diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index eeda759ff18ccb86ce6a585fe41cb972ea3ae295..e718b32cb6c48d11e73600509a17db107f438708 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,7 +23,7 @@ repos: - id: clang-format-with-version-check name: clang-format description: Format files with ClangFormat. - entry: bash ./.clang_format.hook -i + entry: bash ./tools/codestyle/clang_format.hook -i language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ - repo: local @@ -52,7 +52,7 @@ repos: hooks: - id: copyright_checker name: copyright_checker - entry: python ./.copyright.hook + entry: python ./tools/codestyle/copyright.hook language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$ diff --git a/AUTHORS.md b/AUTHORS.md index 11f227be7148d8d6e055538347a8c31679406c84..8c4a113fc276783c945867ceae9612339b7f0bbc 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -22,6 +22,7 @@ | jczaja | Jacek Czaja | | JiayiFeng | Jia-Yi Feng | | kbinias | Krzysztof Binias | +| kexinzhao | Ke-Xin Zhao | | kuke | Yi-Bing Liu | | lcy-seso | Ying Cao | | lipeng-unisound | Peng Li | diff --git a/CMakeLists.txt b/CMakeLists.txt index cfaab206e1f321a55119d4a8d65c4a99d3819fff..4117f077219d3b8fc097631073eafa748ff918bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,12 +55,14 @@ option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) -option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF) +option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF) +option(WITH_ANAKIN "Compile with Anakin library" OFF) +option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) @@ -147,7 +149,16 @@ include(external/any) # download libn::any include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 include(external/cares) -include(external/grpc) + +if(WITH_DISTRIBUTE) + if(WITH_GRPC) + include(external/grpc) + else() + include(external/leveldb) + include(external/brpc) + endif() +endif() + include(external/snappy) # download snappy include(external/snappystream) include(external/threadpool) @@ -183,7 +194,10 @@ set(EXTERNAL_LIBS if(WITH_GPU) include(cuda) include(tensorrt) -endif(WITH_GPU) + include(external/anakin) +else() + set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE) +endif() if(WITH_AMD_GPU) find_package(HIP) diff --git a/Dockerfile b/Dockerfile index 4d6165b79a1d94b8f27d7f3ee1b6e2cee5992d31..fc5069a6c080ed23317695e6822c4c46b5b5c7f9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,7 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/ RUN apt-get update && \ apt-get install -y --allow-downgrades \ - git python-pip python-dev openssh-server bison \ + git python-pip python-dev python-opencv openssh-server bison \ libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ curl sed grep graphviz libjpeg-dev zlib1g-dev \ diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile index 46140a9d1be01a50cd74dab2799e3731e8d3debd..707fadb1fae97cefe8a41715cd57d71754abda41 100644 --- a/benchmark/fluid/Dockerfile +++ b/benchmark/fluid/Dockerfile @@ -1,11 +1,18 @@ FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 -RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop + +# Use UBUNTU_MIRROR can speed up apt-get speed. +# ARG UBUNTU_MIRROR +# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' + +RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so -RUN pip install -U pip -RUN pip install -U kubernetes opencv-python paddlepaddle # IMPORTANT: # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime. +# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ... + +RUN pip install -U pip +RUN pip install -U kubernetes paddlepaddle RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python' RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python' @@ -14,9 +21,11 @@ RUN pip uninstall -y paddlepaddle && mkdir /workspace ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root +RUN chmod +x /usr/bin/paddle_k8s ADD *.whl / -RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s +RUN pip install /*.whl && rm -f /*.whl ENV LD_LIBRARY_PATH=/usr/local/lib -ADD fluid_benchmark.py dataset.py models/ /workspace/ +ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/ +ADD models/ /workspace/models/ diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md index 1b0c7dce8bd6faab0c4c59caa1cbe337483cbd16..28cade4634bb62723bf5120169e202657f548234 100644 --- a/benchmark/fluid/README.md +++ b/benchmark/fluid/README.md @@ -24,14 +24,18 @@ Currently supported `--model` argument include: * Run the following command to start a benchmark job locally: ```bash - python fluid_benchmark.py --model mnist --device GPU + python fluid_benchmark.py --model mnist --device GPU ``` You can choose to use GPU/CPU training. With GPU training, you can specify `--gpus ` to run multi GPU training. + You can set async mode parameter server. With async mode, you can specify + `--async_mode` to train model asynchronous. * Run distributed training with parameter servers: + * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example. * start parameter servers: ```bash PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver + sleep 15 ``` * start trainers: ```bash @@ -42,6 +46,16 @@ Currently supported `--model` argument include: PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2 ``` +## Prepare the RecordIO file to Achieve Better Performance + +Run the following command will generate RecordIO files like "mnist.recordio" under the path +and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size +at any time using `fluid.batch`. + +```bash +python -c 'from recordio_converter import *; prepare_mnist("data", 1)' +``` + ## Run Distributed Benchmark on Kubernetes Cluster You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py new file mode 100644 index 0000000000000000000000000000000000000000..68a3d42d7a8a8082730f4cae3b5d4ea33819ca2f --- /dev/null +++ b/benchmark/fluid/args.py @@ -0,0 +1,126 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +__all__ = ['parse_args', ] + +BENCHMARK_MODELS = [ + "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" +] + + +def parse_args(): + parser = argparse.ArgumentParser('Fluid model benchmarks.') + parser.add_argument( + '--model', + type=str, + choices=BENCHMARK_MODELS, + default='resnet', + help='The model to run benchmark with.') + parser.add_argument( + '--batch_size', type=int, default=32, help='The minibatch size.') + # args related to learning rate + parser.add_argument( + '--learning_rate', type=float, default=0.001, help='The learning rate.') + # TODO(wuyi): add "--use_fake_data" option back. + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=100, help='The number of passes.') + parser.add_argument( + '--data_format', + type=str, + default='NCHW', + choices=['NCHW', 'NHWC'], + help='The data data_format, now only support NCHW.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--gpus', + type=int, + default=1, + help='If gpus > 1, will use ParallelExecutor to run, else use Executor.') + # this option is available only for vgg and resnet. + parser.add_argument( + '--cpus', + type=int, + default=1, + help='If cpus > 1, will use ParallelDo to run, else use Executor.') + parser.add_argument( + '--data_set', + type=str, + default='flowers', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + parser.add_argument( + '--use_cprof', action='store_true', help='If set, use cProfile.') + parser.add_argument( + '--use_nvprof', + action='store_true', + help='If set, use nvprof for CUDA.') + parser.add_argument( + '--no_test', + action='store_true', + help='If set, do not test the testset during training.') + parser.add_argument( + '--memory_optimize', + action='store_true', + help='If set, optimize runtime memory before start.') + parser.add_argument( + '--use_fake_data', + action='store_true', + help='If set ommit the actual read data operators.') + parser.add_argument( + '--profile', action='store_true', help='If set, profile a few steps.') + parser.add_argument( + '--update_method', + type=str, + default='local', + choices=['local', 'pserver', 'nccl2'], + help='Choose parameter update method, can be local, pserver, nccl2.') + parser.add_argument( + '--no_split_var', + action='store_true', + default=False, + help='Whether split variables into blocks when update_method is pserver') + parser.add_argument( + '--async_mode', + action='store_true', + default=False, + help='Whether start pserver in async mode to support ASGD') + parser.add_argument( + '--use_reader_op', + action='store_true', + help='Whether to use reader op, and must specify the data path if set this to true.' + ) + parser.add_argument( + '--data_path', + type=str, + default="", + help='Directory that contains all the training recordio files.') + args = parser.parse_args() + return args diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 9d33a841cddb8d8b8e14c00ae7e9d600d5d2eb46..ece1102dce987cda994ff086b07f756498ce26e6 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -24,87 +24,7 @@ import paddle.fluid.core as core import paddle.fluid.profiler as profiler import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler -BENCHMARK_MODELS = [ - "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" -] - - -def parse_args(): - parser = argparse.ArgumentParser('Fluid model benchmarks.') - parser.add_argument( - '--model', - type=str, - choices=BENCHMARK_MODELS, - default='resnet', - help='The model to run benchmark with.') - parser.add_argument( - '--batch_size', type=int, default=32, help='The minibatch size.') - parser.add_argument( - '--learning_rate', type=float, default=0.001, help='The learning rate.') - # TODO(wuyi): add "--use_fake_data" option back. - parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test' - ) - parser.add_argument( - '--iterations', type=int, default=80, help='The number of minibatches.') - parser.add_argument( - '--pass_num', type=int, default=100, help='The number of passes.') - parser.add_argument( - '--data_format', - type=str, - default='NCHW', - choices=['NCHW', 'NHWC'], - help='The data data_format, now only support NCHW.') - parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help='The device type.') - parser.add_argument( - '--gpus', - type=int, - default=1, - help='If gpus > 1, will use ParallelExecutor to run, else use Executor.') - parser.add_argument( - '--data_set', - type=str, - default='flowers', - choices=['cifar10', 'flowers'], - help='Optional dataset for benchmark.') - parser.add_argument( - '--infer_only', action='store_true', help='If set, run forward only.') - parser.add_argument( - '--use_cprof', action='store_true', help='If set, use cProfile.') - parser.add_argument( - '--use_nvprof', - action='store_true', - help='If set, use nvprof for CUDA.') - parser.add_argument( - '--no_test', - action='store_false', - help='If set, test the testset during training.') - parser.add_argument( - '--memory_optimize', - action='store_true', - help='If set, optimize runtime memory before start.') - parser.add_argument( - '--use_fake_data', - action='store_true', - help='If set ommit the actual read data operators.') - parser.add_argument( - '--profile', action='store_true', help='If set, profile a few steps.') - parser.add_argument( - '--update_method', - type=str, - default='local', - choices=['local', 'pserver', 'nccl2'], - help='Choose parameter update method, can be local, pserver, nccl2.') - args = parser.parse_args() - return args +from args import * def append_nccl2_prepare(trainer_id): @@ -139,7 +59,7 @@ def append_nccl2_prepare(trainer_id): "nccl-based dist train.") -def dist_transpile(trainer_id): +def dist_transpile(trainer_id, args): if trainer_id < 0: return None, None @@ -161,7 +81,12 @@ def dist_transpile(trainer_id): training_role = os.getenv("PADDLE_TRAINING_ROLE") t = distribute_transpiler.DistributeTranspiler() - t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) + t.transpile( + trainer_id, + pservers=pserver_endpoints, + trainers=trainers, + sync_mode=not args.async_mode, + slice_var_up=not args.no_split_var) if training_role == "PSERVER": pserver_program = t.get_pserver_program(current_endpoint) pserver_startup_program = t.get_startup_program(current_endpoint, @@ -172,7 +97,7 @@ def dist_transpile(trainer_id): return train_program, fluid.default_startup_program() else: raise ValueError( - 'TRAINING_ROLE environment variable must be either TRAINER or PSERVER' + 'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER' ) @@ -205,33 +130,57 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) exe = fluid.Executor(place) exe.run(startup_prog) - feed_var_list = [ - var for var in train_prog.global_block().vars.itervalues() - if var.is_data - ] - feeder = fluid.DataFeeder(feed_var_list, place) + + if not args.use_reader_op: + feed_var_list = [ + var for var in train_prog.global_block().vars.itervalues() + if var.is_data + ] + feeder = fluid.DataFeeder(feed_var_list, place) iters, num_samples, start_time = 0, 0, time.time() for pass_id in range(args.pass_num): train_losses = [] - for batch_id, data in enumerate(train_reader()): + if not args.use_reader_op: + reader_generator = train_reader() + batch_id = 0 + data = None + while True: + if not args.use_reader_op: + data = next(reader_generator, None) + if data == None: + break + if iters == args.iterations: + break if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 - if iters == args.iterations: - break - loss = exe.run(train_prog, - feed=feeder.feed(data), - fetch_list=[avg_loss]) + + if args.use_reader_op: + try: + loss = exe.run(train_prog, fetch_list=[avg_loss]) + except fluid.core.EnforceNotMet as ex: + break + else: + loss = exe.run(train_prog, + feed=feeder.feed(data), + fetch_list=[avg_loss]) iters += 1 - num_samples += len(data) + batch_id += 1 + # FIXME(wuyi): For use_reader_op, if the current + # pass is not the last, the last batch of this pass + # is also equal to args.batch_size. + if args.use_reader_op: + num_samples += args.batch_size * args.gpus + else: + num_samples += len(data) train_losses.append(loss) print("Pass: %d, Iter: %d, Loss: %f\n" % (pass_id, iters, np.mean(train_losses))) print_train_time(start_time, time.time(), num_samples) - print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))) + print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))), # evaluation - if not args.no_test and batch_acc != None: + if not args.no_test and batch_acc and not args.use_reader_op: pass_test_acc = test(exe, infer_prog, test_reader, feeder, batch_acc) print(", Test Accuracy: %f" % pass_test_acc) @@ -245,10 +194,14 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, args, train_prog, startup_prog, nccl_id_var, num_trainers, trainer_id): - feed_var_list = [ - var for var in train_prog.global_block().vars.itervalues() - if var.is_data - ] + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + if not args.use_reader_op: + feed_var_list = [ + var for var in train_prog.global_block().vars.itervalues() + if var.is_data + ] + feeder = fluid.DataFeeder(feed_var_list, place) + # generate fake: if args.use_fake_data: for var in feed_var_list: @@ -265,7 +218,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, "value": 1.0, "dtype": var.dtype}) - place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) if nccl_id_var and trainer_id == 0: #FIXME(wuyi): wait other trainer to start listening time.sleep(30) @@ -282,12 +234,21 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, num_trainers=num_trainers, trainer_id=trainer_id) - feeder = fluid.DataFeeder(feed_var_list, place) for pass_id in range(args.pass_num): num_samples = 0 iters = 0 start_time = time.time() - for batch_id, data in enumerate(train_reader()): + if not args.use_reader_op: + reader_generator = train_reader() + batch_id = 0 + data = None + while True: + if not args.use_reader_op: + data = next(reader_generator, None) + if data == None: + break + if iters == args.iterations: + break if args.profile and pass_id == 0 and batch_id == 5: profiler.start_profiler("All") elif args.profile and pass_id == 0 and batch_id == 10: @@ -296,25 +257,30 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 - if iters == args.iterations: - break - if args.use_fake_data: - loss, = exe.run([avg_loss.name]) + if args.use_fake_data or args.use_reader_op: + try: + loss, = exe.run([avg_loss.name]) + except fluid.core.EnforceNotMet as ex: + break else: loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) - if args.update_method == "pserver": - exe.bcast_params() - num_samples += len(data) + if args.use_reader_op: + num_samples += args.batch_size * args.gpus + else: + num_samples += len(data) iters += 1 if batch_id % 1 == 0: print("Pass %d, batch %d, loss %s" % (pass_id, batch_id, np.array(loss))) + batch_id += 1 + print_train_time(start_time, time.time(), num_samples) - if not args.no_test and batch_acc != None: + if not args.no_test and batch_acc and not args.use_reader_op: + # we have not implement record io for test + # skip test when use args.use_reader_op test_acc = test(startup_exe, infer_prog, test_reader, feeder, batch_acc) print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc)) - exit(0) def print_arguments(args): @@ -333,9 +299,18 @@ def print_train_time(start_time, end_time, num_samples): (num_samples, train_elapsed, examples_per_sec)) +def print_paddle_envs(): + print('----------- Configuration envs -----------') + for k in os.environ: + if "PADDLE_" in k: + print "ENV %s:%s" % (k, os.environ[k]) + print('------------------------------------------------') + + def main(): args = parse_args() print_arguments(args) + print_paddle_envs() # the unique trainer id, starting from 0, needed by trainer # only @@ -354,7 +329,7 @@ def main(): fluid.memory_optimize(fluid.default_main_program()) if args.update_method == "pserver": - train_prog, startup_prog = dist_transpile(trainer_id) + train_prog, startup_prog = dist_transpile(trainer_id, args) if not train_prog: raise Exception( "Must configure correct environments to run dist train.") diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py index 9da8a69af1d7b671b2648b1b3702776c1c0650b0..dfe8b5cdd58456902fa8ec355e9837dface3f7be 100644 --- a/benchmark/fluid/kube_gen_job.py +++ b/benchmark/fluid/kube_gen_job.py @@ -17,6 +17,7 @@ import copy import argparse import random import os +import copy from kube_templates import pserver, trainer, envs @@ -108,10 +109,9 @@ def gen_job(): tn_container["ports"][0]["containerPort"] = spreadport envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname}) - envs.append({"name": "TRAINERS", "value": str(args.trainers)}) - envs.append({"name": "PSERVERS", "value": str(args.pservers)}) + envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)}) + envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)}) envs.append({"name": "ENTRY", "value": args.entry}) - envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)}) envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)}) # NOTE: these directories below are cluster specific, please modify # this settings before you run on your own cluster. @@ -166,17 +166,23 @@ def gen_job(): tn["spec"]["template"]["spec"]["volumes"] = volumes tn_container["volumeMounts"] = volumeMounts - ps_container["env"] = envs - ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"}) + ps_container["env"] = copy.deepcopy(envs) + ps_container["env"].append({ + "name": "PADDLE_TRAINING_ROLE", + "value": "PSERVER" + }) tn_container["env"] = envs if args.disttype == "pserver": tn_container["env"].append({ - "name": "TRAINING_ROLE", + "name": "PADDLE_TRAINING_ROLE", "value": "TRAINER" }) elif args.disttype == "nccl2" or args.disttype == "local": # NCCL2 have no training role, set to plain WORKER - tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"}) + tn_container["env"].append({ + "name": "PADDLE_TRAINING_ROLE", + "value": "WORKER" + }) os.mkdir(args.jobname) if args.disttype == "pserver": diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py index 635b3373dd27b21f83afae10b1d24833b81d57eb..17f6b03826ae818a3671ea7f9355a8e8c04b50be 100644 --- a/benchmark/fluid/models/machine_translation.py +++ b/benchmark/fluid/models/machine_translation.py @@ -173,21 +173,6 @@ def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, return avg_cost, feeding_list -def to_lodtensor(data, place): - seq_lens = [len(seq) for seq in data] - cur_len = 0 - lod = [cur_len] - for l in seq_lens: - cur_len += l - lod.append(cur_len) - flattened_data = np.concatenate(data, axis=0).astype("int64") - flattened_data = flattened_data.reshape([len(flattened_data), 1]) - lod_t = core.LoDTensor() - lod_t.set(flattened_data, place) - lod_t.set_lod([lod]) - return lod_t, lod[-1] - - def lodtensor_to_ndarray(lod_tensor): dims = lod_tensor.get_dims() ndarray = np.zeros(shape=dims).astype('float32') @@ -197,6 +182,8 @@ def lodtensor_to_ndarray(lod_tensor): def get_model(args): + if args.use_reader_op: + raise Exception("machine_translation do not support reader op for now.") embedding_dim = 512 encoder_size = 512 decoder_size = 512 @@ -221,7 +208,7 @@ def get_model(args): train_batch_generator = paddle.batch( paddle.reader.shuffle( paddle.dataset.wmt14.train(dict_size), buf_size=1000), - batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus) test_batch_generator = paddle.batch( paddle.reader.shuffle( diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py index d264bfc12bdb159c06dae81db4949b9ee17268e2..8e740dc6896b7eeeb82170aa13d32987c4df5c48 100644 --- a/benchmark/fluid/models/mnist.py +++ b/benchmark/fluid/models/mnist.py @@ -20,6 +20,7 @@ import numpy as np import argparse import time import cProfile +import os import paddle import paddle.fluid as fluid @@ -65,19 +66,49 @@ def cnn_model(data): def get_model(args): - # Input data - images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - - # Train program - predict = cnn_model(images) - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - - # Evaluator - batch_size_tensor = fluid.layers.create_tensor(dtype='int64') - batch_acc = fluid.layers.accuracy( - input=predict, label=label, total=batch_size_tensor) + if args.use_reader_op: + filelist = [ + os.path.join(args.data_path, f) for f in os.listdir(args.data_path) + ] + data_file = fluid.layers.open_files( + filenames=filelist, + shapes=[[-1, 1, 28, 28], (-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int64"], + thread_num=args.gpus, + pass_num=args.pass_num) + data_file = fluid.layers.double_buffer( + fluid.layers.batch( + data_file, batch_size=args.batch_size)) + images, label = fluid.layers.read_file(data_file) + else: + images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + if args.device == 'CPU' and args.cpus > 1: + places = fluid.layers.get_places(args.cpus) + pd = fluid.layers.ParallelDo(places) + with pd.do(): + predict = cnn_model(pd.read_input(images)) + label = pd.read_input(label) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + batch_acc = fluid.layers.accuracy(input=predict, label=label) + + pd.write_output(avg_cost) + pd.write_output(batch_acc) + + avg_cost, batch_acc = pd() + avg_cost = fluid.layers.mean(avg_cost) + batch_acc = fluid.layers.mean(batch_acc) + else: + # Train program + predict = cnn_model(images) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_acc = fluid.layers.accuracy(input=predict, label=label) # inference program inference_program = fluid.default_main_program().clone() @@ -88,7 +119,7 @@ def get_model(args): # Reader train_reader = paddle.batch( - paddle.dataset.mnist.train(), batch_size=args.batch_size) + paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus) test_reader = paddle.batch( paddle.dataset.mnist.test(), batch_size=args.batch_size) return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index 9dec8911ed64e09285fb461c4a12adb601535316..9ed1093c54a501cc93dbbf9c3651fe70914ce26b 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -19,6 +19,7 @@ from __future__ import print_function import functools import numpy as np import time +import os import cProfile, pstats, StringIO @@ -26,6 +27,7 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.profiler as profiler +from recordio_converter import imagenet_train, imagenet_test def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): @@ -122,40 +124,85 @@ def get_model(args): else: dshape = [32, 32, 3] model = resnet_cifar10 - else: + train_reader = paddle.dataset.cifar.train10() + test_reader = paddle.dataset.cifar.test10() + elif args.data_set == "flowers": class_dim = 102 if args.data_format == 'NCHW': dshape = [3, 224, 224] else: dshape = [224, 224, 3] model = resnet_imagenet - - input = fluid.layers.data(name='data', shape=dshape, dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - predict = model(input, class_dim) - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - - batch_size_tensor = fluid.layers.create_tensor(dtype='int64') - batch_acc = fluid.layers.accuracy( - input=predict, label=label, total=batch_size_tensor) + train_reader = paddle.dataset.flowers.train() + test_reader = paddle.dataset.flowers.test() + elif args.data_set == "imagenet": + class_dim = 1000 + if args.data_format == 'NCHW': + dshape = [3, 224, 224] + else: + dshape = [224, 224, 3] + model = resnet_imagenet + if not args.data_path: + raise Exception( + "Must specify --data_path when training with imagenet") + train_reader = imagenet_train(args.data_path) + test_reader = imagenet_test(args.data_path) + + if args.use_reader_op: + filelist = [ + os.path.join(args.data_path, f) for f in os.listdir(args.data_path) + ] + data_file = fluid.layers.open_files( + filenames=filelist, + shapes=[[-1] + dshape, (-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int64"], + thread_num=args.gpus, + pass_num=args.pass_num) + data_file = fluid.layers.double_buffer( + fluid.layers.batch( + data_file, batch_size=args.batch_size)) + input, label = fluid.layers.read_file(data_file) + else: + input = fluid.layers.data(name='data', shape=dshape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + if args.device == 'CPU' and args.cpus > 1: + places = fluid.layers.get_places(args.cpus) + pd = fluid.layers.ParallelDo(places) + with pd.do(): + predict = model(pd.read_input(input), class_dim) + label = pd.read_input(label) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + batch_acc = fluid.layers.accuracy(input=predict, label=label) + + pd.write_output(avg_cost) + pd.write_output(batch_acc) + + avg_cost, batch_acc = pd() + avg_cost = fluid.layers.mean(avg_cost) + batch_acc = fluid.layers.mean(batch_acc) + else: + predict = model(input, class_dim) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + batch_acc = fluid.layers.accuracy(input=predict, label=label) inference_program = fluid.default_main_program().clone() with fluid.program_guard(inference_program): inference_program = fluid.io.get_inference_program( - target_vars=[batch_acc, batch_size_tensor]) + target_vars=[batch_acc]) optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) - train_reader = paddle.batch( + batched_train_reader = paddle.batch( paddle.reader.shuffle( - paddle.dataset.cifar.train10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), - buf_size=5120), - batch_size=args.batch_size) - test_reader = paddle.batch( - paddle.dataset.cifar.test10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), - batch_size=args.batch_size) - - return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc + train_reader, buf_size=5120), + batch_size=args.batch_size * args.gpus, + drop_last=True) + batched_test_reader = paddle.batch( + train_reader, batch_size=args.batch_size, drop_last=True) + + return avg_cost, inference_program, optimizer, batched_train_reader,\ + batched_test_reader, batch_acc diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py index 81a28b5f3aed0c325398b909d700c23df545824a..3231542a17ace99a17c9f9b9bdb3c2527637d9ef 100644 --- a/benchmark/fluid/models/stacked_dynamic_lstm.py +++ b/benchmark/fluid/models/stacked_dynamic_lstm.py @@ -44,6 +44,9 @@ def crop_sentence(reader, crop_size): def get_model(args): + if args.use_reader_op: + raise Exception( + "stacked_dynamic_lstm do not support reader op for now.") lstm_size = 512 emb_dim = 512 crop_size = 1500 @@ -115,25 +118,10 @@ def get_model(args): train_reader = batch( paddle.reader.shuffle( crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000), - batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus) test_reader = batch( paddle.reader.shuffle( crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000), batch_size=args.batch_size) return loss, inference_program, adam, train_reader, test_reader, batch_acc - - -def to_lodtensor(data, place): - seq_lens = [len(seq) for seq in data] - cur_len = 0 - lod = [cur_len] - for l in seq_lens: - cur_len += l - lod.append(cur_len) - flattened_data = numpy.concatenate(data, axis=0).astype("int64") - flattened_data = flattened_data.reshape([len(flattened_data), 1]) - res = fluid.LoDTensor() - res.set(flattened_data, place) - res.set_lod([lod]) - return res diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py index 53856c5f7acd3a4e1476ec57154a880bb6f984c9..932601302d2f5d56b53e3462af886429034d8989 100644 --- a/benchmark/fluid/models/vgg.py +++ b/benchmark/fluid/models/vgg.py @@ -22,6 +22,7 @@ import paddle.fluid as fluid import paddle.fluid.core as core import argparse import functools +import os def vgg16_bn_drop(input): @@ -65,9 +66,25 @@ def get_model(args): else: data_shape = [224, 224, 3] - # Input data - images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') + if args.use_reader_op: + filelist = [ + os.path.join(args.data_path, f) for f in os.listdir(args.data_path) + ] + data_file = fluid.layers.open_files( + filenames=filelist, + shapes=[[-1] + data_shape, (-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int64"], + thread_num=args.gpus, + pass_num=args.pass_num) + data_file = fluid.layers.double_buffer( + fluid.layers.batch( + data_file, batch_size=args.batch_size)) + images, label = fluid.layers.read_file(data_file) + else: + images = fluid.layers.data( + name='data', shape=data_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') # Train program net = vgg16_bn_drop(images) @@ -95,7 +112,7 @@ def get_model(args): paddle.dataset.cifar.train10() if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), buf_size=5120), - batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus) test_reader = paddle.batch( paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), diff --git a/benchmark/fluid/recordio_converter.py b/benchmark/fluid/recordio_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..f2dc39109bf1beaf147b046560c92fbd2416d8e6 --- /dev/null +++ b/benchmark/fluid/recordio_converter.py @@ -0,0 +1,164 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.dataset import mnist, cifar, flowers, image + + +def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data, + shape_label): + num_batches = 0 + with fluid.program_guard(fluid.Program(), fluid.Program()): + reader = paddle.batch(py_reader(), batch_size=batch_size) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=shape_data), + fluid.layers.data( + name='label', shape=shape_label, dtype='int64'), + ], + place=fluid.CPUPlace()) + num_batches = fluid.recordio_writer.convert_reader_to_recordio_file( + outfilepath, reader, feeder) + return num_batches + + +def prepare_mnist(outpath, batch_size): + outfilepath = os.path.join(outpath, "mnist.recordio") + convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1]) + + +def prepare_cifar10(outpath, batch_size): + outfilepath = os.path.join(outpath, "cifar.recordio") + convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1]) + + +def prepare_flowers(outpath, batch_size): + outfilepath = os.path.join(outpath, "flowers.recordio") + convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224], + [1]) + + +def default_mapper(sample): + img, label = sample + img = image.simple_transform( + img, 256, 224, True, mean=[103.94, 116.78, 123.68]) + return img.flatten().astype('float32'), label + + +def imagenet_train(data_dir): + contents = os.listdir(data_dir) + if set(contents) != set( + ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]): + raise Exception("Imagenet data contents error!") + img2label = dict() + imgfilelist = [] + with open(os.path.join(data_dir, "train.txt")) as fn: + while 1: + l = fn.readline() + if not l: + break + img, lbl = l[:-1].split(" ") + img2label[img] = int(lbl) + imgfilelist.append(img) + # shuffle all, this is slow + random.shuffle(imgfilelist) + + def train_reader(): + for idx, imgfile in enumerate(imgfilelist): + data = image.load_image( + os.path.join(data_dir, "train", imgfile.lower())) + label = [img2label[imgfile], ] + yield [data, label] + + return paddle.reader.map_readers(default_mapper, train_reader) + + +def imagenet_test(data_dir): + contents = os.listdir(data_dir) + if set(contents) != set( + ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]): + raise Exception("Imagenet data contents error!") + img2label = dict() + imgfilelist = [] + with open(os.path.join(data_dir, "val.txt")) as fn: + while 1: + l = fn.readline() + if not l: + break + img, lbl = l[:-1].split(" ") + img2label[img] = int(lbl) + imgfilelist.append(img) + + def test_reader(): + for idx, imgfile in enumerate(imgfilelist): + base_path = os.path.join(data_dir, "val", imgfile.split(".")[0]) + image_path = ".".join([base_path, "jpeg"]) + data = image.load_image(image_path) + label = [img2label[imgfile], ] + yield [data, label] + + return paddle.reader.map_readers(default_mapper, test_reader) + + +# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged +def convert_reader_to_recordio_files( + filename, + batch_per_file, + reader_creator, + feeder, + compressor=core.RecordIOWriter.Compressor.Snappy, + max_num_records=1000, + feed_order=None): + if feed_order is None: + feed_order = feeder.feed_names + f_name, f_ext = os.path.splitext(filename) + assert (f_ext == ".recordio") + + lines = [] + f_idx = 0 + counter = 0 + for idx, batch in enumerate(reader_creator()): + lines.append(batch) + if idx >= batch_per_file and idx % batch_per_file == 0: + filename = "%s-%05d%s" % (f_name, f_idx, f_ext) + with fluid.recordio_writer.create_recordio_writer( + filename, compressor, max_num_records) as writer: + for l in lines: + res = feeder.feed(l) + for each in feed_order: + writer.append_tensor(res[each]) + writer.complete_append_tensor() + counter += 1 + lines = [] + f_idx += 1 + print("written file: ", filename) + return counter + + +def prepare_imagenet(inpath, outpath, batch_size): + r = paddle.batch(imagenet_train(inpath), batch_size=batch_size) + feeder = fluid.DataFeeder( + feed_list=[ + fluid.layers.data( + name="image", shape=[3, 224, 224]), fluid.layers.data( + name="label", shape=[1], dtype='int64') + ], + place=fluid.CPUPlace()) + outpath = os.path.join(outpath, "imagenet.recordio") + convert_reader_to_recordio_files(outpath, 10000, r, feeder) diff --git a/benchmark/fluid/run_fluid_benchmark.sh b/benchmark/fluid/run_fluid_benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..4309a3126c1d72fe1eb2d5ec423075aea4d3ec88 --- /dev/null +++ b/benchmark/fluid/run_fluid_benchmark.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device CPU --update_method pserver --iterations=10000 & + +sleep 15 + +CUDA_VISIBLE_DEVICES=0,1 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 & + +CUDA_VISIBLE_DEVICES=2,3 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=1 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 & diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 682614742cf1bd3130c638020a2545e16226d4d6..6a8b15a6b60a2e5635dc78fc877f0c8da9a2a998 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -92,6 +92,9 @@ if(WITH_GPU) if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile") endif() + if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4) + message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile") + endif() include_directories(${TENSORRT_INCLUDE_DIR}) endif() elseif(WITH_AMD_GPU) @@ -115,6 +118,10 @@ endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}") +if(WITH_DISTRIBUTE) + add_definitions(-DPADDLE_WITH_DISTRIBUTE) +endif() + if(WITH_GOLANG) # we need to symlink Paddle directory into GOPATH. If we # don't do it and we have code that depends on Paddle, go @@ -163,3 +170,7 @@ if(WITH_GOLANG) endif() endif(WITH_GOLANG) + +if(WITH_GRPC) + add_definitions(-DPADDLE_WITH_GRPC) +endif(WITH_GRPC) diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake new file mode 100644 index 0000000000000000000000000000000000000000..f1cd9c99ebfe5dc5ee0d46d61f1e08256c27d9cd --- /dev/null +++ b/cmake/external/anakin.cmake @@ -0,0 +1,42 @@ +if (NOT WITH_ANAKIN) + return() +endif() + +set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH + "Anakin install path." FORCE) +set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files") +set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library") + +set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp) + +set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz") + +# A helper function used in Anakin, currently, to use it, one need to recursively include +# nearly all the header files. +function(fetch_include_recursively root_dir) + if (IS_DIRECTORY ${root_dir}) + include_directories(${root_dir}) + endif() + + file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*) + foreach(sub ${ALL_SUB}) + if (IS_DIRECTORY ${root_dir}/${sub}) + fetch_include_recursively(${root_dir}/${sub}) + endif() + endforeach() +endfunction() + +# download library +message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}") +execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") +execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*") +execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}") +execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") +execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz") + +if (WITH_ANAKIN) + message(STATUS "Anakin for inference is enabled") + message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}") + fetch_include_recursively(${ANAKIN_INCLUDE}) + link_directories(${ANAKIN_LIBRARY}) +endif() diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake new file mode 100644 index 0000000000000000000000000000000000000000..8e2c913b2caae0c4eeb844d2b51a8975e81c1592 --- /dev/null +++ b/cmake/external/brpc.cmake @@ -0,0 +1,58 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc) +SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc) +SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE) +SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc library." FORCE) + +INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR}) + +# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args +set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf") + +# If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF +ExternalProject_Add( + extern_brpc + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/brpc/brpc" + GIT_TAG "6d153dd7ff00f960ae6895c9c5fff0ce9f07aff2" + PREFIX ${BRPC_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_PREFIX_PATH=${prefix_path} + -DBRPC_WITH_GLOG=ON + ${EXTERNAL_OPTIONAL_ARGS} + LIST_SEPARATOR | + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} +) +ADD_DEPENDENCIES(extern_brpc protobuf leveldb gflags glog gtest snappy) +ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) +ADD_DEPENDENCIES(brpc extern_brpc) + + +LIST(APPEND external_project_dependencies brpc) diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake index 4b6840578fd155027c895b6ed5d1f9133868f312..85f40585da29bab9a107f5546e64870975f4c2d3 100644 --- a/cmake/external/grpc.cmake +++ b/cmake/external/grpc.cmake @@ -40,11 +40,12 @@ ExternalProject_Add( # NOTE(wuyi): # this package is generated by following steps: # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git - # 2. submodule update --init + # 2. git submodule update --init # 3. keep only zlib, cares, protobuf, boringssl under "third_party", # checkout and clean other dirs under third_party # 4. remove .git, and package the directory. - URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz" + URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz" + URL_MD5 "1f268a2aff6759839dccd256adcc91cf" PREFIX ${GRPC_SOURCES_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake new file mode 100644 index 0000000000000000000000000000000000000000..fb5091731da02b497a14f119e944905eee4979d5 --- /dev/null +++ b/cmake/external/leveldb.cmake @@ -0,0 +1,44 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(LEVELDB_SOURCES_DIR ${THIRD_PARTY_PATH}/leveldb) +SET(LEVELDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/leveldb) +SET(LEVELDB_INCLUDE_DIR "${LEVELDB_INSTALL_DIR}/include" CACHE PATH "leveldb include directory." FORCE) +SET(LEVELDB_LIBRARIES "${LEVELDB_INSTALL_DIR}/lib/libleveldb.a" CACHE FILEPATH "leveldb library." FORCE) +INCLUDE_DIRECTORIES(${LEVELDB_INCLUDE_DIR}) + +ExternalProject_Add( + extern_leveldb + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${LEVELDB_SOURCES_DIR} + URL "https://github.com/google/leveldb/archive/v1.18.tar.gz" + URL_MD5 "73770de34a2a5ab34498d2e05b2b7fa0" + CONFIGURE_COMMAND "" + BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a + INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ + && cp ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/libleveldb.a ${LEVELDB_LIBRARIES} + && cp -r ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/include ${LEVELDB_INSTALL_DIR}/ + BUILD_IN_SOURCE 1 +) + +ADD_DEPENDENCIES(extern_leveldb snappy) + +ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES}) +ADD_DEPENDENCIES(leveldb extern_leveldb) + +LIST(APPEND external_project_dependencies leveldb) + diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 25c07850dda7b2f69c2207c37b9d2368632104ec..20dda35c5ccd98f5672d867c26ab97a215483543 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -45,7 +45,8 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML") ELSE() MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN") ENDIF() -SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-unused-result") +SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result") +SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") ExternalProject_Add( @@ -53,7 +54,7 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${MKLDNN_DEPENDS} GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" - GIT_TAG "db3424ad44901513c03a1ea31ccaacdf633fbe9f" + GIT_TAG "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51" PREFIX ${MKLDNN_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 8af2765f58717408e3a1ef6b500bb01511bfd8d3..ce6a88b51dc98ac46dd3935f12658d60d364ba8c 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -29,6 +29,8 @@ IF(NOT ${CBLAS_FOUND}) "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE FILEPATH "openblas library." FORCE) + ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS) + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") SET(OPENBLAS_COMMIT "v0.2.20") @@ -112,7 +114,12 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c) FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";") ADD_LIBRARY(cblas STATIC ${dummyfile}) -TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) + +IF("${CBLAS_PROVIDER}" STREQUAL "MKLML") + TARGET_LINK_LIBRARIES(cblas dynload_mklml) +ELSE() + TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) +ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML") IF(NOT ${CBLAS_FOUND}) ADD_DEPENDENCIES(cblas extern_openblas) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 9ddd05b3d9404df29ca1bf634105314b7e6a5b70..9c42044ec163e9db1dd21d5c3915b010c30fdf1c 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -195,6 +195,15 @@ function(cc_library TARGET_NAME) list(REMOVE_ITEM cc_library_DEPS warpctc) add_dependencies(${TARGET_NAME} warpctc) endif() + # Only deps libmklml.so, not link + if("${cc_library_DEPS};" MATCHES "mklml;") + list(REMOVE_ITEM cc_library_DEPS mklml) + if(NOT "${TARGET_NAME}" MATCHES "dynload_mklml") + list(APPEND cc_library_DEPS dynload_mklml) + endif() + add_dependencies(${TARGET_NAME} mklml) + target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") + endif() target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) endif() @@ -610,3 +619,21 @@ function(grpc_library TARGET_NAME) COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}") endfunction() + + +function(brpc_library TARGET_NAME) + set(oneValueArgs PROTO) + set(multiValueArgs SRCS DEPS) + set(options "") + cmake_parse_arguments(brpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + message(STATUS "generating brpc ${brpc_library_PROTO}") + + get_filename_component(ABS_PROTO ${brpc_library_PROTO} ABSOLUTE) + get_filename_component(PROTO_WE ${brpc_library_PROTO} NAME_WE) + get_filename_component(PROTO_PATH ${ABS_PROTO} PATH) + + protobuf_generate_cpp(brpc_proto_srcs brpc_proto_hdrs "${ABS_PROTO}") + cc_library("${TARGET_NAME}_proto" SRCS "${brpc_proto_srcs}") + cc_library("${TARGET_NAME}" SRCS "${brpc_library_SRCS}" DEPS "${TARGET_NAME}_proto" "${brpc_library_DEPS}") +endfunction() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 236a55d332a91c88d1c5515e7aca4142930a079f..cd44fe2542bfa8c53721d61b70778226e640d375 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -39,7 +39,7 @@ function(copy TARGET) message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers") endif() math(EXPR len "${copy_lib_SRCS_len} - 1") - + add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS}) foreach(index RANGE ${len}) list(GET copy_lib_SRCS ${index} src) @@ -155,6 +155,15 @@ copy(inference_lib DEPS paddle_fluid_shared paddle_fluid DSTS ${dst_dir}/${module} ${dst_dir}/${module} ) +if(WITH_CONTRIB) + set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference") + copy(contrib_inference_lib DEPS paddle_inference_api + SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h + ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.* + DSTS ${contrib_dst_dir} ${contrib_dst_dir} + ) +endif() + set(module "platform") copy(platform_lib DEPS profiler_py_proto SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h diff --git a/doc/about/about_us.rst b/doc/about/about_us.rst new file mode 100644 index 0000000000000000000000000000000000000000..f67d8b8130030db8d7e7d10b30271a913bd6272a --- /dev/null +++ b/doc/about/about_us.rst @@ -0,0 +1,53 @@ +========= +关于我们 +========= + +什么是PaddlePaddle +-------------------- + +- PaddlePaddle是百度自主研发并开源的深度学习框架,它能够让开发者和企业安全、快速地实现自己的AI想法 + +- 项目团队汇聚了全球顶级的深度学习科学家,致力于为开发者和企业提供最好的深度学习研发体验 + +- 框架具有易学、易用、安全、高效四大特性,是最适合中国开发者和企业的深度学习工具 + +PaddlePaddle的技术特色 +------------------------- + +- 新一代深度学习框架: PaddlePaddle是基于“深度学习编程语言”的新一代深度学习框架,在保证性能的同时,极大的提升了框架对模型的表达能力,能够描述任意潜在可能出现的模型 + +- 对大规模计算更加友好:经过百度内多种大规模计算业务的打磨,PaddlePaddle在分布式计算上表现优异,基于EDL技术能够节约大量计算资源,同时也能支持大规模稀疏模型的训练 + +- 提供可视化的深度学习:通过Visual DL可以帮助开发者方便的观测训练整体趋势、数据样本质量和中间结果、参数分布和变化趋势、以及模型的结构,帮助开发者更便捷的完成编程过程 + +提供基于PaddlePaddle的教育体系 +-------------------------------- + +- 深度学习课程:百度与中国市场顶级的教育、培训机构共同开发了深度学习精品课程以及学习教材,帮助开发者从零掌握深度学习 + +- 深度学习实训:对于目的是科研和学习的用户,PaddlePaddle提供了无需安装、线上运行的开发环境,并提供算法、算力、数据支持 + +- 线下培训:提供丰富、高质量的线下教育活动,如青年教师培训、线下实战营、沙龙等多种形式的培训和交流 + + +提供基于PaddlePaddle的AI服务 +------------------------------ + +- EadyDL:可以帮助零算法基础的企业快速完成一个深度学习任务,只需少量的数据即可得到优质的模型 + +- AI市场:提供标准化的AI 能力、产品的交易机制,帮助企业快速找到所需,有效开展AI业务 + +- 深度学习竞赛: PaddlePaddle汇聚顶尖深度学习开发者,企业可以发布自己的商业问题,通过竞赛方式快速找到最优的解决方案 + +你对PaddlePaddle有任何的问题都可以通过以下方式联系到我们 +----------------------------------------------------------- + +- 学习/使用问题:可以在 `PaddlePaddle开源社区 `_,以及 `PaddlePaddle中文社区 `_ 向我们反馈 + +- 对PaddlePaddle框架发展的建议:可发送邮件至Paddle-better@baidu.com + +我们期待与你一起打造世界顶级深度学习框架,共同推动AI技术的进步 + + + +PaddlePaddle团队 diff --git a/doc/fluid/api/average.rst b/doc/fluid/api/average.rst new file mode 100644 index 0000000000000000000000000000000000000000..496f5b29875443f0c44f50fcb3ca837f4e7bcd12 --- /dev/null +++ b/doc/fluid/api/average.rst @@ -0,0 +1,16 @@ +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! + +============= +fluid.average +============= + +.. _api_fluid_average_WeightedAverage: + +WeightedAverage +--------------- + +.. autoclass:: paddle.fluid.average.WeightedAverage + :members: + :noindex: + diff --git a/doc/fluid/api/backward.rst b/doc/fluid/api/backward.rst new file mode 100644 index 0000000000000000000000000000000000000000..115e0d24b39928cfc349f72e0a21d6374cd8cd75 --- /dev/null +++ b/doc/fluid/api/backward.rst @@ -0,0 +1,23 @@ +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! + +============== +fluid.backward +============== + +.. _api_fluid_backward_append_backward: + +append_backward +--------------- + +.. autofunction:: paddle.fluid.backward.append_backward + :noindex: + +.. _api_fluid_backward_calc_gradient: + +calc_gradient +------------- + +.. autofunction:: paddle.fluid.backward.calc_gradient + :noindex: + diff --git a/doc/fluid/api/clip.rst b/doc/fluid/api/clip.rst index 3ba096388fc87dda3096a9030fe5749e61112c06..aeefbb95a46e5d5ed46375e388a720fad2711779 100644 --- a/doc/fluid/api/clip.rst +++ b/doc/fluid/api/clip.rst @@ -1,9 +1,11 @@ .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` !DO NOT EDIT THIS FILE MANUALLY! -==== -clip -==== +========== +fluid.clip +========== + +.. _api_fluid_clip_ErrorClipByValue: ErrorClipByValue ---------------- @@ -12,6 +14,8 @@ ErrorClipByValue :members: :noindex: +.. _api_fluid_clip_GradientClipByValue: + GradientClipByValue ------------------- @@ -19,6 +23,8 @@ GradientClipByValue :members: :noindex: +.. _api_fluid_clip_GradientClipByNorm: + GradientClipByNorm ------------------ @@ -26,6 +32,8 @@ GradientClipByNorm :members: :noindex: +.. _api_fluid_clip_GradientClipByGlobalNorm: + GradientClipByGlobalNorm ------------------------ @@ -33,15 +41,3 @@ GradientClipByGlobalNorm :members: :noindex: -append_gradient_clip_ops ------------------------- - -.. autofunction:: paddle.fluid.clip.append_gradient_clip_ops - :noindex: - -error_clip_callback -------------------- - -.. autofunction:: paddle.fluid.clip.error_clip_callback - :noindex: - diff --git a/doc/fluid/api/data.rst b/doc/fluid/api/data.rst deleted file mode 100644 index b56c7332cc284649c7e04328e51a7faa78593a39..0000000000000000000000000000000000000000 --- a/doc/fluid/api/data.rst +++ /dev/null @@ -1,10 +0,0 @@ -================================== -Data Reader Interface and DataSets -================================== - -.. toctree:: - :maxdepth: 1 - - data/data_reader.rst - data/image.rst - data/dataset.rst diff --git a/doc/fluid/api/data_feeder.rst b/doc/fluid/api/data_feeder.rst index 3df5c0307ffed9d101da58b385840b115920e906..11d2890f5b3446e37c3ef31e5a17ebebe169dbc8 100644 --- a/doc/fluid/api/data_feeder.rst +++ b/doc/fluid/api/data_feeder.rst @@ -1,9 +1,11 @@ .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` !DO NOT EDIT THIS FILE MANUALLY! -=========== -data_feeder -=========== +================= +fluid.data_feeder +================= + +.. _api_fluid_data_feeder_DataFeeder: DataFeeder ---------- diff --git a/doc/fluid/api/evaluator.rst b/doc/fluid/api/evaluator.rst deleted file mode 100644 index c0dc9a0d1d9f2f70948dc3c905dca25d7dd43742..0000000000000000000000000000000000000000 --- a/doc/fluid/api/evaluator.rst +++ /dev/null @@ -1,7 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -========= -evaluator -========= - diff --git a/doc/fluid/api/executor.rst b/doc/fluid/api/executor.rst index f67a14c49f372e67d18ec8e6f87da01109376d22..db2842e7f23e74130a966bb347004bee1ccb08fd 100644 --- a/doc/fluid/api/executor.rst +++ b/doc/fluid/api/executor.rst @@ -1,9 +1,11 @@ .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` !DO NOT EDIT THIS FILE MANUALLY! -======== -executor -======== +============== +fluid.executor +============== + +.. _api_fluid_executor_Executor: Executor -------- @@ -12,24 +14,32 @@ Executor :members: :noindex: +.. _api_fluid_executor_global_scope: + global_scope ------------ .. autofunction:: paddle.fluid.executor.global_scope :noindex: +.. _api_fluid_executor_scope_guard: + scope_guard ----------- .. autofunction:: paddle.fluid.executor.scope_guard :noindex: -switch_scope ------------- +.. _api_fluid_executor__switch_scope: + +_switch_scope +------------- -.. autofunction:: paddle.fluid.executor.switch_scope +.. autofunction:: paddle.fluid.executor._switch_scope :noindex: +.. _api_fluid_executor_fetch_var: + fetch_var --------- diff --git a/doc/fluid/api/fluid.rst b/doc/fluid/api/fluid.rst new file mode 100644 index 0000000000000000000000000000000000000000..51cdfe0c2ed045a5b3247c4fdec9868d756eae86 --- /dev/null +++ b/doc/fluid/api/fluid.rst @@ -0,0 +1,378 @@ +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! + +===== +fluid +===== + +.. _api_fluid_Block: + +Block +----- + +.. autoclass:: paddle.fluid.Block + :members: + :noindex: + +.. _api_fluid_Variable: + +Variable +-------- + +.. autoclass:: paddle.fluid.Variable + :members: + :noindex: + +.. _api_fluid_Program: + +Program +------- + +.. autoclass:: paddle.fluid.Program + :members: + :noindex: + +.. _api_fluid_Operator: + +Operator +-------- + +.. autoclass:: paddle.fluid.Operator + :members: + :noindex: + +.. _api_fluid_default_startup_program: + +default_startup_program +----------------------- + +.. autofunction:: paddle.fluid.default_startup_program + :noindex: + +.. _api_fluid_default_main_program: + +default_main_program +-------------------- + +.. autofunction:: paddle.fluid.default_main_program + :noindex: + +.. _api_fluid_program_guard: + +program_guard +------------- + +.. autofunction:: paddle.fluid.program_guard + :noindex: + +.. _api_fluid_get_var: + +get_var +------- + +.. autofunction:: paddle.fluid.get_var + :noindex: + +.. _api_fluid_Executor: + +Executor +-------- + +.. autoclass:: paddle.fluid.Executor + :members: + :noindex: + +.. _api_fluid_global_scope: + +global_scope +------------ + +.. autofunction:: paddle.fluid.global_scope + :noindex: + +.. _api_fluid_scope_guard: + +scope_guard +----------- + +.. autofunction:: paddle.fluid.scope_guard + :noindex: + +.. _api_fluid__switch_scope: + +_switch_scope +------------- + +.. autofunction:: paddle.fluid._switch_scope + :noindex: + +.. _api_fluid_fetch_var: + +fetch_var +--------- + +.. autofunction:: paddle.fluid.fetch_var + :noindex: + +.. _api_fluid_Go: + +Go +-- + +.. autoclass:: paddle.fluid.Go + :members: + :noindex: + +.. _api_fluid_make_channel: + +make_channel +------------ + +.. autofunction:: paddle.fluid.make_channel + :noindex: + +.. _api_fluid_channel_send: + +channel_send +------------ + +.. autofunction:: paddle.fluid.channel_send + :noindex: + +.. _api_fluid_channel_recv: + +channel_recv +------------ + +.. autofunction:: paddle.fluid.channel_recv + :noindex: + +.. _api_fluid_channel_close: + +channel_close +------------- + +.. autofunction:: paddle.fluid.channel_close + :noindex: + +.. _api_fluid_Select: + +Select +------ + +.. autoclass:: paddle.fluid.Select + :members: + :noindex: + +.. _api_fluid_Trainer: + +Trainer +------- + +.. autoclass:: paddle.fluid.Trainer + :members: + :noindex: + +.. _api_fluid_BeginEpochEvent: + +BeginEpochEvent +--------------- + +.. autoclass:: paddle.fluid.BeginEpochEvent + :members: + :noindex: + +.. _api_fluid_EndEpochEvent: + +EndEpochEvent +------------- + +.. autoclass:: paddle.fluid.EndEpochEvent + :members: + :noindex: + +.. _api_fluid_BeginStepEvent: + +BeginStepEvent +-------------- + +.. autoclass:: paddle.fluid.BeginStepEvent + :members: + :noindex: + +.. _api_fluid_EndStepEvent: + +EndStepEvent +------------ + +.. autoclass:: paddle.fluid.EndStepEvent + :members: + :noindex: + +.. _api_fluid_CheckpointConfig: + +CheckpointConfig +---------------- + +.. autoclass:: paddle.fluid.CheckpointConfig + :members: + :noindex: + +.. _api_fluid_Inferencer: + +Inferencer +---------- + +.. autoclass:: paddle.fluid.Inferencer + :members: + :noindex: + +.. _api_fluid_DistributeTranspiler: + +DistributeTranspiler +-------------------- + +.. autoclass:: paddle.fluid.DistributeTranspiler + :members: + :noindex: + +.. _api_fluid_memory_optimize: + +memory_optimize +--------------- + +.. autofunction:: paddle.fluid.memory_optimize + :noindex: + +.. _api_fluid_release_memory: + +release_memory +-------------- + +.. autofunction:: paddle.fluid.release_memory + :noindex: + +.. _api_fluid_ParallelExecutor: + +ParallelExecutor +---------------- + +.. autoclass:: paddle.fluid.ParallelExecutor + :members: + :noindex: + +.. _api_fluid_ExecutionStrategy: + +ExecutionStrategy +----------------- + +.. autoclass:: paddle.fluid.ExecutionStrategy + :members: + :noindex: + +.. _api_fluid_BuildStrategy: + +BuildStrategy +------------- + +.. autoclass:: paddle.fluid.BuildStrategy + :members: + :noindex: + +.. _api_fluid_create_lod_tensor: + +create_lod_tensor +----------------- + +.. autofunction:: paddle.fluid.create_lod_tensor + :noindex: + +.. _api_fluid_create_random_int_lodtensor: + +create_random_int_lodtensor +--------------------------- + +.. autofunction:: paddle.fluid.create_random_int_lodtensor + :noindex: + +.. _api_fluid_LoDTensor: + +LoDTensor +--------- + +.. autoclass:: paddle.fluid.LoDTensor + :members: + :noindex: + +.. _api_fluid_CPUPlace: + +CPUPlace +-------- + +.. autoclass:: paddle.fluid.CPUPlace + :members: + :noindex: + +.. _api_fluid_CUDAPlace: + +CUDAPlace +--------- + +.. autoclass:: paddle.fluid.CUDAPlace + :members: + :noindex: + +.. _api_fluid_CUDAPinnedPlace: + +CUDAPinnedPlace +--------------- + +.. autoclass:: paddle.fluid.CUDAPinnedPlace + :members: + :noindex: + +.. _api_fluid_Tensor: + +Tensor +------ + +.. autoclass:: paddle.fluid.Tensor + :members: + :noindex: + +.. _api_fluid_ParamAttr: + +ParamAttr +--------- + +.. autoclass:: paddle.fluid.ParamAttr + :members: + :noindex: + +.. _api_fluid_WeightNormParamAttr: + +WeightNormParamAttr +------------------- + +.. autoclass:: paddle.fluid.WeightNormParamAttr + :members: + :noindex: + +.. _api_fluid_DataFeeder: + +DataFeeder +---------- + +.. autoclass:: paddle.fluid.DataFeeder + :members: + :noindex: + +.. _api_fluid_Scope: + +Scope +----- + +.. autoclass:: paddle.fluid.Scope + :members: + :noindex: + diff --git a/doc/fluid/api/gen_doc.py b/doc/fluid/api/gen_doc.py index 89ab880301b6ac687fd61f556f87f03792c37da3..02efce2bf8392c62a7600c272bedcadc6563f927 100644 --- a/doc/fluid/api/gen_doc.py +++ b/doc/fluid/api/gen_doc.py @@ -29,19 +29,27 @@ def parse_arg(): class DocGenerator(object): - def __init__(self, module_name, stream=sys.stdout): + def __init__(self, module_name=None, stream=sys.stdout): + if module_name == "": + module_name = None self.stream = stream - self.module_name = module_name - if not hasattr(fluid, module_name): - raise ValueError("Cannot find fluid.{0}".format(module_name)) + if module_name is None: + self.module_name = "fluid" else: - self.module = getattr(fluid, module_name) + self.module_name = "fluid." + module_name + if module_name is None: + self.module = fluid + else: + if not hasattr(fluid, module_name): + raise ValueError("Cannot find fluid.{0}".format(module_name)) + else: + self.module = getattr(fluid, module_name) self.stream.write('''.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` !DO NOT EDIT THIS FILE MANUALLY! ''') - self._print_header_(module_name, dot='=', is_title=True) + self._print_header_(self.module_name, dot='=', is_title=True) def print_submodule(self, submodule_name): submodule = getattr(self.module, submodule_name) @@ -60,25 +68,29 @@ class DocGenerator(object): self._print_header_(name, dot='=', is_title=False) def print_item(self, name): - item = getattr(self.module, name) + item = getattr(self.module, name, None) + if item is None: + return if isinstance(item, types.TypeType): self.print_class(name) elif isinstance(item, types.FunctionType): self.print_method(name) else: - raise RuntimeError("Unsupported item {0}".format(name)) + pass def print_class(self, name): + self._print_ref_(name) self._print_header_(name, dot='-', is_title=False) - self.stream.write('''.. autoclass:: paddle.fluid.{0}.{1} + self.stream.write('''.. autoclass:: paddle.{0}.{1} :members: :noindex: '''.format(self.module_name, name)) def print_method(self, name): + self._print_ref_(name) self._print_header_(name, dot='-', is_title=False) - self.stream.write('''.. autofunction:: paddle.fluid.{0}.{1} + self.stream.write('''.. autofunction:: paddle.{0}.{1} :noindex: '''.format(self.module_name, name)) @@ -94,6 +106,10 @@ class DocGenerator(object): self.stream.write('\n') self.stream.write('\n') + def _print_ref_(self, name): + self.stream.write(".. _api_{0}_{1}:\n\n".format("_".join( + self.module_name.split(".")), name)) + def main(): args = parse_arg() diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh index 0f0539355559446fd91f659d61b636db214b5a40..b14ee29873c50fd011f6c48b754767ac8918252a 100755 --- a/doc/fluid/api/gen_doc.sh +++ b/doc/fluid/api/gen_doc.sh @@ -1,7 +1,9 @@ #!/bin/bash -python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst +python gen_doc.py layers --submodules control_flow device io nn ops tensor learning_rate_scheduler detection metric_op tensor > layers.rst -for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer +for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler recordio_writer backward average profiler do python gen_doc.py ${module} > ${module}.rst done + +python gen_doc.py "" > fluid.rst diff --git a/doc/fluid/api/index_en.rst b/doc/fluid/api/index_en.rst index 29cea9c68221b921939e8e09072d87f9f604e21b..359406819a993e7eaf2155c839373df44d97b103 100644 --- a/doc/fluid/api/index_en.rst +++ b/doc/fluid/api/index_en.rst @@ -1,10 +1,11 @@ -====================== -Fluid -====================== +============= +API Reference +============= .. toctree:: :maxdepth: 1 + fluid.rst layers.rst data_feeder.rst executor.rst @@ -18,3 +19,8 @@ Fluid regularizer.rst io.rst data.rst + transpiler.rst + recordio_writer.rst + backward.rst + average.rst + profiler.rst diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst index c49a98c744cdf907630ea8c74791ff2021d996e8..dc0b52b14fd242dfaded1cb9a8e0ab9eb66b0607 100644 --- a/doc/fluid/api/initializer.rst +++ b/doc/fluid/api/initializer.rst @@ -1,9 +1,11 @@ .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` !DO NOT EDIT THIS FILE MANUALLY! -=========== -initializer -=========== +================= +fluid.initializer +================= + +.. _api_fluid_initializer_Constant: Constant -------- @@ -12,6 +14,8 @@ Constant :members: :noindex: +.. _api_fluid_initializer_Uniform: + Uniform ------- @@ -19,6 +23,8 @@ Uniform :members: :noindex: +.. _api_fluid_initializer_Normal: + Normal ------ @@ -26,6 +32,8 @@ Normal :members: :noindex: +.. _api_fluid_initializer_Xavier: + Xavier ------ @@ -33,18 +41,42 @@ Xavier :members: :noindex: +.. _api_fluid_initializer_Bilinear: + +Bilinear +-------- + +.. autoclass:: paddle.fluid.initializer.Bilinear + :members: + :noindex: + +.. _api_fluid_initializer_MSRA: + +MSRA +---- + +.. autoclass:: paddle.fluid.initializer.MSRA + :members: + :noindex: + +.. _api_fluid_initializer_force_init_on_cpu: + force_init_on_cpu ----------------- .. autofunction:: paddle.fluid.initializer.force_init_on_cpu :noindex: +.. _api_fluid_initializer_init_on_cpu: + init_on_cpu ----------- .. autofunction:: paddle.fluid.initializer.init_on_cpu :noindex: +.. _api_fluid_initializer_ConstantInitializer: + ConstantInitializer ------------------- @@ -52,6 +84,8 @@ ConstantInitializer :members: :noindex: +.. _api_fluid_initializer_UniformInitializer: + UniformInitializer ------------------ @@ -59,6 +93,8 @@ UniformInitializer :members: :noindex: +.. _api_fluid_initializer_NormalInitializer: + NormalInitializer ----------------- @@ -66,6 +102,8 @@ NormalInitializer :members: :noindex: +.. _api_fluid_initializer_XavierInitializer: + XavierInitializer ----------------- @@ -73,3 +111,21 @@ XavierInitializer :members: :noindex: +.. _api_fluid_initializer_BilinearInitializer: + +BilinearInitializer +------------------- + +.. autoclass:: paddle.fluid.initializer.BilinearInitializer + :members: + :noindex: + +.. _api_fluid_initializer_MSRAInitializer: + +MSRAInitializer +--------------- + +.. autoclass:: paddle.fluid.initializer.MSRAInitializer + :members: + :noindex: + diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst index 3e956f8302d261b52f9f76ff8eb4a01f9c6381f8..7cee0bc4d9aa2c51517d23a381f14a8f63cc3681 100644 --- a/doc/fluid/api/io.rst +++ b/doc/fluid/api/io.rst @@ -1,9 +1,11 @@ .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` !DO NOT EDIT THIS FILE MANUALLY! -== -io -== +======== +fluid.io +======== + +.. _api_fluid_io_save_vars: save_vars --------- @@ -11,69 +13,115 @@ save_vars .. autofunction:: paddle.fluid.io.save_vars :noindex: +.. _api_fluid_io_save_params: + save_params ----------- .. autofunction:: paddle.fluid.io.save_params :noindex: +.. _api_fluid_io_save_persistables: + save_persistables ----------------- .. autofunction:: paddle.fluid.io.save_persistables :noindex: +.. _api_fluid_io_load_vars: + load_vars --------- .. autofunction:: paddle.fluid.io.load_vars :noindex: +.. _api_fluid_io_load_params: + load_params ----------- .. autofunction:: paddle.fluid.io.load_params :noindex: +.. _api_fluid_io_load_persistables: + load_persistables ----------------- .. autofunction:: paddle.fluid.io.load_persistables :noindex: +.. _api_fluid_io_save_inference_model: + save_inference_model -------------------- .. autofunction:: paddle.fluid.io.save_inference_model :noindex: +.. _api_fluid_io_load_inference_model: + load_inference_model -------------------- .. autofunction:: paddle.fluid.io.load_inference_model :noindex: +.. _api_fluid_io_get_inference_program: + get_inference_program --------------------- .. autofunction:: paddle.fluid.io.get_inference_program :noindex: +.. _api_fluid_io_save_checkpoint: + save_checkpoint --------------- .. autofunction:: paddle.fluid.io.save_checkpoint :noindex: +.. _api_fluid_io_load_checkpoint: + load_checkpoint --------------- .. autofunction:: paddle.fluid.io.load_checkpoint :noindex: +.. _api_fluid_io_clean_checkpoint: + clean_checkpoint ---------------- .. autofunction:: paddle.fluid.io.clean_checkpoint :noindex: +.. _api_fluid_io_load_persist_vars_without_grad: + +load_persist_vars_without_grad +------------------------------ + +.. autofunction:: paddle.fluid.io.load_persist_vars_without_grad + :noindex: + +.. _api_fluid_io_save_persist_vars_without_grad: + +save_persist_vars_without_grad +------------------------------ + +.. autofunction:: paddle.fluid.io.save_persist_vars_without_grad + :noindex: + +.. _api_fluid_io_get_latest_checkpoint_serial: + +get_latest_checkpoint_serial +---------------------------- + +.. autofunction:: paddle.fluid.io.get_latest_checkpoint_serial + :noindex: + diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst index f78e6db3268e44d5f30d83508f07c4ed68106e48..264506a68ae17d081dd58ef4794bf7723f6d021c 100644 --- a/doc/fluid/api/layers.rst +++ b/doc/fluid/api/layers.rst @@ -1,25 +1,31 @@ .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` !DO NOT EDIT THIS FILE MANUALLY! -====== -layers -====== +============ +fluid.layers +============ control_flow ============ +.. _api_fluid_layers_split_lod_tensor: + split_lod_tensor ---------------- .. autofunction:: paddle.fluid.layers.split_lod_tensor :noindex: +.. _api_fluid_layers_merge_lod_tensor: + merge_lod_tensor ---------------- .. autofunction:: paddle.fluid.layers.merge_lod_tensor :noindex: +.. _api_fluid_layers_BlockGuard: + BlockGuard ---------- @@ -27,6 +33,8 @@ BlockGuard :members: :noindex: +.. _api_fluid_layers_BlockGuardWithCompletion: + BlockGuardWithCompletion ------------------------ @@ -34,12 +42,7 @@ BlockGuardWithCompletion :members: :noindex: -StaticRNNMemoryLink -------------------- - -.. autoclass:: paddle.fluid.layers.StaticRNNMemoryLink - :members: - :noindex: +.. _api_fluid_layers_WhileGuard: WhileGuard ---------- @@ -48,6 +51,8 @@ WhileGuard :members: :noindex: +.. _api_fluid_layers_While: + While ----- @@ -55,6 +60,8 @@ While :members: :noindex: +.. _api_fluid_layers_Switch: + Switch ------ @@ -62,78 +69,104 @@ Switch :members: :noindex: +.. _api_fluid_layers_lod_rank_table: + lod_rank_table -------------- .. autofunction:: paddle.fluid.layers.lod_rank_table :noindex: +.. _api_fluid_layers_max_sequence_len: + max_sequence_len ---------------- .. autofunction:: paddle.fluid.layers.max_sequence_len :noindex: +.. _api_fluid_layers_lod_tensor_to_array: + lod_tensor_to_array ------------------- .. autofunction:: paddle.fluid.layers.lod_tensor_to_array :noindex: +.. _api_fluid_layers_array_to_lod_tensor: + array_to_lod_tensor ------------------- .. autofunction:: paddle.fluid.layers.array_to_lod_tensor :noindex: +.. _api_fluid_layers_increment: + increment --------- .. autofunction:: paddle.fluid.layers.increment :noindex: +.. _api_fluid_layers_array_write: + array_write ----------- .. autofunction:: paddle.fluid.layers.array_write :noindex: +.. _api_fluid_layers_create_array: + create_array ------------ .. autofunction:: paddle.fluid.layers.create_array :noindex: +.. _api_fluid_layers_less_than: + less_than --------- .. autofunction:: paddle.fluid.layers.less_than :noindex: +.. _api_fluid_layers_equal: + equal ----- .. autofunction:: paddle.fluid.layers.equal :noindex: +.. _api_fluid_layers_array_read: + array_read ---------- .. autofunction:: paddle.fluid.layers.array_read :noindex: +.. _api_fluid_layers_shrink_memory: + shrink_memory ------------- .. autofunction:: paddle.fluid.layers.shrink_memory :noindex: +.. _api_fluid_layers_array_length: + array_length ------------ .. autofunction:: paddle.fluid.layers.array_length :noindex: +.. _api_fluid_layers_IfElse: + IfElse ------ @@ -141,6 +174,8 @@ IfElse :members: :noindex: +.. _api_fluid_layers_DynamicRNN: + DynamicRNN ---------- @@ -148,6 +183,8 @@ DynamicRNN :members: :noindex: +.. _api_fluid_layers_ConditionalBlock: + ConditionalBlock ---------------- @@ -155,6 +192,8 @@ ConditionalBlock :members: :noindex: +.. _api_fluid_layers_StaticRNN: + StaticRNN --------- @@ -162,12 +201,16 @@ StaticRNN :members: :noindex: +.. _api_fluid_layers_reorder_lod_tensor_by_rank: + reorder_lod_tensor_by_rank -------------------------- .. autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank :noindex: +.. _api_fluid_layers_ParallelDo: + ParallelDo ---------- @@ -175,12 +218,16 @@ ParallelDo :members: :noindex: +.. _api_fluid_layers_Print: + Print ----- .. autofunction:: paddle.fluid.layers.Print :noindex: +.. _api_fluid_layers_is_empty: + is_empty -------- @@ -190,6 +237,8 @@ is_empty device ====== +.. _api_fluid_layers_get_places: + get_places ---------- @@ -199,12 +248,16 @@ get_places io == +.. _api_fluid_layers_data: + data ---- .. autofunction:: paddle.fluid.layers.data :noindex: +.. _api_fluid_layers_BlockGuardServ: + BlockGuardServ -------------- @@ -212,6 +265,8 @@ BlockGuardServ :members: :noindex: +.. _api_fluid_layers_ListenAndServ: + ListenAndServ ------------- @@ -219,54 +274,80 @@ ListenAndServ :members: :noindex: +.. _api_fluid_layers_Send: + Send ---- .. autofunction:: paddle.fluid.layers.Send :noindex: +.. _api_fluid_layers_Recv: + +Recv +---- + +.. autofunction:: paddle.fluid.layers.Recv + :noindex: + +.. _api_fluid_layers_open_recordio_file: + open_recordio_file ------------------ .. autofunction:: paddle.fluid.layers.open_recordio_file :noindex: +.. _api_fluid_layers_open_files: + open_files ---------- .. autofunction:: paddle.fluid.layers.open_files :noindex: +.. _api_fluid_layers_read_file: + read_file --------- .. autofunction:: paddle.fluid.layers.read_file :noindex: +.. _api_fluid_layers_shuffle: + shuffle ------- .. autofunction:: paddle.fluid.layers.shuffle :noindex: +.. _api_fluid_layers_batch: + batch ----- .. autofunction:: paddle.fluid.layers.batch :noindex: +.. _api_fluid_layers_double_buffer: + double_buffer ------------- .. autofunction:: paddle.fluid.layers.double_buffer :noindex: +.. _api_fluid_layers_random_data_generator: + random_data_generator --------------------- .. autofunction:: paddle.fluid.layers.random_data_generator :noindex: +.. _api_fluid_layers_Preprocessor: + Preprocessor ------------ @@ -274,479 +355,709 @@ Preprocessor :members: :noindex: +.. _api_fluid_layers_load: + +load +---- + +.. autofunction:: paddle.fluid.layers.load + :noindex: + nn == +.. _api_fluid_layers_fc: + fc -- .. autofunction:: paddle.fluid.layers.fc :noindex: +.. _api_fluid_layers_embedding: + embedding --------- .. autofunction:: paddle.fluid.layers.embedding :noindex: +.. _api_fluid_layers_dynamic_lstm: + dynamic_lstm ------------ .. autofunction:: paddle.fluid.layers.dynamic_lstm :noindex: +.. _api_fluid_layers_dynamic_lstmp: + dynamic_lstmp ------------- .. autofunction:: paddle.fluid.layers.dynamic_lstmp :noindex: +.. _api_fluid_layers_dynamic_gru: + dynamic_gru ----------- .. autofunction:: paddle.fluid.layers.dynamic_gru :noindex: +.. _api_fluid_layers_gru_unit: + gru_unit -------- .. autofunction:: paddle.fluid.layers.gru_unit :noindex: +.. _api_fluid_layers_linear_chain_crf: + linear_chain_crf ---------------- .. autofunction:: paddle.fluid.layers.linear_chain_crf :noindex: +.. _api_fluid_layers_crf_decoding: + crf_decoding ------------ .. autofunction:: paddle.fluid.layers.crf_decoding :noindex: +.. _api_fluid_layers_cos_sim: + cos_sim ------- .. autofunction:: paddle.fluid.layers.cos_sim :noindex: +.. _api_fluid_layers_cross_entropy: + cross_entropy ------------- .. autofunction:: paddle.fluid.layers.cross_entropy :noindex: +.. _api_fluid_layers_square_error_cost: + square_error_cost ----------------- .. autofunction:: paddle.fluid.layers.square_error_cost :noindex: +.. _api_fluid_layers_chunk_eval: + chunk_eval ---------- .. autofunction:: paddle.fluid.layers.chunk_eval :noindex: +.. _api_fluid_layers_sequence_conv: + sequence_conv ------------- .. autofunction:: paddle.fluid.layers.sequence_conv :noindex: +.. _api_fluid_layers_conv2d: + conv2d ------ .. autofunction:: paddle.fluid.layers.conv2d :noindex: +.. _api_fluid_layers_conv3d: + +conv3d +------ + +.. autofunction:: paddle.fluid.layers.conv3d + :noindex: + +.. _api_fluid_layers_sequence_pool: + sequence_pool ------------- .. autofunction:: paddle.fluid.layers.sequence_pool :noindex: +.. _api_fluid_layers_sequence_softmax: + sequence_softmax ---------------- .. autofunction:: paddle.fluid.layers.sequence_softmax :noindex: +.. _api_fluid_layers_softmax: + softmax ------- .. autofunction:: paddle.fluid.layers.softmax :noindex: +.. _api_fluid_layers_pool2d: + pool2d ------ .. autofunction:: paddle.fluid.layers.pool2d :noindex: +.. _api_fluid_layers_pool3d: + +pool3d +------ + +.. autofunction:: paddle.fluid.layers.pool3d + :noindex: + +.. _api_fluid_layers_batch_norm: + batch_norm ---------- .. autofunction:: paddle.fluid.layers.batch_norm :noindex: +.. _api_fluid_layers_beam_search_decode: + beam_search_decode ------------------ .. autofunction:: paddle.fluid.layers.beam_search_decode :noindex: +.. _api_fluid_layers_conv2d_transpose: + conv2d_transpose ---------------- .. autofunction:: paddle.fluid.layers.conv2d_transpose :noindex: +.. _api_fluid_layers_conv3d_transpose: + +conv3d_transpose +---------------- + +.. autofunction:: paddle.fluid.layers.conv3d_transpose + :noindex: + +.. _api_fluid_layers_sequence_expand: + sequence_expand --------------- .. autofunction:: paddle.fluid.layers.sequence_expand :noindex: +.. _api_fluid_layers_lstm_unit: + lstm_unit --------- .. autofunction:: paddle.fluid.layers.lstm_unit :noindex: +.. _api_fluid_layers_reduce_sum: + reduce_sum ---------- .. autofunction:: paddle.fluid.layers.reduce_sum :noindex: +.. _api_fluid_layers_reduce_mean: + reduce_mean ----------- .. autofunction:: paddle.fluid.layers.reduce_mean :noindex: +.. _api_fluid_layers_reduce_max: + reduce_max ---------- .. autofunction:: paddle.fluid.layers.reduce_max :noindex: +.. _api_fluid_layers_reduce_min: + reduce_min ---------- .. autofunction:: paddle.fluid.layers.reduce_min :noindex: +.. _api_fluid_layers_reduce_prod: + reduce_prod ----------- .. autofunction:: paddle.fluid.layers.reduce_prod :noindex: +.. _api_fluid_layers_sequence_first_step: + sequence_first_step ------------------- .. autofunction:: paddle.fluid.layers.sequence_first_step :noindex: +.. _api_fluid_layers_sequence_last_step: + sequence_last_step ------------------ .. autofunction:: paddle.fluid.layers.sequence_last_step :noindex: +.. _api_fluid_layers_dropout: + dropout ------- .. autofunction:: paddle.fluid.layers.dropout :noindex: +.. _api_fluid_layers_split: + split ----- .. autofunction:: paddle.fluid.layers.split :noindex: +.. _api_fluid_layers_ctc_greedy_decoder: + ctc_greedy_decoder ------------------ .. autofunction:: paddle.fluid.layers.ctc_greedy_decoder :noindex: +.. _api_fluid_layers_edit_distance: + edit_distance ------------- .. autofunction:: paddle.fluid.layers.edit_distance :noindex: +.. _api_fluid_layers_l2_normalize: + l2_normalize ------------ .. autofunction:: paddle.fluid.layers.l2_normalize :noindex: +.. _api_fluid_layers_matmul: + matmul ------ .. autofunction:: paddle.fluid.layers.matmul :noindex: +.. _api_fluid_layers_topk: + topk ---- .. autofunction:: paddle.fluid.layers.topk :noindex: +.. _api_fluid_layers_warpctc: + warpctc ------- .. autofunction:: paddle.fluid.layers.warpctc :noindex: +.. _api_fluid_layers_sequence_reshape: + sequence_reshape ---------------- .. autofunction:: paddle.fluid.layers.sequence_reshape :noindex: +.. _api_fluid_layers_transpose: + transpose --------- .. autofunction:: paddle.fluid.layers.transpose :noindex: +.. _api_fluid_layers_im2sequence: + im2sequence ----------- .. autofunction:: paddle.fluid.layers.im2sequence :noindex: +.. _api_fluid_layers_nce: + nce --- .. autofunction:: paddle.fluid.layers.nce :noindex: +.. _api_fluid_layers_beam_search: + beam_search ----------- .. autofunction:: paddle.fluid.layers.beam_search :noindex: +.. _api_fluid_layers_row_conv: + row_conv -------- .. autofunction:: paddle.fluid.layers.row_conv :noindex: +.. _api_fluid_layers_multiplex: + multiplex --------- .. autofunction:: paddle.fluid.layers.multiplex :noindex: +.. _api_fluid_layers_layer_norm: + layer_norm ---------- .. autofunction:: paddle.fluid.layers.layer_norm :noindex: +.. _api_fluid_layers_softmax_with_cross_entropy: + softmax_with_cross_entropy -------------------------- .. autofunction:: paddle.fluid.layers.softmax_with_cross_entropy :noindex: +.. _api_fluid_layers_smooth_l1: + smooth_l1 --------- .. autofunction:: paddle.fluid.layers.smooth_l1 :noindex: +.. _api_fluid_layers_one_hot: + one_hot ------- .. autofunction:: paddle.fluid.layers.one_hot :noindex: +.. _api_fluid_layers_autoincreased_step_counter: + autoincreased_step_counter -------------------------- .. autofunction:: paddle.fluid.layers.autoincreased_step_counter :noindex: +.. _api_fluid_layers_reshape: + reshape ------- .. autofunction:: paddle.fluid.layers.reshape :noindex: +.. _api_fluid_layers_lod_reset: + lod_reset --------- .. autofunction:: paddle.fluid.layers.lod_reset :noindex: +.. _api_fluid_layers_lrn: + lrn --- .. autofunction:: paddle.fluid.layers.lrn :noindex: +.. _api_fluid_layers_pad: + pad --- .. autofunction:: paddle.fluid.layers.pad :noindex: +.. _api_fluid_layers_label_smooth: + label_smooth ------------ .. autofunction:: paddle.fluid.layers.label_smooth :noindex: +.. _api_fluid_layers_roi_pool: + roi_pool -------- .. autofunction:: paddle.fluid.layers.roi_pool :noindex: +.. _api_fluid_layers_dice_loss: + dice_loss --------- .. autofunction:: paddle.fluid.layers.dice_loss :noindex: +.. _api_fluid_layers_image_resize: + +image_resize +------------ + +.. autofunction:: paddle.fluid.layers.image_resize + :noindex: + +.. _api_fluid_layers_image_resize_short: + +image_resize_short +------------------ + +.. autofunction:: paddle.fluid.layers.image_resize_short + :noindex: + +.. _api_fluid_layers_resize_bilinear: + resize_bilinear --------------- .. autofunction:: paddle.fluid.layers.resize_bilinear :noindex: +.. _api_fluid_layers_gather: + gather ------ .. autofunction:: paddle.fluid.layers.gather :noindex: +.. _api_fluid_layers_random_crop: + random_crop ----------- .. autofunction:: paddle.fluid.layers.random_crop :noindex: +.. _api_fluid_layers_mean_iou: + +mean_iou +-------- + +.. autofunction:: paddle.fluid.layers.mean_iou + :noindex: + +.. _api_fluid_layers_relu: + +relu +---- + +.. autofunction:: paddle.fluid.layers.relu + :noindex: + +.. _api_fluid_layers_log: + +log +--- + +.. autofunction:: paddle.fluid.layers.log + :noindex: + +.. _api_fluid_layers_crop: + +crop +---- + +.. autofunction:: paddle.fluid.layers.crop + :noindex: + ops === +.. _api_fluid_layers_mean: + mean ---- .. autofunction:: paddle.fluid.layers.mean :noindex: +.. _api_fluid_layers_mul: + mul --- .. autofunction:: paddle.fluid.layers.mul :noindex: +.. _api_fluid_layers_scale: + scale ----- .. autofunction:: paddle.fluid.layers.scale :noindex: +.. _api_fluid_layers_sigmoid_cross_entropy_with_logits: + sigmoid_cross_entropy_with_logits --------------------------------- .. autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits :noindex: +.. _api_fluid_layers_elementwise_add: + elementwise_add --------------- .. autofunction:: paddle.fluid.layers.elementwise_add :noindex: +.. _api_fluid_layers_elementwise_div: + elementwise_div --------------- .. autofunction:: paddle.fluid.layers.elementwise_div :noindex: +.. _api_fluid_layers_elementwise_sub: + elementwise_sub --------------- .. autofunction:: paddle.fluid.layers.elementwise_sub :noindex: +.. _api_fluid_layers_elementwise_mul: + elementwise_mul --------------- .. autofunction:: paddle.fluid.layers.elementwise_mul :noindex: +.. _api_fluid_layers_elementwise_max: + elementwise_max --------------- .. autofunction:: paddle.fluid.layers.elementwise_max :noindex: +.. _api_fluid_layers_elementwise_min: + elementwise_min --------------- .. autofunction:: paddle.fluid.layers.elementwise_min :noindex: +.. _api_fluid_layers_elementwise_pow: + elementwise_pow --------------- .. autofunction:: paddle.fluid.layers.elementwise_pow :noindex: +.. _api_fluid_layers_clip: + clip ---- .. autofunction:: paddle.fluid.layers.clip :noindex: +.. _api_fluid_layers_clip_by_norm: + clip_by_norm ------------ .. autofunction:: paddle.fluid.layers.clip_by_norm :noindex: +.. _api_fluid_layers_logical_and: + logical_and ----------- .. autofunction:: paddle.fluid.layers.logical_and :noindex: +.. _api_fluid_layers_logical_or: + logical_or ---------- .. autofunction:: paddle.fluid.layers.logical_or :noindex: +.. _api_fluid_layers_logical_xor: + logical_xor ----------- .. autofunction:: paddle.fluid.layers.logical_xor :noindex: +.. _api_fluid_layers_logical_not: + logical_not ----------- .. autofunction:: paddle.fluid.layers.logical_not :noindex: -uniform_random --------------- - -.. autofunction:: paddle.fluid.layers.uniform_random - :noindex: +.. _api_fluid_layers_uniform_random_batch_size_like: uniform_random_batch_size_like ------------------------------ @@ -754,23 +1065,23 @@ uniform_random_batch_size_like .. autofunction:: paddle.fluid.layers.uniform_random_batch_size_like :noindex: +.. _api_fluid_layers_gaussian_random: + gaussian_random --------------- .. autofunction:: paddle.fluid.layers.gaussian_random :noindex: +.. _api_fluid_layers_gaussian_random_batch_size_like: + gaussian_random_batch_size_like ------------------------------- .. autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like :noindex: -cumsum ------- - -.. autofunction:: paddle.fluid.layers.cumsum - :noindex: +.. _api_fluid_layers_scatter: scatter ------- @@ -778,41 +1089,79 @@ scatter .. autofunction:: paddle.fluid.layers.scatter :noindex: +.. _api_fluid_layers_sum: + sum --- .. autofunction:: paddle.fluid.layers.sum :noindex: +.. _api_fluid_layers_slice: + +slice +----- + +.. autofunction:: paddle.fluid.layers.slice + :noindex: + +.. _api_fluid_layers_polygon_box_transform: + +polygon_box_transform +--------------------- + +.. autofunction:: paddle.fluid.layers.polygon_box_transform + :noindex: + +.. _api_fluid_layers_shape: + shape ----- .. autofunction:: paddle.fluid.layers.shape :noindex: +.. _api_fluid_layers_iou_similarity: + +iou_similarity +-------------- + +.. autofunction:: paddle.fluid.layers.iou_similarity + :noindex: + +.. _api_fluid_layers_maxout: + +maxout +------ + +.. autofunction:: paddle.fluid.layers.maxout + :noindex: + +.. _api_fluid_layers_sigmoid: + sigmoid ------- .. autofunction:: paddle.fluid.layers.sigmoid :noindex: +.. _api_fluid_layers_logsigmoid: + logsigmoid ---------- .. autofunction:: paddle.fluid.layers.logsigmoid :noindex: +.. _api_fluid_layers_exp: + exp --- .. autofunction:: paddle.fluid.layers.exp :noindex: -relu ----- - -.. autofunction:: paddle.fluid.layers.relu - :noindex: +.. _api_fluid_layers_tanh: tanh ---- @@ -820,71 +1169,87 @@ tanh .. autofunction:: paddle.fluid.layers.tanh :noindex: +.. _api_fluid_layers_tanh_shrink: + tanh_shrink ----------- .. autofunction:: paddle.fluid.layers.tanh_shrink :noindex: +.. _api_fluid_layers_softshrink: + softshrink ---------- .. autofunction:: paddle.fluid.layers.softshrink :noindex: +.. _api_fluid_layers_sqrt: + sqrt ---- .. autofunction:: paddle.fluid.layers.sqrt :noindex: +.. _api_fluid_layers_abs: + abs --- .. autofunction:: paddle.fluid.layers.abs :noindex: +.. _api_fluid_layers_ceil: + ceil ---- .. autofunction:: paddle.fluid.layers.ceil :noindex: +.. _api_fluid_layers_floor: + floor ----- .. autofunction:: paddle.fluid.layers.floor :noindex: +.. _api_fluid_layers_cos: + cos --- .. autofunction:: paddle.fluid.layers.cos :noindex: +.. _api_fluid_layers_sin: + sin --- .. autofunction:: paddle.fluid.layers.sin :noindex: +.. _api_fluid_layers_round: + round ----- .. autofunction:: paddle.fluid.layers.round :noindex: +.. _api_fluid_layers_reciprocal: + reciprocal ---------- .. autofunction:: paddle.fluid.layers.reciprocal :noindex: -log ---- - -.. autofunction:: paddle.fluid.layers.log - :noindex: +.. _api_fluid_layers_square: square ------ @@ -892,150 +1257,506 @@ square .. autofunction:: paddle.fluid.layers.square :noindex: +.. _api_fluid_layers_softplus: + softplus -------- .. autofunction:: paddle.fluid.layers.softplus :noindex: +.. _api_fluid_layers_softsign: + softsign -------- .. autofunction:: paddle.fluid.layers.softsign :noindex: +.. _api_fluid_layers_brelu: + brelu ----- .. autofunction:: paddle.fluid.layers.brelu :noindex: +.. _api_fluid_layers_leaky_relu: + leaky_relu ---------- .. autofunction:: paddle.fluid.layers.leaky_relu :noindex: +.. _api_fluid_layers_soft_relu: + soft_relu --------- .. autofunction:: paddle.fluid.layers.soft_relu :noindex: +.. _api_fluid_layers_elu: + elu --- .. autofunction:: paddle.fluid.layers.elu :noindex: +.. _api_fluid_layers_relu6: + relu6 ----- .. autofunction:: paddle.fluid.layers.relu6 :noindex: +.. _api_fluid_layers_pow: + pow --- .. autofunction:: paddle.fluid.layers.pow :noindex: +.. _api_fluid_layers_stanh: + stanh ----- .. autofunction:: paddle.fluid.layers.stanh :noindex: +.. _api_fluid_layers_hard_sigmoid: + +hard_sigmoid +------------ + +.. autofunction:: paddle.fluid.layers.hard_sigmoid + :noindex: + +.. _api_fluid_layers_swish: + +swish +----- + +.. autofunction:: paddle.fluid.layers.swish + :noindex: + +.. _api_fluid_layers_uniform_random: + +uniform_random +-------------- + +.. autofunction:: paddle.fluid.layers.uniform_random + :noindex: + +.. _api_fluid_layers_hard_shrink: + hard_shrink ----------- .. autofunction:: paddle.fluid.layers.hard_shrink :noindex: +.. _api_fluid_layers_cumsum: + +cumsum +------ + +.. autofunction:: paddle.fluid.layers.cumsum + :noindex: + +.. _api_fluid_layers_thresholded_relu: + thresholded_relu ---------------- .. autofunction:: paddle.fluid.layers.thresholded_relu :noindex: -hard_sigmoid ------------- +tensor +====== -.. autofunction:: paddle.fluid.layers.hard_sigmoid +.. _api_fluid_layers_create_tensor: + +create_tensor +------------- + +.. autofunction:: paddle.fluid.layers.create_tensor :noindex: -swish +.. _api_fluid_layers_create_parameter: + +create_parameter +---------------- + +.. autofunction:: paddle.fluid.layers.create_parameter + :noindex: + +.. _api_fluid_layers_create_global_var: + +create_global_var +----------------- + +.. autofunction:: paddle.fluid.layers.create_global_var + :noindex: + +.. _api_fluid_layers_cast: + +cast +---- + +.. autofunction:: paddle.fluid.layers.cast + :noindex: + +.. _api_fluid_layers_concat: + +concat +------ + +.. autofunction:: paddle.fluid.layers.concat + :noindex: + +.. _api_fluid_layers_sums: + +sums +---- + +.. autofunction:: paddle.fluid.layers.sums + :noindex: + +.. _api_fluid_layers_assign: + +assign +------ + +.. autofunction:: paddle.fluid.layers.assign + :noindex: + +.. _api_fluid_layers_fill_constant_batch_size_like: + +fill_constant_batch_size_like +----------------------------- + +.. autofunction:: paddle.fluid.layers.fill_constant_batch_size_like + :noindex: + +.. _api_fluid_layers_fill_constant: + +fill_constant +------------- + +.. autofunction:: paddle.fluid.layers.fill_constant + :noindex: + +.. _api_fluid_layers_argmin: + +argmin +------ + +.. autofunction:: paddle.fluid.layers.argmin + :noindex: + +.. _api_fluid_layers_argmax: + +argmax +------ + +.. autofunction:: paddle.fluid.layers.argmax + :noindex: + +.. _api_fluid_layers_ones: + +ones +---- + +.. autofunction:: paddle.fluid.layers.ones + :noindex: + +.. _api_fluid_layers_zeros: + +zeros ----- -.. autofunction:: paddle.fluid.layers.swish +.. autofunction:: paddle.fluid.layers.zeros + :noindex: + +.. _api_fluid_layers_reverse: + +reverse +------- + +.. autofunction:: paddle.fluid.layers.reverse + :noindex: + +learning_rate_scheduler +======================= + +.. _api_fluid_layers_exponential_decay: + +exponential_decay +----------------- + +.. autofunction:: paddle.fluid.layers.exponential_decay + :noindex: + +.. _api_fluid_layers_natural_exp_decay: + +natural_exp_decay +----------------- + +.. autofunction:: paddle.fluid.layers.natural_exp_decay + :noindex: + +.. _api_fluid_layers_inverse_time_decay: + +inverse_time_decay +------------------ + +.. autofunction:: paddle.fluid.layers.inverse_time_decay + :noindex: + +.. _api_fluid_layers_polynomial_decay: + +polynomial_decay +---------------- + +.. autofunction:: paddle.fluid.layers.polynomial_decay + :noindex: + +.. _api_fluid_layers_piecewise_decay: + +piecewise_decay +--------------- + +.. autofunction:: paddle.fluid.layers.piecewise_decay + :noindex: + +.. _api_fluid_layers_noam_decay: + +noam_decay +---------- + +.. autofunction:: paddle.fluid.layers.noam_decay + :noindex: + +.. _api_fluid_layers_append_LARS: + +append_LARS +----------- + +.. autofunction:: paddle.fluid.layers.append_LARS + :noindex: + +detection +========= + +.. _api_fluid_layers_prior_box: + +prior_box +--------- + +.. autofunction:: paddle.fluid.layers.prior_box + :noindex: + +.. _api_fluid_layers_multi_box_head: + +multi_box_head +-------------- + +.. autofunction:: paddle.fluid.layers.multi_box_head + :noindex: + +.. _api_fluid_layers_bipartite_match: + +bipartite_match +--------------- + +.. autofunction:: paddle.fluid.layers.bipartite_match + :noindex: + +.. _api_fluid_layers_target_assign: + +target_assign +------------- + +.. autofunction:: paddle.fluid.layers.target_assign + :noindex: + +.. _api_fluid_layers_detection_output: + +detection_output +---------------- + +.. autofunction:: paddle.fluid.layers.detection_output + :noindex: + +.. _api_fluid_layers_ssd_loss: + +ssd_loss +-------- + +.. autofunction:: paddle.fluid.layers.ssd_loss + :noindex: + +.. _api_fluid_layers_detection_map: + +detection_map +------------- + +.. autofunction:: paddle.fluid.layers.detection_map + :noindex: + +.. _api_fluid_layers_iou_similarity: + +iou_similarity +-------------- + +.. autofunction:: paddle.fluid.layers.iou_similarity + :noindex: + +.. _api_fluid_layers_box_coder: + +box_coder +--------- + +.. autofunction:: paddle.fluid.layers.box_coder + :noindex: + +metric_op +========= + +.. _api_fluid_layers_accuracy: + +accuracy +-------- + +.. autofunction:: paddle.fluid.layers.accuracy + :noindex: + +.. _api_fluid_layers_auc: + +auc +--- + +.. autofunction:: paddle.fluid.layers.auc :noindex: tensor ====== +.. _api_fluid_layers_create_tensor: + create_tensor ------------- .. autofunction:: paddle.fluid.layers.create_tensor :noindex: +.. _api_fluid_layers_create_parameter: + create_parameter ---------------- .. autofunction:: paddle.fluid.layers.create_parameter :noindex: +.. _api_fluid_layers_create_global_var: + create_global_var ----------------- .. autofunction:: paddle.fluid.layers.create_global_var :noindex: +.. _api_fluid_layers_cast: + cast ---- .. autofunction:: paddle.fluid.layers.cast :noindex: +.. _api_fluid_layers_concat: + concat ------ .. autofunction:: paddle.fluid.layers.concat :noindex: +.. _api_fluid_layers_sums: + sums ---- .. autofunction:: paddle.fluid.layers.sums :noindex: +.. _api_fluid_layers_assign: + assign ------ .. autofunction:: paddle.fluid.layers.assign :noindex: +.. _api_fluid_layers_fill_constant_batch_size_like: + fill_constant_batch_size_like ----------------------------- .. autofunction:: paddle.fluid.layers.fill_constant_batch_size_like :noindex: +.. _api_fluid_layers_fill_constant: + fill_constant ------------- .. autofunction:: paddle.fluid.layers.fill_constant :noindex: +.. _api_fluid_layers_argmin: + +argmin +------ + +.. autofunction:: paddle.fluid.layers.argmin + :noindex: + +.. _api_fluid_layers_argmax: + +argmax +------ + +.. autofunction:: paddle.fluid.layers.argmax + :noindex: + +.. _api_fluid_layers_ones: + ones ---- .. autofunction:: paddle.fluid.layers.ones :noindex: +.. _api_fluid_layers_zeros: + zeros ----- .. autofunction:: paddle.fluid.layers.zeros :noindex: +.. _api_fluid_layers_reverse: + +reverse +------- + +.. autofunction:: paddle.fluid.layers.reverse + :noindex: + diff --git a/doc/fluid/api/metrics.rst b/doc/fluid/api/metrics.rst index ddf07775d7ea293acd421b8549d03b277ff0611d..0f54b2e2eb7ead353215c5dbd529293794e37123 100644 --- a/doc/fluid/api/metrics.rst +++ b/doc/fluid/api/metrics.rst @@ -1,9 +1,11 @@ .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` !DO NOT EDIT THIS FILE MANUALLY! -======= -metrics -======= +============= +fluid.metrics +============= + +.. _api_fluid_metrics_MetricBase: MetricBase ---------- @@ -12,6 +14,8 @@ MetricBase :members: :noindex: +.. _api_fluid_metrics_CompositeMetric: + CompositeMetric --------------- @@ -19,6 +23,26 @@ CompositeMetric :members: :noindex: +.. _api_fluid_metrics_Precision: + +Precision +--------- + +.. autoclass:: paddle.fluid.metrics.Precision + :members: + :noindex: + +.. _api_fluid_metrics_Recall: + +Recall +------ + +.. autoclass:: paddle.fluid.metrics.Recall + :members: + :noindex: + +.. _api_fluid_metrics_Accuracy: + Accuracy -------- @@ -26,6 +50,8 @@ Accuracy :members: :noindex: +.. _api_fluid_metrics_ChunkEvaluator: + ChunkEvaluator -------------- @@ -33,6 +59,8 @@ ChunkEvaluator :members: :noindex: +.. _api_fluid_metrics_EditDistance: + EditDistance ------------ @@ -40,6 +68,8 @@ EditDistance :members: :noindex: +.. _api_fluid_metrics_DetectionMAP: + DetectionMAP ------------ @@ -47,6 +77,8 @@ DetectionMAP :members: :noindex: +.. _api_fluid_metrics_Auc: + Auc --- diff --git a/doc/fluid/api/nets.rst b/doc/fluid/api/nets.rst index 7ae3187304f386a08c5cb8a4ba093423a58a7f36..059733af18517257b6821d95fd628a9e13e6e98e 100644 --- a/doc/fluid/api/nets.rst +++ b/doc/fluid/api/nets.rst @@ -1,9 +1,11 @@ .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` !DO NOT EDIT THIS FILE MANUALLY! -==== -nets -==== +========== +fluid.nets +========== + +.. _api_fluid_nets_simple_img_conv_pool: simple_img_conv_pool -------------------- @@ -11,18 +13,24 @@ simple_img_conv_pool .. autofunction:: paddle.fluid.nets.simple_img_conv_pool :noindex: +.. _api_fluid_nets_sequence_conv_pool: + sequence_conv_pool ------------------ .. autofunction:: paddle.fluid.nets.sequence_conv_pool :noindex: +.. _api_fluid_nets_glu: + glu --- .. autofunction:: paddle.fluid.nets.glu :noindex: +.. _api_fluid_nets_scaled_dot_product_attention: + scaled_dot_product_attention ---------------------------- diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst index 6ad44bb6905b6e3f2b6e4aeb3701ced5d18e2005..8d792120f2f16a8c92606b343eb4c3d4368bed14 100644 --- a/doc/fluid/api/optimizer.rst +++ b/doc/fluid/api/optimizer.rst @@ -1,9 +1,11 @@ .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` !DO NOT EDIT THIS FILE MANUALLY! -========= -optimizer -========= +=============== +fluid.optimizer +=============== + +.. _api_fluid_optimizer_SGD: SGD --- @@ -12,6 +14,8 @@ SGD :members: :noindex: +.. _api_fluid_optimizer_Momentum: + Momentum -------- @@ -19,6 +23,8 @@ Momentum :members: :noindex: +.. _api_fluid_optimizer_Adagrad: + Adagrad ------- @@ -26,6 +32,8 @@ Adagrad :members: :noindex: +.. _api_fluid_optimizer_Adam: + Adam ---- @@ -33,6 +41,8 @@ Adam :members: :noindex: +.. _api_fluid_optimizer_Adamax: + Adamax ------ @@ -40,6 +50,8 @@ Adamax :members: :noindex: +.. _api_fluid_optimizer_DecayedAdagrad: + DecayedAdagrad -------------- @@ -47,6 +59,17 @@ DecayedAdagrad :members: :noindex: +.. _api_fluid_optimizer_Ftrl: + +Ftrl +---- + +.. autoclass:: paddle.fluid.optimizer.Ftrl + :members: + :noindex: + +.. _api_fluid_optimizer_SGDOptimizer: + SGDOptimizer ------------ @@ -54,6 +77,8 @@ SGDOptimizer :members: :noindex: +.. _api_fluid_optimizer_MomentumOptimizer: + MomentumOptimizer ----------------- @@ -61,6 +86,8 @@ MomentumOptimizer :members: :noindex: +.. _api_fluid_optimizer_AdagradOptimizer: + AdagradOptimizer ---------------- @@ -68,6 +95,8 @@ AdagradOptimizer :members: :noindex: +.. _api_fluid_optimizer_AdamOptimizer: + AdamOptimizer ------------- @@ -75,6 +104,8 @@ AdamOptimizer :members: :noindex: +.. _api_fluid_optimizer_AdamaxOptimizer: + AdamaxOptimizer --------------- @@ -82,6 +113,8 @@ AdamaxOptimizer :members: :noindex: +.. _api_fluid_optimizer_DecayedAdagradOptimizer: + DecayedAdagradOptimizer ----------------------- @@ -89,6 +122,8 @@ DecayedAdagradOptimizer :members: :noindex: +.. _api_fluid_optimizer_RMSPropOptimizer: + RMSPropOptimizer ---------------- @@ -96,6 +131,17 @@ RMSPropOptimizer :members: :noindex: +.. _api_fluid_optimizer_FtrlOptimizer: + +FtrlOptimizer +------------- + +.. autoclass:: paddle.fluid.optimizer.FtrlOptimizer + :members: + :noindex: + +.. _api_fluid_optimizer_Adadelta: + Adadelta -------- @@ -103,6 +149,8 @@ Adadelta :members: :noindex: +.. _api_fluid_optimizer_ModelAverage: + ModelAverage ------------ @@ -110,6 +158,8 @@ ModelAverage :members: :noindex: +.. _api_fluid_optimizer_Optimizer: + Optimizer --------- @@ -117,3 +167,12 @@ Optimizer :members: :noindex: +.. _api_fluid_optimizer_RMSPropOptimizer: + +RMSPropOptimizer +---------------- + +.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer + :members: + :noindex: + diff --git a/doc/fluid/api/param_attr.rst b/doc/fluid/api/param_attr.rst index 8e4ddb2b0492d0fcfcade199fdd6dfe43faa7075..33035bbc7ca5c8d000adeaf1cb79806a3ea64604 100644 --- a/doc/fluid/api/param_attr.rst +++ b/doc/fluid/api/param_attr.rst @@ -1,9 +1,11 @@ .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` !DO NOT EDIT THIS FILE MANUALLY! -========== -param_attr -========== +================ +fluid.param_attr +================ + +.. _api_fluid_param_attr_ParamAttr: ParamAttr --------- @@ -12,6 +14,8 @@ ParamAttr :members: :noindex: +.. _api_fluid_param_attr_WeightNormParamAttr: + WeightNormParamAttr ------------------- diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst index 39fda65863471a78895503184848a754828b71a1..c750a2d588df56728ac7f73051ab7a9e44dee232 100644 --- a/doc/fluid/api/profiler.rst +++ b/doc/fluid/api/profiler.rst @@ -1,9 +1,11 @@ .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` !DO NOT EDIT THIS FILE MANUALLY! -======== -profiler -======== +============== +fluid.profiler +============== + +.. _api_fluid_profiler_cuda_profiler: cuda_profiler ------------- @@ -11,24 +13,32 @@ cuda_profiler .. autofunction:: paddle.fluid.profiler.cuda_profiler :noindex: +.. _api_fluid_profiler_reset_profiler: + reset_profiler -------------- .. autofunction:: paddle.fluid.profiler.reset_profiler :noindex: +.. _api_fluid_profiler_profiler: + profiler -------- .. autofunction:: paddle.fluid.profiler.profiler :noindex: +.. _api_fluid_profiler_start_profiler: + start_profiler -------------- .. autofunction:: paddle.fluid.profiler.start_profiler :noindex: +.. _api_fluid_profiler_stop_profiler: + stop_profiler ------------- diff --git a/doc/fluid/api/recordio_writer.rst b/doc/fluid/api/recordio_writer.rst new file mode 100644 index 0000000000000000000000000000000000000000..f0c12fd115478a29fbd178b533b7490b2f663717 --- /dev/null +++ b/doc/fluid/api/recordio_writer.rst @@ -0,0 +1,23 @@ +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! + +===================== +fluid.recordio_writer +===================== + +.. _api_fluid_recordio_writer_convert_reader_to_recordio_file: + +convert_reader_to_recordio_file +------------------------------- + +.. autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file + :noindex: + +.. _api_fluid_recordio_writer_convert_reader_to_recordio_files: + +convert_reader_to_recordio_files +-------------------------------- + +.. autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files + :noindex: + diff --git a/doc/fluid/api/regularizer.rst b/doc/fluid/api/regularizer.rst index 756bc53baa0625aef48dad0c35e7ae57421a70d0..987eaea903520d91c284c8da7a8cb066a1648069 100644 --- a/doc/fluid/api/regularizer.rst +++ b/doc/fluid/api/regularizer.rst @@ -1,9 +1,11 @@ .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` !DO NOT EDIT THIS FILE MANUALLY! -=========== -regularizer -=========== +================= +fluid.regularizer +================= + +.. _api_fluid_regularizer_append_regularization_ops: append_regularization_ops ------------------------- @@ -11,12 +13,7 @@ append_regularization_ops .. autofunction:: paddle.fluid.regularizer.append_regularization_ops :noindex: -WeightDecayRegularizer ----------------------- - -.. autoclass:: paddle.fluid.regularizer.WeightDecayRegularizer - :members: - :noindex: +.. _api_fluid_regularizer_L1Decay: L1Decay ------- @@ -25,6 +22,8 @@ L1Decay :members: :noindex: +.. _api_fluid_regularizer_L2Decay: + L2Decay ------- @@ -32,6 +31,8 @@ L2Decay :members: :noindex: +.. _api_fluid_regularizer_L1DecayRegularizer: + L1DecayRegularizer ------------------ @@ -39,6 +40,8 @@ L1DecayRegularizer :members: :noindex: +.. _api_fluid_regularizer_L2DecayRegularizer: + L2DecayRegularizer ------------------ diff --git a/doc/fluid/api/transpiler.rst b/doc/fluid/api/transpiler.rst new file mode 100644 index 0000000000000000000000000000000000000000..943d39331d26c05764c90cb24f6774997c976bfe --- /dev/null +++ b/doc/fluid/api/transpiler.rst @@ -0,0 +1,50 @@ +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! + +================ +fluid.transpiler +================ + +.. _api_fluid_transpiler_DistributeTranspiler: + +DistributeTranspiler +-------------------- + +.. autoclass:: paddle.fluid.transpiler.DistributeTranspiler + :members: + :noindex: + +.. _api_fluid_transpiler_memory_optimize: + +memory_optimize +--------------- + +.. autofunction:: paddle.fluid.transpiler.memory_optimize + :noindex: + +.. _api_fluid_transpiler_release_memory: + +release_memory +-------------- + +.. autofunction:: paddle.fluid.transpiler.release_memory + :noindex: + +.. _api_fluid_transpiler_HashName: + +HashName +-------- + +.. autoclass:: paddle.fluid.transpiler.HashName + :members: + :noindex: + +.. _api_fluid_transpiler_RoundRobin: + +RoundRobin +---------- + +.. autoclass:: paddle.fluid.transpiler.RoundRobin + :members: + :noindex: + diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md new file mode 100644 index 0000000000000000000000000000000000000000..dffee8e02bacbc99bdfa8c54f1a146de340ad778 --- /dev/null +++ b/doc/fluid/design/concepts/python_data_feeding.md @@ -0,0 +1,130 @@ +# Python Data Feeding + +In the former implementation of Paddle Fluid, there are two ways to feed data: + +- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details. + +- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance. + +In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `LoDTensorBlockingQueue` is designed to be shared by the Python and C++ side, while `LoDTensorArray` is pushed into the queue in Python side and `reader_op` in C++ side reads out the data from the queue. + + +## Design of LoDTensorBlockingQueue +`LoDTensorBlockingQueue` is a blocking queue with a fixed `capacity` and accepts `std::vector` with shapes indicated by `dims`. Since `LoDTensorBlockingQueue` must be constructed using `capacity` and `dims`, it cannot be a `Variable` type. Therefore, a `LoDTensorBlockingQueueHolder` is designed to defer construction of `LoDTensorBlockingQueue`. + +```C++ +class LoDTensorBlockingQueueHolder; + +class LoDTensorBlockingQueue { + friend class LoDTensorBlockingQueueHolder; + private: + // `LoDTensorBlockingQueue` can only be constructed by + // `LoDTensorBlockingQueueHolder::InitOnce()` + LoDTensorBlockingQueue(size_t capacity, const std::vector& dims); + + public: + size_t Size() const { return queue_.Size(); } // Get the current size of the queue + + size_t Cap() const { return queue_.Cap(); }// Get the capacity of the queue + + void Close() { return queue_.Close(); } + + bool IsClosed() const { return queue_.IsClosed(); } + + // Block if Size() == Cap() + // Return false only when queue_.IsClosed() == true + bool Push(const std::vector &lod_tensor_vec); + + // Block if Size() == 0. + // *Success == false when queue_.IsClosed() == true + std::vector Pop(bool *success = nullptr); + + private: + // Use reader::BlockingQueue as the inner data structure + BlockingQueue> queue_; + std::vector dims_; +}; + +class LoDTensorBlockingQueueHolder { + public: + // Call the constructor of `LoDTensorBlockingQueue` to create queue_ + // `InitOnce` can only called once, otherwise an exception would raise + void InitOnce(size_t capacity, const std::vector& dims) { + PADDLE_ENFORCE(queue_ == nullptr); + queue_.reset(new LoDTensorBlockingQueue(capacity, dims)); + } + + const std::shared_ptr& GetQueue() const { return queue_; } + + private: + std::shared_ptr queue_; +}; +``` + +There are some major things that must be concerned: +- `LoDTensorBlockingQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data. +- A `Variable` of `LoDTensorBlockingQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called. +- `Create_reader_op` should accept the name of the `LoDTensorBlockingQueueHolder` variable as an input. + + +## Release of the GIL in pybind +`Pybind11::gil_scoped_release` is used to release GIL (Global Interpreter Lock) when `LoDTensorBlockingQueue::Push()` or `Executor::Run()` method are invoked in Python side, making `LoDTensorBlockingQueue::Push()` and `Executor::Run()` run in parallel. + + +## Design of PyReader +`PyReader` is a reader which holds a `LoDTensorBlockingQueue` object. +```C++ +class PyReader : public ReaderBase { + public: + explicit PyReader(const std::shared_ptr& queue); + + void ReadNext(std::vector* out) override { + bool success; + *out = queue_->Pop(&success); + if (!success) out->clear(); + } + + void ReInit() override { return; } + + private: + std::shared_ptr queue_; +}; +``` + + +## Design of CreatePyReaderOp +`CreatePyReaderOp` is used to create the `PyReader` object. It requires an input `blocking_queue` which indicates the name of the `LoDTensorBlockingQueueHolder` variable. +```C++ +class CreatePyReaderOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + if (out->Get() != nullptr) return; + + const std::string& queue_name = Input("blocking_queue"); + auto* queue_holder_var = scope.FindVar(queue_name); + PADDLE_ENFORCE(queue_holder_var != nullptr); + auto* queue_holder = queue_holder_var + ->template GetMutable(); + out->Reset(new PyReader(queue_holder->GetQueue())); + } +}; +``` + +## Design of Python codes +The design of Python codes are as follows. First, we construct a variable of `LoDTensorBlockingQueueHolder` and init it with given parameters, returning the `LoDTensorBlockingQueue` object after initialization. After that, a layer of `CreatePyReaderOp` is constructed and accepts the name of the `LoDTensorBlockingQueueHolder` variable. The `LoDTensorBlockingQueue` object and result of the layer are both returned. +```Python +def py_reader(capacity, shapes): + queue_name = unique_name.generate("lod_tensor_blocking_queue") + var = global_scope().var(feeder_name) # create LoDTensorBlockingQueueHolder Variable + feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) # init the queue + out = create_var() + create_py_reader_op_with_queue_name( + inputs={'blocking_queue': queue_name}, + outputs={'Out':[out]}) + return out, feed_queue +``` diff --git a/doc/fluid/dev/api_doc_std_cn.md b/doc/fluid/dev/api_doc_std_cn.md index b50f18f21df0787b9761bf0935ed7f4384ff0f98..7d39b8de1e6dc502ffea5f7882bd6a42b1ed6549 100644 --- a/doc/fluid/dev/api_doc_std_cn.md +++ b/doc/fluid/dev/api_doc_std_cn.md @@ -1,8 +1,9 @@ # API注释撰写标准 -- [API注释模块](#API注释模块) -- [格式及示例](#格式及示例) -- [完整示例](#完整示例) +- [API注释撰写标准](#api) + - [API注释模块](#api) + - [格式及示例](#) + - [完整示例](#) ## API注释模块 @@ -217,4 +218,4 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接 ## 完整示例 -fc 的完整注释见[示例](src/fc.py)。 +fc 的完整注释见[示例](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。 diff --git a/doc/fluid/dev/api_doc_std_en.md b/doc/fluid/dev/api_doc_std_en.md index e57072d52fd162e92a3482aef33f99ab9394c532..f175b219750d1c765a6a111c2ec3aa732fa46175 100644 --- a/doc/fluid/dev/api_doc_std_en.md +++ b/doc/fluid/dev/api_doc_std_en.md @@ -1,8 +1,9 @@ # API Doc Standard -- [API Doc Structure](#API Doc Structure) -- [Format and Examples](#Format and Examples) -- [Complete Example](#Complete Example) +- [API Doc Standard](#api-doc-standard) + - [API Doc Structure](#api-doc-structure) + - [Format and Examples](#format-and-examples) + - [Complete Example](#complete-example) ## API Doc Structure @@ -223,4 +224,4 @@ Format and examples of each part of API documantation are as follows: (take fc f ## Complete Example -Complete Example of fc please see [here](src/fc.py)。 +Complete Example of fc please see [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。 diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md index b99b90056b0a2e51f2668a6d27d94857bdc09c37..55326940ce7c7dbaa5bf19f1950f470527ddf4f0 100644 --- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md +++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md @@ -168,13 +168,13 @@ cd /paddle/python/paddle/fluid/tests/book 第二步,启动Parameter Server: ```bash -PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py +PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.2 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=192.168.1.2 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=PSERVER python test_fit_a_line.py ``` 执行命令后请等待出现提示: ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。 第三步,启动Trainer: ```bash -PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py +PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.3 PADDLE_TRAINERS=2 PADDLE_CURRENT_IPP=192.168.1.3 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=TRAINER python test_fit_a_line.py ``` 由于我们定义的Trainer的数量是2个,因此需要在另外一个计算节点上再启动一个Trainer。 diff --git a/doc/fluid/howto/cluster/fluid_recordio.md b/doc/fluid/howto/cluster/fluid_recordio.md index 55ce63ec193948424cd0b87f13d56b9cf6154dfc..92859e8f622d0c155128821c54252113c5016989 100644 --- a/doc/fluid/howto/cluster/fluid_recordio.md +++ b/doc/fluid/howto/cluster/fluid_recordio.md @@ -114,8 +114,8 @@ def gen_train_list(file_pattern, trainers, trainer_id): ret_list.append(f) return ret_list -trainers = int(os.getenv("TRAINERS")) -trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) +trainers = int(os.getenv("PADDLE_TRAINERS")) +trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) data_file = fluid.layers.io.open_files( filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0), thread_num=1, diff --git a/doc/fluid/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/howto/inference/build_and_install_lib_cn.rst index c8d9992fcc92c25f8c14f71c79bde9f79fd92b1f..84005b54e07cf810649370d2c1f6b6c522434bf6 100644 --- a/doc/fluid/howto/inference/build_and_install_lib_cn.rst +++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst @@ -13,6 +13,7 @@ cpu_noavx_openblas `fluid.tgz `_ cuda8.0_cudnn5_avx_mkl `fluid.tgz `_ cuda8.0_cudnn7_avx_mkl `fluid.tgz `_ +cuda9.0_cudnn7_avx_mkl `fluid.tgz `_ ====================== ======================================== 从源码编译 diff --git a/doc/fluid/howto/optimization/host_memory_profiling_cn.md b/doc/fluid/howto/optimization/host_memory_profiling_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..9b55a66ded8b48f7105c05f1462839a72ab5f904 --- /dev/null +++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md @@ -0,0 +1,89 @@ +## 堆内存分析和优化 + +计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放,随着程序的运行占用的内存越来越大,一方面会影响程序的稳定性,可能让运行速度越来越慢,或者造成oom,甚至会影响运行程序的机器的稳定性,造成宕机。 + + +目前有很多内存泄漏分析工具,比较经典的有[valgrind](http://valgrind.org/docs/manual/quick-start.html#quick-start.intro), [gperftools](https://gperftools.github.io/gperftools/)。 + +因为Fluid是用Python驱动C++ core来运行,valgrind直接分析非常困难,需要自己编译debug版本的、带valgrind支持的专用Python版本,而且输出的信息中大部分是Python自己的符号和调用信息,分析起来很困难,另外使用valgrind会让程序运行速度变得非常慢,所以不建议使用。 + +本教程主要介绍[gperftools](https://gperftools.github.io/gperftools/)的使用。 + +gperftool主要支持以下四个功能: + +- thread-caching malloc +- heap-checking using tcmalloc +- heap-profiling using tcmalloc +- CPU profiler + +Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/cpu_profiling_cn.md)。 + +对于堆内存的分析,主要用到thread-caching malloc和heap-profiling using tcmalloc。 + +## 使用流程 +#### 环境 +本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev,基于Ubuntu 16.04.4 LTS环境。 + +#### 使用流程 + +- 安装google-perftools + +``` +apt-get install libunwind-dev +apt-get install google-perftools +``` + +- 安装pprof + +``` +go get -u github.com/google/pprof +``` + +- 设置运行环境 + +``` +export PPROF_PATH=/root/gopath/bin/pprof +export PPROF_BINARY_PATH=/root/gopath/bin/pprof +export LD_PRELOAD=/usr/lib/libtcmalloc.so.4 +``` + +- 使用heap profile来运行python程序。本质上是周期性的对堆的分配情况做一次快照。 + +``` +# HEAPPROFILE 设置生成的堆分析文件的目录和文件前缀 +# HEAP_PROFILE_ALLOCATION_INTERVAL 设置每分配多少存储dump一次dump,默认1GB +env HEAPPROFILE="./perf_log/test.log" HEAP_PROFILE_ALLOCATION_INTERVAL=209715200 python trainer.py +``` + +随着程序的运行,会在perf_log这个文件夹下生成很多文件,如下: + +``` +-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0001.heap +-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0002.heap +-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0003.heap +-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0004.heap +-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0005.heap +-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0006.heap +``` + +- 使用pprof对heap文件进行分析。分析有两种模式: + - 完整模式。会对当前heap做一个分析,显示目前分配内存一些调用路径。 + + ``` + pprof --pdf python test.log.0012.heap + ``` + 上述命令会生成一个profile00x.pdf的文件,可以直接打开,例如:[memory_cpu_allocator](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_cpu_allocator.pdf)。从下图可以看出,在CPU版本fluid的运行过程中,分配存储最多的模块式CPUAllocator. 而别的模块相对而言分配内存较少,所以被忽略了,这对于分配内存泄漏是很不方便的,因为泄漏是一个缓慢的过程,在这种图中是无法看到的。 + + ![result](https://user-images.githubusercontent.com/3048612/40964027-a54033e4-68dc-11e8-836a-144910c4bb8c.png) + + - Diff模式。可以对两个时刻的heap做diff,把一些内存分配没有发生变化的模块去掉,而把增量部分显示出来。 + ``` + pprof --pdf --base test.log.0010.heap python test.log.1045.heap + ``` + 生成的结果为:[`memory_leak_protobuf`](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_leak_protobuf.pdf) + + 从图中可以看出:ProgramDesc这个结构,在两个版本之间增长了200MB+,所以这里有很大的内存泄漏的可能性,最终结果也确实证明是这里造成了泄漏。 + + ![result](https://user-images.githubusercontent.com/3048612/40964057-b434d5e4-68dc-11e8-894b-8ab62bcf26c2.png) + ![result](https://user-images.githubusercontent.com/3048612/40964063-b7dbee44-68dc-11e8-9719-da279f86477f.png) + diff --git a/doc/survey/dynamic_graph.md b/doc/survey/dynamic_graph.md new file mode 100644 index 0000000000000000000000000000000000000000..6b80b014b1b1dc50f425e1296f70984c9e9b1cbd --- /dev/null +++ b/doc/survey/dynamic_graph.md @@ -0,0 +1,378 @@ +# Automatic Differentiation with the Tape + +## Automatic Differentiation + +A key challenge in the field of deep learning is to automatically derive the backward pass from the forward pass described algorithmically by researchers. Such a derivation, or a transformation of the forward pass program, has been long studied before the recent prosperity of deep learning in the field known as [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf). + +## The Tape + +Given the forward pass program (usually in Python in practices), there are two strategies to derive the backward pass: + +1. from the forward pass program itself, or +1. from the execution trace of the forward pass program, which is often known as the *tape*. + +This article surveys systems that follow the latter strategy. + +## Dynamic Network + +When we train a deep learning model, the tape changes every iteration as the input data change, so we have to re-derive the backward pass every iteration. This is known as *dynamic network*. + +Deep learning systems that utilize the idea of dynamic network gained their popularities in recent years. This article surveys two representative systems: [PyTorch](https://pytorch.org/) and [DyNet](https://dynet.readthedocs.io/en/latest/). + +## An Overview + +Both frameworks record a ‘tape’ of the computation and interpreting (or run-time compiling) a transformation of the tape played back in reverse. This tape is a different kind of entity than the original program.[[link]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf) + +Consider the following code feedforward model. + +```python +x = Variable(randn(20, 1))) +label = Variable(randint(1)) +W_1, W_2 = Variable(randn(20, 20)), Variable(randn(10, 20)) +h = matmul(W_1, x) +pred = matmul(W_2, x) +loss = softmax(pred, label) +loss.backward() +``` + +### 1) Dynet uses List to encode the Tape + +During the forward execution, a list of operators, in this case `matmul`, `matmul` and `softmax`, are recorded in the tape, along with the necessary information needed to do the backward such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward()`. + +
+ +digraph g { + graph [ + rankdir = "LR" + ]; + node [ + fontsize = "16" + shape = "ellipse" + ]; + edge []; + "node0" [ + label = " type: matmul | input: W_1, x | output: h" + shape = "record" + ]; + "node1" [ + label = " type: matmul | input: W_2, h | output: pred" + shape = "record" + ]; + "node2" [ + label = " type: softmax | input: pred, label | output: loss" + shape = "record" + ]; + "node0":f0 -> "node1":f0 []; + "node1":f0 -> "node2":f0 []; +} +
+ +![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22ellipse%22%20];%20edge%20[];%20%22node0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_1,%20x%20|%20%3Cf2%3E%20output:%20h%22%20shape%20=%20%22record%22%20];%20%22node1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_2,%20h%20|%20%3Cf2%3E%20output:%20pred%22%20shape%20=%20%22record%22%20];%20%22node2%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20%3Cf1%3E%20input:%20pred,%20label%20|%20%3Cf2%3E%20output:%20loss%22%20shape%20=%20%22record%22%20];%20%22node0%22:f0%20-%3E%20%22node1%22:f0%20[%20id%20=%200%20];%20%22node1%22:f0%20-%3E%20%22node2%22:f0%20[%20id%20=%201%20];%20}) + +### 2) Pytorch uses Node Graph to encode the Tape + +The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order. + +
+ +digraph g { + graph [ + rankdir = "LR" + ]; + + subgraph function { + node [ + fontsize = "16" + style = filled + shape = "record" + ]; + "matmul0" [ label = " type: matmul | prev_func: None" ]; + "matmul1" [ label = " type: matmul | prev_func: matmul" ]; + "softmax" [ label = " type: softmax | prev_func: matmul" ]; + } + + subgraph variable { + node [ + fontsize = "16" + shape = "Mrecord" + style = filled + fillcolor = white + ]; + "x" [ label = " x | creator: None" ]; + "label" [ label = " label | creator: None" ]; + "W_1" [ label = " W_1 | creator: None" ]; + "W_2" [ label = " W_2 | creator: None" ]; + "h" [ label = " h | creator: None" ]; + "pred" [ label = " pred | creator: matmul" ]; + "loss" [ label = " loss | creator: softmax" ]; + } + + subgraph data_flow { + "x":f0 -> "matmul0":f0; + "W_1":f0 -> "matmul0":f0; + "matmul0":f0 -> "h":f0; + + "h":f0 -> "matmul1":f0; + "W_2":f0 -> "matmul1":f0; + "matmul1":f0 -> "pred":f0; + + "pred":f0 -> "softmax":f0; + "label":f0 -> "softmax":f0; + "softmax":f0 -> "loss":f0; + } + + subgraph prev_func { + edge [color="red", arrowsize="0.6", penwidth="1", constraint=false]; + "matmul1":f1 -> "matmul0":f0; + "softmax":f1 -> "matmul1":f0; + label = "prev_func"; + } +} +
+ +![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20subgraph%20function%20{%20node%20[%20fontsize%20=%20%2216%22%20style%20=%20filled%20shape%20=%20%22record%22%20];%20%22matmul0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20None%22%20];%20%22matmul1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20matmul%22%20];%20%22softmax%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20prev_func:%20matmul%22%20];%20}%20subgraph%20variable%20{%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22Mrecord%22%20style%20=%20filled%20fillcolor%20=%20white%20];%20%22x%22%20[%20label%20=%20%22%3Cf0%3E%20x%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22label%22%20[%20label%20=%20%22%3Cf0%3E%20label%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_1%22%20[%20label%20=%20%22%3Cf0%3E%20W_1%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_2%22%20[%20label%20=%20%22%3Cf0%3E%20W_2%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22h%22%20[%20label%20=%20%22%3Cf0%3E%20h%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22pred%22%20[%20label%20=%20%22%3Cf0%3E%20pred%20|%20%3Cf1%3E%20creator:%20matmul%22%20];%20%22loss%22%20[%20label%20=%20%22%3Cf0%3E%20loss%20|%20%3Cf1%3E%20creator:%20softmax%22%20];%20}%20subgraph%20data_flow%20{%20%22x%22:f0%20-%3E%20%22matmul0%22:f0;%20%22W_1%22:f0%20-%3E%20%22matmul0%22:f0;%20%22matmul0%22:f0%20-%3E%20%22h%22:f0;%20%22h%22:f0%20-%3E%20%22matmul1%22:f0;%20%22W_2%22:f0%20-%3E%20%22matmul1%22:f0;%20%22matmul1%22:f0%20-%3E%20%22pred%22:f0;%20%22pred%22:f0%20-%3E%20%22softmax%22:f0;%20%22label%22:f0%20-%3E%20%22softmax%22:f0;%20%22softmax%22:f0%20-%3E%20%22loss%22:f0;%20}%20subgraph%20prev_func%20{%20edge%20[color=%22red%22,%20arrowsize=%220.6%22,%20penwidth=%221%22,%20constraint=false];%20%22matmul1%22:f1%20-%3E%20%22matmul0%22:f0;%20%22softmax%22:f1%20-%3E%20%22matmul1%22:f0;%20label%20=%20%22prev_func%22;%20}%20}) + +Chainer and Autograd uses the similar techniques to record the forward pass. For details please refer to the appendix. + +## Design choices + +### 1) Dynet's List vs Pytorch's Node Graph + +What's good about List: +1. It avoids a topological sort. One only needs to traverse the list of operators in reverse and calling the corresponding backward operator. +1. It promises effient data parallelism implementations. One could count the time of usage of a certain variable during the construction list. Then in the play back, one knows the calculation of a variable has completed. This enables communication and computation overlapping. + +What's good about Node Graph: +1. More flexibility. PyTorch users can mix and match independent graphs however they like, in whatever threads they like (without explicit synchronization). An added benefit of structuring graphs this way is that when a portion of the graph becomes dead, it is automatically freed. [[2]](https://openreview.net/pdf?id=BJJsrmfCZ) Consider the following example, Pytorch only does backward on SmallNet while Dynet does both BigNet and SmallNet. +```python +result = BigNet(data) +loss = SmallNet(data) +loss.backward() +``` + +### 2) Dynet's Lazy evaluation vs Pytorch's Immediate evaluation + +Dynet builds the list in a symbolic matter. Consider the following example +```python +for epoch in range(num_epochs): + for in_words, out_label in training_data: + dy.renew_cg() + W = dy.parameter(W_p) + b = dy.parameter(b_p) + score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b) + loss_sym = dy.pickneglogsoftmax(score_sym, out_label) + loss_val = loss_sym.value() + loss_sym.backward() +``` +The computation of `lookup`, `concat`, `matmul` and `softmax` didn't happen until the call of `loss_sym.value()`. This defered execution is useful because it allows some graph-like optimization possible, e.g. kernel fusion. + +Pytorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`. + + +## What can fluid learn from them? + +Please refer to `paddle/contrib/dynamic/`. + +# Appendix + +### Overview + +| Framework | Has Tape | Core in C++ | First Release Date | +|-----------|----------|-------------|--------------------| +| Autograd | No | No | Mar 5, 2015 | +| Chainer | No | No | Jun 5, 2015 | +| Pytorch | No | Yes | Aug 31, 2016 | +| Dynet | Yes | Yes | Oct 12, 2016 | + +### Source Code +#### Autograd +[Backward code](https://github.com/HIPS/autograd/blob/442205dfefe407beffb33550846434baa90c4de7/autograd/core.py#L8-L40). In the forward pass, a graph of VJPNode is constructed. +```python +# User API +def make_grad(fun, x): + start_node = VJPNode.new_root() + end_value, end_node = trace(start_node, fun, x) + return backward_pass(g, end_node), end_value + +# trace the forward pass by creating VJPNodes +def trace(start_node, fun, x): + with trace_stack.new_trace() as t: + start_box = new_box(x, t, start_node) + end_box = fun(start_box) + return end_box._value, end_box._node + +def backward_pass(g, end_node): + outgrads = {end_node : (g, False)} + for node in toposort(end_node): + outgrad = outgrads.pop(node) + ingrads = node.vjp(outgrad[0]) + for parent, ingrad in zip(node.parents, ingrads): + outgrads[parent] = add_outgrads(outgrads.get(parent), ingrad) + return outgrad[0] + +# Every VJPNode corresponds to a op_grad +class VJPNode(Node): + __slots__ = ['parents', 'vjp'] + def __init__(self, value, fun, args, kwargs, parent_argnums, parents): + self.parents = parents + vjpmaker = primitive_vjps[fun] + self.vjp = vjpmaker(parent_argnums, value, args, kwargs) +``` +#### Chainer +Example Code +```python +# (1) Function Set definition, creates FunctionNode +model = FunctionSet( + l1=F.Linear(784, 100), + l2=F.Linear(100, 100), + l3=F.Linear(100, 10)).to_gpu() + +# (2) Optimizer Setup +opt = optimizers.SGD() +opt.setup(model) + +# (3) Forward computation +def forward(x, t): + h1 = F.relu(model.l1(x)) + h2 = F.relu(model.l2(h1)) + y = model.l3(h2) + return F.softmax_cross_entropy(y, t) + +# (4) Training loop +for epoch in xrange(n_epoch): + for i in xrange(0, N, b_size): + x = Variable(to_gpu(...)) + t = Variable(to_gpu(...)) + opt.zero_grads() + loss = forward(x, t) + loss.backward() + opt.update() +``` +In `forward(x, t)`, a graph of [`VariableNode`](https://github.com/chainer/chainer/blob/master/chainer/variable.py#L110) and [`FunctionNode`](https://github.com/chainer/chainer/blob/a69103a4aa59d5b318f39b01dbcb858d465b89cf/chainer/function_node.py#L19) is constructed. Every output's `VariableNode.creator` is pointed to the `FunctionNode`. +```python +class FunctionNode(object): + ... + def apply(self, inputs): + outputs = self.forward(inputs) + ret = tuple([variable.Variable(y, requires_grad=requires_grad) + for y in outputs]) + # Topological ordering + self.rank = max([x.rank for x in inputs]) if input_vars else 0 + # Add backward edges + for y in ret: + y.creator_node = self + self.inputs = tuple([x.node for x in input_vars]) + self.outputs = tuple([y.node for y in ret]) + + return ret +``` +`loss.backward()` will calculate the accumulated gradient of all variables. All the backward of `FunctionNode`s will be called based on the topological order. +```python +class VariableNode(object): + ... + def backward(self, retain_grad, loss_scale): + if self.creator_node is None: + return + + cand_funcs = [] + seen_set = set() + grads = {} + + # Initialize error by 1, if this is a loss variable + if self.data.size == 1 and self._grad_var is None: + self.grad = numpy.ones_like(self.data) + grads[self._node] = self._grad_var + + def add_cand(cand): + if cand not in seen_set: + # Negate since heapq is min-heap. This is a global variable + heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) + seen_set.add(cand) + + add_cand(self.creator_node) + + while cand_funcs: + _, _, func = heapq.heappop(cand_funcs) + gxs = func.backward_accumulate(func.inputs, func.outputs, func.outputs.grad) + + for x, gx in enumerate(gxs): + if x in grads: + grads[x] += gx + else: + grads[x] = gx + + if x.creator_node is not None: + add_cand(x.creator_node) +``` + +#### PyTorch +Example Code +```python +x = Variable(torch.ones(5, 5)) +y = Variable(torch.ones(5, 5) * 4) +z = x ** 2 + x * 2 + x * y + y +z.backward(torch.ones(5, 5)) +``` +The trace is done by `Variable.creator` and `Function.previous_functions`. +```python +class Variable(object): + def __init__(self, tensor, creator=None, requires_grad=True): + if creator is None: + creator = Leaf(self, requires_grad) + self.data = tensor + self.creator = creator + self._grad = None + + def backward(self, gradient=None): + if gradient is None: + if self.data.numel() != 1: + raise RuntimeError('backward should be called only on a scalar (i.e. 1-element tensor) or with gradient w.r.t. the variable') + gradient = self.data.new(1).fill_(1) + self._execution_engine.run_backward(self, gradient) + +class Function(obejct): + # ... + def _do_forward(self, *input): + unpacked_input = tuple(arg.data for arg in input) + raw_output = self.forward(*unpacked_input) + + # mark output.creator = self for backward trace + output = tuple(Variable(tensor, self) for tensor in raw_output) + + self.previous_functions = [(arg.creator, id(arg)) for arg in input] + self.output_ids = {id(var): i for i, var in enumerate(output)} + return output + + def _do_backward(self, grad_output): + return self.backwaerd(grad_output) +``` +The [backward](https://github.com/pytorch/pytorch/blob/v0.1.1/torch/autograd/engine.py) is similar to Autograd. + +#### DyNet +Example code +```python +model = dy.model() +W_p = model.add_parameters((20, 100)) +b_p = model.add_parameters(20) +E = model.add_lookup_parameters((20000, 50)) +for epoch in range(num_epochs): + for in_words, out_label in training_data: + dy.renew_cg() # init tape + W = dy.parameter(W_p) + b = dy.parameter(b_p) + score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b) + loss_sym = dy.pickneglogsoftmax(score_sym, out_label) + loss_val = loss_sym.value() + loss_sym.backward() +``` +[forward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L84-L158), [backward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L166-L284). The trace is done by creating a tape of expressions in every iteration. Backward is done by traverse the tape in the reverse order. +```c++ +void SimpleExecutionEngine::backward(VariableIndex from_where, bool full) { + ... + for (int i = num_nodes - 1; i >= 0; --i) { + // each node corresponds to an op + node->backward(xs, node_fx, node_dEdfx, ai, node_dEdxai); + } + ... +} +``` diff --git a/doc/v2/api/config/evaluators.rst b/doc/v2/api/config/evaluators.rst index 9ac972fb193a2fb525edc507f7ba1303d2c8eabe..458d892e825a7a9bbe7843ad5c508bd5a31f5f0f 100644 --- a/doc/v2/api/config/evaluators.rst +++ b/doc/v2/api/config/evaluators.rst @@ -101,7 +101,7 @@ value_printer :noindex: Detection -===== +========== detection_map ------------- diff --git a/doc/v2/api/config/layer.rst b/doc/v2/api/config/layer.rst index 1a6496968cae1fef88142ba9ca3f9e63a81b196d..5a0cfadfce84df41defdf518b7c3a6222d5b30a1 100644 --- a/doc/v2/api/config/layer.rst +++ b/doc/v2/api/config/layer.rst @@ -11,7 +11,7 @@ Data layer data ---- -.. autoclass:: paddle.v2.layer.data +.. autofunction:: paddle.v2.layer.data :noindex: Fully Connected Layers @@ -21,12 +21,12 @@ Fully Connected Layers fc -- -.. autoclass:: paddle.v2.layer.fc +.. autofunction:: paddle.v2.layer.fc :noindex: selective_fc ------------ -.. autoclass:: paddle.v2.layer.selective_fc +.. autofunction:: paddle.v2.layer.selective_fc :noindex: Conv Layers @@ -34,34 +34,34 @@ Conv Layers conv_operator ------------- -.. autoclass:: paddle.v2.layer.conv_operator +.. autofunction:: paddle.v2.layer.conv_operator :noindex: conv_projection --------------- -.. autoclass:: paddle.v2.layer.conv_projection +.. autofunction:: paddle.v2.layer.conv_projection :noindex: conv_shift ---------- -.. autoclass:: paddle.v2.layer.conv_shift +.. autofunction:: paddle.v2.layer.conv_shift :noindex: img_conv -------- -.. autoclass:: paddle.v2.layer.img_conv +.. autofunction:: paddle.v2.layer.img_conv :noindex: .. _api_v2.layer_context_projection: context_projection ------------------ -.. autoclass:: paddle.v2.layer.context_projection +.. autofunction:: paddle.v2.layer.context_projection :noindex: row_conv -------- -.. autoclass:: paddle.v2.layer.row_conv +.. autofunction:: paddle.v2.layer.row_conv :noindex: Image Pooling Layer @@ -69,27 +69,27 @@ Image Pooling Layer img_pool -------- -.. autoclass:: paddle.v2.layer.img_pool +.. autofunction:: paddle.v2.layer.img_pool :noindex: spp --- -.. autoclass:: paddle.v2.layer.spp +.. autofunction:: paddle.v2.layer.spp :noindex: maxout ------ -.. autoclass:: paddle.v2.layer.maxout +.. autofunction:: paddle.v2.layer.maxout :noindex: roi_pool -------- -.. autoclass:: paddle.v2.layer.roi_pool +.. autofunction:: paddle.v2.layer.roi_pool :noindex: pad ---- -.. autoclass:: paddle.v2.layer.pad +.. autofunction:: paddle.v2.layer.pad :noindex: Norm Layer @@ -97,27 +97,27 @@ Norm Layer img_cmrnorm ----------- -.. autoclass:: paddle.v2.layer.img_cmrnorm +.. autofunction:: paddle.v2.layer.img_cmrnorm :noindex: batch_norm ---------- -.. autoclass:: paddle.v2.layer.batch_norm +.. autofunction:: paddle.v2.layer.batch_norm :noindex: sum_to_one_norm --------------- -.. autoclass:: paddle.v2.layer.sum_to_one_norm +.. autofunction:: paddle.v2.layer.sum_to_one_norm :noindex: cross_channel_norm ------------------ -.. autoclass:: paddle.v2.layer.cross_channel_norm +.. autofunction:: paddle.v2.layer.cross_channel_norm :noindex: row_l2_norm ----------- -.. autoclass:: paddle.v2.layer.row_l2_norm +.. autofunction:: paddle.v2.layer.row_l2_norm :noindex: Recurrent Layers @@ -125,22 +125,22 @@ Recurrent Layers recurrent --------- -.. autoclass:: paddle.v2.layer.recurrent +.. autofunction:: paddle.v2.layer.recurrent :noindex: lstmemory --------- -.. autoclass:: paddle.v2.layer.lstmemory +.. autofunction:: paddle.v2.layer.lstmemory :noindex: grumemory --------- -.. autoclass:: paddle.v2.layer.grumemory +.. autofunction:: paddle.v2.layer.grumemory :noindex: gated_unit ----------- -.. autoclass:: paddle.v2.layer.gated_unit +.. autofunction:: paddle.v2.layer.gated_unit :noindex: Recurrent Layer Group @@ -148,32 +148,32 @@ Recurrent Layer Group memory ------ -.. autoclass:: paddle.v2.layer.memory +.. autofunction:: paddle.v2.layer.memory :noindex: recurrent_group --------------- -.. autoclass:: paddle.v2.layer.recurrent_group +.. autofunction:: paddle.v2.layer.recurrent_group :noindex: lstm_step --------- -.. autoclass:: paddle.v2.layer.lstm_step +.. autofunction:: paddle.v2.layer.lstm_step :noindex: gru_step -------- -.. autoclass:: paddle.v2.layer.gru_step +.. autofunction:: paddle.v2.layer.gru_step :noindex: beam_search ------------ -.. autoclass:: paddle.v2.layer.beam_search +.. autofunction:: paddle.v2.layer.beam_search :noindex: get_output ---------- -.. autoclass:: paddle.v2.layer.get_output +.. autofunction:: paddle.v2.layer.get_output :noindex: Mixed Layer @@ -183,54 +183,54 @@ Mixed Layer mixed ----- -.. autoclass:: paddle.v2.layer.mixed +.. autofunction:: paddle.v2.layer.mixed :noindex: .. _api_v2.layer_embedding: embedding --------- -.. autoclass:: paddle.v2.layer.embedding +.. autofunction:: paddle.v2.layer.embedding :noindex: scaling_projection ------------------ -.. autoclass:: paddle.v2.layer.scaling_projection +.. autofunction:: paddle.v2.layer.scaling_projection :noindex: dotmul_projection ----------------- -.. autoclass:: paddle.v2.layer.dotmul_projection +.. autofunction:: paddle.v2.layer.dotmul_projection :noindex: dotmul_operator --------------- -.. autoclass:: paddle.v2.layer.dotmul_operator +.. autofunction:: paddle.v2.layer.dotmul_operator :noindex: full_matrix_projection ---------------------- -.. autoclass:: paddle.v2.layer.full_matrix_projection +.. autofunction:: paddle.v2.layer.full_matrix_projection :noindex: identity_projection ------------------- -.. autoclass:: paddle.v2.layer.identity_projection +.. autofunction:: paddle.v2.layer.identity_projection :noindex: slice_projection ------------------- -.. autoclass:: paddle.v2.layer.slice_projection +.. autofunction:: paddle.v2.layer.slice_projection :noindex: table_projection ---------------- -.. autoclass:: paddle.v2.layer.table_projection +.. autofunction:: paddle.v2.layer.table_projection :noindex: trans_full_matrix_projection ---------------------------- -.. autoclass:: paddle.v2.layer.trans_full_matrix_projection +.. autofunction:: paddle.v2.layer.trans_full_matrix_projection :noindex: Aggregate Layers @@ -245,51 +245,46 @@ AggregateLevel pooling ------- -.. autoclass:: paddle.v2.layer.pooling +.. autofunction:: paddle.v2.layer.pooling :noindex: .. _api_v2.layer_last_seq: last_seq -------- -.. autoclass:: paddle.v2.layer.last_seq +.. autofunction:: paddle.v2.layer.last_seq :noindex: .. _api_v2.layer_first_seq: first_seq --------- -.. autoclass:: paddle.v2.layer.first_seq +.. autofunction:: paddle.v2.layer.first_seq :noindex: sub_seq --------- -.. autoclass:: paddle.v2.layer.sub_seq +.. autofunction:: paddle.v2.layer.sub_seq :noindex: concat ------ -.. autoclass:: paddle.v2.layer.concat +.. autofunction:: paddle.v2.layer.concat :noindex: seq_concat ---------- -.. autoclass:: paddle.v2.layer.seq_concat +.. autofunction:: paddle.v2.layer.seq_concat :noindex: seq_slice --------- -.. autoclass:: paddle.v2.layer.seq_slice - :noindex: - -kmax_sequence_score -------------------- -.. autoclass:: paddle.v2.layer.kmax_sequence_score +.. autofunction:: paddle.v2.layer.seq_slice :noindex: sub_nested_seq -------------- -.. autoclass:: paddle.v2.layer.sub_nested_seq +.. autofunction:: paddle.v2.layer.sub_nested_seq :noindex: Reshaping Layers @@ -297,7 +292,7 @@ Reshaping Layers block_expand ------------ -.. autoclass:: paddle.v2.layer.block_expand +.. autofunction:: paddle.v2.layer.block_expand :noindex: .. _api_v2.layer_expand: @@ -309,22 +304,22 @@ ExpandLevel expand ------ -.. autoclass:: paddle.v2.layer.expand +.. autofunction:: paddle.v2.layer.expand :noindex: repeat ------ -.. autoclass:: paddle.v2.layer.repeat +.. autofunction:: paddle.v2.layer.repeat :noindex: rotate ------ -.. autoclass:: paddle.v2.layer.rotate +.. autofunction:: paddle.v2.layer.rotate :noindex: seq_reshape ----------- -.. autoclass:: paddle.v2.layer.seq_reshape +.. autofunction:: paddle.v2.layer.seq_reshape :noindex: Math Layers @@ -332,94 +327,94 @@ Math Layers addto ----- -.. autoclass:: paddle.v2.layer.addto +.. autofunction:: paddle.v2.layer.addto :noindex: linear_comb ----------- -.. autoclass:: paddle.v2.layer.linear_comb +.. autofunction:: paddle.v2.layer.linear_comb :noindex: interpolation ------------- -.. autoclass:: paddle.v2.layer.interpolation +.. autofunction:: paddle.v2.layer.interpolation :noindex: bilinear_interp --------------- -.. autoclass:: paddle.v2.layer.bilinear_interp +.. autofunction:: paddle.v2.layer.bilinear_interp :noindex: dropout -------- -.. autoclass:: paddle.v2.layer.dropout +.. autofunction:: paddle.v2.layer.dropout :noindex: dot_prod --------- -.. autoclass:: paddle.v2.layer.dot_prod +.. autofunction:: paddle.v2.layer.dot_prod :noindex: out_prod -------- -.. autoclass:: paddle.v2.layer.out_prod +.. autofunction:: paddle.v2.layer.out_prod :noindex: power ----- -.. autoclass:: paddle.v2.layer.power +.. autofunction:: paddle.v2.layer.power :noindex: scaling ------- -.. autoclass:: paddle.v2.layer.scaling +.. autofunction:: paddle.v2.layer.scaling :noindex: clip ---- -.. autoclass:: paddle.v2.layer.clip +.. autofunction:: paddle.v2.layer.clip :noindex: resize ------ -.. autoclass:: paddle.v2.layer.resize +.. autofunction:: paddle.v2.layer.resize :noindex: slope_intercept --------------- -.. autoclass:: paddle.v2.layer.slope_intercept +.. autofunction:: paddle.v2.layer.slope_intercept :noindex: tensor ------ -.. autoclass:: paddle.v2.layer.tensor +.. autofunction:: paddle.v2.layer.tensor :noindex: .. _api_v2.layer_cos_sim: cos_sim ------- -.. autoclass:: paddle.v2.layer.cos_sim +.. autofunction:: paddle.v2.layer.cos_sim :noindex: l2_distance ----------- -.. autoclass:: paddle.v2.layer.l2_distance +.. autofunction:: paddle.v2.layer.l2_distance :noindex: trans ----- -.. autoclass:: paddle.v2.layer.trans +.. autofunction:: paddle.v2.layer.trans :noindex: scale_shift ----------- -.. autoclass:: paddle.v2.layer.scale_shift +.. autofunction:: paddle.v2.layer.scale_shift :noindex: factorization_machine --------------------- -.. autoclass:: paddle.v2.layer.factorization_machine +.. autofunction:: paddle.v2.layer.factorization_machine :noindex: Sampling Layers @@ -427,17 +422,17 @@ Sampling Layers maxid ----- -.. autoclass:: paddle.v2.layer.max_id +.. autofunction:: paddle.v2.layer.max_id :noindex: sampling_id ----------- -.. autoclass:: paddle.v2.layer.sampling_id +.. autofunction:: paddle.v2.layer.sampling_id :noindex: multiplex --------- -.. autoclass:: paddle.v2.layer.multiplex +.. autofunction:: paddle.v2.layer.multiplex :noindex: .. _api_v2.layer_costs: @@ -447,97 +442,97 @@ Cost Layers cross_entropy_cost ------------------ -.. autoclass:: paddle.v2.layer.cross_entropy_cost +.. autofunction:: paddle.v2.layer.cross_entropy_cost :noindex: cross_entropy_with_selfnorm_cost -------------------------------- -.. autoclass:: paddle.v2.layer.cross_entropy_with_selfnorm_cost +.. autofunction:: paddle.v2.layer.cross_entropy_with_selfnorm_cost :noindex: multi_binary_label_cross_entropy_cost ------------------------------------- -.. autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost +.. autofunction:: paddle.v2.layer.multi_binary_label_cross_entropy_cost :noindex: classification_cost ------------------- -.. autoclass:: paddle.v2.layer.classification_cost +.. autofunction:: paddle.v2.layer.classification_cost :noindex: huber_regression_cost ------------------------- -.. autoclass:: paddle.v2.layer.huber_regression_cost +.. autofunction:: paddle.v2.layer.huber_regression_cost :noindex: huber_classification_cost ------------------------- -.. autoclass:: paddle.v2.layer.huber_classification_cost +.. autofunction:: paddle.v2.layer.huber_classification_cost :noindex: lambda_cost ----------- -.. autoclass:: paddle.v2.layer.lambda_cost +.. autofunction:: paddle.v2.layer.lambda_cost :noindex: square_error_cost ----------------- -.. autoclass:: paddle.v2.layer.square_error_cost +.. autofunction:: paddle.v2.layer.square_error_cost :noindex: rank_cost --------- -.. autoclass:: paddle.v2.layer.rank_cost +.. autofunction:: paddle.v2.layer.rank_cost :noindex: sum_cost --------- -.. autoclass:: paddle.v2.layer.sum_cost +.. autofunction:: paddle.v2.layer.sum_cost :noindex: crf --- -.. autoclass:: paddle.v2.layer.crf +.. autofunction:: paddle.v2.layer.crf :noindex: crf_decoding ------------ -.. autoclass:: paddle.v2.layer.crf_decoding +.. autofunction:: paddle.v2.layer.crf_decoding :noindex: ctc --- -.. autoclass:: paddle.v2.layer.ctc +.. autofunction:: paddle.v2.layer.ctc :noindex: warp_ctc -------- -.. autoclass:: paddle.v2.layer.warp_ctc +.. autofunction:: paddle.v2.layer.warp_ctc :noindex: nce --- -.. autoclass:: paddle.v2.layer.nce +.. autofunction:: paddle.v2.layer.nce :noindex: hsigmoid --------- -.. autoclass:: paddle.v2.layer.hsigmoid +.. autofunction:: paddle.v2.layer.hsigmoid :noindex: smooth_l1_cost -------------- -.. autoclass:: paddle.v2.layer.smooth_l1_cost +.. autofunction:: paddle.v2.layer.smooth_l1_cost :noindex: multibox_loss -------------- -.. autoclass:: paddle.v2.layer.multibox_loss +.. autofunction:: paddle.v2.layer.multibox_loss :noindex: detection_output ---------------- -.. autoclass:: paddle.v2.layer.detection_output +.. autofunction:: paddle.v2.layer.detection_output :noindex: Check Layer @@ -545,7 +540,7 @@ Check Layer eos --- -.. autoclass:: paddle.v2.layer.eos +.. autofunction:: paddle.v2.layer.eos :noindex: Activation @@ -553,5 +548,5 @@ Activation prelu -------- -.. autoclass:: paddle.v2.layer.prelu +.. autofunction:: paddle.v2.layer.prelu :noindex: diff --git a/doc/v2/api/index_en.rst b/doc/v2/api/index_en.rst index b11cd449affd1dcd9d3f42492961469331350942..70c5c524aaf0a9ae003bf4340c3f268c225d4419 100644 --- a/doc/v2/api/index_en.rst +++ b/doc/v2/api/index_en.rst @@ -8,4 +8,3 @@ API model_configs.rst data.rst run_logic.rst - fluid/index.rst diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst index de7e9eb75c3a053179f2d03ac887955bb4e0a6d2..6421c5308271c2508597d849c79709255caf349a 100644 --- a/doc/v2/build_and_install/build_from_source_cn.rst +++ b/doc/v2/build_and_install/build_from_source_cn.rst @@ -106,7 +106,7 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安 - 学习 Docker 有多难? - 理解 Docker 并不难,大概花十分钟看一下 `这篇文章 `_ 。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。 + 理解 Docker 并不难,大概花十分钟看一下 `如何使用Docker `_ 。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。 - 我可以用 IDE 吗? @@ -123,7 +123,7 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安 - 可以并行编译吗? - 是的。我们的 Docker image 运行一个 `Bash脚本 `_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。 + 是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 `_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。 - Docker 需要 sudo @@ -131,11 +131,11 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安 - 在 Windows/MacOS 上编译很慢 - Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考 `这个issue `_ 。 + Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 `_ 。 - 磁盘不够 - 本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考 `这篇文章 `_ 来清理这些内容。 + 本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考 `如何删除Docker Container `_ 来清理这些内容。 .. _compile_deps: @@ -195,7 +195,7 @@ BLAS PaddlePaddle支持 `MKL `_ 和 `OpenBlAS `_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集, -还会下载MKL-DNN数学库,详细参考 `这里 `_ 。 +还会下载MKL-DNN数学库,详细参考 `mkldnn设计文档 `_ 。 如果关闭MKL,则会使用OpenBLAS作为BLAS库。 diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst index 853bdb21bbcf07ae1742d2196dbcfe4668828b7b..095da19cd41d29bfa72ab23abd24bec45f925a86 100644 --- a/doc/v2/build_and_install/pip_install_cn.rst +++ b/doc/v2/build_and_install/pip_install_cn.rst @@ -60,6 +60,7 @@ paddlepaddle-gpu==0.11.0 使用CUDA 7.5和cuDNN 5编译的0.11.0版 "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" + "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" .. _pip_dependency: diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst index fecf6d3712feac3265100a6121901ba784f7d5cc..8406e4aa1fbb953c3b615b10d1bcb2c45974dde0 100644 --- a/doc/v2/build_and_install/pip_install_en.rst +++ b/doc/v2/build_and_install/pip_install_en.rst @@ -63,6 +63,7 @@ If the links below shows up the login form, just click "Log in as guest" to star "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" + "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" .. _pip_dependency: diff --git a/doc/v2/dev/contribute_to_paddle_cn.md b/doc/v2/dev/contribute_to_paddle_cn.md index add06e42f1bbd221b48eb83e4e84d4a7c89e7483..3244eedf918b93f9351258f1218dfb2d507c1a9c 100644 --- a/doc/v2/dev/contribute_to_paddle_cn.md +++ b/doc/v2/dev/contribute_to_paddle_cn.md @@ -104,7 +104,7 @@ no changes added to commit (use "git add" and/or "git commit -a") ➜ docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest" ``` -关于构建和测试的更多信息,请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。 +关于构建和测试的更多信息,请参见[使用Docker安装运行](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2/build_and_install/docker_install_cn.rst)。 ## 提交(commit) diff --git a/doc/v2/faq/build_and_install/index_cn.rst b/doc/v2/faq/build_and_install/index_cn.rst index f292684fb5fe2df06db5239e7f43fdfa1dd2f2bd..0d644777287aea0a572adb6fa40f498f9c147af7 100644 --- a/doc/v2/faq/build_and_install/index_cn.rst +++ b/doc/v2/faq/build_and_install/index_cn.rst @@ -213,3 +213,12 @@ virtualenv本身也是Python的一个包,可以用pip进行安装: 保存并关闭文件。 这样,每次打开终端时就会自动启动名为‘paddle’的Python环境了。 + +10. 通过pip安装的PaddlePaddle在 :code:`import paddle.fluid` 报找不到 :code:`libmkldnn.so` 或 :code:`libmklml_intel.so` +------------------------------------------------------------------------------------------ +出现这种问题的原因是在导入 :code:`paddle.fluid` 时需要加载 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`, +但是系统没有找到该文件。一般通过pip安装PaddlePaddle时会将 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` +拷贝到 :code:`/usr/local/lib` 路径下,所以解决办法是将该路径加到 :code:`LD_LIBRARY_PATH` 环境变量下, +即: :code:`export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH` 。 + +**注意**:如果是在虚拟环境中安装PaddlePaddle, :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` 可能不在 :code:`/usr/local/lib` 路径下。 \ No newline at end of file diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt index 1e3bb7bf16f969255dba6f6ec7a6a70bbb1e07ee..0f56d648b1939e1d6af3368bb2423477a3b638fc 100644 --- a/paddle/contrib/inference/CMakeLists.txt +++ b/paddle/contrib/inference/CMakeLists.txt @@ -17,42 +17,9 @@ if(APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") endif(APPLE) -set(ANAKIN_INCLUDE "" CACHE STRING "root of Anakin header files") -set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library") - set(inference_deps paddle_inference_api paddle_fluid_api) -# if anakin is set enable anakin api implementation -if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY) - set(ANAKIN_FOUND ON) -else() - set(ANAKIN_FOUND OFF) -endif() - -if (ANAKIN_FOUND) - # Anakin's code style doesn't follow google c style. - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=comment - -Wno-error=reorder - -Wno-error=format - -Wno-error=switch - -Wno-error=return-type - -Wno-error=non-virtual-dtor - -Wno-error=cpp") - - message(STATUS "Anakin for inference is enabled") - message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}") - include_directories("${ANAKIN_INCLUDE}") - # Anakin's source path is a mass, need to set sub-directories trivially. - include_directories("${ANAKIN_INCLUDE}/saber") - link_directories("${ANAKIN_LIBRARY}") - - nv_library(inference_anakin_api SRCS paddle_inference_api_anakin_engine.cc) - target_link_libraries(inference_anakin_api anakin) - list(APPEND inference_deps inference_anakin_api) -endif() - - function(inference_api_test TARGET_NAME) if (WITH_TESTING) set(options "") @@ -83,9 +50,17 @@ cc_test(test_paddle_inference_api inference_api_test(test_paddle_inference_api_impl ARGS test_word2vec test_image_classification) -if (ANAKIN_FOUND) - nv_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc - DEPS ${inference_deps} protobuf) +if (WITH_ANAKIN AND WITH_TESTING) # only needed in CI + # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's, + # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to + # compile the libinference_anakin_api.a and compile with anakin.so. + nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc) + target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) + target_link_libraries(inference_anakin_api anakin anakin_saber_common) + cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc + ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin + DEPS inference_anakin_api) + target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) endif() if(WITH_TESTING) diff --git a/paddle/contrib/inference/demo/CMakeLists.txt b/paddle/contrib/inference/demo/CMakeLists.txt index 7b0fa77ad13c19f177e5b2446bcda6551471e45f..566c7d1a0784f12aaeb9398f6d911ffa2b69e8b2 100644 --- a/paddle/contrib/inference/demo/CMakeLists.txt +++ b/paddle/contrib/inference/demo/CMakeLists.txt @@ -14,3 +14,43 @@ # inference_api_test(simple_on_word2vec ARGS test_word2vec) + +set(DEMO_INSTALL_DIR "${PADDLE_BINARY_DIR}/inference_demo") +set(URL_ROOT http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F) + +function(inference_download_test_demo TARGET) + if (NOT WITH_TESTING) + return() + endif() + set(options "") + set(oneValueArgs URL) + set(multiValueArgs SRCS) + cmake_parse_arguments(tests "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + set(test_dir "${DEMO_INSTALL_DIR}/${TARGET}") + message(STATUS "inference demo ${test_dir}") + + if(NOT EXISTS "${test_dir}") + message(STATUS "Download ${TARGET} model from ${tests_URL}") + execute_process(COMMAND bash -c "mkdir -p ${test_dir}") + execute_process(COMMAND bash -c "cd ${test_dir}; wget -q ${tests_URL}") + execute_process(COMMAND bash -c "cd ${test_dir}; tar xzf *.tar.gz") + endif() + + cc_test(${TARGET} SRCS "${tests_SRCS}" + DEPS paddle_inference_api paddle_fluid + ARGS --data=${test_dir}/data.txt + --modeldir=${test_dir}/model + --refer=${test_dir}/result.txt) +endfunction() + +# disable mobilenet test +#inference_download_test_demo(mobilenet_inference_demo +# SRCS vis_demo.cc +# URL ${URL_ROOT}mobilenet.tar.gz) +inference_download_test_demo(se_resnext50_inference_demo + SRCS vis_demo.cc + URL ${URL_ROOT}se_resnext50.tar.gz) +inference_download_test_demo(ocr_inference_demo + SRCS vis_demo.cc + URL ${URL_ROOT}ocr.tar.gz) diff --git a/paddle/contrib/inference/demo/README.md b/paddle/contrib/inference/demo/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f1d256660299a68dc5d9d73dbe4a401a0e7d9680 --- /dev/null +++ b/paddle/contrib/inference/demo/README.md @@ -0,0 +1,36 @@ +# Infernce Demos + +Input data format: + +- Each line contains a single record +- Each record's format is + +``` +\t +``` + +Follow the C++ codes in `vis_demo.cc`. + +## MobileNet + +To execute the demo, simply run + +```sh +./mobilenet_inference_demo --modeldir --data +``` + +## SE-ResNeXt-50 + +To execute the demo, simply run + +```sh +./se_resnext50_inference_demo --modeldir --data +``` + +## OCR + +To execute the demo, simply run + +```sh +./ocr_inference_demo --modeldir --data +``` diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc index ee865f37900fc84b87a2d050686a90b607f2c3d5..c253014642f39a042430992548a285cc7078a959 100644 --- a/paddle/contrib/inference/demo/simple_on_word2vec.cc +++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include +#include #include "paddle/contrib/inference/paddle_inference_api.h" namespace paddle { @@ -40,10 +41,9 @@ void Main(bool use_gpu) { //# 2. Prepare input. int64_t data[4] = {1, 2, 3, 4}; - PaddleBuf buf{.data = data, .length = sizeof(data)}; PaddleTensor tensor{.name = "", .shape = std::vector({4, 1}), - .data = buf, + .data = PaddleBuf(data, sizeof(data)), .dtype = PaddleDType::INT64}; // For simplicity, we set all the slots with the same data. @@ -55,17 +55,71 @@ void Main(bool use_gpu) { //# 4. Get output. ASSERT_EQ(outputs.size(), 1UL); - LOG(INFO) << "output buffer size: " << outputs.front().data.length; - const size_t num_elements = outputs.front().data.length / sizeof(float); + LOG(INFO) << "output buffer size: " << outputs.front().data.length(); + const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(5UL, num_elements); i++) { - LOG(INFO) << static_cast(outputs.front().data.data)[i]; + LOG(INFO) << static_cast(outputs.front().data.data())[i]; } } } +void MainThreads(int num_threads, bool use_gpu) { + // Multi-threads only support on CPU + // 0. Create PaddlePredictor with a config. + NativeConfig config; + config.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config.use_gpu = use_gpu; + config.fraction_of_gpu_memory = 0.15; + config.device = 0; + auto main_predictor = + CreatePaddlePredictor(config); + + std::vector threads; + for (int tid = 0; tid < num_threads; ++tid) { + threads.emplace_back([&, tid]() { + // 1. clone a predictor which shares the same parameters + auto predictor = main_predictor->Clone(); + constexpr int num_batches = 3; + for (int batch_id = 0; batch_id < num_batches; ++batch_id) { + // 2. Dummy Input Data + int64_t data[4] = {1, 2, 3, 4}; + PaddleTensor tensor{.name = "", + .shape = std::vector({4, 1}), + .data = PaddleBuf(data, sizeof(data)), + .dtype = PaddleDType::INT64}; + std::vector inputs(4, tensor); + std::vector outputs; + // 3. Run + CHECK(predictor->Run(inputs, &outputs)); + + // 4. Get output. + ASSERT_EQ(outputs.size(), 1UL); + LOG(INFO) << "TID: " << tid << ", " + << "output buffer size: " << outputs.front().data.length(); + const size_t num_elements = + outputs.front().data.length() / sizeof(float); + // The outputs' buffers are in CPU memory. + for (size_t i = 0; i < std::min(5UL, num_elements); i++) { + LOG(INFO) << static_cast(outputs.front().data.data())[i]; + } + } + }); + } + for (int i = 0; i < num_threads; ++i) { + threads[i].join(); + } +} + TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); } +TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); } +TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); } + +#ifdef PADDLE_WITH_CUDA TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); } +TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); } +TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); } +#endif } // namespace demo } // namespace paddle diff --git a/paddle/contrib/inference/demo/utils.h b/paddle/contrib/inference/demo/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..b5330d8d9d89260cfe3d5214e5a4ceb720cffdf1 --- /dev/null +++ b/paddle/contrib/inference/demo/utils.h @@ -0,0 +1,68 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include "paddle/contrib/inference/paddle_inference_api.h" + +namespace paddle { +namespace demo { + +static void split(const std::string& str, + char sep, + std::vector* pieces) { + pieces->clear(); + if (str.empty()) { + return; + } + size_t pos = 0; + size_t next = str.find(sep, pos); + while (next != std::string::npos) { + pieces->push_back(str.substr(pos, next - pos)); + pos = next + 1; + next = str.find(sep, pos); + } + if (!str.substr(pos).empty()) { + pieces->push_back(str.substr(pos)); + } +} + +/* + * Get a summary of a PaddleTensor content. + */ +static std::string SummaryTensor(const PaddleTensor& tensor) { + std::stringstream ss; + int num_elems = tensor.data.length() / PaddleDtypeSize(tensor.dtype); + + ss << "data[:10]\t"; + switch (tensor.dtype) { + case PaddleDType::INT64: { + for (int i = 0; i < std::min(num_elems, 10); i++) { + ss << static_cast(tensor.data.data())[i] << " "; + } + break; + } + case PaddleDType::FLOAT32: + for (int i = 0; i < std::min(num_elems, 10); i++) { + ss << static_cast(tensor.data.data())[i] << " "; + } + break; + } + return ss.str(); +} + +} // namespace demo +} // namespace paddle diff --git a/paddle/contrib/inference/demo/vis_demo.cc b/paddle/contrib/inference/demo/vis_demo.cc new file mode 100644 index 0000000000000000000000000000000000000000..45575f9a862de430236ae20cf498e542a45b1f4b --- /dev/null +++ b/paddle/contrib/inference/demo/vis_demo.cc @@ -0,0 +1,149 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file contains demo for mobilenet, se-resnext50 and ocr. + */ + +#include +#include // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files. +#include +#include +#include +#include "paddle/contrib/inference/demo/utils.h" +#include "paddle/contrib/inference/paddle_inference_api.h" + +#ifdef PADDLE_WITH_CUDA +DECLARE_double(fraction_of_gpu_memory_to_use); +#endif + +namespace paddle { +namespace demo { + +DEFINE_string(modeldir, "", "Directory of the inference model."); +DEFINE_string(refer, "", "path to reference result for comparison."); +DEFINE_string( + data, + "", + "path of data; each line is a record, format is " + "'\t data; + std::vector shape; +}; + +void split(const std::string& str, char sep, std::vector* pieces); + +Record ProcessALine(const std::string& line) { + LOG(INFO) << "process a line"; + std::vector columns; + split(line, '\t', &columns); + CHECK_EQ(columns.size(), 2UL) + << "data format error, should be \t"; + + Record record; + std::vector data_strs; + split(columns[0], ' ', &data_strs); + for (auto& d : data_strs) { + record.data.push_back(std::stof(d)); + } + + std::vector shape_strs; + split(columns[1], ' ', &shape_strs); + for (auto& s : shape_strs) { + record.shape.push_back(std::stoi(s)); + } + LOG(INFO) << "data size " << record.data.size(); + LOG(INFO) << "data shape size " << record.shape.size(); + return record; +} + +void CheckOutput(const std::string& referfile, const PaddleTensor& output) { + std::string line; + std::ifstream file(referfile); + std::getline(file, line); + auto refer = ProcessALine(line); + file.close(); + + size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); + LOG(INFO) << "predictor output numel " << numel; + LOG(INFO) << "reference output numel " << refer.data.size(); + EXPECT_EQ(numel, refer.data.size()); + switch (output.dtype) { + case PaddleDType::INT64: { + for (size_t i = 0; i < numel; ++i) { + EXPECT_EQ(static_cast(output.data.data())[i], refer.data[i]); + } + break; + } + case PaddleDType::FLOAT32: + for (size_t i = 0; i < numel; ++i) { + EXPECT_NEAR( + static_cast(output.data.data())[i], refer.data[i], 1e-5); + } + break; + } +} + +/* + * Use the native fluid engine to inference the demo. + */ +void Main(bool use_gpu) { + NativeConfig config; + config.param_file = FLAGS_modeldir + "/__params__"; + config.prog_file = FLAGS_modeldir + "/__model__"; + config.use_gpu = use_gpu; + config.device = 0; +#ifdef PADDLE_WITH_CUDA + config.fraction_of_gpu_memory = FLAGS_fraction_of_gpu_memory_to_use; +#endif + + LOG(INFO) << "init predictor"; + auto predictor = + CreatePaddlePredictor(config); + + LOG(INFO) << "begin to process data"; + // Just a single batch of data. + std::string line; + std::ifstream file(FLAGS_data); + std::getline(file, line); + auto record = ProcessALine(line); + file.close(); + + // Inference. + PaddleTensor input{ + .name = "xx", + .shape = record.shape, + .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)), + .dtype = PaddleDType::FLOAT32}; + + LOG(INFO) << "run executor"; + std::vector output; + predictor->Run({input}, &output); + + LOG(INFO) << "output.size " << output.size(); + auto& tensor = output.front(); + LOG(INFO) << "output: " << SummaryTensor(tensor); + + // compare with reference result + CheckOutput(FLAGS_refer, tensor); +} + +TEST(demo, vis_demo_cpu) { Main(false /*use_gpu*/); } +#ifdef PADDLE_WITH_CUDA +TEST(demo, vis_demo_gpu) { Main(true /*use_gpu*/); } +#endif +} // namespace demo +} // namespace paddle diff --git a/paddle/contrib/inference/high_level_api.md b/paddle/contrib/inference/high_level_api.md new file mode 100644 index 0000000000000000000000000000000000000000..eb92885052a453d8c837bbf6f6e984efb509332a --- /dev/null +++ b/paddle/contrib/inference/high_level_api.md @@ -0,0 +1,60 @@ +# Inference High-level APIs +This document describes the high-level inference APIs, one can use them to deploy a Paddle model for an application quickly. + +The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_fluid.so` and `libpaddle_fluid_api.so` are needed for a deployment. + +## PaddleTensor +We provide the `PaddleTensor` data structure to give a general tensor interface. + +The definition is + +```c++ +struct PaddleTensor { + std::string name; // variable name. + std::vector shape; + PaddleBuf data; // blob of data. + PaddleDType dtype; +}; +``` + +The data is stored in a continuous memory `PaddleBuf,` and a `PaddleDType` specifies tensor's data type. +The `name` field is used to specify the name of an input variable, +that is important when there are multiple inputs and need to distinguish which variable to set. + +## engine +The inference APIs has two different underlying engines + +- the native engine, which is consists of the native operators and framework, +- the Anakin engine, which has an Anakin library embedded. + +The native engine takes a native Paddle model as input, and supports any model that trained by Paddle, +the Anakin engine is faster for some model, +but it can only take the Anakin model as input(user need to transform the format first manually) and currently not all Paddle models are supported. + +```c++ +enum class PaddleEngineKind { + kNative = 0, // Use the native Fluid facility. + kAnakin, // Use Anakin for inference. +}; +``` + +## PaddlePredictor and how to create one +The main interface is `PaddlePredictor,` there are following methods + +- `bool Run(const std::vector& inputs, std::vector* output_data)` + - take inputs and output `output_data.` +- `Clone` to clone a predictor from an existing one, with model parameter shared. + +There is a factory method to help create a predictor, and the user takes the ownership of this object. + +```c++ +template +std::unique_ptr CreatePaddlePredictor(const ConfigT& config); +``` + +By specifying the engine kind and config, one can get a specific implementation. + +## Reference + +- [paddle_inference_api.h](./paddle_inference_api.h) +- [some demos](./demo) diff --git a/paddle/contrib/inference/paddle_inference_api.cc b/paddle/contrib/inference/paddle_inference_api.cc index d67e1e7667800d6dd00cb8915b0d6dc7c664970b..ea46b3006f8d0964cc8229d3683ee7b602d6ef0d 100644 --- a/paddle/contrib/inference/paddle_inference_api.cc +++ b/paddle/contrib/inference/paddle_inference_api.cc @@ -13,3 +13,66 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/contrib/inference/paddle_inference_api.h" + +namespace paddle { + +int PaddleDtypeSize(PaddleDType dtype) { + switch (dtype) { + case PaddleDType::FLOAT32: + return sizeof(float); + case PaddleDType::INT64: + return sizeof(int64_t); + default: + // + assert(false); + return -1; + } +} + +PaddleBuf::PaddleBuf(PaddleBuf&& other) + : data_(other.data_), + length_(other.length_), + memory_owned_(other.memory_owned_) { + other.memory_owned_ = false; + other.data_ = nullptr; + other.length_ = 0; +} + +PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; } + +PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { + // only the buffer with external memory can be copied + assert(!other.memory_owned_); + data_ = other.data_; + length_ = other.length_; + memory_owned_ = other.memory_owned_; + return *this; +} + +void PaddleBuf::Resize(size_t length) { + // Only the owned memory can be reset, the external memory can't be changed. + if (length_ == length) return; + assert(memory_owned_); + Free(); + data_ = new char[length]; + length_ = length; + memory_owned_ = true; +} + +void PaddleBuf::Reset(void* data, size_t length) { + Free(); + memory_owned_ = false; + data_ = data; + length_ = length; +} + +void PaddleBuf::Free() { + if (memory_owned_ && data_) { + assert(length_ > 0); + delete static_cast(data_); + data_ = nullptr; + length_ = 0; + } +} + +} // namespace paddle diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h index b5cd0d603f1391427bec392f9dcb33c99eef36b7..238d8c772ec875948701a1d8381e051ebeb7c2f5 100644 --- a/paddle/contrib/inference/paddle_inference_api.h +++ b/paddle/contrib/inference/paddle_inference_api.h @@ -15,12 +15,13 @@ limitations under the License. */ /* * This file contains the definition of a simple Inference API for Paddle. * - * ATTENTION: It requires some C++ features, for lower version C++ or C, we + * ATTENTION: It requires some C++11 features, for lower version C++ or C, we * might release another API. */ #pragma once +#include #include #include #include @@ -32,12 +33,38 @@ enum PaddleDType { INT64, }; -struct PaddleBuf { - void* data; // pointer to the data memory. - size_t length; // number of memory bytes. +class PaddleBuf { + public: + PaddleBuf() = default; + PaddleBuf(PaddleBuf&& other); + // Copy only available when memory is managed externally. + explicit PaddleBuf(const PaddleBuf&); + PaddleBuf& operator=(const PaddleBuf&); + // Do not own the memory. + PaddleBuf(void* data, size_t length) + : data_(data), length_(length), memory_owned_{false} {} + // Own memory. + PaddleBuf(size_t length) + : data_(new char[length]), length_(length), memory_owned_(true) {} + // Resize to `length` bytes. + void Resize(size_t length); + // Reset to external memory. + void Reset(void* data, size_t length); + bool empty() const { return length_ == 0; } + void* data() const { return data_; } + size_t length() const { return length_; } + + ~PaddleBuf() { Free(); } + + private: + void Free(); + void* data_{nullptr}; // pointer to the data memory. + size_t length_{0}; // number of memory bytes. + bool memory_owned_{true}; }; struct PaddleTensor { + PaddleTensor() = default; std::string name; // variable name. std::vector shape; // TODO(Superjomn) for LoD support, add a vector> field if needed. @@ -63,11 +90,13 @@ class PaddlePredictor { struct Config; PaddlePredictor() = default; PaddlePredictor(const PaddlePredictor&) = delete; + PaddlePredictor& operator=(const PaddlePredictor&) = delete; // Predict an record. // The caller should be responsible for allocating and releasing the memory of - // `inputs`. `inputs` should be alive until Run returns. caller should be - // responsible for releasing the memory of `output_data`. + // `inputs`. `inputs` should be available until Run returns. Caller should be + // responsible for the output tensor's buffer, either allocated or passed from + // outside. virtual bool Run(const std::vector& inputs, std::vector* output_data) = 0; @@ -76,12 +105,11 @@ class PaddlePredictor { virtual std::unique_ptr Clone() = 0; // Destroy the Predictor. - virtual ~PaddlePredictor() {} + virtual ~PaddlePredictor() = default; // The common configs for all the predictors. struct Config { - std::string model_dir; // path to the model directory. - bool enable_engine{false}; // Enable to execute (part of) the model on + std::string model_dir; // path to the model directory. }; }; @@ -113,4 +141,6 @@ struct AnakinConfig : public PaddlePredictor::Config { template std::unique_ptr CreatePaddlePredictor(const ConfigT& config); +int PaddleDtypeSize(PaddleDType dtype); + } // namespace paddle diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc index 865d7ac10db55ce9565f4b1a35defa2a3d1d40ef..ba2d30314715a57c5ab85e5ae1d8ac0512bbc74f 100644 --- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc +++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc @@ -12,9 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include - #include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h" +#include namespace paddle { @@ -24,8 +23,16 @@ PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor( } bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) { - // TODO(Superjomn) Tell anakin to support return code. - engine_.Build(config.model_file, config.max_batch_size); + if (!(graph_.load(config.model_file))) { + return false; + } + graph_.ResetBatchSize("input_0", config.max_batch_size); + // optimization for graph + if (!(graph_.Optimize())) { + return false; + } + // construct executer + executor_.init(graph_); return true; } @@ -38,24 +45,33 @@ bool PaddleInferenceAnakinPredictor::Run( << "'s type is not float"; return false; } - engine_.SetInputFromCPU( - input.name, static_cast(input.data.data), input.data.length); + auto d_tensor_in_p = executor_.get_in(input.name); + float *d_data_p = d_tensor_in_p->mutable_data(); + if (cudaMemcpy(d_data_p, + static_cast(input.data.data()), + d_tensor_in_p->valid_size() * sizeof(float), + cudaMemcpyHostToDevice) != 0) { + LOG(ERROR) << "copy data from CPU to GPU error"; + return false; + } } - // TODO(Superjomn) Tell anakin to support return code. - engine_.Execute(); + executor_.prediction(); if (output_data->empty()) { LOG(ERROR) << "At least one output should be set with tensors' names."; return false; } for (auto &output : *output_data) { - auto *tensor = engine_.GetOutputInGPU(output.name); + auto *tensor = executor_.get_out(output.name); output.shape = tensor->shape(); + if (output.data.length() < tensor->valid_size() * sizeof(float)) { + output.data.Resize(tensor->valid_size() * sizeof(float)); + } // Copy data from GPU -> CPU - if (cudaMemcpy(output.data.data, - tensor->data(), - tensor->size(), + if (cudaMemcpy(output.data.data(), + tensor->mutable_data(), + tensor->valid_size() * sizeof(float), cudaMemcpyDeviceToHost) != 0) { LOG(ERROR) << "copy data from GPU to CPU error"; return false; @@ -64,9 +80,26 @@ bool PaddleInferenceAnakinPredictor::Run( return true; } -// TODO(Superjomn) To implement latter. +anakin::Net + &PaddleInferenceAnakinPredictor::get_executer() { + return executor_; +} + +// the cloned new Predictor of anakin share the same net weights from original +// Predictor std::unique_ptr PaddleInferenceAnakinPredictor::Clone() { - return nullptr; + VLOG(3) << "Anakin Predictor::clone"; + std::unique_ptr cls(new PaddleInferenceAnakinPredictor()); + // construct executer from other graph + auto anakin_predictor_p = + dynamic_cast(cls.get()); + if (!anakin_predictor_p) { + LOG(ERROR) << "fail to call Init"; + return nullptr; + } + anakin_predictor_p->get_executer().init(graph_); + + return std::move(cls); } // A factory to help create difference predictor. @@ -74,6 +107,7 @@ template <> std::unique_ptr CreatePaddlePredictor( const AnakinConfig &config) { + VLOG(3) << "Anakin Predictor create."; std::unique_ptr x( new PaddleInferenceAnakinPredictor(config)); return x; diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h index fe9f562e9d1d40c30585bcb68fa51e445bedb4aa..212ba41cdf8ff2feccb6b6498f9679d76a2efe7c 100644 --- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h +++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h @@ -19,33 +19,42 @@ limitations under the License. */ #pragma once -// NOTE This header file do not have namespace. -// TODO(Superjomn) Tell Anakin to provide better APIs. -#include #include "paddle/contrib/inference/paddle_inference_api.h" +// from anakin +#include "framework/core/net/net.h" +#include "saber/saber_types.h" + namespace paddle { class PaddleInferenceAnakinPredictor : public PaddlePredictor { public: + PaddleInferenceAnakinPredictor() {} + PaddleInferenceAnakinPredictor(const AnakinConfig& config); // NOTE Unlike the native engine, the buffers of anakin engine's output_data // should be allocated first. - // TODO(Superjomn) should unify all the behaviors of output_data accross all - // the engines. bool Run(const std::vector& inputs, std::vector* output_data) override; std::unique_ptr Clone() override; + anakin::Net& + get_executer(); + + ~PaddleInferenceAnakinPredictor() override{}; + private: bool Init(const AnakinConfig& config); - anakin::AnakinEngine - engine_; + graph_; + anakin::Net + executor_; + AnakinConfig config_; }; } // namespace paddle diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc index 43324bc67cba16c36d9dbcb58ccde1c57293085e..f92e9d4190412f5847e353ef1dc0324cad668c9a 100644 --- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc +++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc @@ -12,16 +12,54 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/contrib/inference/paddle_inference_api.h" +#include +#include #include +#include "paddle/contrib/inference/paddle_inference_api.h" + +DEFINE_string(model, "", "Directory of the inference model."); + namespace paddle { -TEST(inference, anakin) { +AnakinConfig GetConfig() { AnakinConfig config; + config.model_file = FLAGS_model; + config.device = 0; + config.max_batch_size = 1; + return config; +} - auto engine = +TEST(inference, anakin) { + AnakinConfig config = GetConfig(); + auto predictor = CreatePaddlePredictor(config); + + float data[1 * 3 * 224 * 224] = {1.0f}; + + PaddleTensor tensor{.name = "input_0", + .shape = std::vector({1, 3, 224, 224}), + .data = PaddleBuf(data, sizeof(data)), + .dtype = PaddleDType::FLOAT32}; + + // For simplicity, we set all the slots with the same data. + std::vector paddle_tensor_feeds; + paddle_tensor_feeds.emplace_back(std::move(tensor)); + + PaddleTensor tensor_out{.name = "prob_out", + .shape = std::vector({1000, 1}), + .data = PaddleBuf(), + .dtype = PaddleDType::FLOAT32}; + + std::vector outputs; + outputs.emplace_back(std::move(tensor_out)); + + ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); + + float* data_o = static_cast(outputs[0].data.data()); + for (size_t j = 0; j < 1000; ++j) { + LOG(INFO) << "output[" << j << "]: " << data_o[j]; + } } } // namespace paddle diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc index b52a43a463de702ef822f50a1cb7348ae5710c2b..d9129a704bc289ce1d416474537fc9234a07e5b8 100644 --- a/paddle/contrib/inference/paddle_inference_api_impl.cc +++ b/paddle/contrib/inference/paddle_inference_api_impl.cc @@ -54,7 +54,8 @@ std::string num2str(T a) { } } // namespace -bool NativePaddlePredictor::Init() { +bool NativePaddlePredictor::Init( + std::shared_ptr parent_scope) { VLOG(3) << "Predictor::init()"; if (config_.use_gpu) { @@ -62,9 +63,15 @@ bool NativePaddlePredictor::Init() { } else { place_ = paddle::platform::CPUPlace(); } - paddle::framework::InitDevices(false); + if (parent_scope) { + scope_ = parent_scope; + sub_scope_ = &(parent_scope->NewScope()); + } else { + paddle::framework::InitDevices(false); + scope_.reset(new paddle::framework::Scope()); + } + executor_.reset(new paddle::framework::Executor(place_)); - scope_.reset(new paddle::framework::Scope()); // Initialize the inference program if (!config_.model_dir.empty()) { @@ -83,13 +90,8 @@ bool NativePaddlePredictor::Init() { return false; } ctx_ = executor_->Prepare(*inference_program_, 0); - - // Create temporary variables first, so that the first batch do not need to - // create variables in the runtime. This is the logics of the old inference - // API. - // TODO(Superjomn) this should be modified when `Clone` is valid for - // multi-thread application. - executor_->CreateVariables(*inference_program_, scope_.get(), 0); + executor_->CreateVariables( + *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0); // Get the feed_target_names and fetch_target_names feed_target_names_ = inference_program_->GetFeedTargetNames(); @@ -97,6 +99,13 @@ bool NativePaddlePredictor::Init() { return true; } +NativePaddlePredictor::~NativePaddlePredictor() { + if (sub_scope_) { + PADDLE_ENFORCE_NOT_NULL(scope_, "Should have parent scope!"); + scope_->DeleteScope(sub_scope_); + } +}; + bool NativePaddlePredictor::Run(const std::vector &inputs, std::vector *output_data) { VLOG(3) << "Predictor::predict"; @@ -121,11 +130,12 @@ bool NativePaddlePredictor::Run(const std::vector &inputs, } // Run the inference program // if share variables, we need not create variables - executor_->RunPreparedContext(ctx_.get(), - scope_.get(), - &feed_targets, - &fetch_targets, - false /* don't create variable eatch time */); + executor_->RunPreparedContext( + ctx_.get(), + sub_scope_ != nullptr ? sub_scope_ : scope_.get(), + &feed_targets, + &fetch_targets, + false /* don't create variable eatch time */); if (!GetFetch(fetchs, output_data)) { LOG(ERROR) << "fail to get fetchs"; return false; @@ -138,7 +148,7 @@ std::unique_ptr NativePaddlePredictor::Clone() { VLOG(3) << "Predictor::clone"; std::unique_ptr cls(new NativePaddlePredictor(config_)); - if (!dynamic_cast(cls.get())->Init()) { + if (!dynamic_cast(cls.get())->Init(scope_)) { LOG(ERROR) << "fail to call Init"; return nullptr; } @@ -168,8 +178,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. std::memcpy(static_cast(input_ptr), - inputs[i].data.data, - inputs[i].data.length); + inputs[i].data.data(), + inputs[i].data.length()); feeds->push_back(input); } return true; @@ -231,10 +241,11 @@ bool NativePaddlePredictor::GetFetch( } outputs->at(i).shape = shape; - outputs->at(i).data.length = sizeof(float) * data.size(); - outputs->at(i).data.data = malloc(outputs->at(i).data.length); - std::memcpy( - outputs->at(i).data.data, data.data(), outputs->at(i).data.length); + auto &buffer = outputs->at(i).data; + if (buffer.empty() || buffer.length() < sizeof(float) * data.size()) { + buffer.Resize(sizeof(float) * data.size()); + } + std::memcpy(buffer.data(), data.data(), buffer.length()); outputs->at(i).dtype = PaddleDType::FLOAT32; // TODO(panyx0718): support other types? fill tensor name? avoid a copy. } @@ -266,7 +277,7 @@ CreatePaddlePredictor( } std::unique_ptr predictor(new NativePaddlePredictor(config)); - if (!dynamic_cast(predictor.get())->Init()) { + if (!dynamic_cast(predictor.get())->Init(nullptr)) { return nullptr; } return std::move(predictor); diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/contrib/inference/paddle_inference_api_impl.h index 84707e223d7aa3d1ebca933923e932b3973613ae..86d1db7bcc7567e104cd20c9f767ed4513f611f5 100644 --- a/paddle/contrib/inference/paddle_inference_api_impl.h +++ b/paddle/contrib/inference/paddle_inference_api_impl.h @@ -34,14 +34,15 @@ class NativePaddlePredictor : public PaddlePredictor { explicit NativePaddlePredictor(const NativeConfig &config) : config_(config) {} - bool Init(); + // will only create sub scope if have global scope + bool Init(std::shared_ptr parent_scope); bool Run(const std::vector &inputs, std::vector *output_data) override; std::unique_ptr Clone() override; - ~NativePaddlePredictor() override{}; + ~NativePaddlePredictor() override; private: bool SetFeed(const std::vector &input_datas, @@ -52,11 +53,13 @@ class NativePaddlePredictor : public PaddlePredictor { NativeConfig config_; platform::Place place_; std::unique_ptr executor_; - std::unique_ptr scope_; + std::shared_ptr scope_; std::unique_ptr ctx_; std::unique_ptr inference_program_; std::vector feed_target_names_; std::vector fetch_target_names_; + // Do not use unique_ptr, use parent scope to delete + framework::Scope *sub_scope_{nullptr}; }; } // namespace paddle diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc index 1f960677163988be6f4c502738861bf86588f406..88c4e665a3daed0ed34b23b75d360acbd586401f 100644 --- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc +++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc @@ -15,6 +15,8 @@ limitations under the License. */ #include #include +#include + #include "gflags/gflags.h" #include "paddle/contrib/inference/paddle_inference_api_impl.h" #include "paddle/fluid/inference/tests/test_helper.h" @@ -25,13 +27,12 @@ namespace paddle { PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { PaddleTensor pt; - pt.data.data = t->data(); if (t->type() == typeid(int64_t)) { - pt.data.length = t->numel() * sizeof(int64_t); + pt.data.Reset(t->data(), t->numel() * sizeof(int64_t)); pt.dtype = PaddleDType::INT64; } else if (t->type() == typeid(float)) { - pt.data.length = t->numel() * sizeof(float); + pt.data.Reset(t->data(), t->numel() * sizeof(float)); pt.dtype = PaddleDType::FLOAT32; } else { LOG(FATAL) << "unsupported type."; @@ -45,14 +46,19 @@ NativeConfig GetConfig() { config.model_dir = FLAGS_dirname + "word2vec.inference.model"; LOG(INFO) << "dirname " << config.model_dir; config.fraction_of_gpu_memory = 0.15; +#ifdef PADDLE_WITH_CUDA config.use_gpu = true; +#else + config.use_gpu = false; +#endif config.device = 0; return config; } -TEST(paddle_inference_api_impl, word2vec) { +void MainWord2Vec(bool use_gpu) { NativeConfig config = GetConfig(); auto predictor = CreatePaddlePredictor(config); + config.use_gpu = use_gpu; framework::LoDTensor first_word, second_word, third_word, fourth_word; framework::LoD lod{{0, 1}}; @@ -72,8 +78,8 @@ TEST(paddle_inference_api_impl, word2vec) { std::vector outputs; ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); ASSERT_EQ(outputs.size(), 1UL); - size_t len = outputs[0].data.length; - float* data = static_cast(outputs[0].data.data); + size_t len = outputs[0].data.length(); + float* data = static_cast(outputs[0].data.data()); for (size_t j = 0; j < len / sizeof(float); ++j) { ASSERT_LT(data[j], 1.0); ASSERT_GT(data[j], -1.0); @@ -96,15 +102,13 @@ TEST(paddle_inference_api_impl, word2vec) { EXPECT_LT(lod_data[i] - data[i], 1e-3); EXPECT_GT(lod_data[i] - data[i], -1e-3); } - - free(outputs[0].data.data); } -TEST(paddle_inference_api_impl, image_classification) { +void MainImageClassification(bool use_gpu) { int batch_size = 2; - bool use_mkldnn = false; bool repeat = false; NativeConfig config = GetConfig(); + config.use_gpu = use_gpu; config.model_dir = FLAGS_dirname + "image_classification_resnet.inference.model"; @@ -126,12 +130,8 @@ TEST(paddle_inference_api_impl, image_classification) { std::vector cpu_fetchs1; cpu_fetchs1.push_back(&output1); - TestInference(config.model_dir, - cpu_feeds, - cpu_fetchs1, - repeat, - is_combined, - use_mkldnn); + TestInference( + config.model_dir, cpu_feeds, cpu_fetchs1, repeat, is_combined); auto predictor = CreatePaddlePredictor(config); std::vector paddle_tensor_feeds; @@ -140,13 +140,149 @@ TEST(paddle_inference_api_impl, image_classification) { std::vector outputs; ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); ASSERT_EQ(outputs.size(), 1UL); - size_t len = outputs[0].data.length; - float* data = static_cast(outputs[0].data.data); + size_t len = outputs[0].data.length(); + float* data = static_cast(outputs[0].data.data()); float* lod_data = output1.data(); for (size_t j = 0; j < len / sizeof(float); ++j) { EXPECT_NEAR(lod_data[j], data[j], 1e-3); } - free(data); } +void MainThreadsWord2Vec(bool use_gpu) { + NativeConfig config = GetConfig(); + config.use_gpu = use_gpu; + auto main_predictor = CreatePaddlePredictor(config); + + // prepare inputs data and reference results + constexpr int num_jobs = 3; + std::vector> jobs(num_jobs); + std::vector> paddle_tensor_feeds(num_jobs); + std::vector refs(num_jobs); + for (size_t i = 0; i < jobs.size(); ++i) { + // each job has 4 words + jobs[i].resize(4); + for (size_t j = 0; j < 4; ++j) { + framework::LoD lod{{0, 1}}; + int64_t dict_size = 2073; // The size of dictionary + SetupLoDTensor(&jobs[i][j], lod, static_cast(0), dict_size - 1); + paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i][j])); + } + + // get reference result of each job + std::vector ref_feeds; + std::vector ref_fetches(1, &refs[i]); + for (auto& word : jobs[i]) { + ref_feeds.push_back(&word); + } + TestInference(config.model_dir, ref_feeds, ref_fetches); + } + + // create threads and each thread run 1 job + std::vector threads; + for (int tid = 0; tid < num_jobs; ++tid) { + threads.emplace_back([&, tid]() { + auto predictor = main_predictor->Clone(); + auto& local_inputs = paddle_tensor_feeds[tid]; + std::vector local_outputs; + ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs)); + + // check outputs range + ASSERT_EQ(local_outputs.size(), 1UL); + const size_t len = local_outputs[0].data.length(); + float* data = static_cast(local_outputs[0].data.data()); + for (size_t j = 0; j < len / sizeof(float); ++j) { + ASSERT_LT(data[j], 1.0); + ASSERT_GT(data[j], -1.0); + } + + // check outputs correctness + float* ref_data = refs[tid].data(); + EXPECT_EQ(refs[tid].numel(), static_cast(len / sizeof(float))); + for (int i = 0; i < refs[tid].numel(); ++i) { + EXPECT_NEAR(ref_data[i], data[i], 1e-3); + } + }); + } + for (int i = 0; i < num_jobs; ++i) { + threads[i].join(); + } +} + +void MainThreadsImageClassification(bool use_gpu) { + constexpr int num_jobs = 4; // each job run 1 batch + constexpr int batch_size = 1; + NativeConfig config = GetConfig(); + config.use_gpu = use_gpu; + config.model_dir = + FLAGS_dirname + "image_classification_resnet.inference.model"; + + auto main_predictor = CreatePaddlePredictor(config); + std::vector jobs(num_jobs); + std::vector> paddle_tensor_feeds(num_jobs); + std::vector refs(num_jobs); + for (size_t i = 0; i < jobs.size(); ++i) { + // prepare inputs + std::vector> feed_target_shapes = + GetFeedTargetShapes(config.model_dir, /*is_combined*/ false); + feed_target_shapes[0][0] = batch_size; + framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]); + SetupTensor(&jobs[i], input_dims, 0.f, 1.f); + paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i])); + + // get reference result of each job + std::vector ref_feeds(1, &jobs[i]); + std::vector ref_fetches(1, &refs[i]); + TestInference(config.model_dir, ref_feeds, ref_fetches); + } + + // create threads and each thread run 1 job + std::vector threads; + for (int tid = 0; tid < num_jobs; ++tid) { + threads.emplace_back([&, tid]() { + auto predictor = main_predictor->Clone(); + auto& local_inputs = paddle_tensor_feeds[tid]; + std::vector local_outputs; + ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs)); + + // check outputs correctness + ASSERT_EQ(local_outputs.size(), 1UL); + const size_t len = local_outputs[0].data.length(); + float* data = static_cast(local_outputs[0].data.data()); + float* ref_data = refs[tid].data(); + EXPECT_EQ(refs[tid].numel(), len / sizeof(float)); + for (int i = 0; i < refs[tid].numel(); ++i) { + EXPECT_NEAR(ref_data[i], data[i], 1e-3); + } + }); + } + for (int i = 0; i < num_jobs; ++i) { + threads[i].join(); + } +} + +TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); } +TEST(inference_api_native, word2vec_cpu_threads) { + MainThreadsWord2Vec(false /*use_gpu*/); +} +TEST(inference_api_native, image_classification_cpu) { + MainThreadsImageClassification(false /*use_gpu*/); +} +TEST(inference_api_native, image_classification_cpu_threads) { + MainThreadsImageClassification(false /*use_gpu*/); +} + +#ifdef PADDLE_WITH_CUDA +TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); } +TEST(inference_api_native, word2vec_gpu_threads) { + MainThreadsWord2Vec(true /*use_gpu*/); +} +TEST(inference_api_native, image_classification_gpu) { + MainThreadsImageClassification(true /*use_gpu*/); +} +TEST(inference_api_native, image_classification_gpu_threads) { + MainThreadsImageClassification(true /*use_gpu*/); +} + +#endif + } // namespace paddle diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index ed1e70c6460b513c1d2e1add18ac037f71d36944..6286dda4a54991b7a1042aed9886fdcb694198ba 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -83,11 +83,16 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) -cc_library(executor SRCS executor.cc DEPS op_registry device_context scope -framework_proto glog lod_rank_table feed_fetch_method) +if(WITH_DISTRIBUTE) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +else() + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method) +endif() -cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor) +cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h index 9c5e2cf7ccdcea2822da42210ff1fdb915a9a4ec..b611bb77b4e1ec05b8bd029ac37cefba346c6eb0 100644 --- a/paddle/fluid/framework/data_layout.h +++ b/paddle/fluid/framework/data_layout.h @@ -27,6 +27,7 @@ enum class DataLayout { kNHWC = 0, kNCHW = 1, kAnyLayout = 2, + kMKLDNN = 3, // all layouts supported by MKLDNN internally }; inline DataLayout StringToDataLayout(const std::string& str) { @@ -41,6 +42,8 @@ inline DataLayout StringToDataLayout(const std::string& str) { return DataLayout::kNCHW; } else if (s == "ANYLAYOUT") { return DataLayout::kAnyLayout; + } else if (s == "MKLDNNLAYOUT") { + return DataLayout::kMKLDNN; } else { PADDLE_THROW("Unknown storage order string: %s", s); } @@ -54,8 +57,10 @@ inline std::string DataLayoutToString(const DataLayout& data_layout) { return "NCHW"; case DataLayout::kAnyLayout: return "ANY_LAYOUT"; + case DataLayout::kMKLDNN: + return "MKLDNNLAYOUT"; default: - PADDLE_THROW("unknown DataLayou %d", data_layout); + PADDLE_THROW("unknown DataLayout %d", data_layout); } } diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 60ec60a427ba9046ce690eb75c27cd322fdd726d..bc48fd3b479157d4aea390cd5f4dc61ea46dca4b 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -16,6 +16,9 @@ #include #include "paddle/fluid/operators/math/math_function.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif namespace paddle { namespace framework { @@ -88,5 +91,84 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var, out->set_layout(expected_kernel_type.data_layout_); } +#ifdef PADDLE_WITH_MKLDNN +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::reorder; + +void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) { + switch (type) { + case mkldnn::memory::data_type::f32: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::s8: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::u8: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::s16: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::s32: + return platform::to_void_cast(tensor.data()); + default: + PADDLE_THROW("wrong mkldnn type provided"); + } +} +#endif + +void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, + const Tensor& in, Tensor* out) { + auto in_layout = kernel_type_for_var.data_layout_; + auto out_layout = expected_kernel_type.data_layout_; + + PADDLE_ENFORCE( + in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN, + "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to " + "non-MKLDNN"); + +#ifdef PADDLE_WITH_MKLDNN + PADDLE_ENFORCE(in.format() != memory::format::format_undef && + in.format() != memory::format::any, + "Input tensor should have specified memory format"); + + // Set default as NCHW in case not specified + out_layout = + out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout; + + auto& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = dynamic_cast( + pool.Get(expected_kernel_type.place_)); + auto& cpu_engine = dev_ctx->GetEngine(); + + std::vector in_tz = paddle::framework::vectorize2int(in.dims()); + std::vector out_tz = in_tz; + + memory::data_type in_type = ToMKLDNNDataType(in.type()); + PADDLE_ENFORCE(in_type != memory::data_type::data_undef, + "Input tensor type is not supported: ", in.type().name()); + memory::data_type out_type = in_type; + + auto in_format = MKLDNNFormatForSize(in_tz.size(), in.format()); + auto out_format = + MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout)); + + void* in_data = GetDataFromTensor(in, in_type); + + // output tensor has the same dims as input. Reorder don't change dims + out->Resize(in.dims()); + + auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); + + auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data); + auto out_memory = + memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data); + + platform::Reorder(in_memory, out_memory); + + out->set_layout(out_layout); + // reset format since the out tensor will be feed to non-MKLDNN OPkernel + out->set_format(memory::format::format_undef); +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h index 06b638663dd334837a3bcb7737e507fcbc871c7a..67f91e4e48d3e11ed493c5e6943cb9071aff60c4 100644 --- a/paddle/fluid/framework/data_layout_transform.h +++ b/paddle/fluid/framework/data_layout_transform.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/tensor.h" @@ -22,6 +23,57 @@ namespace paddle { namespace framework { +#ifdef PADDLE_WITH_MKLDNN +using MKLDNNFormat = mkldnn::memory::format; +using MKLDNNDataType = mkldnn::memory::data_type; + +inline MKLDNNFormat ToMKLDNNFormat(const DataLayout& layout) { + switch (layout) { + case DataLayout::kNHWC: + return MKLDNNFormat::nhwc; + case DataLayout::kNCHW: + return MKLDNNFormat::nchw; + default: + PADDLE_THROW("Fail to convert layout %s to MKLDNN format", + DataLayoutToString(layout)); + } +} + +inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) { + switch (format) { + case MKLDNNFormat::nhwc: + return DataLayout::kNHWC; + case MKLDNNFormat::nchw: + return DataLayout::kNCHW; + default: + PADDLE_THROW("Fail to convert MKLDNN format to paddle layout"); + } +} + +inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) { + static const std::map dict{ + {std::type_index(typeid(float)), MKLDNNDataType::f32}, // NOLINT + {std::type_index(typeid(char)), MKLDNNDataType::s8}, // NOLINT + {std::type_index(typeid(unsigned char)), MKLDNNDataType::u8}, + {std::type_index(typeid(int16_t)), MKLDNNDataType::s16}, + {std::type_index(typeid(int32_t)), MKLDNNDataType::s32}}; + auto iter = dict.find(type); + if (iter != dict.end()) return iter->second; + return MKLDNNDataType::data_undef; +} + +inline MKLDNNFormat MKLDNNFormatForSize(size_t dims_size, + MKLDNNFormat default_format) { + return (dims_size == 1 + ? mkldnn::memory::format::x + : dims_size == 2 ? mkldnn::memory::format::nc : default_format); +} +#endif + +void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, + const Tensor& in, Tensor* out); + std::vector GetAxis(const DataLayout& from, const DataLayout& to); void TransDataLayout(const OpKernelType& kernel_type_for_var, diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index 9c277a27da5af34fc9fb18ca073e369c05ecdf22..5f15e20c78fd5a333523fe9e73542c037a161cae 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -33,11 +33,38 @@ void DataTransform(const OpKernelType& expected_kernel_type, Tensor in; in.ShareDataWith(input_tensor); Tensor out; + DataLayout lin = kernel_type_for_var.data_layout_; + DataLayout lout = expected_kernel_type.data_layout_; // do layout transform - if (NeedTransformLayout(expected_kernel_type.data_layout_, - kernel_type_for_var.data_layout_)) { - TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out); + if (NeedTransformLayout(lout, lin)) { + if (lin == DataLayout::kMKLDNN || lout == DataLayout::kMKLDNN) { + PADDLE_ENFORCE( + !(lin == DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN), + "No layout transform needed between two MKLDNN OPKernels"); + + if (lin != DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN) { +#ifdef PADDLE_WITH_MKLDNN + // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel + // Just set layout/format. No real transform occur + + auto out_format = + MKLDNNFormatForSize(in.dims().size(), ToMKLDNNFormat(lin)); + + out.ShareDataWith(input_tensor); + out.set_layout(DataLayout::kMKLDNN); + out.set_format(out_format); +#endif + } else { + // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel + // Do transform via MKLDNN lib + TransDataLayoutFromMKLDNN(kernel_type_for_var, expected_kernel_type, in, + &out); + } + } else { + // Case3 - transfrom between Non-MKLDNN OPKernels + TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out); + } transformed = true; PassTensorData(&out, &in); } diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc index b6b93cf422a60c1d8e9cb8b477efd562f9fe4758..60382faffb8e53870658b2d1ff83abc4008cb4cf 100644 --- a/paddle/fluid/framework/data_type.cc +++ b/paddle/fluid/framework/data_type.cc @@ -28,6 +28,9 @@ struct DataTypeMap { }; static DataTypeMap* InitDataTypeMap(); +// C++11 removes the need for manual locking. Concurrent execution shall wait if +// a static local variable is already being initialized. +// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex static DataTypeMap& gDataTypeMap() { static DataTypeMap* g_data_type_map_ = InitDataTypeMap(); return *g_data_type_map_; diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 1bcd8412eb2d618b923bcd0557d118af62271f4a..3c73b6cc55c187c3f6e7edd1ce38cc58f4e8413d 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -7,26 +7,32 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph) +cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder) +cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder) cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) if(WITH_GPU) - nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory - dynload_cuda) - set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) + nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + dynload_cuda variable_visitor) nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda) nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) else() - set(multi_devices_graph_builder_deps) + cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + variable_visitor) cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim) cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) endif() cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) +cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle - scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle) + scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle) + + +cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope @@ -36,5 +42,6 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha device_context broadcast_op_handle) cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context gather_op_handle) +cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor) #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory # device_context reduce_op_handle ) diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc similarity index 61% rename from paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc rename to paddle/fluid/framework/details/all_reduce_op_handle.cc index 95aa599cd3e403e9cc66b2b5ad35d0d214d1ab5b..b335d3a0d364c916e19574de8d3ed89aaec7de41 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -11,46 +11,65 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - -#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" #include + +#include "paddle/fluid/framework/details/all_reduce_op_handle.h" +#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" +#include "paddle/fluid/framework/details/variable_visitor.h" namespace paddle { namespace framework { namespace details { -NCCLAllReduceOpHandle::NCCLAllReduceOpHandle( - const std::vector &local_scopes, - const std::vector &places, - const platform::NCCLContextMap &ctxs) + +#ifdef PADDLE_WITH_CUDA +AllReduceOpHandle::AllReduceOpHandle(const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLContextMap *ctxs) : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) { - for (auto &p : places_) { - this->dev_ctxes_[p] = nccl_ctxs_.DevCtx(p); + if (nccl_ctxs_) { + for (auto &p : places_) { + this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p); + } } } +#else +AllReduceOpHandle::AllReduceOpHandle(const std::vector &local_scopes, + const std::vector &places) + : local_scopes_(local_scopes), places_(places) {} +#endif -void NCCLAllReduceOpHandle::RunImpl() { - if (inputs_.size() == 1) { +void AllReduceOpHandle::RunImpl() { + if (NoDummyInputSize() == 1) { return; // No need to all reduce when GPU count = 1; } else { // Wait input done WaitInputVarGenerated(); - - auto &var_name = static_cast(this->inputs_[0])->name_; - int dtype = -1; - size_t numel = 0; + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), places_.size(), + "The NoDummyInputSize should be equal to the number of places."); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), out_var_handles.size(), + "The NoDummyInputSize and NoDummyOutputSize should be equal."); std::vector lod_tensors; - for (size_t i = 0; i < local_scopes_.size(); ++i) { auto *s = local_scopes_[i]; auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get(); - - auto &lod_tensor = local_scope.FindVar(var_name)->Get(); + auto &lod_tensor = + local_scope.FindVar(in_var_handles[i]->name_)->Get(); lod_tensors.emplace_back(&lod_tensor); + PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_, + "The name of input and output should be equal."); } if (platform::is_gpu_place(lod_tensors[0]->place())) { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); + int dtype = -1; + size_t numel = 0; std::vector> all_reduce_calls; for (size_t i = 0; i < local_scopes_.size(); ++i) { auto &p = places_[i]; @@ -66,7 +85,7 @@ void NCCLAllReduceOpHandle::RunImpl() { } int dev_id = boost::get(p).device; - auto &nccl_ctx = nccl_ctxs_.at(dev_id); + auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; all_reduce_calls.emplace_back([=] { @@ -81,22 +100,25 @@ void NCCLAllReduceOpHandle::RunImpl() { call(); } }); +#else + PADDLE_THROW("Not compiled with CUDA"); +#endif } else { // Special handle CPU only Operator's gradient. Like CRF auto &trg = *this->local_scopes_[0] ->FindVar(kLocalExecScopeName) ->Get() - ->Var() + ->FindVar(out_var_handles[0]->name_) ->GetMutable(); // Reduce All Tensor to trg in CPU ReduceLoDTensor func(lod_tensors, &trg); VisitDataType(ToDataType(lod_tensors[0]->type()), func); - for (size_t i = 0; i < local_scopes_.size(); ++i) { + for (size_t i = 1; i < local_scopes_.size(); ++i) { auto &scope = *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get(); auto &p = places_[i]; - auto *var = scope.FindVar(var_name); + auto *var = scope.FindVar(out_var_handles[i]->name_); auto *dev_ctx = dev_ctxes_[p]; RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { @@ -109,7 +131,7 @@ void NCCLAllReduceOpHandle::RunImpl() { } } -std::string NCCLAllReduceOpHandle::Name() const { return "nccl_all_reduce"; } +std::string AllReduceOpHandle::Name() const { return "all_reduce"; } } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h similarity index 68% rename from paddle/fluid/framework/details/nccl_all_reduce_op_handle.h rename to paddle/fluid/framework/details/all_reduce_op_handle.h index a0c321843e3fc5abcbd1ef2ce2e153250269aa7d..fdd250b0d3eb166249271a95f7592b9fadee5265 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/all_reduce_op_handle.h @@ -20,17 +20,23 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/nccl_helper.h" +#endif namespace paddle { namespace framework { namespace details { -struct NCCLAllReduceOpHandle : public OpHandleBase { - NCCLAllReduceOpHandle(const std::vector &local_scopes, - const std::vector &places, - const platform::NCCLContextMap &ctxs); - +struct AllReduceOpHandle : public OpHandleBase { +#ifdef PADDLE_WITH_CUDA + AllReduceOpHandle(const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLContextMap *ctxs); +#else + AllReduceOpHandle(const std::vector &local_scopes, + const std::vector &places); +#endif std::string Name() const override; // Delay and buffer nccl_all_reduce together can significantly increase @@ -41,9 +47,11 @@ struct NCCLAllReduceOpHandle : public OpHandleBase { void RunImpl() override; private: - const std::vector &local_scopes_; - const std::vector &places_; - const platform::NCCLContextMap &nccl_ctxs_; + std::vector local_scopes_; + std::vector places_; +#ifdef PADDLE_WITH_CUDA + const platform::NCCLContextMap *nccl_ctxs_; +#endif }; } // namespace details diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index d5ca061944f33939cea59a5275e691b1966194fa..1d9f1bd6e417e30f0799f0bbed1739cedb4e8fbf 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -73,6 +73,9 @@ void BroadcastOpHandle::RunImpl() { int root_id = boost::get(in_tensor.place()).device; std::vector> broadcast_calls; + int type = platform::ToNCCLDataType(in_tensor.type()); + size_t numel = static_cast(in_tensor.numel()); + for (auto out_var_handle : out_var_handles) { Variable *out_var = var_scopes.at(out_var_handle->scope_idx_) ->FindVar(out_var_handle->name_); @@ -87,13 +90,11 @@ void BroadcastOpHandle::RunImpl() { send_recv_buffer = const_cast(in_tensor.data()); out_handle = out_var_handle; } else { - send_recv_buffer = - VariableVisitor::GetMutableTensor(out_var).mutable_data( - out_var_handle->place_); + send_recv_buffer = VariableVisitor::GetMutableTensor(out_var) + .Resize(in_tensor.dims()) + .mutable_data(out_var_handle->place_); } - int type = platform::ToNCCLDataType(in_tensor.type()); - size_t numel = static_cast(in_tensor.numel()); broadcast_calls.emplace_back( [send_recv_buffer, numel, type, root_id, &nccl_ctx] { PADDLE_ENFORCE(platform::dynload::ncclBcast( diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index 629aa00cb817c4b1446e7b750ca62a7c6b1db670..8036f756b6d6506684c109ab881d546f38176a10 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -59,8 +59,8 @@ struct BroadcastOpHandle : public OpHandleBase { void RunImpl() override; private: - const std::vector &local_scopes_; - const std::vector &places_; + std::vector local_scopes_; + std::vector places_; #ifdef PADDLE_WITH_CUDA const platform::NCCLContextMap *nccl_ctxs_; #endif diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 91bdfe6134ffbd1404336c9d6d1222a505084b2b..64e83acb4dc1995800c4ca3caf81668b24a7c9fe 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -14,6 +14,8 @@ #pragma once +#include + namespace paddle { namespace framework { namespace details { @@ -29,6 +31,8 @@ struct BuildStrategy { ReduceStrategy reduce_{ReduceStrategy::kAllReduce}; GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice}; + + std::string debug_graphviz_path_{""}; }; } // namespace details diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index e8d510ec955602b5a3f73ca06caa121886eb150b..716d674fa29bad9321fc20979775c06f26bf4679 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -20,8 +20,9 @@ namespace details { struct ExecutionStrategy { size_t num_threads_{0}; - bool use_event_{true}; + bool use_cuda_{true}; bool allow_op_delay_{false}; + size_t num_iteration_per_drop_scope_{100}; }; } // namespace details diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc new file mode 100644 index 0000000000000000000000000000000000000000..018c9bff71e553d8a3641f06f10b350453676b24 --- /dev/null +++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fuse_vars_op_handle.h" + +namespace paddle { +namespace framework { +namespace details { + +void FuseVarsOpHandle::RunImpl() { + WaitInputVarGenerated(place_); + + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + PADDLE_ENFORCE_EQ(in_var_handles.size(), 0); + PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), ""); + + auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get(); + + auto out_var_handle = out_var_handles[0]; + auto out_var = scope->Var(out_var_handle->name_); + + auto out_tensor = out_var->GetMutable(); + out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_); + + int64_t s = 0; + for (size_t i = 1; i < out_var_handles.size(); ++i) { + auto out_name = out_var_handles[i]->name_; + auto out_t = scope->Var(out_name)->GetMutable(); + auto numel = this->inputs_numel_.at(out_name); + out_t->ShareDataWith(out_tensor->Slice(s, s + numel)); + s += numel; + } + this->RunAndRecordEvent([] {}); +} + +std::string FuseVarsOpHandle::Name() const { return "fuse vars"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h new file mode 100644 index 0000000000000000000000000000000000000000..140fb5bb49a33146de974b6d79559b4cf15bdd7b --- /dev/null +++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h @@ -0,0 +1,63 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { + +struct FuseVarsOpHandle : public OpHandleBase { + public: + FuseVarsOpHandle(Scope *local_scope, const platform::Place &place, + const std::unordered_map &inputs_numel, + const std::type_index &var_type) + : local_scope_(local_scope), + place_(place), + inputs_numel_(inputs_numel), + type_(var_type) { + total_numel_ = 0; + for (auto in_numel : inputs_numel) { + PADDLE_ENFORCE_GT(in_numel.second, 0); + total_numel_ += in_numel.second; + } + } + + std::string Name() const override; + + bool IsMultiDeviceTransfer() override { return false; }; + + protected: + void RunImpl() override; + + private: + Scope *local_scope_; + const platform::Place place_; + const std::unordered_map inputs_numel_; + const std::type_index type_; + int64_t total_numel_; +}; +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 17baacd13eecac8f410631fe9e94788da4fff848..cc7b94d0653e34c8ac711a7db7ab6ab1a9ac46a2 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -17,6 +17,7 @@ #include #include +#include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_graph_builder.h" @@ -26,14 +27,6 @@ #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/scope.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" -#endif - -DEFINE_string(ssa_graph_path, "/tmp/ssa_graph.dot", - "the ssa graph path only print with GLOG_v=10," - "default /tmp/graph.dot"); - namespace paddle { namespace framework { namespace details { @@ -64,6 +57,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( for (auto &p : params) { grad_names_.insert(GradVarName(p)); } + balance_vars_.resize(places_.size(), 0); } void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, @@ -93,7 +87,7 @@ std::vector MultiDevSSAGraphBuilder::FindDistTrainSendVars( for (auto *op : program.Block(0).AllOps()) { // TODO(Yancey1989): use a graceful method to find send op, // instead of the the hard code string - if (op->Type() == "send_vars") { + if (op->Type() == "send") { auto op_vars = op->InputArgumentNames(); send_vars.reserve(send_vars.size() + std::distance(op_vars.begin(), op_vars.end())); @@ -147,11 +141,30 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp( checker(op.InputArgumentNames(), recv_vars); } +size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( + const std::vector &var_names) const { + int64_t numel_sum = 0; + for (auto var_name : var_names) { + auto var_desc = all_vars_.at(var_name); + PADDLE_ENFORCE_NOT_NULL(var_desc); + auto dim = framework::make_ddim(var_desc->GetShape()); + int64_t numel = framework::product(dim); + PADDLE_ENFORCE_GT(numel, 0); + numel_sum += numel; + } + + auto smallest = + std::min_element(std::begin(balance_vars_), std::end(balance_vars_)); + size_t dev_id = + static_cast(std::distance(std::begin(balance_vars_), smallest)); + balance_vars_[dev_id] += numel_sum; + return dev_id; +} + std::unique_ptr MultiDevSSAGraphBuilder::Build( const ProgramDesc &program) const { - std::unordered_map all_vars; for (auto *var : program.Block(0).AllVars()) { - all_vars[var->Name()] = var; + all_vars_.emplace(var->Name(), var); } auto graph = new SSAGraph(); @@ -168,35 +181,16 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( auto send_vars = FindDistTrainSendVars(program); auto recv_vars = FindDistTrainRecvVars(program); - std::vector> var_name_on_devices; std::vector> bcast_var_name_set; - var_name_on_devices.resize(places_.size()); bcast_var_name_set.resize(places_.size()); size_t cur_device_id = 0; - std::vector balance_grads(places_.size(), 0); - - auto get_appropriate_dev = [&](std::string &g_name) -> size_t { - auto var_desc = all_vars.at(g_name); - PADDLE_ENFORCE_NOT_NULL(var_desc); - auto dim = framework::make_ddim(var_desc->GetShape()); - int64_t numel = framework::product(dim); - PADDLE_ENFORCE_GE(numel, 0); - auto smallest = - std::min_element(std::begin(balance_grads), std::end(balance_grads)); - size_t dev_id = - static_cast(std::distance(std::begin(balance_grads), smallest)); - balance_grads[dev_id] += numel; - return dev_id; - }; - bool is_forwarding = true; + for (auto *op : program.Block(0).AllOps()) { if (boost::get( op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kRPC)) { - // append rpc op if program is distributed trainer main program. - // always use the first device CreateRPCOp(&result, *op); } else if (IsDistTrainOp(*op, send_vars, recv_vars)) { CreateDistTrainOp(&result, *op); @@ -206,53 +200,63 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( BuildStrategy::GradientScaleStrategy::kCustomized) { CreateScaleLossGradOp(&result); } + // This assumes the backward generating code will ensure IsScaleLossOp + // is true only for the op that scale the final scalar loss. + // It also assumes backward op will always follow the forward op in + // the block. is_forwarding = false; } else { - int op_dev_id = GetOpDeviceID(var_name_on_devices, *op); - if (op_dev_id == -1) { // var on all device - CreateComputationalOps(&result, *op, places_.size()); - } else { + int op_dev_id = GetOpDeviceID(*op); + if (op_dev_id != -1) { // This op only runs on one specific device. CreateComputationalOp(&result, *op, op_dev_id); for (auto &var_name : op->OutputArgumentNames()) { - var_name_on_devices[op_dev_id].emplace(var_name); + var_name_on_devices_.emplace(var_name, op_dev_id); } - } - if (!is_forwarding && places_.size() > 1) { - // Currently, we assume that once gradient is generated, it can be - // broadcast, and each gradient is only broadcast once. - if (static_cast(boost::get(op->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) & - static_cast(OpRole::kBackward))) { - try { - auto backward_vars = - boost::get>(op->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - - PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); - - for (size_t i = 0; i < backward_vars.size(); i += 2) { - auto &p_name = backward_vars[i]; - auto &g_name = backward_vars[i + 1]; - VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; - - switch (strategy_.reduce_) { - case BuildStrategy::ReduceStrategy::kReduce: - cur_device_id = get_appropriate_dev(g_name); - CreateReduceOp(&result, g_name, cur_device_id); - var_name_on_devices[cur_device_id].emplace(g_name); - bcast_var_name_set[cur_device_id].emplace(p_name); - break; - case BuildStrategy::ReduceStrategy::kAllReduce: - if (IsSparseGradient(all_vars, g_name)) { - CreateReduceOp(&result, g_name, 0); - CreateBroadcastOp(&result, g_name, 0); - } else { - InsertNCCLAllReduceOp(&result, g_name); - } - break; + } else { + // This op runs on all devices, and its output may have parameter's + // gradients. + CreateComputationalOps(&result, *op, places_.size()); + + if (!is_forwarding && places_.size() > 1) { + // Currently, we assume that once gradient is generated, it can be + // broadcast, and each gradient is only broadcast once. + if (static_cast(boost::get(op->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward))) { + try { + auto backward_vars = + boost::get>(op->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &p_name = backward_vars[i]; + auto &g_name = backward_vars[i + 1]; + VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; + + switch (strategy_.reduce_) { + case BuildStrategy::ReduceStrategy::kReduce: + cur_device_id = GetAppropriateDeviceID({g_name}); + CreateReduceOp(&result, g_name, cur_device_id); + var_name_on_devices_.emplace(g_name, cur_device_id); + bcast_var_name_set[cur_device_id].emplace(p_name); + break; + case BuildStrategy::ReduceStrategy::kAllReduce: + if (IsSparseGradient(g_name)) { + CreateReduceOp(&result, g_name, 0); + CreateBroadcastOp(&result, g_name, 0); + } else { + InsertAllReduceOp(&result, g_name); + } + break; + default: + LOG(FATAL) << "Unknown reduce strategy "; + break; + } } + } catch (boost::bad_get e) { } - } catch (boost::bad_get e) { } } } @@ -268,7 +272,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( } /* Dependency graph has been constructed. However, there are still data - harzaeds need to be handled. + hazards need to be handled. */ PolishGraphToSupportDataHazards(&result); @@ -277,24 +281,30 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( */ AddOutputToLeafOps(&result); - if (VLOG_IS_ON(10)) { - std::ofstream fout(FLAGS_ssa_graph_path); - PrintGraphviz(*graph, fout); - } - return std::unique_ptr(graph); } -bool MultiDevSSAGraphBuilder::IsSparseGradient( - const std::unordered_map &all_vars, - const std::string &og) const { - PADDLE_ENFORCE(all_vars.count(og) != 0); - if (all_vars.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { +bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { + PADDLE_ENFORCE(all_vars_.count(og) != 0); + if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { return true; } return false; } +void MultiDevSSAGraphBuilder::SetCommunicationContext( + OpHandleBase *op_handle, const platform::Place &p) const { +#ifdef PADDLE_WITH_CUDA + if (nccl_ctxs_ == nullptr) { + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); + } +#else + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); +#endif +} + void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result, const std::string &p_name, size_t src_dev_id) const { @@ -309,15 +319,12 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result, op_handle->AddInput(in); for (size_t i = 0; i < places_.size(); ++i) { - auto &vars = result->vars_.at(i).at(p_name); auto &p = places_[i]; + SetCommunicationContext(op_handle, p); + auto &vars = result->vars_.at(i).at(p_name); auto *out_var = new VarHandle(vars.size(), i, p_name, p); vars.emplace_back(out_var); op_handle->AddOutput(out_var); -#ifndef ADDLE_WITH_CUDA - op_handle->SetDeviceContext(p, - platform::DeviceContextPool::Instance().Get(p)); -#endif } } @@ -329,27 +336,28 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result, CreateOpHandleIOs(result, op, dev_id); } -void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp( - SSAGraph *result, const std::string &og) const { +void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result, + const std::string &og) const { #ifdef PADDLE_WITH_CUDA result->ops_.emplace_back( - new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_)); + new AllReduceOpHandle(local_scopes_, places_, nccl_ctxs_)); +#else + result->ops_.emplace_back(new AllReduceOpHandle(local_scopes_, places_)); +#endif auto *op_handle = result->ops_.back().get(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; + SetCommunicationContext(op_handle, p); auto &vars = result->vars_[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); op_handle->AddInput(prev_grad.get()); - auto var = new VarHandle(vars.size() - 1, i, og, p); + auto var = new VarHandle(vars.size(), i, og, p); vars.emplace_back(var); op_handle->AddOutput(var); } -#else - PADDLE_ENFORCE("Not implemented"); -#endif } bool MultiDevSSAGraphBuilder::IsParameterGradientOnce( @@ -364,31 +372,32 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce( return is_pg_once; } -int MultiDevSSAGraphBuilder::GetOpDeviceID( - const std::vector> &var_name_on_devices, - const OpDesc &op) const { +int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { return -1; } - int var_dev_id = -1; - for (auto &var_name : op.InputArgumentNames()) { - if (var_dev_id != -1) break; - for (size_t i = 0; i < var_name_on_devices.size(); ++i) { - if (var_name_on_devices[i].count(var_name)) { - var_dev_id = static_cast(i); - break; - } + for (auto &varname : op.InputArgumentNames()) { + int dev_id = GetVarDeviceID(varname); + if (dev_id != -1) { + return dev_id; } } - return var_dev_id; + return -1; +} + +int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const { + auto got = var_name_on_devices_.find(varname); + return got == var_name_on_devices_.end() ? -1 : got->second; } void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const { for (size_t i = 0; i < places_.size(); ++i) { // Insert ScaleCost OpHandle #ifdef PADDLE_WITH_CUDA - auto *communication_dev_ctx = nccl_ctxs_->DevCtx(places_[i]); + auto *communication_dev_ctx = + nccl_ctxs_ ? nccl_ctxs_->DevCtx(places_[i]) + : platform::DeviceContextPool::Instance().Get(places_[i]); #else auto *communication_dev_ctx = platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); @@ -433,24 +442,22 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result, auto *op_handle = result->ops_.back().get(); for (size_t i = 0; i < places_.size(); ++i) { - auto &vars = result->vars_[i][og]; -#ifndef PADDLE_WITH_CUDA auto &p = places_[i]; - op_handle->SetDeviceContext(p, - platform::DeviceContextPool::Instance().Get(p)); -#endif + SetCommunicationContext(op_handle, p); + auto &vars = result->vars_[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); op_handle->AddInput(prev_grad.get()); } auto &vars = result->vars_[dst_dev_id][og]; - auto var = - new VarHandle(vars.size() - 1, dst_dev_id, og, places_[dst_dev_id]); + auto var = new VarHandle(vars.size(), dst_dev_id, og, places_[dst_dev_id]); vars.emplace_back(var); op_handle->AddOutput(var); return var; } +// Find the first occurence of `prev_op_name` and make current `op` depend +// on it. void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op, const std::string &prev_op_name) const { for (auto &prev_op : result->ops_) { @@ -465,35 +472,85 @@ void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op, void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result, const OpDesc &op) const { - CreateComputationalOp(result, op, 0); + int op_dev_id = -1; + if (op.Type() == "split_byref" || op.Type() == "split_selected_rows") { + op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]); + if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames()); + for (auto &varname : op.InputArgumentNames()) { + var_name_on_devices_.emplace(varname, op_dev_id); + } + } + for (auto &varname : op.OutputArgumentNames()) { + var_name_on_devices_.emplace(varname, op_dev_id); + } + } else if (op.Type() == "concat") { + op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]); + for (auto &varname : op.OutputArgumentNames()) { + var_name_on_devices_.emplace(varname, op_dev_id); + } + } else { + PADDLE_ENFORCE( + "the distribute training related op should be in [split_byref, " + "concat]."); + } + + PADDLE_ENFORCE(op_dev_id != -1, + "can not find right place for distributed op: %s", op.Type()); + + CreateComputationalOp(result, op, op_dev_id); if (op.Type() == "concat") { ConnectOp(result, result->ops_.back().get(), "fetch_barrier"); } } +// Create RPC related op handles that connects its in ops and out ops. void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, const OpDesc &op) const { - auto &p = places_[0]; - auto *s = local_scopes_[0]; - result->ops_.emplace_back(new RPCOpHandle(op, s, p, op.Type())); + int op_dev_id = -1; + if (op.Type() == "send") { + op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]); + // the variable name which contains .block means it was splited by + // split_byref op + // so that we can balance the variable blocks to all the pserver instances. + if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce && + op.InputArgumentNames()[0].find(".block") == std::string::npos) { + op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames()); + for (auto &varname : op.InputArgumentNames()) { + var_name_on_devices_.emplace(varname, op_dev_id); + } + } + } else if (op.Type() == "recv") { + op_dev_id = GetAppropriateDeviceID(op.OutputArgumentNames()); + for (auto &varname : op.OutputArgumentNames()) { + var_name_on_devices_.emplace(varname, op_dev_id); + } + } else { + // send_barrier and fetch_barrier op can be scheduled on device 0 + op_dev_id = 0; + } + + PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s", + op.Type()); + + result->ops_.emplace_back(new RPCOpHandle(op, local_scopes_[op_dev_id], + op.Type(), places_[op_dev_id])); if (op.Type() == "send_barrier") { - ConnectOp(result, result->ops_.back().get(), "send_vars"); + ConnectOp(result, result->ops_.back().get(), "send"); } else if (op.Type() == "recv") { ConnectOp(result, result->ops_.back().get(), "send_barrier"); } else if (op.Type() == "fetch_barrier") { ConnectOp(result, result->ops_.back().get(), "recv"); - } else if (op.Type() == "send_vars") { + } else if (op.Type() == "send") { // do nothing } else { PADDLE_THROW( "rpc op should be in [" - "send_vars, send_barrier. recv, fetch_barrier]"); + "send, send_barrier. recv, fetch_barrier]"); } - // TODO(Yancey1989): schedule rpc op on different place may - // increate throughput - CreateOpHandleIOs(result, op, 0); + CreateOpHandleIOs(result, op, op_dev_id); } bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const { diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index 544cbe585c7423b5f3eb98ee698ca5668376f1ca..0b6347bf51dc1c347073a0fdcf4ddd91865d846d 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -47,10 +47,11 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { #endif std::unique_ptr Build(const ProgramDesc &program) const override; + int GetVarDeviceID(const std::string &varname) const override; private: void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op, - size_t place_id) const; + size_t device_id) const; private: std::string loss_var_name_; @@ -96,21 +97,26 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::string &og, std::unordered_set *og_has_been_broadcast) const; - int GetOpDeviceID( - const std::vector> &var_name_on_devices, - const OpDesc &op) const; + int GetOpDeviceID(const OpDesc &op) const; - void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const; + void InsertAllReduceOp(SSAGraph *result, const std::string &og) const; void CreateBroadcastOp(SSAGraph *result, const std::string &p_name, size_t src_dev_id) const; - bool IsSparseGradient( - const std::unordered_map &all_vars, - const std::string &og) const; + bool IsSparseGradient(const std::string &og) const; + + size_t GetAppropriateDeviceID( + const std::vector &var_names) const; private: BuildStrategy strategy_; + mutable std::unordered_map all_vars_; + mutable std::unordered_map var_name_on_devices_; + mutable std::vector balance_vars_; + + void SetCommunicationContext(OpHandleBase *op_handle, + const platform::Place &p) const; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 6b064650b4f09737836bda4a43fa421720077929..1f84c3b9e2d7ee9ae51959988fceeb3451b7b3b8 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/framework/details/op_handle_base.h" +#include namespace paddle { namespace framework { @@ -39,9 +39,9 @@ OpHandleBase::~OpHandleBase() { #endif } -void OpHandleBase::Run(bool use_event) { +void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA - if (events_.empty() && use_event) { + if (events_.empty() && use_cuda) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; PADDLE_ENFORCE(cudaSetDevice(dev_id)); @@ -50,7 +50,7 @@ void OpHandleBase::Run(bool use_event) { } } #else - PADDLE_ENFORCE(!use_event); + PADDLE_ENFORCE(!use_cuda); #endif RunImpl(); @@ -104,6 +104,16 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { } } +size_t OpHandleBase::NoDummyInputSize() const { + size_t cnt = 0; + for (auto *in : inputs_) { + if (dynamic_cast(in) == nullptr) { + ++cnt; + } + } + return cnt; +} + bool OpHandleBase::NeedWait(VarHandleBase *in_var) { return in_var && in_var->generated_op_; } @@ -112,11 +122,16 @@ void OpHandleBase::RunAndRecordEvent(const std::function &callback) { #ifdef PADDLE_WITH_CUDA if (!events_.empty()) { // Use event std::function method = callback; - + // NOTE(zcd): device context must be ordered here because RecordEvent + // will use a mutex to ensure the safe of multi-threads. + std::map ordered_ctxes; for (auto &p : dev_ctxes_) { + ordered_ctxes.emplace(p.second, p.first); + } + for (auto &p : ordered_ctxes) { method = [method, p, this]() { - static_cast(p.second)->RecordEvent( - events_.at(boost::get(p.first).device), + static_cast(p.first)->RecordEvent( + events_.at(boost::get(p.second).device), method); }; } diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 8f94206a87dbae8a81727ca48718886bbabbe25c..fbd90a3296bca92b097cab925b218b91e7f4752f 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -36,7 +36,7 @@ class OpHandleBase { virtual std::string Name() const = 0; - void Run(bool use_event); + void Run(bool use_cuda); virtual void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx); @@ -80,6 +80,8 @@ class OpHandleBase { const std::vector &Outputs() const { return outputs_; } + size_t NoDummyInputSize() const; + protected: void RunAndRecordEvent(const std::function &callback); diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h index 2b95a284990da8f9b7c16d6e4221eb1ed061f74b..a6ffb37313a88120bc9e8d5ce326f60aeebdff69 100644 --- a/paddle/fluid/framework/details/reduce_and_gather.h +++ b/paddle/fluid/framework/details/reduce_and_gather.h @@ -37,7 +37,9 @@ struct ReduceLoDTensor { PADDLE_ENFORCE_NE(t0.numel(), 0); dst_tensor_.Resize(t0.dims()); T *dst = dst_tensor_.mutable_data(platform::CPUPlace()); - std::copy(t0.data(), t0.data() + t0.numel(), dst); + if (dst != t0.data()) { + std::copy(t0.data(), t0.data() + t0.numel(), dst); + } for (size_t i = 1; i < src_tensors_.size(); ++i) { auto &t = *src_tensors_[i]; diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index c652a2f4eb0f9b73cb19ebbd9d0809210b280ad3..4d14334cdfe06e2e805c2577458d6689e6324cc7 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -32,8 +32,8 @@ namespace framework { namespace details { struct ReduceOpHandle : public OpHandleBase { - const std::vector &local_scopes_; - const std::vector &places_; + std::vector local_scopes_; + std::vector places_; #ifdef PADDLE_WITH_CUDA const platform::NCCLContextMap *nccl_ctxs_; diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc index 7f4da4c01de1010467d839ee5490c5e0d02d8c24..586465f99fd94117c821be2952bffda385fbcf75 100644 --- a/paddle/fluid/framework/details/rpc_op_handle.cc +++ b/paddle/fluid/framework/details/rpc_op_handle.cc @@ -19,12 +19,12 @@ namespace framework { namespace details { RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc, - const Scope *local_scope, const platform::Place &place, - const std::string &name) + const Scope *local_scope, const std::string &name, + const platform::Place &place) : op_(framework::OpRegistry::CreateOp(op_desc)), local_scope_(local_scope), - place_(place), - name_(name) {} + name_(name), + place_(place) {} void RPCOpHandle::RunImpl() { // TODO(wuyi): need further analysis whether wait VarDummyHandle. diff --git a/paddle/fluid/framework/details/rpc_op_handle.h b/paddle/fluid/framework/details/rpc_op_handle.h index d28b7721720d808a8d81701c3811eae16121fb41..ae38c7fe19e102a330455d89a1068414a7835fab 100644 --- a/paddle/fluid/framework/details/rpc_op_handle.h +++ b/paddle/fluid/framework/details/rpc_op_handle.h @@ -29,7 +29,7 @@ namespace details { struct RPCOpHandle : public OpHandleBase { RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope, - const platform::Place& place, const std::string& name); + const std::string& name, const platform::Place& place); std::string Name() const override; @@ -43,8 +43,8 @@ struct RPCOpHandle : public OpHandleBase { private: std::unique_ptr op_; const Scope* local_scope_; - const platform::Place& place_; const std::string name_; + platform::Place place_; }; } // namespace details diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..eb4e7ec52f907f9403e21ec2734d61824f51a58b --- /dev/null +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" +#include +#include +#include "paddle/fluid/framework/executor.h" + +namespace paddle { +namespace framework { +namespace details { +ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( + ExecutionStrategy strategy, std::vector local_scopes, + std::vector var_infos, std::vector places, + std::unique_ptr &&underlying_executor) + : strategy_(std::move(strategy)), + underlying_executor_(std::move(underlying_executor)), + local_scopes_(std::move(local_scopes)), + var_infos_(std::move(var_infos)), + places_(std::move(places)) {} + +FeedFetchList ScopeBufferedSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + if (drop_scope_counter_ == 0) { + // Create local scopes. + for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) { + auto &scope = *it; + Scope &local_scope = scope->NewScope(); + *scope->Var(details::kLocalExecScopeName)->GetMutable() = + &local_scope; + + for (auto &info : var_infos_) { + if (scope->FindVar(info.name_) != nullptr) { + continue; + } + + if (info.persistable_) { // Persistable + InitializeVariable(scope->Var(info.name_), info.type_); + } else { + InitializeVariable(local_scope.Var(info.name_), info.type_); + } + } + } + } + + auto fetch_data = underlying_executor_->Run(fetch_tensors); + drop_scope_counter_ += 1; + if (!fetch_tensors.empty() || + drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { + drop_scope_counter_ = 0; + // Wait All computational streams + for (auto p : places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } + for (auto &scope : local_scopes_) { + auto &local_scope = + *scope->Var(details::kLocalExecScopeName)->GetMutable(); + scope->DeleteScope(local_scope); + } + } + return fetch_data; +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h new file mode 100644 index 0000000000000000000000000000000000000000..20df7a4722d589ffd168f842e927cff8411096bb --- /dev/null +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -0,0 +1,53 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/details/execution_strategy.h" +#include "paddle/fluid/framework/details/ssa_graph_executor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/place.h" +namespace paddle { +namespace framework { +namespace details { + +struct VariableInfo { + std::string name_; + proto::VarType::Type type_; + bool persistable_; +}; + +class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { + public: + ScopeBufferedSSAGraphExecutor( + ExecutionStrategy strategy, std::vector local_scopes, + std::vector var_infos, std::vector places, + std::unique_ptr&& underlying_executor); + FeedFetchList Run(const std::vector& fetch_tensors) override; + + private: + size_t drop_scope_counter_{0}; + + ExecutionStrategy strategy_; + std::unique_ptr underlying_executor_; + std::vector local_scopes_; + std::vector var_infos_; + std::vector places_; +}; +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index 6a567527550883add08031e50aa8de2b204cf13d..88a21f48879a15450051ad94ed76e1c48bf23014 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/framework/details/ssa_graph_builder.h" +#include namespace paddle { namespace framework { @@ -73,64 +73,6 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, op_handle->AddOutput(var); } -template -void IterAllVar(const SSAGraph &graph, Callback callback) { - for (auto &each : graph.vars_) { - for (auto &pair1 : each) { - for (auto &pair2 : pair1.second) { - callback(*pair2); - } - } - } - - for (auto &var : graph.dep_vars_) { - callback(*var); - } -} - -void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) { - size_t var_id = 0; - std::unordered_map vars; - - sout << "digraph G {\n"; - - IterAllVar(graph, [&](const VarHandleBase &var) { - auto *var_ptr = &var; - auto *var_handle_ptr = dynamic_cast(var_ptr); - auto *dummy_ptr = dynamic_cast(var_ptr); - - size_t cur_var_id = var_id++; - vars[var_ptr] = cur_var_id; - - if (var_handle_ptr) { - sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_ - << "\\n" - << var_handle_ptr->place_ << "\\n" - << var_handle_ptr->version_ << "\"]" << std::endl; - } else if (dummy_ptr) { - sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl; - } - }); - - size_t op_id = 0; - for (auto &op : graph.ops_) { - std::string op_name = "op_" + std::to_string(op_id++); - sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]" - << std::endl; - for (auto in : op->Inputs()) { - std::string var_name = "var_" + std::to_string(vars[in]); - sout << var_name << " -> " << op_name << std::endl; - } - - for (auto out : op->Outputs()) { - std::string var_name = "var_" + std::to_string(vars[out]); - sout << op_name << " -> " << var_name << std::endl; - } - } - - sout << "}\n"; -} - void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) { for (auto &op : graph->ops_) { if (!op->Outputs().empty()) { diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index 64e5d93081eb76c56898bbeb530e37364619fdbb..18612c3c1b62cf4c2ebdc221c301c59ec81c2da7 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -30,6 +30,7 @@ class SSAGraphBuilder { SSAGraphBuilder() {} virtual ~SSAGraphBuilder() {} virtual std::unique_ptr Build(const ProgramDesc &program) const = 0; + virtual int GetVarDeviceID(const std::string &var_name) const = 0; DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); @@ -55,8 +56,6 @@ class SSAGraphBuilder { const platform::Place &place, size_t place_offset); static void AddOutputToLeafOps(SSAGraph *graph); - - static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout); }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc new file mode 100644 index 0000000000000000000000000000000000000000..b4b49d3de6da2e5fd7836668619e42d10bb6b35a --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h" +#include +#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" +#include "paddle/fluid/framework/details/ssa_graph_checker.h" +#include "paddle/fluid/framework/details/ssa_graph_printer.h" + +namespace paddle { +namespace framework { +namespace details { +std::unique_ptr SSAGraphBuilderFactory::Create() { + std::unique_ptr res( +#ifdef PADDLE_WITH_CUDA + new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_, + local_scopes_, nccl_ctxs_, strategy_) +#else + new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_, + local_scopes_, strategy_) +#endif + ); // NOLINT + + if (!strategy_.debug_graphviz_path_.empty()) { + std::unique_ptr fout( + new std::ofstream(strategy_.debug_graphviz_path_)); + PADDLE_ENFORCE(fout->good()); + std::unique_ptr graphviz_printer( + new GraphvizSSAGraphPrinter()); + res.reset(new SSAGraghBuilderWithPrinter( + std::move(fout), std::move(graphviz_printer), std::move(res))); + } + res.reset(new SSAGraghBuilderWithChecker(std::move(res))); + + return res; +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.h b/paddle/fluid/framework/details/ssa_graph_builder_factory.h new file mode 100644 index 0000000000000000000000000000000000000000..91a119de83ed3d1573803e48faf86c874eed98d6 --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.h @@ -0,0 +1,71 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/ssa_graph_builder.h" +#include "paddle/fluid/platform/place.h" + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace framework { +class Scope; +namespace details { + +class SSAGraphBuilderFactory { + public: + SSAGraphBuilderFactory(const std::vector& places, + const std::string& loss_var_name, + const std::unordered_set& param_names, + const std::vector& local_scopes, + const BuildStrategy& strategy) + : places_(places), + loss_var_name_(loss_var_name), + param_names_(param_names), + local_scopes_(local_scopes), + strategy_(strategy) { +#ifdef PADDLE_WITH_CUDA + nccl_ctxs_ = nullptr; +#endif + } + +#ifdef PADDLE_WITH_CUDA + void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) { + nccl_ctxs_ = nccl_ctxs; + } +#endif + + std::unique_ptr Create(); + + private: + std::vector places_; + std::string loss_var_name_; + std::unordered_set param_names_; + std::vector local_scopes_; + BuildStrategy strategy_; + +#ifdef PADDLE_WITH_CUDA + platform::NCCLContextMap* nccl_ctxs_; +#endif +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/ssa_graph_checker.cc new file mode 100644 index 0000000000000000000000000000000000000000..da5428946ee588e8eac1f78929dc0432df532975 --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_checker.cc @@ -0,0 +1,87 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/ssa_graph.h" +#include +#include "paddle/fluid/framework/details/ssa_graph_checker.h" + +namespace paddle { +namespace framework { +namespace details { + +bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const { + std::unordered_map pending_ops; + std::unordered_set pending_vars; + std::unordered_set ready_vars; + std::unordered_set ready_ops; + + auto insert_pending_var = [&](VarHandleBase *var) { + pending_vars.insert(var); + if (var->generated_op_ == nullptr) { + ready_vars.emplace(var); + } + }; + + for (auto &var_map : graph->vars_) { + for (auto &name_pair : var_map) { + for (auto &version_pair : name_pair.second) { + insert_pending_var(version_pair.get()); + } + } + } + + for (auto &var : graph->dep_vars_) { + insert_pending_var(var.get()); + } + + for (auto &op : graph->ops_) { + if (op->Inputs().empty()) { + ready_ops.insert(op.get()); + } else { + pending_ops.insert({op.get(), op.get()->NoDupInputSize()}); + } + } + + auto run_all_ops = [&](std::unordered_set &set) { + for (auto *op : set) { + for (auto out : op->Outputs()) { + ready_vars.emplace(out); + } + } + set.clear(); + }; + + while (!pending_vars.empty()) { + run_all_ops(ready_ops); + + if (ready_vars.empty()) { + return false; + } + + for (auto ready_var : ready_vars) { + pending_vars.erase(ready_var); + for (auto *op : ready_var->pending_ops_) { + auto &deps = --pending_ops[op]; + if (deps == 0) { + ready_ops.insert(op); + } + } + } + ready_vars.clear(); + } + return true; +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h new file mode 100644 index 0000000000000000000000000000000000000000..331aa9d2b5864c470dbd5e29ef6faccffdcf781c --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_checker.h @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/details/ssa_graph_builder.h" + +#include + +namespace paddle { +namespace framework { +namespace details { +struct SSAGraph; + +class SSAGraghBuilderWithChecker : public SSAGraphBuilder { + public: + explicit SSAGraghBuilderWithChecker( + std::unique_ptr&& builder) + : builder_(std::move(builder)) {} + + std::unique_ptr Build(const ProgramDesc& program) const override { + auto graph = builder_->Build(program); + PADDLE_ENFORCE(IsValidGraph(graph.get())); + return graph; + } + + int GetVarDeviceID(const std::string& var_name) const override { + return builder_->GetVarDeviceID(var_name); + } + + bool IsValidGraph(const SSAGraph* graph) const; + + private: + std::unique_ptr builder_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc index 8da6ca889b89999e0f6f974503cea476c9de97f3..09b97bd0d98dc4ad1124dcbc495cff921bf03efc 100644 --- a/paddle/fluid/framework/details/ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/ssa_graph_executor.cc @@ -17,10 +17,6 @@ namespace paddle { namespace framework { namespace details { - -SSAGraphExecutor::SSAGraphExecutor(std::unique_ptr &&graph) - : graph_(std::move(graph)) {} - SSAGraphExecutor::~SSAGraphExecutor() {} } // namespace details diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h index a8833b7388ab907020a260d356f1484ffd227658..958086033607a4ed8fb840f5b14fe5779625bd82 100644 --- a/paddle/fluid/framework/details/ssa_graph_executor.h +++ b/paddle/fluid/framework/details/ssa_graph_executor.h @@ -28,15 +28,11 @@ class SSAGraphExecutor { DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor); public: - // Steal graph inside - explicit SSAGraphExecutor(std::unique_ptr &&graph); + SSAGraphExecutor() {} virtual ~SSAGraphExecutor(); virtual FeedFetchList Run(const std::vector &fetch_tensors) = 0; - - protected: - std::unique_ptr graph_; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/ssa_graph_printer.cc new file mode 100644 index 0000000000000000000000000000000000000000..22a40ca4b25cdd8ed9856b6c71bffc79561edcac --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_printer.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/ssa_graph_printer.h" +#include +#include "paddle/fluid/framework/details/ssa_graph.h" + +namespace paddle { +namespace framework { +namespace details { + +template +static inline void IterAllVar(const SSAGraph &graph, Callback callback) { + for (auto &each : graph.vars_) { + for (auto &pair1 : each) { + for (auto &pair2 : pair1.second) { + callback(*pair2); + } + } + } + + for (auto &var : graph.dep_vars_) { + callback(*var); + } +} + +void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph, + std::ostream &sout) const { + size_t var_id = 0; + std::unordered_map vars; + + sout << "digraph G {\n"; + + IterAllVar(graph, [&](const VarHandleBase &var) { + auto *var_ptr = &var; + auto *var_handle_ptr = dynamic_cast(var_ptr); + auto *dummy_ptr = dynamic_cast(var_ptr); + + size_t cur_var_id = var_id++; + vars[var_ptr] = cur_var_id; + + if (var_handle_ptr) { + sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_ + << "\\n" + << var_handle_ptr->place_ << "\\n" + << var_handle_ptr->version_ << "\"]" << std::endl; + } else if (dummy_ptr) { + sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl; + } + }); + + size_t op_id = 0; + for (auto &op : graph.ops_) { + std::string op_name = "op_" + std::to_string(op_id++); + sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]" + << std::endl; + for (auto in : op->Inputs()) { + std::string var_name = "var_" + std::to_string(vars[in]); + sout << var_name << " -> " << op_name << std::endl; + } + + for (auto out : op->Outputs()) { + std::string var_name = "var_" + std::to_string(vars[out]); + sout << op_name << " -> " << var_name << std::endl; + } + } + + sout << "}\n"; +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h new file mode 100644 index 0000000000000000000000000000000000000000..09b0333ef2cb43a306133aa5af98d37c11454d4d --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_printer.h @@ -0,0 +1,72 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/details/ssa_graph_builder.h" + +namespace paddle { +namespace framework { +namespace details { +struct SSAGraph; +class SSAGraphPrinter { + public: + virtual ~SSAGraphPrinter() {} + virtual void Print(const SSAGraph& graph, std::ostream& sout) const = 0; +}; + +class GraphvizSSAGraphPrinter : public SSAGraphPrinter { + public: + void Print(const SSAGraph& graph, std::ostream& sout) const override; +}; + +class SSAGraghBuilderWithPrinter : public SSAGraphBuilder { + public: + SSAGraghBuilderWithPrinter(std::ostream& sout, + std::unique_ptr&& printer, + std::unique_ptr&& builder) + : printer_(std::move(printer)), + builder_(std::move(builder)), + stream_ref_(sout) {} + + SSAGraghBuilderWithPrinter(std::unique_ptr&& sout, + std::unique_ptr&& printer, + std::unique_ptr&& builder) + : printer_(std::move(printer)), + builder_(std::move(builder)), + stream_ptr_(std::move(sout)), + stream_ref_(*stream_ptr_) {} + + std::unique_ptr Build(const ProgramDesc& program) const override { + auto graph = builder_->Build(program); + printer_->Print(*graph, stream_ref_); + return graph; + } + + int GetVarDeviceID(const std::string& var_name) const override { + return builder_->GetVarDeviceID(var_name); + } + + private: + std::unique_ptr printer_; + std::unique_ptr builder_; + std::unique_ptr stream_ptr_; + std::ostream& stream_ref_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 815f739371e77d953a28be99b38ec1b8ff26506c..b1706eb12d080364d04108c7ef4da31e1e7c1deb 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -21,7 +21,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, std::unique_ptr &&graph) - : SSAGraphExecutor(std::move(graph)), + : graph_(std::move(graph)), pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) : nullptr), local_scopes_(local_scopes), @@ -96,6 +96,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( auto cur_ready_vars = ready_vars.PopAll(1, &timeout); if (timeout) { + std::lock_guard l(exception_mu_); if (exception_) { auto exp = *exception_; exception_.reset(); @@ -185,17 +186,21 @@ void ThreadedSSAGraphExecutor::InsertPendingVar( ready_vars->Push(var); } } + void ThreadedSSAGraphExecutor::RunOp( BlockingQueue *ready_var_q, details::OpHandleBase *op) { auto op_run = [ready_var_q, op, this] { try { - VLOG(10) << op << " " << op->Name() << " : " << op->DebugString(); - op->Run(strategy_.use_event_); + if (VLOG_IS_ON(10)) { + VLOG(10) << op << " " << op->Name() << " : " << op->DebugString(); + } + op->Run(strategy_.use_cuda_); VLOG(10) << op << " " << op->Name() << " Done "; running_ops_--; ready_var_q->Extend(op->Outputs()); VLOG(10) << op << " " << op->Name() << "Signal posted"; } catch (platform::EnforceNotMet ex) { + std::lock_guard l(exception_mu_); exception_.reset(new platform::EnforceNotMet(ex)); } catch (...) { LOG(FATAL) << "Unknown exception catched"; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 1f7f88d75218e757e4555ad093f3cd6558f624dd..90430be996758364387b552019762d9c2e9dfe45 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -51,10 +51,12 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { details::OpHandleBase *op); private: + std::unique_ptr graph_; std::unique_ptr<::ThreadPool> pool_; std::vector local_scopes_; std::vector places_; platform::DeviceContextPool fetch_ctxs_; + std::mutex exception_mu_; std::unique_ptr exception_; std::atomic running_ops_; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 3d68c5fb870d5b575f97eeb286528544402b8ed9..ae98fccc9600a2a75f12fa516c982bec0ef13f9f 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -20,10 +20,14 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" +#ifdef PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/grpc_client.h" +#endif #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" DECLARE_bool(benchmark); +DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); namespace paddle { namespace framework { @@ -43,6 +47,14 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { Executor::Executor(const platform::Place& place) : place_(place) {} +#ifdef PADDLE_WITH_DISTRIBUTE +void Executor::Complete() { + ::paddle::operators::distributed::RPCClient::GetInstance< + ::paddle::operators::distributed::GRPCClient>() + ->SendComplete(); +} +#endif + void InitializeVariable(Variable* var, proto::VarType::Type var_type) { if (var_type == proto::VarType::LOD_TENSOR) { var->GetMutable(); @@ -115,6 +127,7 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool create_local_scope, bool create_vars) { platform::RecordBlock b(block_id); + if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); auto ctx = Prepare(pdesc, block_id); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); } @@ -214,6 +227,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, const std::string& feed_holder_name, const std::string& fetch_holder_name) { platform::RecordBlock b(kProgramId); + if (FLAGS_use_mkldnn) EnableMKLDNN(program); bool has_feed_ops = has_feed_operators(program.Block(0), *feed_targets, feed_holder_name); bool has_fetch_ops = @@ -225,7 +239,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, unique_ptr_of_copy_program.reset(new ProgramDesc(program)); copy_program = unique_ptr_of_copy_program.get(); } - auto* global_block = copy_program->MutableBlock(0); if (!has_feed_ops) { @@ -282,13 +295,14 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, std::unique_ptr Executor::Prepare( const ProgramDesc& program, int block_id) { - auto* ctx = new ExecutorPrepareContext(program, block_id); + std::unique_ptr ctx( + new ExecutorPrepareContext(program, block_id)); PADDLE_ENFORCE_LT(static_cast(block_id), program.Size()); auto& block = program.Block(block_id); for (auto& op_desc : block.AllOps()) { ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); } - return std::unique_ptr(ctx); + return ctx; } std::vector> Executor::Prepare( @@ -307,7 +321,8 @@ std::vector> Executor::Prepare( } void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, - bool create_local_scope, bool create_vars) { + bool create_local_scope, bool create_vars, + bool keep_kids) { Scope* local_scope = scope; if (create_vars) { if (create_local_scope) { @@ -317,8 +332,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } for (auto& op : ctx->ops_) { - VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); + VLOG(4) << place_ << " " << op->DebugStringEx(local_scope); op->Run(*local_scope, place_); + // NOTE! Please do not delete this line, it's usefull because the debug + // string before and after op.run are different, after run the output + // will have right shape which is usefull for debug. + VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); if (FLAGS_benchmark) { VLOG(2) << "Memory used after operator " + op->Type() + " running: " @@ -326,12 +345,20 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } } platform::DeviceContextPool::Instance().Get(place_)->Wait(); - if (create_vars && create_local_scope) { + if (local_scope != scope) { scope->DeleteScope(local_scope); } else { - // Delete the local scopes created in operators. - scope->DropKids(); + if (!keep_kids) { + // By default, we should delete all kid scopes after run executor because + // some operators may create local scope when running, such as while_op. + // But when while_op also create a local executor to run it's sub block, + // the sub scopes it created should not be dropped immediately, because + // while_grad_op will use some variables created during while_op run, so + // we need to keep the kids and wait for the outer executor to drop them. + scope->DropKids(); + } } + if (FLAGS_benchmark) { VLOG(2) << "-------------------------------------------------------"; VLOG(2) << "Memory used after deleting local scope: " @@ -378,5 +405,22 @@ void Executor::RunPreparedContext( } } +void Executor::EnableMKLDNN(const ProgramDesc& program) { +#ifdef PADDLE_WITH_MKLDNN + VLOG(3) << "use_mkldnn=True"; + for (size_t bid = 0; bid < program.Size(); ++bid) { + auto* block = const_cast(program).MutableBlock(bid); + for (auto* op : block->AllOps()) { + if (op->HasAttr("use_mkldnn")) { + op->SetAttr("use_mkldnn", true); + } + } + } +#else + LOG(WARNING) + << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option"; +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 0c3c23611d95e0da67cabfb8fb2755a4a52c991b..3aa5ffef69cd29681f248e915575c5715ad0d3fa 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -44,6 +44,13 @@ class Executor { explicit Executor(const platform::Place& place); +#ifdef PADDLE_WITH_DISTRIBUTE + /* + * Sending signal to pserver to mark current trainer stop. + */ + void Complete(); +#endif + /* @Brief * Runtime evaluation of the given ProgramDesc under certain Scope * @@ -71,7 +78,7 @@ class Executor { void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, bool create_local_scope = true, - bool create_vars = true); + bool create_vars = true, bool keep_kids = false); void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, std::map* feed_targets, @@ -81,6 +88,8 @@ class Executor { const std::string& feed_holder_name = "feed", const std::string& fetch_holder_name = "fetch"); + void EnableMKLDNN(const ProgramDesc& program); + private: const platform::Place place_; }; diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index d35125fe8c3c8018c38650dc87b2b1474ded6058..2cf14bd371831ab682166f4256d6966b5ab278c8 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -27,6 +27,7 @@ enum AttrType { BOOLEANS = 7; BLOCK = 8; LONG = 9; + BLOCKS = 10; } // OpDesc describes an instance of a C++ framework::OperatorBase @@ -46,6 +47,7 @@ message OpDesc { repeated bool bools = 11; optional int32 block_idx = 12; optional int64 l = 13; + repeated int32 blocks_idx = 14; }; message Var { @@ -71,6 +73,7 @@ message OpProto { optional bool duplicable = 3 [ default = false ]; optional bool intermediate = 4 [ default = false ]; optional bool dispensable = 5 [ default = false ]; + optional string reuse = 6; } // AttrProto describes the C++ type Attribute. diff --git a/paddle/fluid/framework/init.cc b/paddle/fluid/framework/init.cc index 85beae775b96c3b7e08a2795bcd0ec79b24faeb4..a1094976f6c0965ac0a601d7e37575969146fdab 100644 --- a/paddle/fluid/framework/init.cc +++ b/paddle/fluid/framework/init.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/init.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/piece.h" @@ -113,6 +114,9 @@ void InitDevices(bool init_p2p, const std::vector devices) { } places.emplace_back(platform::CPUPlace()); platform::DeviceContextPool::Init(places); +#ifndef PADDLE_WITH_MKLDNN + operators::math::SetNumThreads(1); +#endif } void InitGLOG(const std::string &prog_name) { diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index a56674cbe216e312c4394ef537140122352dc785..d29d8ce1c561e45980d10c17c984ca2ed3b453f3 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -51,8 +51,6 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) { } std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { - PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code()); - if (!platform::is_cpu_place(t.place())) { LoDTensor tt; framework::TensorCopy(t, platform::CPUPlace(), &tt); @@ -70,7 +68,13 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { // only print first ten elements int64_t size = t.numel() < 10 ? t.numel() : 10; for (int64_t i = 0; i < size; ++i) { - os << t.data()[i] << " "; + if (t.type().hash_code() == typeid(float).hash_code()) { + os << t.data()[i] << " "; + } else if (t.type().hash_code() == typeid(int64_t).hash_code()) { + os << t.data()[i] << " "; + } else { + PADDLE_THROW("LoDTensor data type not in [float, int64_t]"); + } } return os; @@ -410,5 +414,38 @@ void LoDTensor::MergeLoDTensor( } } +LoD ConvertToLengthBasedLoD(const LoD &offset_lod) { + LoD length_lod; + length_lod.reserve(offset_lod.size()); + for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) { + std::vector level; + if (offset_lod[lvl].size() > 0) { + level.reserve(offset_lod[lvl].size() - 1); + } + for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) { + level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]); + } + length_lod.push_back(level); + } + return length_lod; +} + +LoD ConvertToOffsetBasedLoD(const LoD &length_lod) { + LoD offset_lod; + offset_lod.reserve(length_lod.size()); + for (size_t lvl = 0; lvl < length_lod.size(); ++lvl) { + std::vector level; + level.reserve(length_lod[lvl].size() + 1); + size_t tmp = 0; + level.push_back(tmp); + for (size_t idx = 0; idx < length_lod[lvl].size(); ++idx) { + tmp += length_lod[lvl][idx]; + level.push_back(tmp); + } + offset_lod.push_back(level); + } + return offset_lod; +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h index 1159fee39b0737402c60448dcbe69e7535c9d6e1..4a2729373b5c63176ed1e856f4acf29fd1e73254 100644 --- a/paddle/fluid/framework/lod_tensor.h +++ b/paddle/fluid/framework/lod_tensor.h @@ -226,5 +226,19 @@ extern void WriteToRecordIO(recordio::Writer* writer, extern std::vector ReadFromRecordIO( recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx); +/* + * Convert between length-based LoD and offset-based LoD. + * The implementation of LoDTensor class use offset-based LoD. + * However, we want to expose the more user-friendly length-based + * LoD to the Python side instead. + * + * Example: + * If offset_lod = [[0, 2, 3],[0, 3, 5, 9]] + * then length_lod = [[2, 1], [3, 2, 4]] + */ +LoD ConvertToLengthBasedLoD(const LoD& offset_lod); + +LoD ConvertToOffsetBasedLoD(const LoD& length_lod); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc index 2ceffc93319359683e87e7fec2d18784c9bf02f3..38d3cd96d65f0a54b0ea87b4c677013f3802adfb 100644 --- a/paddle/fluid/framework/lod_tensor_test.cc +++ b/paddle/fluid/framework/lod_tensor_test.cc @@ -26,6 +26,20 @@ namespace paddle { namespace framework { +TEST(LoD, PrintLoDTensor) { + LoDTensor tensor1; + tensor1.mutable_data(platform::CPUPlace()); + tensor1.data()[0] = 0.2; + tensor1.data()[1] = 0.5; + LOG(INFO) << tensor1; + + LoDTensor tensor2; + tensor2.mutable_data(platform::CPUPlace()); + tensor2.data()[0] = 1; + tensor2.data()[1] = 2; + LOG(INFO) << tensor2; +} + TEST(LoD, data) { LoD lod{{0, 1, 2}}; lod.push_back({0, 2, 4, 5}); @@ -37,7 +51,7 @@ TEST(LoD, data) { } } -TEST(LodExpand, test) { +TEST(LoD, ExpandLoD) { LoD lod{{0, 2}}; LoDTensor tensor; tensor.set_lod(lod); @@ -228,6 +242,38 @@ TEST(LoD, CheckAbsLoD) { ASSERT_FALSE(CheckAbsLoD(abs_lod0)); } +TEST(LoD, ConvertToLengthBasedLoD) { + LoD offset_lod; + offset_lod.push_back(std::vector({0, 2})); + offset_lod.push_back(std::vector({0, 1, 3})); + offset_lod.push_back(std::vector({0, 2, 4, 5})); + + LoD length_lod = ConvertToLengthBasedLoD(offset_lod); + + LoD expected; + expected.push_back(std::vector({2})); + expected.push_back(std::vector({1, 2})); + expected.push_back(std::vector({2, 2, 1})); + + EXPECT_EQ(length_lod, expected); +} + +TEST(LoD, ConvertToOffsetBasedLoD) { + LoD length_lod; + length_lod.push_back(std::vector({2})); + length_lod.push_back(std::vector({1, 2})); + length_lod.push_back(std::vector({2, 2, 1})); + + LoD offset_lod = ConvertToOffsetBasedLoD(length_lod); + + LoD expected; + expected.push_back(std::vector({0, 2})); + expected.push_back(std::vector({0, 1, 3})); + expected.push_back(std::vector({0, 2, 4, 5})); + + EXPECT_EQ(offset_lod, expected); +} + template static void TestRecordIO() { LoDTensor tensor; diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index f92769192c218eb7cdc2350ff6e4721b45005806..a190199f1cb1361f67f20c755b8e7ef52c284adc 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -211,6 +211,12 @@ void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) { need_update_ = true; } +void OpDesc::SetBlocksAttr(const std::string &name, + std::vector blocks) { + this->attrs_[name] = blocks; + need_update_ = true; +} + void OpDesc::SetAttrMap( const std::unordered_map &attr_map) { attrs_ = attr_map; @@ -305,6 +311,13 @@ struct SetAttrDescVisitor : public boost::static_visitor { void operator()(const std::vector &v) const { VectorToRepeated(v, attr_->mutable_bools()); } + void operator()(const std::vector &v) const { + std::vector blocks_idx; + for (auto blk : v) { + blocks_idx.push_back(blk->ID()); + } + VectorToRepeated(blocks_idx, attr_->mutable_blocks_idx()); + } void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); } void operator()(int64_t v) const { attr_->set_l(v); } void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index a02d3e269129596f65a2fb346e76c1af7fbead95..74dd8ec002005dd080424b48b5db1a2574a6974f 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -77,6 +77,8 @@ class OpDesc { void SetBlockAttr(const std::string &name, BlockDesc *block); + void SetBlocksAttr(const std::string &name, std::vector blocks); + Attribute GetAttr(const std::string &name) const; Attribute GetNullableAttr(const std::string &name) const; diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc index b99e82f8c4358b60a014c6fc7c61c9bbb8683834..f1261dee0319440995951d1bee145404186a8ad4 100644 --- a/paddle/fluid/framework/op_info.cc +++ b/paddle/fluid/framework/op_info.cc @@ -17,12 +17,11 @@ limitations under the License. */ namespace paddle { namespace framework { -static OpInfoMap* g_op_info_map = nullptr; - +// C++11 removes the need for manual locking. Concurrent execution shall wait if +// a static local variable is already being initialized. +// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex OpInfoMap& OpInfoMap::Instance() { - if (g_op_info_map == nullptr) { - g_op_info_map = new OpInfoMap(); - } + static OpInfoMap* g_op_info_map = new OpInfoMap(); return *g_op_info_map; } } // namespace framework diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h index fab20d75f5a45257f243333c1998d7b2549a25f9..f51a184e7bae2283f335fe9462a77b9c5fb831a5 100644 --- a/paddle/fluid/framework/op_kernel_type.h +++ b/paddle/fluid/framework/op_kernel_type.h @@ -87,7 +87,14 @@ inline std::string KernelTypeToString(const OpKernelType& kernel_key) { } inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) { - return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r; + bool ret = + (l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r); +#ifdef PADDLE_WITH_MKLDNN + // Layout transform needed for either non-MKLDNN to MKLDNN or vice versa + ret |= (l != DataLayout::kMKLDNN && r == DataLayout::kMKLDNN); + ret |= (l == DataLayout::kMKLDNN && r != DataLayout::kMKLDNN); +#endif + return ret; } inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) { diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index ae9f4efd44acdcdff2806deea6826e4089459a78..001b5cb5a8eb57cbe0a2e0ad7f64ef05f8149922 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -21,6 +21,7 @@ namespace framework { void OpProtoAndCheckerMaker::Validate() { validated_ = true; CheckNoDuplicatedInOutAttrs(); + CheckReuseVars(); } OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput( @@ -56,6 +57,24 @@ void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() { } } +void OpProtoAndCheckerMaker::CheckReuseVars() { + std::unordered_set names; + for (auto& input : proto_->inputs()) { + names.insert(input.name()); + } + auto checker = [&](const std::string& name, const std::string& reused) { + PADDLE_ENFORCE( + names.count(reused), + "Output [%s] reuse Input [%s], but the input is not registered.", name, + reused); + }; + for (auto& output : proto_->outputs()) { + if (output.has_reuse()) { + checker(output.name(), output.reuse()); + } + } +} + void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, OpAttrChecker* attr_checker) { proto_ = proto; diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 8493b9d8b326c71a33b95bf95e5fc1743c686eb7..92f86bb5de520878d0a7b8d7214620580242c061 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once #include +#include + #include "glog/logging.h" #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/framework.pb.h" @@ -64,6 +66,11 @@ class OpProtoAndCheckerMaker { var_->set_dispensable(true); return *this; } + + VariableBuilder &Reuse(const std::string &name) { + var_->set_reuse(name); + return *this; + } }; VariableBuilder AddInput(const std::string &name, const std::string &comment); @@ -89,6 +96,8 @@ class OpProtoAndCheckerMaker { void CheckNoDuplicatedInOutAttrs(); void Validate(); + void CheckReuseVars(); + proto::OpProto *proto_; OpAttrChecker *op_checker_; bool validated_{false}; diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc index a8030d377fdb4d4aef74b315e21792dad10fac96..58f70cb39c0d96ed3b9ff35ea132ba75a37f5405 100644 --- a/paddle/fluid/framework/op_proto_maker_test.cc +++ b/paddle/fluid/framework/op_proto_maker_test.cc @@ -47,3 +47,23 @@ TEST(ProtoMaker, DuplicatedInOut) { ASSERT_THROW(proto_maker(&op_proto, &op_checker), paddle::platform::EnforceNotMet); } + +class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "input of test op"); + AddOutput("XOut", "output of test op").Reuse("X"); + AddOutput("NoOut", "output of test op").Reuse("NotExists"); + } +}; + +TEST(ProtoMaker, InplaceOutput) { + paddle::framework::proto::OpProto op_proto; + paddle::framework::OpAttrChecker op_checker; + TestInplaceProtoMaker proto_maker; + ASSERT_THROW(proto_maker(&op_proto, &op_checker), + paddle::platform::EnforceNotMet); + // proto_maker(&op_proto, &op_checker); + // proto_maker.Make(); + // ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet); +} diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 748317438b44bc4af84f13b25f8e4f88386388fb..43ab227a9478707445892c14723801992d0041aa 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -83,8 +83,14 @@ struct OpKernelRegistrarFunctor { void operator()(const char* op_type, const char* library_type) const { using T = typename KERNEL_TYPE::ELEMENT_TYPE; + std::string library(library_type); + std::string data_layout = "ANYLAYOUT"; + if (library == "MKLDNN") { + data_layout = "MKLDNNLAYOUT"; + } OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(), - DataLayout::kAnyLayout, StringToLibraryType(library_type)); + StringToDataLayout(data_layout), + StringToLibraryType(library_type)); OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE); constexpr auto size = std::tuple_size>::value; @@ -99,7 +105,8 @@ struct OpKernelRegistrarFunctor { void operator()(const char* op_type, const char* library_type) const {} }; -// User can register many kernel in one place. The data type could be different. +// User can register many kernel in one place. The data type could be +// different. template class OpKernelRegistrar : public Registrar { public: @@ -149,15 +156,15 @@ class OpKernelRegistrar : public Registrar { /** * Macro to register OperatorKernel. */ -#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...) \ +#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...) \ STATIC_ASSERT_GLOBAL_NAMESPACE( \ - __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__, \ + __reg_op_kernel_##op_type##_##library_type##__, \ "REGISTER_OP_KERNEL must be called in global namespace"); \ static ::paddle::framework::OpKernelRegistrar \ - __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type, \ - #LIBRARY_TYPE); \ - int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() { \ - __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch(); \ + __op_kernel_registrar_##op_type##_##library_type##__(#op_type, \ + #library_type); \ + int TouchOpKernelRegistrar_##op_type##_##library_type() { \ + __op_kernel_registrar_##op_type##_##library_type##__.Touch(); \ return 0; \ } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index f87d5521492418d2daf5b7fba1500c4bb31e10f5..122ee1dab35b8c7d42392a983b5b15b7c1be7869 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -69,6 +69,19 @@ static DDim GetDims(const Scope& scope, const std::string& name, } } +static int GetRowSize(const Scope& scope, const std::string& name) { + Variable* var = scope.FindVar(name); + if (var == nullptr) { + return -1; + } + + if (var->IsType()) { + return var->Get().rows().size(); + } + + return -1; +} + static LoD GetLoD(const Scope& scope, const std::string& name) { Variable* var = scope.FindVar(name); auto default_lod = LoD({{}}); @@ -85,6 +98,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { } void OperatorBase::Run(const Scope& scope, const platform::Place& place) { + VLOG(10) << "- " << DebugStringEx(&scope); if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA PADDLE_THROW("Cannot run operator on place %s", place); @@ -94,6 +108,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { #endif } RunImpl(scope, place); + VLOG(10) << "+ " << DebugStringEx(&scope); } bool OperatorBase::HasInputs(const std::string& name) const { @@ -153,6 +168,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { for (size_t i = 0; i < input.second.size(); ++i) { ss << input.second[i]; if (scope) { + int row_size = GetRowSize(*scope, input.second[i]); + if (row_size >= 0) { + ss << "[row_size=" << row_size << "]"; + } ss << "[" << GetDims(*scope, input.second[i], true) << "]"; ss << "(" << GetLoD(*scope, input.second[i]) << ")"; } @@ -173,6 +192,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { for (size_t i = 0; i < output.second.size(); ++i) { ss << output.second[i]; if (scope) { + int row_size = GetRowSize(*scope, output.second[i]); + if (row_size >= 0) { + ss << "[row_size=" << row_size << "]"; + } ss << "[" << GetDims(*scope, output.second[i], true) << "]"; ss << "(" << GetLoD(*scope, output.second[i]) << ")"; } @@ -293,6 +316,38 @@ static Tensor* GetMutableTensorFromVar(Variable* var) { } } +bool ExecutionContext::HasInput(const std::string& name) const { + if (!op_.HasInputs(name)) { + return false; + } + auto& ins = Inputs(name); + size_t length = ins.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Input %s should not have more than one inputs", name); + auto arg = ins[0]; + auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg); + return var != nullptr; +} + +bool ExecutionContext::HasOutput(const std::string& name) const { + if (!op_.HasOutputs(name)) { + return false; + } + auto& outs = Outputs(name); + size_t length = outs.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Output %s should not have more than one inputs", name); + auto arg = outs[0]; + auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg); + return var != nullptr; +} + template <> const Tensor* ExecutionContext::Input(const std::string& name) const { auto* var = InputVar(name); @@ -444,10 +499,25 @@ class RuntimeInferShapeContext : public InferShapeContext { auto* out_tensor = out_var->GetMutable(); out_tensor->set_lod(in_tensor.lod()); - // TODO(dzhwinter) : reuse ShareLoD in most operators. - // Need to call ShareLayout explicitly in sequence related ops. - // Shall we have a better method to shared info between in/out Tensor? - out_tensor->set_layout(in_tensor.layout()); +// TODO(dzhwinter) : reuse ShareLoD in most operators. +// Need to call ShareLayout explicitly in sequence related ops. +// Shall we have a better method to shared info between in/out Tensor? +#ifdef PADDLE_WITH_MKLDNN + // Fix me: ugly workaround below + // Correct solution: + // set_layout() should NOT be called here (i.e. ShareLoD). Instead, + // layout of output tensor should be set "manually" in Compute() + // of each OPKernel. The reason layout should NOT be shared between + // input and output "automatically" (now by InferShape()->ShareLoD()) + // is that layout transform may occur after InferShape(). + // Workaround: + // Skip set_layout() when input layout is kMKLDNN + // This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN + // OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called + // in Compute() + if (in_tensor.layout() != DataLayout::kMKLDNN) +#endif + out_tensor->set_layout(in_tensor.layout()); } void ShareLayout(const std::string& in, const std::string& out, size_t i = 0, @@ -646,8 +716,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( } if (t != nullptr) { int tmp = static_cast(ToDataType(t->type())); - PADDLE_ENFORCE(tmp == data_type || data_type == -1, - "DataType of Paddle Op %s must be the same.", Type()); + PADDLE_ENFORCE( + tmp == data_type || data_type == -1, + "DataType of Paddle Op %s must be the same. Get %d != %d", Type(), + data_type, tmp); data_type = tmp; } } @@ -665,7 +737,8 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType( OpKernelType OperatorWithKernel::GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, const OpKernelType& expected_kernel_type) const { - return OpKernelType(expected_kernel_type.data_type_, tensor.place()); + return OpKernelType(expected_kernel_type.data_type_, tensor.place(), + tensor.layout()); } } // namespace framework diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 2f480e00c100d579e100de17d3feb957f5ef6167..b1d75d0d0ff3dccc67a1e833ccfe03a4cad8df39 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -191,9 +191,9 @@ class ExecutionContext { return op_.Attr(name); } - bool HasInput(const std::string& name) const { return op_.HasInputs(name); } + bool HasInput(const std::string& name) const; - bool HasOutput(const std::string& name) const { return op_.HasOutputs(name); } + bool HasOutput(const std::string& name) const; size_t InputSize(const std::string& name) const { return op_.Inputs(name).size(); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 50c3468d556bfe05d6c41906cf35cb671f711b1e..b53a6f43fbd1f23e69d23ad0fcc54d5c25d352a3 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -22,7 +22,8 @@ limitations under the License. */ #include "paddle/fluid/platform/nccl_helper.h" #endif -#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" +#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" +#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" @@ -42,9 +43,8 @@ class ParallelExecutorPrivate { #ifdef PADDLE_WITH_CUDA std::unique_ptr nccl_ctxs_; #endif - - std::vector> var_types_; - bool own_local_scope; + bool own_local_scope_; + bool use_cuda_; }; std::vector &ParallelExecutor::GetLocalScopes() { @@ -61,69 +61,95 @@ ParallelExecutor::ParallelExecutor( size_t num_trainers, size_t trainer_id) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; + member_->use_cuda_ = exec_strategy.use_cuda_; // Step 1. Bcast the params to devs. // Create local scopes if (local_scopes.empty()) { - member_->own_local_scope = true; + member_->own_local_scope_ = true; member_->local_scopes_.emplace_back(member_->global_scope_); for (size_t i = 1; i < member_->places_.size(); ++i) { member_->local_scopes_.emplace_back(&scope->NewScope()); } } else { - member_->own_local_scope = false; + member_->own_local_scope_ = false; PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size()); for (size_t i = 0; i < member_->places_.size(); ++i) { member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope()); } } + if (member_->use_cuda_) { // Bcast Parameters to all GPUs #ifdef PADDLE_WITH_CUDA - auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); - ncclUniqueId *nccl_id = nullptr; - if (nccl_id_var != nullptr) { - nccl_id = nccl_id_var->GetMutable(); - } - member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id, num_trainers, trainer_id)); + auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); + ncclUniqueId *nccl_id = nullptr; + if (nccl_id_var != nullptr) { + nccl_id = nccl_id_var->GetMutable(); + } + member_->nccl_ctxs_.reset(new platform::NCCLContextMap( + member_->places_, nccl_id, num_trainers, trainer_id)); +#else + PADDLE_THROW("Not compiled with CUDA"); #endif - if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 && - local_scopes.empty()) { // Is CUDA + } + + if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { BCastParamsToGPUs(bcast_vars); } -// Startup Program has been run. All local scopes has correct parameters. + // Startup Program has been run. All local scopes has correct parameters. -// Step 2. Convert main_program to SSA form and dependency graph. Also, insert -// ncclOp -#ifdef PADDLE_WITH_CUDA - details::MultiDevSSAGraphBuilder builder( + // Step 2. Create vars in each scope; + std::vector var_infos; + for (auto *var : main_program.Block(0).AllVars()) { + var_infos.emplace_back(); + var_infos.back().name_ = var->Name(); + var_infos.back().type_ = var->GetType(); + var_infos.back().persistable_ = var->Persistable(); + } + + // Step 3. Convert main_program to SSA form and dependency graph. Also, insert + // ncclOp + details::SSAGraphBuilderFactory builder_factory( member_->places_, loss_var_name, params, member_->local_scopes_, - member_->nccl_ctxs_.get(), build_strategy); + build_strategy); + if (member_->use_cuda_) { +#ifdef PADDLE_WITH_CUDA + builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get()); #else - details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, - params, member_->local_scopes_, - build_strategy); + PADDLE_THROW("Not compiled with CUDA"); #endif - auto graph = builder.Build(main_program); + } + builder_ = builder_factory.Create(); member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, places, + builder_->Build(main_program))); - // Step 3. Create vars in each scope; - for (auto *var : main_program.Block(0).AllVars()) { - member_->var_types_.emplace_back(var->Name(), var->GetType(), - var->Persistable()); - } + member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, std::move(var_infos), + member_->places_, std::move(member_->executor_))); } void ParallelExecutor::BCastParamsToGPUs( const std::unordered_set &vars) const { -#ifdef PADDLE_WITH_CUDA - auto *main_scope = member_->local_scopes_[0]; + // the the initializing bcast, all vars would be bcast from device(0), + // otherwise + // bcast from the specified device. + bool initializing = builder_.get() == nullptr ? true : false; for (auto &var : vars) { - auto *main_var = main_scope->FindVar(var); + int var_dev_id = + builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var); + if (!initializing && var_dev_id == -1) continue; + + framework::Variable *main_var = nullptr; + if (initializing) { + main_var = member_->local_scopes_[0]->FindVar(var); + } else { + main_var = member_->local_scopes_[var_dev_id]->FindVar(var); + } + if (main_var == nullptr || !main_var->IsType()) { continue; } @@ -131,13 +157,16 @@ void ParallelExecutor::BCastParamsToGPUs( auto &main_tensor = main_var->Get(); auto &dims = main_tensor.dims(); if (paddle::platform::is_gpu_place(main_tensor.place())) { +#ifdef PADDLE_WITH_CUDA + std::vector buffers; size_t numel = main_tensor.numel(); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); - platform::NCCLGroupGuard guard; for (size_t i = 0; i < member_->places_.size(); ++i) { auto place = member_->places_[i]; void *buffer; - if (i == 0) { + + if ((initializing && i == 0) || + (!initializing && static_cast(i) == var_dev_id)) { buffer = const_cast(main_tensor.data()); } else { auto local_scope = member_->local_scopes_[i]; @@ -145,10 +174,32 @@ void ParallelExecutor::BCastParamsToGPUs( t->Resize(dims); buffer = t->mutable_data(place, main_tensor.type()); } - auto &nccl_ctx = member_->nccl_ctxs_->at(place); - platform::dynload::ncclBcast(buffer, numel, data_type, 0, - nccl_ctx.comm_, nccl_ctx.stream()); + buffers.push_back(buffer); + } + + PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(), + "variables' buffer size to bcast NOT equal to places"); + { + platform::NCCLGroupGuard guard; + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]); + if (initializing) { + platform::dynload::ncclBcast(buffers[i], numel, data_type, 0, + nccl_ctx.comm_, nccl_ctx.stream()); + } else { + if (var_dev_id >= 0) { + platform::dynload::ncclBcast(buffers[i], numel, data_type, + var_dev_id, nccl_ctx.comm_, + nccl_ctx.stream()); + } + } + } + member_->nccl_ctxs_->WaitAll(); } + +#else + PADDLE_THROW("Not compiled with CUDA"); +#endif } else { platform::CPUPlace cpu; for (size_t i = 1; i < member_->places_.size(); ++i) { @@ -159,52 +210,15 @@ void ParallelExecutor::BCastParamsToGPUs( paddle::framework::TensorCopy(main_tensor, cpu, t); } } - member_->nccl_ctxs_->WaitAll(); } -#else - PADDLE_THROW("Not compiled with CUDA"); -#endif } void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { platform::RecordBlock b(0); - // Create local scopes. - for (auto it = member_->local_scopes_.rbegin(); - it != member_->local_scopes_.rend(); ++it) { - auto &scope = *it; - Scope &local_scope = scope->NewScope(); - *scope->Var(details::kLocalExecScopeName)->GetMutable() = - &local_scope; - - for (auto &name_type_pair : member_->var_types_) { - if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) { - continue; - } - - if (std::get<2>(name_type_pair)) { // Persistable - InitializeVariable(scope->Var(std::get<0>(name_type_pair)), - std::get<1>(name_type_pair)); - } else { - InitializeVariable(local_scope.Var(std::get<0>(name_type_pair)), - std::get<1>(name_type_pair)); - } - } - } - auto fetch_data = member_->executor_->Run(fetch_tensors); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetch_data; - - // Wait All computational streams - for (auto p : member_->places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); - } - for (auto &scope : member_->local_scopes_) { - auto &local_scope = - *scope->Var(details::kLocalExecScopeName)->GetMutable(); - scope->DeleteScope(local_scope); - } } void ParallelExecutor::FeedTensorsIntoLocalScopes( @@ -242,7 +256,7 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( } ParallelExecutor::~ParallelExecutor() { - if (member_->own_local_scope) { + if (member_->own_local_scope_) { for (size_t i = 1; i < member_->local_scopes_.size(); ++i) { member_->global_scope_->DeleteScope(member_->local_scopes_[i]); } diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 5247e790649e76567f4527d54499d6e95dac5c27..058f83f07c26224e3180d140630c08a24c40cd80 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -19,12 +19,14 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/details/execution_strategy.h" +#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" + namespace paddle { namespace framework { @@ -68,6 +70,7 @@ class ParallelExecutor { private: ParallelExecutorPrivate *member_; + std::unique_ptr builder_; }; } // namespace framework diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h index 3a413941df964c8d9454fafc6030c377c10f9fb1..64d4ceab624312ed366d7e835072899f1f033a88 100644 --- a/paddle/fluid/framework/reader.h +++ b/paddle/fluid/framework/reader.h @@ -35,14 +35,15 @@ class ReaderBase { class DecoratedReader : public ReaderBase { public: - explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) { + explicit DecoratedReader(const std::shared_ptr& reader) + : ReaderBase(), reader_(reader) { PADDLE_ENFORCE_NOT_NULL(reader_); } void ReInit() override { reader_->ReInit(); } protected: - ReaderBase* reader_; + std::shared_ptr reader_; }; class FileReader : public ReaderBase { @@ -64,7 +65,7 @@ class ReaderHolder { public: void Reset(ReaderBase* reader) { reader_.reset(reader); } - ReaderBase* Get() const { return reader_.get(); } + std::shared_ptr Get() const { return reader_; } void ReadNext(std::vector* out) { PADDLE_ENFORCE_NOT_NULL(reader_); @@ -76,7 +77,7 @@ class ReaderHolder { } private: - std::unique_ptr reader_; + std::shared_ptr reader_; }; } // namespace framework diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 9091713158c8071d5386f14250e3c546284e7fd0..50f374e3703a97f6c1fdb4b14fdeb0b603f9ac86 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -34,13 +34,7 @@ DEFINE_bool( namespace paddle { namespace framework { -Scope::~Scope() { - DropKids(); - for (auto& kv : vars_) { - VLOG(3) << "Destroy variable " << kv.first; - delete kv.second; - } -} +Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { std::unique_lock lock(mutex_); @@ -49,45 +43,37 @@ Scope& Scope::NewScope() const { } Variable* Scope::Var(const std::string& name) { - auto* v = FindVarLocally(name); - if (v != nullptr) return v; - v = new Variable(); - vars_[name] = v; - VLOG(3) << "Create variable " << name; - v->name_ = &(vars_.find(name)->first); - return v; + std::unique_lock lock(mutex_); + return VarInternal(name); } Variable* Scope::Var(std::string* name) { - auto var_name = string::Sprintf("%p.%d", this, vars_.size()); + std::unique_lock lock(mutex_); + auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { - *name = var_name; + *name = new_name; } - return Var(var_name); + return VarInternal(new_name); } Variable* Scope::FindVar(const std::string& name) const { - auto var = FindVarLocally(name); - if (var != nullptr) { - return var; - } - return (parent_ == nullptr) ? nullptr : parent_->FindVar(name); + std::unique_lock lock(mutex_); + return FindVarInternal(name); } const Scope* Scope::FindScope(const Variable* var) const { - for (auto& kv : vars_) { - if (kv.second == var) { - return this; - } - } - return (parent_ == nullptr) ? nullptr : parent_->FindScope(var); + std::unique_lock lock(mutex_); + return FindScopeInternal(var); } + void Scope::DropKids() { + std::unique_lock lock(mutex_); for (Scope* s : kids_) delete s; kids_.clear(); } std::vector Scope::LocalVarNames() const { + std::unique_lock lock(mutex_); std::vector known_vars; known_vars.reserve(this->vars_.size()); for (auto& p : vars_) { @@ -110,10 +96,10 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { + std::unique_lock lock(mutex_); std::set var_set(var_names.begin(), var_names.end()); for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { - delete it->second; it = vars_.erase(it); } else { ++it; @@ -123,25 +109,60 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { + std::unique_lock lock(mutex_); + RenameInternal(origin_name, new_name); +} + +std::string Scope::Rename(const std::string& origin_name) const { + std::unique_lock lock(mutex_); + auto new_name = string::Sprintf("%p.%d", this, vars_.size()); + RenameInternal(origin_name, new_name); + return new_name; +} + +Variable* Scope::VarInternal(const std::string& name) { + auto* v = FindVarLocally(name); + if (v != nullptr) return v; + + v = new Variable(); + vars_[name].reset(v); + VLOG(3) << "Create variable " << name; + v->name_ = &(vars_.find(name)->first); + return v; +} + +const Scope* Scope::FindScopeInternal(const Variable* var) const { + for (auto& kv : vars_) { + if (kv.second.get() == var) { + return this; + } + } + return (parent_ == nullptr) ? nullptr : parent_->FindScope(var); +} + +void Scope::RenameInternal(const std::string& origin_name, + const std::string& new_name) const { auto origin_it = vars_.find(origin_name); PADDLE_ENFORCE(origin_it != vars_.end(), "Cannot find original variable with name %s", origin_name); auto new_it = vars_.find(new_name); PADDLE_ENFORCE(new_it == vars_.end(), "The variable with name %s is already in the scope", new_name); - vars_[new_name] = origin_it->second; + vars_[new_name].reset(origin_it->second.release()); vars_.erase(origin_it); } -std::string Scope::Rename(const std::string& origin_name) const { - auto var_name = string::Sprintf("%p.%d", this, vars_.size()); - Rename(origin_name, var_name); - return var_name; +Variable* Scope::FindVarInternal(const std::string& name) const { + auto var = FindVarLocally(name); + if (var != nullptr) { + return var; + } + return (parent_ == nullptr) ? nullptr : parent_->FindVar(name); } Variable* Scope::FindVarLocally(const std::string& name) const { auto it = vars_.find(name); - if (it != vars_.end()) return it->second; + if (it != vars_.end()) return it->second.get(); return nullptr; } diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index abc82e452d732638a2f7315022074850f299a7ea..e246241c0abfbc7bdcaf38d073cc58fc36a4f737 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -47,15 +47,18 @@ class Scope { Scope& NewScope() const; /// Create a variable with given name if it doesn't exist. + /// Caller doesn't own the returned Variable. Variable* Var(const std::string& name); /// Create a variable with a scope-unique name. + /// Caller doesn't own the returned Variable. Variable* Var(std::string* name = nullptr); void EraseVars(const std::vector& var_names); /// Find a variable in the scope or any of its ancestors. Returns /// nullptr if cannot find. + /// Caller doesn't own the returned Variable. Variable* FindVar(const std::string& name) const; const Scope* parent() const { return parent_; } @@ -78,13 +81,30 @@ class Scope { // Rename variable to a new name and return the new name std::string Rename(const std::string& origin_name) const; - Variable* FindVarLocally(const std::string& name) const; + protected: + mutable std::unordered_map> vars_; private: // Call Scope::NewScope for a sub-scope. explicit Scope(Scope const* parent) : parent_(parent) {} - mutable std::unordered_map vars_; + // Called by Var. + Variable* VarInternal(const std::string& name); + + // Called by FindScope. + const Scope* FindScopeInternal(const Variable* var) const; + + // Called by Rename. + void RenameInternal(const std::string& origin_name, + const std::string& new_name) const; + + // Called by FindVar recursively. + Variable* FindVarInternal(const std::string& name) const; + + // Called by FindVarInternal and Var. + Variable* FindVarLocally(const std::string& name) const; + + // Scope in `kids_` are owned by this class. mutable std::list kids_; Scope const* parent_{nullptr}; diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index e97ada06f06d0538f17160220e3aa3f4ffc55520..c7286dacf01659f3af0927a71856e5a6496cb877 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -15,5 +15,102 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" namespace paddle { -namespace framework {} +namespace framework { +extern size_t SizeOfType(std::type_index type); +void Tensor::check_memory_size() const { + PADDLE_ENFORCE_NOT_NULL( + holder_, "Tensor holds no memory. Call Tensor::mutable_data first."); + PADDLE_ENFORCE_LE( + numel() * SizeOfType(type()), memory_size(), + "Tensor's dims_ is out of bound. Call Tensor::mutable_data " + "first to re-allocate memory.\n" + "or maybe the required data-type mismatches the data already stored."); +} + +size_t Tensor::memory_size() const { + return holder_ == nullptr ? 0UL : holder_->size() - offset_; +} + +void* Tensor::mutable_data(platform::Place place, std::type_index type) { + if (holder_ != nullptr) { + holder_->set_type(type); + } + PADDLE_ENFORCE_GE(numel(), 0, + "When calling this method, the Tensor's numel must be " + "equal or larger than zero. " + "Please check Tensor::Resize has been called first."); + int64_t size = numel() * SizeOfType(type); + /* some versions of boost::variant don't have operator!= */ + if (holder_ == nullptr || !(holder_->place() == place) || + holder_->size() < size + offset_) { + if (platform::is_cpu_place(place)) { + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); + } else if (platform::is_gpu_place(place) || + platform::is_cuda_pinned_place(place)) { +#ifndef PADDLE_WITH_CUDA + PADDLE_THROW( + "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode."); + } +#else + if (platform::is_gpu_place(place)) { + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); + } else if (platform::is_cuda_pinned_place(place)) { + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); + } + } +#endif + offset_ = 0; + } + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} + +void* Tensor::mutable_data(platform::Place place) { + PADDLE_ENFORCE(this->holder_ != nullptr, + "Cannot invoke mutable data if current hold nothing."); + return mutable_data(place, holder_->type()); +} + +Tensor& Tensor::ShareDataWith(const Tensor& src) { + src.check_memory_size(); + *this = src; + return *this; +} + +Tensor Tensor::Slice(int begin_idx, int end_idx) const { + check_memory_size(); + PADDLE_ENFORCE_GE(begin_idx, 0, + "The start row index must be greater than 0."); + PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound."); + PADDLE_ENFORCE_LT( + begin_idx, end_idx, + "The start row index must be lesser than the end row index."); + + if (dims_[0] == 1) { + return *this; + } else { + size_t base = numel() / dims_[0]; + Tensor dst; + dst.holder_ = holder_; + dst.set_layout(layout_); + DDim dst_dims = dims_; + dst_dims[0] = end_idx - begin_idx; + dst.Resize(dst_dims); + dst.offset_ = offset_ + begin_idx * base * SizeOfType(type()); + return dst; + } +} + +Tensor& Tensor::Resize(const DDim& dims) { + dims_ = dims; + return *this; +} + +const DDim& Tensor::dims() const { return dims_; } + +int64_t Tensor::numel() const { return product(dims_); } +} // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 6f878541e6de1deec1829145b1b325ecd176a034..ef224d68f1fc561f45e9d7a81425e62655457648 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -34,6 +34,28 @@ namespace framework { class LoDTensor; class Tensor { +#ifdef PADDLE_WITH_MKLDNN + + public: + inline mkldnn::memory::format format() const { return format_; } + + inline void set_format(const mkldnn::memory::format format) { + format_ = format; + } + + protected: + /** + * @brief the detail format of memory block which have layout as kMKLDNN + * + * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, + * nChw16c, etc. For a MKLDNN memory block, layout will be set as + * DataLayout::kMKLDNN meanwhile detail memory format will be kept in + * this field. + */ + + mkldnn::memory::format format_ = mkldnn::memory::format::format_undef; +#endif + public: template friend struct EigenTensor; @@ -54,26 +76,24 @@ class Tensor { /*! Return a pointer to mutable memory block. */ template - inline T* data(); + T* data(); /*! Return a pointer to constant memory block. */ template - inline const T* data() const; + const T* data() const; - inline bool IsInitialized() const; - - inline void switch_place(platform::Place new_place); + bool IsInitialized() const; /** * @brief Return a pointer to mutable memory block. * @note If not exist, then allocation. */ template - inline T* mutable_data(platform::Place place); + T* mutable_data(platform::Place place); - inline void* mutable_data(platform::Place place, std::type_index type); + void* mutable_data(platform::Place place, std::type_index type); - inline void* mutable_data(platform::Place place); + void* mutable_data(platform::Place place); /** * @brief Return a pointer to mutable memory block. @@ -84,19 +104,19 @@ class Tensor { * @note If not exist, then allocation. */ template - inline T* mutable_data(DDim dims, platform::Place place); + T* mutable_data(DDim dims, platform::Place place); /*! Return the dimensions of the memory block. */ - inline const DDim& dims() const; + const DDim& dims() const; /*! Return the numel of the memory block. */ - inline int64_t numel() const; + int64_t numel() const; /*! Resize the dimensions of the memory block. */ - inline Tensor& Resize(const DDim& dims); + Tensor& Resize(const DDim& dims); /*! The internal of two tensors share the same memory block. */ - inline Tensor& ShareDataWith(const Tensor& src); + Tensor& ShareDataWith(const Tensor& src); /** * @brief Return a sub-tensor of the given tensor. @@ -106,7 +126,7 @@ class Tensor { * @param[in] end_idx The index of the end row(exclusive) to slice. * The index number begins from 0. */ - inline Tensor Slice(int begin_idx, int end_idx) const; + Tensor Slice(int begin_idx, int end_idx) const; platform::Place place() const { PADDLE_ENFORCE_NOT_NULL( @@ -123,11 +143,11 @@ class Tensor { // memory size returns the holding memory size in byte. size_t memory_size() const; - inline void check_memory_size() const; + void check_memory_size() const; - inline DataLayout layout() const { return layout_; } + DataLayout layout() const { return layout_; } - inline void set_layout(const DataLayout layout) { layout_ = layout; } + void set_layout(const DataLayout layout) { layout_ = layout; } private: /** @@ -197,8 +217,10 @@ class Tensor { * N,C,H,W for respectively the batch size, the number of * feature maps, the height. */ - - DataLayout layout_ = DataLayout::kNHWC; + // Fix me: here just change the default layout to kNCHW + // it doesn't fix the real issue, i.e. feeder should set up tensor layout + // according to actual input data + DataLayout layout_ = DataLayout::kNCHW; /** * @brief A PlaceHolder may be shared by more than one tensor. @@ -210,15 +232,6 @@ class Tensor { size_t offset_; }; -inline void Tensor::switch_place(platform::Place new_place) { - if (holder_->place() == new_place) { - return; - } - - // TODO(tonyyang-svail): do memcpy here. - PADDLE_THROW("Not Implemented"); -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index 2f19ec0f0a9338e2b96d1f64eac45387bae4d1eb..96114678a9992f2975c4173c7cc003114f04d8df 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -20,21 +20,6 @@ limitations under the License. */ namespace paddle { namespace framework { -extern size_t SizeOfType(std::type_index type); -inline void Tensor::check_memory_size() const { - PADDLE_ENFORCE_NOT_NULL( - holder_, "Tensor holds no memory. Call Tensor::mutable_data first."); - PADDLE_ENFORCE_LE( - numel() * SizeOfType(type()), memory_size(), - "Tensor's dims_ is out of bound. Call Tensor::mutable_data " - "first to re-allocate memory.\n" - "or maybe the required data-type mismatches the data already stored."); -} - -inline size_t Tensor::memory_size() const { - return holder_ == nullptr ? 0UL : holder_->size() - offset_; -} - template inline const T* Tensor::data() const { check_memory_size(); @@ -73,88 +58,6 @@ inline T* Tensor::mutable_data(platform::Place place) { return reinterpret_cast(mutable_data(place, typeid(T))); } -inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { - if (holder_ != nullptr) { - holder_->set_type(type); - } - PADDLE_ENFORCE_GE(numel(), 0, - "When calling this method, the Tensor's numel must be " - "equal or larger than zero. " - "Please check Tensor::Resize has been called first."); - int64_t size = numel() * SizeOfType(type); - /* some versions of boost::variant don't have operator!= */ - if (holder_ == nullptr || !(holder_->place() == place) || - holder_->size() < size + offset_) { - if (platform::is_cpu_place(place)) { - holder_.reset(new PlaceholderImpl( - boost::get(place), size, type)); - } else if (platform::is_gpu_place(place) || - platform::is_cuda_pinned_place(place)) { -#ifndef PADDLE_WITH_CUDA - PADDLE_THROW( - "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode."); - } -#else - if (platform::is_gpu_place(place)) { - holder_.reset(new PlaceholderImpl( - boost::get(place), size, type)); - } else if (platform::is_cuda_pinned_place(place)) { - holder_.reset(new PlaceholderImpl( - boost::get(place), size, type)); - } - } -#endif - offset_ = 0; - } - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - offset_); -} - -inline void* Tensor::mutable_data(platform::Place place) { - PADDLE_ENFORCE(this->holder_ != nullptr, - "Cannot invoke mutable data if current hold nothing."); - return mutable_data(place, holder_->type()); -} - -inline Tensor& Tensor::ShareDataWith(const Tensor& src) { - src.check_memory_size(); - *this = src; - return *this; -} - -inline Tensor Tensor::Slice(int begin_idx, int end_idx) const { - check_memory_size(); - PADDLE_ENFORCE_GE(begin_idx, 0, - "The start row index must be greater than 0."); - PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound."); - PADDLE_ENFORCE_LT( - begin_idx, end_idx, - "The start row index must be lesser than the end row index."); - - if (dims_[0] == 1) { - return *this; - } else { - size_t base = numel() / dims_[0]; - Tensor dst; - dst.holder_ = holder_; - dst.set_layout(layout_); - DDim dst_dims = dims_; - dst_dims[0] = end_idx - begin_idx; - dst.Resize(dst_dims); - dst.offset_ = offset_ + begin_idx * base * SizeOfType(type()); - return dst; - } -} - -inline Tensor& Tensor::Resize(const DDim& dims) { - dims_ = dims; - return *this; -} - -inline const DDim& Tensor::dims() const { return dims_; } - -inline int64_t Tensor::numel() const { return product(dims_); } - inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { Tensor res; res.ShareDataWith(src); diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index e1012de2ec36eb4a858202d56a678b6a204c2f0a..0a1cb6d5703dace5e6be73285655ecd9d2ad89fb 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -209,7 +209,7 @@ TEST(Tensor, ReshapeToMatrix) { TEST(Tensor, Layout) { framework::Tensor src; - ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC); + ASSERT_EQ(src.layout(), framework::DataLayout::kNCHW); src.set_layout(framework::DataLayout::kAnyLayout); ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout); } diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 4879209ece9fdfea91e484a4118c00a2a2a2b4f7..e099e40f121ff13657e563eb608feecbca0551be 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -35,7 +35,8 @@ using VariableNameMap = std::map>; using Attribute = boost::variant, std::vector, std::vector, bool, - std::vector, BlockDesc*, int64_t>; + std::vector, BlockDesc*, int64_t, + std::vector>; using AttributeMap = std::unordered_map; diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 9faf5bb3036775a2ba0c08d3d6ea17ffa73753c6..2bb2c8135d8c317388e1a0d711589a390c7e8924 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -1,17 +1,32 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init) -cc_library(analysis SRCS dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc fluid_to_data_flow_graph_pass.cc - DEPS paddle_fluid) +cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc + fluid_to_data_flow_graph_pass.cc + data_flow_graph_to_fluid_pass.cc + tensorrt_subgraph_pass.cc + dfg_graphviz_draw_pass.cc + DEPS framework_proto) cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) -cc_test(test_data_flow_graph SRCS data_flow_graph_tester.cc DEPS analysis ${FLUID_CORE_MODULES} paddle_fluid - ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model) -set_tests_properties(test_data_flow_graph PROPERTIES DEPENDS test_word2vec) +function (inference_analysis_test TARGET) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS) + cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) -cc_test(test_subgraph_splitter - SRCS subgraph_splitter_tester.cc - DEPS analysis paddle_fluid tensor - ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model) -set_tests_properties(test_subgraph_splitter PROPERTIES DEPENDS test_word2vec) + cc_test(${TARGET} + SRCS "${analysis_test_SRCS}" + DEPS analysis + ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5) + set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) +endfunction(inference_analysis_test) + +inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) +inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc) +inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc) +inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc) +inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc) +#inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc) +inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc) diff --git a/paddle/fluid/inference/analysis/argument.cc b/paddle/fluid/inference/analysis/argument.cc new file mode 100644 index 0000000000000000000000000000000000000000..cb0263d5d98e86b612696ebde66d17fb2543809b --- /dev/null +++ b/paddle/fluid/inference/analysis/argument.cc @@ -0,0 +1,15 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/argument.h" diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h new file mode 100644 index 0000000000000000000000000000000000000000..f7f4e03968a723df1718bd3752bdd1c3430d02be --- /dev/null +++ b/paddle/fluid/inference/analysis/argument.h @@ -0,0 +1,55 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file defines the class Argument, which is the input and output of the + * analysis module. All the fields that needed either by Passes or PassManagers + * are contained in Argument. + * + * TODO(Superjomn) Find some way better to contain the fields when it grow too + * big. + */ + +#pragma once + +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/inference/analysis/data_flow_graph.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * The argument definition of both Pass and PassManagers. + * + * All the fields should be registered here for clearness. + */ +struct Argument { + // The graph that process by the Passes or PassManagers. + std::unique_ptr main_dfg; + + // The original program desc. + std::unique_ptr origin_program_desc; +}; + +#define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) +#define ANALYSIS_ARGUMENT_CHECK_FIELD(field__) \ + if (UNLIKELY(!(field__))) { \ + LOG(ERROR) << "field " << #field__ << " should be set."; \ + return false; \ + } + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc index 4220451e3caee62caa51af5bc33d6dd3fd891018..c30a7c26cecbe67f0ca73223e06b2095584aca94 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/dot.h" +#include "paddle/fluid/inference/analysis/node.h" namespace paddle { namespace inference { @@ -57,19 +58,7 @@ std::string DataFlowGraph::DotString() const { // Add nodes for (size_t i = 0; i < nodes.size(); i++) { const Node &node = nodes.Get(i); - switch (node.type()) { - case Node::Type::kValue: - dot.AddNode(node.repr(), node.dot_attrs()); - break; - case Node::Type::kFunction: - dot.AddNode(node.repr(), node.dot_attrs()); - break; - case Node::Type::kFunctionBlock: - dot.AddNode(node.repr(), node.dot_attrs()); - break; - default: - PADDLE_THROW("unsupported Node type %d", static_cast(node.type())); - } + dot.AddNode(node.repr(), node.dot_attrs()); } // Add edges diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..f7d4cca2132d11eb89eee5a71ed0a3cc7381e1ff --- /dev/null +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" +#include "paddle/fluid/framework/proto_desc.h" + +namespace paddle { +namespace inference { +namespace analysis { + +bool DataFlowGraphToFluidPass::Initialize(Argument* argument) { + ANALYSIS_ARGUMENT_CHECK_FIELD(argument) + ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc) + desc_ = argument->origin_program_desc.get(); + // Here some logic from program_desc.cc and will not add new interfaces into + // framework::ProgramDesc class, use some UT to assure the correctness. + auto* block = desc_->mutable_blocks()->Add(); + block->set_idx(framework::kRootBlockIndex); + block->set_parent_idx(framework::kNoneBlockIndex); + return true; +} + +bool DataFlowGraphToFluidPass::Finalize() { return true; } + +void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) { + auto traits = GraphTraits(graph); + for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) { + if (it->deleted()) continue; + switch (it->type()) { + case Node::Type::kFunction: + LOG(INFO) << "add function " << it->name(); + AddFluidOp(&(*it)); + break; + case Node::Type::kFunctionBlock: + AddEngineOp(&(*it)); + break; + default: + continue; + } + } +} + +void DataFlowGraphToFluidPass::AddFluidOp(Node* node) { + LOG(INFO) << "processing func " << node->name(); + auto* ori_op = static_cast(node->pb_desc()); + // currently only the main block is analyzed. + auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex); + auto* op = main_block->add_ops(); + LOG(INFO) << "to copy the op"; + *op = *ori_op; // copy the attributes, by default, these will not be changed + // by analysis phrase. + // The inputs and outputs of the existing ops are not changed by tensorrt + // subgraph pass. + // NOTE It might be changed by other passes in the long run. +} + +void DataFlowGraphToFluidPass::AddEngineOp(Node* node) { + // auto* ori_op = static_cast(node->extra_info()); + // auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex); + // auto* op = main_block->add_ops(); + // TODO(Superjomn) Here need to expose some arguments for default setting. +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..cbb05f622cc29c99c57e649b1c57cf3e54541191 --- /dev/null +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +/* + * This file implements the transformation from fluid ProgramDesc to data flow + * graph. + */ + +#pragma once + +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/inference/analysis/data_flow_graph.h" +#include "paddle/fluid/inference/analysis/pass.h" + +namespace paddle { +namespace inference { +namespace analysis { +class DataFlowGraphToFluidPass final : public DataFlowGraphPass { + public: + DataFlowGraphToFluidPass() = default; + + bool Initialize(Argument *argument) override; + bool Finalize() override; + + void Run(DataFlowGraph *graph) override; + + std::string repr() const override { return "DFG to fluid"; } + std::string description() const override { + return "Transform a DFG to a Fluid ProgramDesc"; + } + + Pass *CreatePrinterPass(std::ostream &os, + const std::string &banner) const override { + return nullptr; + } + + protected: + // Add a Fluid Op into the ProgramDesc. + void AddFluidOp(Node *node); + // Add a EngineOp into the ProgramDesc. + void AddEngineOp(Node *node); + + private: + framework::proto::ProgramDesc *desc_; +}; +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc index dcee75cee50ede1d2b660e88e06544440bd5ef77..d8fc5e580a98f76233f01fdc4d7987311f78ee45 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc @@ -27,13 +27,12 @@ namespace inference { namespace analysis { TEST_F(DFG_Tester, Test) { - framework::proto::ProgramDesc new_desc; DataFlowGraph graph; FluidToDataFlowGraphPass pass0; DataFlowGraphToFluidPass pass1; - pass0.Initialize(desc); - pass1.Initialize(&new_desc); + ASSERT_TRUE(pass0.Initialize(&argument)); + ASSERT_TRUE(pass1.Initialize(&argument)); pass0.Run(&graph); pass1.Run(&graph); diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..afffb3feb0c515faa554d0d4919c442ca4515294 --- /dev/null +++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc @@ -0,0 +1,54 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) { + auto content = Draw(graph); + std::ofstream file(GenDotPath()); + file.write(content.c_str(), content.size()); + file.close(); + LOG(INFO) << "draw dot to " << GenDotPath(); +} + +std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) { + Dot dot; + // Add nodes + for (size_t i = 0; i < graph->nodes.size(); i++) { + const Node &node = graph->nodes.Get(i); + if (config_.display_deleted_node || !node.deleted()) { + dot.AddNode(node.repr(), node.dot_attrs()); + } + } + // Add edges + for (size_t i = 0; i < graph->nodes.size(); i++) { + const Node &node = graph->nodes.Get(i); + if (!config_.display_deleted_node && node.deleted()) continue; + for (auto &in : node.inlinks) { + if (!config_.display_deleted_node && in->deleted()) continue; + for (auto &in : node.inlinks) { + dot.AddEdge(in->repr(), node.repr(), {}); + } + } + } + return dot.Build(); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..93ebff59ae9691394858f32c822a5e70f3345581 --- /dev/null +++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file create an DFG_GraphvizDrawPass which helps to draw a data flow + * graph's structure using graphviz. + */ + +#pragma once + +#include +#include +#include "paddle/fluid/inference/analysis/dot.h" +#include "paddle/fluid/inference/analysis/pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * Output a dot file and write to some place. + */ +class DFG_GraphvizDrawPass : public DataFlowGraphPass { + public: + struct Config { + Config(const std::string &dir, const std::string &id, + bool display_deleted_node = false) + : dir(dir), id(id), display_deleted_node(display_deleted_node) {} + + // The directory to store the .dot or .png files. + const std::string dir; + // The identifier for this dot file. + const std::string id; + // Whether to display deleted nodes, default false. + const bool display_deleted_node; + }; + + DFG_GraphvizDrawPass(const Config &config) : config_(config) {} + + bool Initialize(Argument *argument) override { return true; } + void Run(DataFlowGraph *graph) override; + bool Finalize() override { return Pass::Finalize(); } + + std::string repr() const override { return "DFG graphviz drawer"; } + std::string description() const override { + return "Debug a DFG by draw with graphviz"; + } + + private: + // Path of the dot file to output. + std::string GenDotPath() const { + return config_.dir + "/" + "graph_" + config_.id + ".dot"; + } + + std::string Draw(DataFlowGraph *graph); + + Config config_; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..f4b5c5fd2201cc9ff56d7ee8d8921376c2c9c59e --- /dev/null +++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" + +#include +#include +#include +#include "paddle/fluid/inference/analysis/ut_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) { + auto dfg = ProgramDescToDFG(*argument.origin_program_desc); + DFG_GraphvizDrawPass::Config config("./", "test"); + DFG_GraphvizDrawPass pass(config); + pass.Initialize(&argument); + pass.Run(&dfg); + + // test content + std::ifstream file("./graph_test.dot"); + ASSERT_TRUE(file.is_open()); + + std::string line; + int no{0}; + while (std::getline(file, line)) { + no++; + } + // DFG is sensitive to ProgramDesc, be careful to change the existing models. + ASSERT_EQ(no, 112); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc index 9f67c989cca4a936cd320b73efaae277263fb3e2..5f62eef52876ac68dfab00348f422a46de123cfe 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc @@ -21,19 +21,23 @@ namespace paddle { namespace inference { namespace analysis { -FluidToDataFlowGraphPass::FluidToDataFlowGraphPass() {} - -bool FluidToDataFlowGraphPass::Initialize() { return Pass::Initialize(); } - -bool FluidToDataFlowGraphPass::Initialize( - const framework::proto::ProgramDesc &desc) { - desc_ = &desc; +bool FluidToDataFlowGraphPass::Initialize(Argument *argument) { + ANALYSIS_ARGUMENT_CHECK_FIELD(argument); + ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc); + PADDLE_ENFORCE(argument); + if (!argument->main_dfg) { + LOG(INFO) << "Init DFG"; + argument->main_dfg.reset(new DataFlowGraph); + } + desc_ = argument->origin_program_desc.get(); return true; } bool FluidToDataFlowGraphPass::Finalize() { return Pass::Finalize(); } void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { + PADDLE_ENFORCE(graph); + PADDLE_ENFORCE(desc_); // insert vars std::unordered_map var2id; auto &main_block = desc_->blocks(framework::kRootBlockIndex); @@ -41,7 +45,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { const auto &var = main_block.vars(i); auto *v = graph->nodes.Create(Node::Type::kValue); v->SetName(var.name()); - v->SetExtraInfo(const_cast(static_cast(&var))); + v->SetPbDesc(const_cast(static_cast(&var))); var2id[var.name()] = v->id(); } for (int i = 0; i < main_block.ops_size(); i++) { @@ -51,7 +55,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { static_cast(o)->SetFuncType(op.type()); // Link to the original protobuf message's memory, make it easier to // generate from a data flow graph to fluid ProgramDesc. - o->SetExtraInfo(const_cast(static_cast(&op))); + o->SetPbDesc(const_cast(static_cast(&op))); // set inputs and outputs // TODO(Superjomn) make sure the InputNames is the real variable name. for (int j = 0; j < op.inputs_size(); j++) { diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h index 33517e57becdffc0416f204247eac5feadb7ed82..176faf0220cc98bf2c0384af75125d4bc493e753 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h @@ -34,13 +34,18 @@ namespace analysis { */ class FluidToDataFlowGraphPass final : public DataFlowGraphPass { public: - FluidToDataFlowGraphPass(); - bool Initialize() override; - bool Initialize(const framework::proto::ProgramDesc &desc) override; + FluidToDataFlowGraphPass() = default; + + bool Initialize(Argument *argument) override; bool Finalize() override; void Run(DataFlowGraph *graph) override; + std::string repr() const override { return "fluid-to-data-flow-graph"; } + std::string description() const override { + return "transform a fluid ProgramDesc to a data flow graph."; + } + Pass *CreatePrinterPass(std::ostream &os, const std::string &banner) const override; diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc index 817d32c92cdbdc234eef9ed5156891c2b11ced4c..cfbbc284e491bd62a6108d6d14e7896a57d1b63e 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc @@ -23,11 +23,11 @@ namespace analysis { TEST_F(DFG_Tester, Init) { FluidToDataFlowGraphPass pass; - pass.Initialize(); - pass.Initialize(desc); + pass.Initialize(&argument); DataFlowGraph graph; pass.Run(&graph); - ASSERT_GT(graph.nodes.size(), 0); + // Analysis is sensitive to ProgramDesc, careful to change the original model. + ASSERT_EQ(graph.nodes.size(), 37); pass.Finalize(); LOG(INFO) << '\n' << graph.DotString(); } diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 153dca576bd6734d62f00c4a7cb9b503506b33e2..f0039e113159fdcc0cc1c209a8bc899bc82984c1 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -18,6 +18,8 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -60,6 +62,7 @@ struct DataTypeNamer { SET_TYPE(int); SET_TYPE(bool); SET_TYPE(float); + SET_TYPE(void *); } std::unordered_map> data_; }; +template +T &GetFromScope(const framework::Scope &scope, const std::string &name) { + framework::Variable *var = scope.FindVar(name); + PADDLE_ENFORCE(var != nullptr); + return *var->GetMutable(); +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/node.cc b/paddle/fluid/inference/analysis/node.cc index fe060526080b1ee01aa98f2ff06fb2191eddf9da..3339b5044df0cf91d00aa9ddad310d4bf263bc3c 100644 --- a/paddle/fluid/inference/analysis/node.cc +++ b/paddle/fluid/inference/analysis/node.cc @@ -40,6 +40,9 @@ Node *NodeMap::Create(Node::Type type) { case Node::Type::kValue: nodes_.emplace_back(new Value); break; + case Node::Type::kFunctionBlock: + nodes_.emplace_back(new FunctionBlock); + break; default: PADDLE_THROW("Not supported node type."); } diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h index 7972ca25c92186a8c55a76de645f4fdbb089e8d3..8c2e6d88b9605d9923d002f73b60cd92b5e551b7 100644 --- a/paddle/fluid/inference/analysis/node.h +++ b/paddle/fluid/inference/analysis/node.h @@ -71,12 +71,17 @@ class Node { // Get an additional attribute and convert it to T data type. NOTE this will // silently create a new attribute if not exists. - Attr &attr(const std::string &name) { return attrs_[name]; } + Attr &attr(const std::string &name) const { return attrs_[name]; } int id() const { return id_; } - bool deleted() const { return deleted_; } + // The Protobuf description is set/get with a void* to decouple Node interface + // from a specific kind of Protobuf message. + void SetPbDesc(void *pb) { attr("pb_desc").Pointer() = pb; } + void *pb_desc() const { return attr("pb_desc").Pointer(); } + void SetDeleted() { deleted_ = true; } + bool deleted() const { return deleted_; } void SetName(const std::string &name) { name_ = name; } const std::string &name() const { return name_; } @@ -84,29 +89,25 @@ class Node { void SetType(Type type) { type_ = type; } Type type() const { return type_; } - void *extra_info() const { return extra_info_; } - void SetExtraInfo(void *extra_info) { extra_info_ = extra_info; } - // Input links. std::vector inlinks; // Output links. std::vector outlinks; // A helper class to maintain the status from Pass. - // TODO(superjomn) add a checker here to ensure the T is primary. struct Attr { // NOTE T should be a primary type or a struct combined by several primary // types. // NOTE the STL containers should not use here. // Some usages - // Attr attr; - // T data; - // attr.data.assign((char*)data, sizeof(data)); + // Attr attr; + // attr.Bool() = true; bool &Bool() { return As(); } float &Float() { return As(); } int32_t &Int32() { return As(); } int64_t &Int64() { return As(); } + void *&Pointer() { return As(); } private: template @@ -130,6 +131,7 @@ class Node { size_t type_hash_{std::numeric_limits::max()}; }; + // Type checks. bool IsFunction() const { return type_ == Node::Type::kFunction; } bool IsValue() const { return type_ == Node::Type::kValue; } bool IsFunctionBlock() const { return type_ == Node::Type::kFunctionBlock; } @@ -148,9 +150,6 @@ class Node { Type type_{Type::kNone}; // Mark this node is deleted by some pass. bool deleted_{false}; - - void *extra_info_; - mutable std::unordered_map attrs_; }; diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h index aa0e8667b5e4a9e6156c25fcad03bb8eee3287f6..65632b749177add9dcb297bffad1e85f68a80b02 100644 --- a/paddle/fluid/inference/analysis/pass.h +++ b/paddle/fluid/inference/analysis/pass.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/inference/analysis/argument.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/node.h" @@ -30,19 +31,24 @@ namespace analysis { class Pass { public: Pass() = default; - virtual ~Pass() {} + virtual ~Pass() = default; // Virtual method overridden by subclasses to do only necessary initialization // before any pass is run. - virtual bool Initialize() { return false; } + // virtual bool Initialize() { return false; } // There is some passes such as FlowToDataFlowGraphPass that needs a // ProgramDesc. Here use the native ProgramDesc ProtoBuf message, so that it // only couple with the proto file. - virtual bool Initialize(const framework::proto::ProgramDesc &desc) { - return false; - } + // virtual bool Initialize(const framework::proto::ProgramDesc &desc) { return + // false; } // There are some Passes such as DataFlowGraphToFluidPass that will output a // ProgramDesc. - virtual bool Initialize(framework::proto::ProgramDesc *desc) { return false; } + // virtual bool Initialize(framework::proto::ProgramDesc *desc) { return + // false; } + + // Mutable Pass. + virtual bool Initialize(Argument *argument) { return false; } + // Readonly Pass. + virtual bool Initialize(const Argument &argument) { return false; } // Virtual method overriden by subclasses to do any necessary clean up after // all passes have run. @@ -50,7 +56,9 @@ class Pass { // Get a Pass appropriate to print the Node this pass operates on. virtual Pass *CreatePrinterPass(std::ostream &os, - const std::string &banner) const = 0; + const std::string &banner) const { + return nullptr; + } // Run on a single Node. virtual void Run(Node *x) { LOG(FATAL) << "not valid"; } @@ -60,6 +68,11 @@ class Pass { virtual void Run(FunctionBlock *x) { LOG(FATAL) << "not valid"; } // Run on a single DataFlowGraph. virtual void Run(DataFlowGraph *x) { LOG(FATAL) << "not valid"; } + + // Human-readable short representation. + virtual std::string repr() const = 0; + // Human-readable long description. + virtual std::string description() const = 0; }; // NodePass process on any Node types. diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..b17c0e0d724ebeea7b84bf63024cd141891a78b4 --- /dev/null +++ b/paddle/fluid/inference/analysis/pass_manager.cc @@ -0,0 +1,44 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/analysis/pass_manager.h" +#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void DfgPassManager::RunAll() { + PADDLE_ENFORCE(argument_); + for (auto& pass : data_) { + VLOG(4) << "Running pass [" << pass->repr() << "]"; + pass->Run(argument_->main_dfg.get()); + } +} + +void NodePassManager::RunAll() { + PADDLE_ENFORCE(argument_); + PADDLE_ENFORCE(argument_->main_dfg.get()); + auto trait = + GraphTraits(argument_->main_dfg.get()).nodes_in_DFS(); + for (auto& node : trait) { + for (auto& pass : data_) { + pass->Run(&node); + } + } +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/pass_manager.h b/paddle/fluid/inference/analysis/pass_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..7841c4b9d08001264af9f3a248a96814d1c273c4 --- /dev/null +++ b/paddle/fluid/inference/analysis/pass_manager.h @@ -0,0 +1,116 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file defines the logic of pass management. The analysis for inference is + * a pipeline of Passes, a PassManager is a agency that helps to manage the + * executation of the Passes. + * + * There are two modes of Passes, the first one is called NodePass and takes + * an Node as input and output; the second one is called DFGPass and takes a + * DFG(Data Flow Graph) as input and output. It is hard to put all the passes in + * the same pipeline, there are two kinds of PassManagers, both takes a DFG as + * input and output a DFG, but the Passes inside are different: + * + * 1. NodePassManager: the passes inside are all NodePasses, it can have + * different graph trivial algorithm, for example, DFS_NodePassManager will + * trigger the passes in depth first order; + * 2. DfgPassManager: the passes inside are all DfgPasses. + */ + +#pragma once + +#include +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/inference/analysis/pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * PassManager is the base class for all pass managers, a pass manager has + * several Pass-es registered, and execute them in the linear order. + */ +class PassManager : public OrderedRegistry { + public: + PassManager() = default; + // Call all the passes' Initialize methods. The desc and data_flow_graph are + // globally shared, so pass them as the arguemnts for all the pass managers. + virtual bool Initialize(const Argument& argument) { return false; } + + virtual bool Initialize(Argument* argument) { + argument_ = argument; + for (auto& pass : data_) { + LOG(INFO) << "Initializing pass " << pass->repr(); + if (!pass->Initialize(argument)) { + LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]"; + return false; + } + } + return true; + } + + // Call all the passes' Finalize methods. + virtual bool Finalize() { + for (auto& pass : data_) { + if (!pass->Finalize()) { + LOG(ERROR) << "Failed to finalize pass [" << pass->repr() << "]"; + return false; + } + } + return true; + } + + // Run all the passes. + virtual void RunAll() = 0; + + // Short identifier. + virtual std::string repr() const = 0; + // Long description. + virtual std::string description() const = 0; + + virtual ~PassManager() = default; + + protected: + Argument* argument_{nullptr}; +}; + +/* + * A pass manager that process a DFG. + */ +class DfgPassManager : public PassManager { + public: + DfgPassManager() = default; + + void RunAll() override; + + virtual ~DfgPassManager() = default; +}; + +/* + * A pass manager that process a Node each time. + */ +class NodePassManager : public PassManager { + public: + NodePassManager() = default; + + void RunAll() override; + + virtual ~NodePassManager() = default; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..7af6a199514636224f0b8303abea7d398400d278 --- /dev/null +++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc @@ -0,0 +1,85 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/analysis/pass_manager.h" +#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" +#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" +#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" + +#include + +namespace paddle { +namespace inference { +namespace analysis { + +class TestDfgPassManager final : public DfgPassManager { + public: + TestDfgPassManager() = default; + virtual ~TestDfgPassManager() = default; + // Short identifier. + std::string repr() const override { return "test-pass-manager"; } + // Long description. + std::string description() const override { return "test doc"; } +}; + +class TestNodePassManager final : public NodePassManager { + public: + virtual ~TestNodePassManager() = default; + + std::string repr() const override { return "test-node-pass-manager"; } + std::string description() const override { return "test doc"; } +}; + +class TestNodePass final : public NodePass { + public: + virtual ~TestNodePass() = default; + + bool Initialize(Argument* argument) override { return true; } + + void Run(Node* node) override { + LOG(INFO) << "- Processing node " << node->repr(); + } + + std::string repr() const override { return "test-node"; } + std::string description() const override { return "some doc"; } +}; + +TEST_F(DFG_Tester, DFG_pass_manager) { + TestDfgPassManager manager; + DFG_GraphvizDrawPass::Config config("./", "dfg.dot"); + + manager.Register("fluid-to-flow-graph", new FluidToDataFlowGraphPass); + manager.Register("graphviz", new DFG_GraphvizDrawPass(config)); + manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass); + + ASSERT_TRUE(manager.Initialize(&argument)); + manager.RunAll(); +} + +TEST_F(DFG_Tester, Node_pass_manager) { + // Pre-process: initialize the DFG with the ProgramDesc first. + FluidToDataFlowGraphPass pass0; + pass0.Initialize(&argument); + pass0.Run(argument.main_dfg.get()); + + TestNodePassManager manager; + manager.Register("test-node-pass", new TestNodePass); + ASSERT_TRUE(manager.Initialize(&argument)); + manager.RunAll(); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc index 0644c0db12e3daabba76dbaad33847f5624b157a..8134494f8bccb132f2ed7d1ba1fb615a298596ed 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc @@ -19,22 +19,23 @@ namespace paddle { namespace inference { namespace analysis { +SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) { + if (node->type() != Node::Type::kFunction) return false; + const auto* func = static_cast(node); + if (func->func_type() == "elementwise_add" || func->func_type() == "relu" || + func->func_type() == "conv2d" || func->func_type() == "mul" || + func->func_type() == "sigmoid" || func->func_type() == "softmax") { + LOG(INFO) << "sub-graph marked " << node->repr(); + return true; + } + return false; +}; + TEST_F(DFG_Tester, Split) { auto desc = LoadProgramDesc(); auto dfg = ProgramDescToDFG(desc); LOG(INFO) << "spliter\n" << dfg.DotString(); - SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) { - if (node->type() != Node::Type::kFunction) return false; - const auto* func = static_cast(node); - if (func->func_type() == "elementwise_add" || func->func_type() == "relu" || - func->func_type() == "conv2d" || func->func_type() == "mul" || - func->func_type() == "sigmoid" || func->func_type() == "softmax") { - LOG(INFO) << "sub-graph marked " << node->repr(); - return true; - } - return false; - }; ASSERT_GT(dfg.nodes.size(), 5UL); auto subgraphs = SubGraphSplitter(&dfg, teller)(); @@ -62,6 +63,28 @@ TEST_F(DFG_Tester, Split) { ASSERT_EQ(subgraphs.back().size(), 6UL); } +TEST_F(DFG_Tester, Fuse) { + auto desc = LoadProgramDesc(); + auto dfg = ProgramDescToDFG(desc); + + size_t count0 = dfg.nodes.size(); + + SubGraphFuse fuse(&dfg, teller); + fuse(); + + int count1 = 0; + for (auto& node : dfg.nodes.nodes()) { + if (node->deleted()) { + LOG(INFO) << "deleted " << node->repr(); + } + count1 += node->deleted(); + } + + // At least one nodes should be deleted. + ASSERT_EQ(dfg.nodes.size(), count0 + 1); // added a new FunctionBlock + ASSERT_EQ(6UL, count1); +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..c7f40d43c922a328febd343cea7240fcb09f3d02 --- /dev/null +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h" +#include "paddle/fluid/inference/analysis/subgraph_splitter.h" + +namespace paddle { +namespace inference { +namespace analysis { + +TensorRTSubGraphPass::TensorRTSubGraphPass( + const TensorRTSubGraphPass::NodeInsideSubgraphTeller &teller) + : node_inside_subgraph_teller_(teller) {} + +void TensorRTSubGraphPass::Run(DataFlowGraph *graph) { + SubGraphFuse(graph, node_inside_subgraph_teller_); +} + +} // namespace analysis +} // namespace inference + +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..79e9e2bcc9e626a102dfdab6f1f50c8d58f9bbdd --- /dev/null +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/inference/analysis/node.h" +#include "paddle/fluid/inference/analysis/pass.h" +#include "paddle/fluid/inference/analysis/subgraph_splitter.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * Parse the graph and replace TensorRT supported nodes with SubGraphNode + */ +class TensorRTSubGraphPass : public DataFlowGraphPass { + public: + // Tell whether to transform a sub-graph into TensorRT. + using NodeInsideSubgraphTeller = SubGraphFuse::NodeInsideSubgraphTeller; + + TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller); + + bool Initialize(Argument* argument) override { return true; } + + // This class get a sub-graph as input and determine whether to transform this + // sub-graph into TensorRT. + void Run(DataFlowGraph* graph) override; + + private: + NodeInsideSubgraphTeller node_inside_subgraph_teller_; +}; + +} // namespace analysis +} // namespace inference +} // paddle diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..d12dcf0d0fe7f9354f7ed1aac924aeab3403e9b8 --- /dev/null +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc @@ -0,0 +1,71 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h" + +#include +#include +#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +DEFINE_string(model_dir, "", "inference test model dir"); + +TEST(TensorRTSubGraph, single_pass) { + auto desc = LoadProgramDesc(); + auto dfg = ProgramDescToDFG(desc); + + SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) { + if (node->type() != Node::Type::kFunction) return false; + const auto* func = static_cast(node); + if (func->func_type() == "elementwise_add" || func->func_type() == "relu" || + func->func_type() == "conv2d" || func->func_type() == "mul" || + func->func_type() == "sigmoid" || func->func_type() == "softmax") { + LOG(INFO) << "sub-graph marked " << node->repr(); + return true; + } + return false; + }; + + DFG_GraphvizDrawPass::Config config{"./", "test"}; + DFG_GraphvizDrawPass dfg_pass(config); + dfg_pass.Initialize(); + + DFG_GraphvizDrawPass dfg_pass1(config); + dfg_pass1.Initialize(); + + dfg_pass.Run(&dfg); + + TensorRTSubGraphPass trt_pass(std::move(teller)); + trt_pass.Initialize(); + + trt_pass.Run(&dfg); + + dfg_pass1.Run(&dfg); + + // Check the TRT op's block desc + for (auto node : dfg.nodes.nodes()) { + if (node->IsFunctionBlock()) { + } + } +} + +TEST(TensorRTSubGraph, pass_manager) {} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h index 722fa99a48a5f2b0e778904de0c35977d0ee3cc0..ce1191a567a4198f003520c40bf02487c48c56eb 100644 --- a/paddle/fluid/inference/analysis/ut_helper.h +++ b/paddle/fluid/inference/analysis/ut_helper.h @@ -15,33 +15,46 @@ limitations under the License. */ #pragma once #include #include +#include #include #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" #include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/io.h" namespace paddle { namespace inference { + +// Read ProgramDesc from a __model__ file, defined in io.cc +extern void ReadBinaryFile(const std::string& filename, std::string* contents); + namespace analysis { DEFINE_string(inference_model_dir, "", "inference test model dir"); static framework::proto::ProgramDesc LoadProgramDesc( const std::string& model_dir = FLAGS_inference_model_dir) { - paddle::platform::CPUPlace place; - paddle::framework::Executor executor(place); - paddle::framework::Scope scope; - auto program = Load(&executor, &scope, model_dir); - return *program->Proto(); + std::string msg; + std::string net_file = FLAGS_inference_model_dir + "/__model__"; + std::ifstream fin(net_file, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", net_file); + fin.seekg(0, std::ios::end); + msg.resize(fin.tellg()); + fin.seekg(0, std::ios::beg); + fin.read(&(msg.at(0)), msg.size()); + fin.close(); + framework::proto::ProgramDesc program_desc; + program_desc.ParseFromString(msg); + return program_desc; } static DataFlowGraph ProgramDescToDFG( const framework::proto::ProgramDesc& desc) { DataFlowGraph graph; FluidToDataFlowGraphPass pass; - pass.Initialize(desc); + Argument argument; + argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc)); + pass.Initialize(&argument); pass.Run(&graph); pass.Finalize(); return graph; @@ -49,9 +62,12 @@ static DataFlowGraph ProgramDescToDFG( class DFG_Tester : public ::testing::Test { protected: - void SetUp() override { desc = LoadProgramDesc(FLAGS_inference_model_dir); } + void SetUp() override { + auto desc = LoadProgramDesc(FLAGS_inference_model_dir); + argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc)); + } - framework::proto::ProgramDesc desc; + Argument argument; }; } // namespace analysis diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 65db7c7b5008dcb301e741ec17c3623715e10bb8..6b03ac7119b117e442e6af34c719c8a4f736bde9 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -20,16 +20,20 @@ limitations under the License. */ #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/pybind/pybind.h" DEFINE_string(devices, "", "The devices to be used which is joined by comma."); DEFINE_bool(init_p2p, false, "Whether to init p2p."); +DEFINE_int32(math_num_threads, 1, + "Number of threads used to run math functions."); namespace paddle { namespace inference { void Init(const std::vector argv) { framework::InitGflags(argv); + operators::math::SetNumThreads(FLAGS_math_num_threads); // init devices std::vector devices; std::string token; diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 23ca8bfac84f35ebdca2e2a1a8538d366358ca8b..748f5a084e8c880df215a60fe51c835ba5cd3110 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,12 +1,15 @@ # Add TRT tests -nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine) -# This test is not stable -# See https://paddleci.ngrok.io/viewLog.html?tab=buildLog&buildTypeId=Paddle_PrCi2&buildId=36834&_focus=8828 -#nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc -# DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine -# SERIAL) +nv_library(tensorrt_converter + SRCS mul_op.cc conv2d_op.cc fc_op.cc + DEPS tensorrt_engine mul_op) + +nv_test(test_op_converter SRCS test_op_converter.cc DEPS + ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter) + nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor) nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) +nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc index 79d01b640a214ed5eb86173a36d5e85a6626066f..e1cace9cc1b06f036f52e82b7b86c99a02d50f50 100644 --- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" namespace paddle { @@ -21,7 +22,8 @@ namespace tensorrt { class ReluOpConverter : public OpConverter { public: ReluOpConverter() {} - void operator()(const framework::proto::OpDesc& op) override { + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { // Here the two nullptr looks strange, that's because the // framework::OpDesc's constructor is strange. framework::OpDesc op_desc(op, nullptr); @@ -32,12 +34,17 @@ class ReluOpConverter : public OpConverter { nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER( engine_, Activation, *const_cast(input_tensor), nvinfer1::ActivationType::kRELU); - engine_->SetITensor(op_desc.Output("Out")[0], layer->getOutput(0)); + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { // the test framework can not determine which is the + // output, so place the declaration inside. + engine_->DeclareOutput(output_name); + } } }; -REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter); - } // namespace tensorrt } // namespace inference } // namespace paddle + +REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 668d344f1bba1c012dcb42c71b996209b4703d78..8e7e23377d4b2fe7afd51f1f58048fc4ed3c6d99 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -22,14 +22,14 @@ class Conv2dOpConverter : public OpConverter { public: Conv2dOpConverter() {} void operator()(const framework::proto::OpDesc& op, - const framework::Scope& scope) override { + const framework::Scope& scope, bool test_mode) override { LOG(INFO) << "convert a fluid conv2d op to tensorrt conv layer without bias"; } }; -REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter); - } // namespace tensorrt } // namespace inference } // namespace paddle + +REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 45b079559754a8f5c3fe39781b5700a75f425e99..bb603efaf30bb72d74b5583abc45d01a16c076a3 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -56,7 +56,7 @@ void ReorderCKtoKC(TensorRTEngine::Weight& iweights, class FcOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, - const framework::Scope& scope) override { + const framework::Scope& scope, bool test_mode) override { VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias"; framework::OpDesc op_desc(op, nullptr); @@ -106,14 +106,16 @@ class FcOpConverter : public OpConverter { n_output, weight.get(), bias.get()); auto output_name = op_desc.Output("Out").front(); - engine_->DeclareOutput(layer, 0, output_name); + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { + engine_->DeclareOutput(output_name); + } } }; -REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter); - } // namespace tensorrt } // namespace inference } // namespace paddle +REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter); USE_OP(mul); diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc index 6bb07709c7ee1c6b29c46425849a4f472d3df59d..3c342957360ad4192d838147bf37e84d233c2629 100644 --- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc @@ -23,9 +23,8 @@ namespace tensorrt { */ class MulOpConverter : public OpConverter { public: - MulOpConverter() {} void operator()(const framework::proto::OpDesc& op, - const framework::Scope& scope) override { + const framework::Scope& scope, bool test_mode) override { VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias"; framework::OpDesc op_desc(op, nullptr); @@ -37,12 +36,18 @@ class MulOpConverter : public OpConverter { engine_, MatrixMultiply, *const_cast(input1), false, *const_cast(input2), false); - engine_->DeclareOutput(layer, 0, op_desc.Output("Out")[0]); + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { // the test framework can not determine which is the + // output, so place the declaration inside. + engine_->DeclareOutput(output_name); + } } }; -REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter); - } // namespace tensorrt } // namespace inference } // namespace paddle + +USE_OP(mul); +REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 3beafeefd06f24ec50b0e61c1fabe13d7e53f242..6697952051c4b1997ca6b550da17a52e64cb3454 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -34,12 +35,15 @@ class OpConverter { // Converter logic for an op. virtual void operator()(const framework::proto::OpDesc& op, - const framework::Scope& scope) {} + const framework::Scope& scope, + bool test_mode = false) {} - // Convert a single fluid operaotr and add the corresponding layer to TRT. + // Convert a single fluid operator and add the corresponding layer to TRT. + // test_mode: whether the instance executes in an unit test. void ConvertOp(const framework::proto::OpDesc& op, const std::unordered_set& parameters, - const framework::Scope& scope, TensorRTEngine* engine) { + const framework::Scope& scope, TensorRTEngine* engine, + bool test_mode = false) { framework::OpDesc op_desc(op, nullptr); OpConverter* it{nullptr}; @@ -57,10 +61,11 @@ class OpConverter { PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_desc.Type()); it->SetEngine(engine); - (*it)(op, scope); + (*it)(op, scope, test_mode); } - // convert fluid block to tensorrt network + // Convert a fluid block to tensorrt network, NOTE it just convert operators, + // the INetwork's inputs and outputs should specified in some other modules. void ConvertBlock(const framework::proto::BlockDesc& block, const std::unordered_set& parameters, const framework::Scope& scope, TensorRTEngine* engine) { @@ -77,6 +82,9 @@ class OpConverter { // TensorRT engine TensorRTEngine* engine_{nullptr}; + protected: + bool test_mode_; + private: // registered op converter map, whose key is the fluid op type, and value is // the pointer position of corresponding OpConverter class. @@ -85,13 +93,24 @@ class OpConverter { framework::Scope* scope_{nullptr}; }; -#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__) \ - struct trt_##op_type__##_converter { \ - trt_##op_type__##_converter() { \ - Registry::Register(#op_type__); \ - } \ - }; \ - trt_##op_type__##_converter trt_##op_type__##_converter__; +#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__) \ + struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \ + trt_##op_type__##_converter() { \ + ::paddle::inference:: \ + Registry::Register< \ + ::paddle::inference::tensorrt::Converter__>(#op_type__); \ + } \ + }; \ + trt_##op_type__##_converter trt_##op_type__##_converter__; \ + int TouchConverterRegister_##op_type__() { \ + trt_##op_type__##_converter__.Touch(); \ + return 0; \ + } + +#define USE_TRT_CONVERTER(op_type__) \ + extern int TouchConverterRegister_##op_type__(); \ + static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \ + TouchConverterRegister_##op_type__(); } // namespace tensorrt } // namespace inference diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 86ca2ca08eb14265e1bfe7abd5eb6af5c83b8a5c..0a02a7bebf9efbd0555707e6cfa701ef1e7d9659 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -1,106 +1,47 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ #include -#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/inference/tensorrt/convert/io_converter.h" -#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/place.h" - -USE_OP(relu); +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" namespace paddle { namespace inference { namespace tensorrt { -void Compare(const std::string op_type, float input, float expect) { +TEST(ReluOpConverter, main) { framework::Scope scope; - platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); - - // init fluid op and variable - auto x_var = scope.Var("X"); - auto x_tensor = x_var->GetMutable(); - x_tensor->Resize({1, 1}); - x_tensor->mutable_data(place); - std::vector init; - init.push_back(input); - framework::TensorFromVector(init, ctx, x_tensor); - - auto out_var = scope.Var("Out"); - auto out_tensor = out_var->GetMutable(); - out_tensor->Resize({1, 1}); - out_tensor->mutable_data(place); - - framework::OpDesc op_desc; - op_desc.SetType(op_type); - op_desc.SetInput("X", {"X"}); - op_desc.SetOutput("Out", {"Out"}); - - auto op = framework::OpRegistry::CreateOp(*op_desc.Proto()); - - // run fluid op - op->Run(scope, place); - // get fluid output - std::vector out1; - framework::TensorToVector(*out_tensor, ctx, &out1); - - // init tensorrt op - cudaStream_t stream; - ASSERT_EQ(0, cudaStreamCreate(&stream)); - TensorRTEngine* engine = new TensorRTEngine(1, 1 << 10, &stream); - engine->InitNetwork(); - engine->DeclareInput("X", nvinfer1::DataType::kFLOAT, - nvinfer1::DimsCHW{1, 1, 1}); - // convert op - OpConverter op_converter; - op_converter.ConvertOp(*op_desc.Proto(), engine); - - engine->DeclareOutput("Out"); - engine->FreezeNetwork(); - - // convert LoDTensor to ITensor - size_t size = x_tensor->memory_size(); - EngineIOConverter::ConvertInput(op_type, *x_tensor, - engine->buffer("X").buffer, size, &stream); - // run tensorrt Outp - engine->Execute(1); - // convert ITensor to LoDTensor - EngineIOConverter::ConvertOutput(op_type, engine->buffer("Out").buffer, - out_tensor, size, &stream); - // get tensorrt output - std::vector out2; - framework::TensorToVector(*out_tensor, ctx, &out2); - - // compare - ASSERT_EQ(out1[0], out2[0]); - ASSERT_EQ(out1[0], expect); - - delete engine; - cudaStreamDestroy(stream); -} - -TEST(OpConverter, ConvertRelu) { - Compare("relu", 1, 1); // relu(1) = 1 - Compare("relu", -5, 0); // relu(-5) = 0 + std::unordered_set parameters; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6)); + validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("relu"); + desc.SetInput("X", {"relu-X"}); + desc.SetOutput("Out", {"relu-Out"}); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(10); } } // namespace tensorrt } // namespace inference } // namespace paddle -USE_OP(activation); +USE_OP(relu); diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc index 1d3f5eabb2f839b2acfa9da6527589df1ec3767f..9b79f86b0edba983019bd932f52b08711ff36d41 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc @@ -36,3 +36,5 @@ TEST(OpConverter, ConvertBlock) { } // namespace tensorrt } // namespace inference } // namespace paddle + +USE_TRT_CONVERTER(conv2d) diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index d7e05dd5b5b235b7b166b22c5b094dc364e28dfc..3b1f531adc5d756259df1c350f7f44bf71ee1f93 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -27,6 +27,7 @@ limitations under the License. */ #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/utils/singleton.h" namespace paddle { namespace inference { @@ -63,7 +64,8 @@ class TRTConvertValidation { TRTConvertValidation(int batch_size, const std::unordered_set& parameters, - framework::Scope& scope, int workspace_size = 1 << 10) + framework::Scope& scope, // NOLINT + int workspace_size = 1 << 10) : parameters_(parameters), scope_(scope) { // create engine. engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_)); @@ -104,8 +106,8 @@ class TRTConvertValidation { void SetOp(const framework::proto::OpDesc& desc) { op_ = framework::OpRegistry::CreateOp(desc); - OpConverter op_converter; - op_converter.ConvertOp(desc, parameters_, scope_, engine_.get()); + Singleton::Global().ConvertOp( + desc, parameters_, scope_, engine_.get(), true /*test_mode*/); engine_->FreezeNetwork(); @@ -150,7 +152,8 @@ class TRTConvertValidation { // Compare two output ASSERT_FALSE(fluid_out.empty()); for (size_t i = 0; i < fluid_out.size(); i++) { - EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 1e-6); + // Loose the threshold for CI in different machine model. + EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5); } } } diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 3d75fefc1a735168131a6c67ac073e80aba32945..596e0fe9da3d272ecb1c0f8dbef09a75d08a4b1a 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -43,9 +43,10 @@ void TensorRTEngine::Execute(int batch_size) { } TensorRTEngine::~TensorRTEngine() { + cudaStreamSynchronize(*stream_); // clean buffer for (auto& buf : buffers_) { - if (buf.buffer != nullptr) { + if (buf.device == DeviceType::GPU && buf.buffer != nullptr) { PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer)); buf.buffer = nullptr; buf.max_size = 0; @@ -80,6 +81,8 @@ void TensorRTEngine::FreezeNetwork() { auto& buf = buffer(item.first); CHECK(buf.buffer == nullptr); // buffer should be allocated only once. PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second)); + VLOG(4) << "buffer malloc " << item.first << " " << item.second << " " + << buf.buffer; buf.size = buf.max_size = item.second; buf.device = DeviceType::GPU; } @@ -96,6 +99,7 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name, PADDLE_ENFORCE(input, "infer network add input %s failed", name); buffer_sizes_[name] = kDataTypeSize[static_cast(dtype)] * analysis::AccuDims(dims.d, dims.nbDims); + PADDLE_ENFORCE(input->isNetworkInput()); TensorRTEngine::SetITensor(name, input); return input; } @@ -109,7 +113,9 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset, SetITensor(name, output); PADDLE_ENFORCE(output != nullptr); output->setName(name.c_str()); + PADDLE_ENFORCE(!output->isNetworkInput()); infer_network_->markOutput(*output); + PADDLE_ENFORCE(output->isNetworkOutput()); // output buffers' size can only be decided latter, set zero here to mark this // and will reset latter. buffer_sizes_[name] = 0; @@ -122,6 +128,7 @@ void TensorRTEngine::DeclareOutput(const std::string& name) { auto* output = TensorRTEngine::GetITensor(name); PADDLE_ENFORCE(output != nullptr); output->setName(name.c_str()); + PADDLE_ENFORCE(!output->isNetworkInput()); infer_network_->markOutput(*output); // output buffers' size can only be decided latter, set zero here to mark this // and will reset latter. diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index fabcfd9e80cc0ef2637201a1499ebbe2d6adfd8c..b06a9bbc6758ae9410b2fce99ef2b1a9e7ab98c0 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -21,6 +21,7 @@ limitations under the License. */ #include #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" +#include "paddle/fluid/inference/utils/singleton.h" namespace paddle { namespace inference { @@ -50,11 +51,12 @@ class TensorRTEngine : public EngineBase { nvinfer1::Weights w_; }; - TensorRTEngine(int max_batch, int max_workspace, cudaStream_t* stream, + TensorRTEngine(int max_batch, int max_workspace, + cudaStream_t* stream = nullptr, nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), - stream_(stream), + stream_(stream ? stream : &default_stream_), logger_(logger) {} virtual ~TensorRTEngine(); @@ -120,6 +122,8 @@ class TensorRTEngine : public EngineBase { // the max memory size the engine uses int max_workspace_; cudaStream_t* stream_; + // If stream_ is not set from outside, hold its own stream. + cudaStream_t default_stream_; nvinfer1::ILogger& logger_; std::vector buffers_; @@ -131,7 +135,11 @@ class TensorRTEngine : public EngineBase { // TensorRT related internal members template struct Destroyer { - void operator()(T* x) { x->destroy(); } + void operator()(T* x) { + if (x) { + x->destroy(); + } + } }; template using infer_ptr = std::unique_ptr>; @@ -155,6 +163,38 @@ class TensorRTEngine : public EngineBase { #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \ engine__->network()->add##layer__(ARGS); +/* + * Helper to control the TensorRT engine's creation and deletion. + */ +class TRT_EngineManager { + public: + bool HasEngine(const std::string& name) const { + return engines_.count(name) != 0; + } + + // Get an engine called `name`. + TensorRTEngine* Get(const std::string& name) const { + return engines_.at(name).get(); + } + + // Create or get an engine called `name` + TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream, + const std::string& name) { + auto* p = new TensorRTEngine(max_batch, max_workspace, stream); + engines_[name].reset(p); + return p; + } + + void DeleteALl() { + for (auto& item : engines_) { + item.second.reset(nullptr); + } + } + + private: + std::unordered_map> engines_; +}; + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc index 987da18116cc6f4902bd66ae317f2470a8bc5057..60c761c5281e2f535aab0200c93fb738addcdb87 100644 --- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc +++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc @@ -21,7 +21,6 @@ DEFINE_string(fp16_dirname, "", "Directory of the float16 inference model."); DEFINE_int32(batch_size, 1, "Batch size of input data"); DEFINE_int32(repeat, 1, "Running the inference program repeat times"); DEFINE_bool(skip_cpu, false, "Skip the cpu test"); -DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference"); TEST(inference, image_classification) { if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) { @@ -59,10 +58,8 @@ TEST(inference, image_classification) { // Run inference on CPU LOG(INFO) << "--- CPU Runs: ---"; LOG(INFO) << "Batch size is " << FLAGS_batch_size; - LOG(INFO) << "FLAGS_use_mkldnn: " << FLAGS_use_mkldnn; TestInference( - dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined, - FLAGS_use_mkldnn); + dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined); LOG(INFO) << output1.dims(); } diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc index 70aa42ac4111c0524a55e26aaefa864338c1d6c1..03b0b6946339772ac535b3471d50fbd74554239d 100644 --- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc +++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc @@ -19,17 +19,17 @@ limitations under the License. */ #include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/inference/tests/test_helper.h" +#include "paddle/fluid/operators/math/blas.h" #ifdef PADDLE_WITH_MKLML -#include #include #endif DEFINE_string(model_path, "", "Directory of the inference model."); DEFINE_string(data_file, "", "File of input index data."); DEFINE_int32(repeat, 100, "Running the inference program repeat times"); -DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference"); DEFINE_bool(prepare_vars, true, "Prepare variables before executor"); DEFINE_int32(num_threads, 1, "Number of threads should be used"); +DECLARE_bool(use_mkldnn); inline double GetCurrentMs() { struct timeval time; @@ -101,23 +101,22 @@ void SplitData( } void ThreadRunInfer( - const int tid, paddle::framework::Executor* executor, - paddle::framework::Scope* scope, - const std::unique_ptr& inference_program, + const int tid, paddle::framework::Scope* scope, const std::vector>& jobs) { - auto copy_program = std::unique_ptr( - new paddle::framework::ProgramDesc(*inference_program)); + // maybe framework:ProgramDesc is not thread-safe + paddle::platform::CPUPlace place; + paddle::framework::Executor executor(place); auto& sub_scope = scope->NewScope(); + auto inference_program = + paddle::inference::Load(&executor, scope, FLAGS_model_path); - std::string feed_holder_name = "feed_" + paddle::string::to_string(tid); - std::string fetch_holder_name = "fetch_" + paddle::string::to_string(tid); - copy_program->SetFeedHolderName(feed_holder_name); - copy_program->SetFetchHolderName(fetch_holder_name); + auto ctx = executor.Prepare(*inference_program, /*block_id*/ 0); + executor.CreateVariables(*inference_program, &sub_scope, /*block_id*/ 0); const std::vector& feed_target_names = - copy_program->GetFeedTargetNames(); + inference_program->GetFeedTargetNames(); const std::vector& fetch_target_names = - copy_program->GetFetchTargetNames(); + inference_program->GetFetchTargetNames(); PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL); std::map fetch_targets; @@ -131,9 +130,8 @@ void ThreadRunInfer( auto start_ms = GetCurrentMs(); for (size_t i = 0; i < inputs.size(); ++i) { feed_targets[feed_target_names[0]] = inputs[i]; - executor->Run(*copy_program, &sub_scope, &feed_targets, &fetch_targets, - true /*create_local_scope*/, true /*create_vars*/, - feed_holder_name, fetch_holder_name); + executor.RunPreparedContext(ctx.get(), &sub_scope, &feed_targets, + &fetch_targets, false /*create_local_scope*/); } auto stop_ms = GetCurrentMs(); scope->DeleteScope(&sub_scope); @@ -158,27 +156,15 @@ TEST(inference, nlp) { LOG(INFO) << "Number of samples (seq_len<1024): " << datasets.size(); LOG(INFO) << "Total number of words: " << num_total_words; - const bool model_combined = false; // 0. Call `paddle::framework::InitDevices()` initialize all the devices - // 1. Define place, executor, scope - auto place = paddle::platform::CPUPlace(); - auto executor = paddle::framework::Executor(place); std::unique_ptr scope( new paddle::framework::Scope()); - // 2. Initialize the inference_program and load parameters - std::unique_ptr inference_program; - inference_program = - InitProgram(&executor, scope.get(), FLAGS_model_path, model_combined); - if (FLAGS_use_mkldnn) { - EnableMKLDNN(inference_program); - } - #ifdef PADDLE_WITH_MKLML // only use 1 thread number per std::thread omp_set_dynamic(0); omp_set_num_threads(1); - mkl_set_num_threads(1); + paddle::operators::math::SetNumThreads(1); #endif double start_ms = 0, stop_ms = 0; @@ -189,21 +175,27 @@ TEST(inference, nlp) { start_ms = GetCurrentMs(); for (int i = 0; i < FLAGS_num_threads; ++i) { threads.emplace_back( - new std::thread(ThreadRunInfer, i, &executor, scope.get(), - std::ref(inference_program), std::ref(jobs))); + new std::thread(ThreadRunInfer, i, scope.get(), std::ref(jobs))); } for (int i = 0; i < FLAGS_num_threads; ++i) { threads[i]->join(); } stop_ms = GetCurrentMs(); } else { - if (FLAGS_prepare_vars) { - executor.CreateVariables(*inference_program, scope.get(), 0); - } + // 1. Define place, executor, scope + paddle::platform::CPUPlace place; + paddle::framework::Executor executor(place); + + // 2. Initialize the inference_program and load parameters + std::unique_ptr inference_program; + inference_program = InitProgram(&executor, scope.get(), FLAGS_model_path, + /*model combined*/ false); // always prepare context std::unique_ptr ctx; ctx = executor.Prepare(*inference_program, 0); - + if (FLAGS_prepare_vars) { + executor.CreateVariables(*inference_program, scope.get(), 0); + } // preapre fetch const std::vector& fetch_target_names = inference_program->GetFetchTargetNames(); diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 01b8dc0be662da22fe15a79cd9abfe5fa92c9577..44c36b1683b037832a218df02184e7cd2ba143e9 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -22,6 +22,8 @@ limitations under the License. */ #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/profiler.h" +DECLARE_bool(use_mkldnn); + template void SetupTensor(paddle::framework::LoDTensor* input, paddle::framework::DDim dims, T lower, T upper) { @@ -133,24 +135,11 @@ std::vector> GetFeedTargetShapes( return feed_target_shapes; } -void EnableMKLDNN( - const std::unique_ptr& program) { - for (size_t bid = 0; bid < program->Size(); ++bid) { - auto* block = program->MutableBlock(bid); - for (auto* op : block->AllOps()) { - if (op->HasAttr("use_mkldnn")) { - op->SetAttr("use_mkldnn", true); - } - } - } -} - template void TestInference(const std::string& dirname, const std::vector& cpu_feeds, const std::vector& cpu_fetchs, - const int repeat = 1, const bool is_combined = false, - const bool use_mkldnn = false) { + const int repeat = 1, const bool is_combined = false) { // 1. Define place, executor, scope auto place = Place(); auto executor = paddle::framework::Executor(place); @@ -182,9 +171,6 @@ void TestInference(const std::string& dirname, "init_program", paddle::platform::DeviceContextPool::Instance().Get(place)); inference_program = InitProgram(&executor, scope, dirname, is_combined); - if (use_mkldnn) { - EnableMKLDNN(inference_program); - } } // Disable the profiler and print the timing information paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, @@ -210,7 +196,10 @@ void TestInference(const std::string& dirname, fetch_targets[fetch_target_names[i]] = cpu_fetchs[i]; } - // 6. Run the inference program + // 6. If export Flags_use_mkldnn=True, use mkldnn related ops. + if (FLAGS_use_mkldnn) executor.EnableMKLDNN(*inference_program); + + // 7. Run the inference program { if (!CreateVars) { // If users don't want to create and destroy variables every time they diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index d5390529163491c2711e50ffad236534e88b73ee..9b1ab1e228dd758b52975abc4c4aa0bdeadbe2de 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -43,14 +43,16 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) { *index = 0; // unlock memory - void* p; + void* p = nullptr; #ifdef PADDLE_WITH_MKLDNN // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp // memory alignment - PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0); + PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0, "Alloc %ld error!", + size); #else - PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0); + PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0, "Alloc %ld error!", + size); #endif PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size); diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index de6ff29c6f8edbcf930546ff157a1c226e1311db..4c338c67d34fa229de17019ce97e8b8dc39ea737 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -166,8 +166,6 @@ function(op_library TARGET) # NOTE(*): activation use macro to regist the kernels, set use_op manually. if(${TARGET} STREQUAL "activation") file(APPEND ${pybind_file} "USE_OP(relu);\n") - elseif(${TARGET} STREQUAL "reduce") - file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n") elseif(${TARGET} STREQUAL "fake_dequantize") file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") else() @@ -186,39 +184,39 @@ else() set(DEPS_OPS ${DEPS_OPS} nccl_op) endif() -add_subdirectory(detail) if(WITH_DISTRIBUTE) + add_subdirectory(distributed) - set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) + set(DISTRIBUTE_DEPS "") + if(WITH_GRPC) + set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) + else() + set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib) + endif() + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - op_library(send_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS}) - op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(fetch_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + foreach(dist_op "prefetch_op" "checkpoint_notify_op" "listen_and_serv_op" "send_op" "recv_op" "send_barrier_op" "fetch_barrier_op") + op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + endforeach() + #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op # listen_and_serv_op sum_op executor SERIAL) if(WITH_GPU) set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op - listen_and_serv_op executor SERIAL) - op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc) + cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op executor SERIAL) + if(WITH_GRPC) + op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc) + else() + op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_brpc) + endif() set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op) endif() else() - set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op fetch_barrier_op gen_nccl_id_op) + set(DEPS_OPS ${DEPS_OPS} checkpoint_notify_op prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op) endif() op_library(cross_entropy_op DEPS cross_entropy) @@ -227,6 +225,8 @@ op_library(softmax_op DEPS softmax) op_library(sequence_softmax_op DEPS softmax) if (WITH_GPU AND TENSORRT_FOUND) op_library(tensorrt_engine_op DEPS tensorrt_engine) + nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc + DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter) else() set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op) endif() diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc index 46ed99bcf2234f7621d9f00eb48c846d8a355795..137bca5e2b8e2754aed274970e08b03ee816a7f2 100644 --- a/paddle/fluid/operators/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/activation_mkldnn_op.cc @@ -12,16 +12,20 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "mkldnn.hpp" #include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/operators/mkldnn_activation_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" namespace paddle { namespace operators { -using paddle::framework::Tensor; -using paddle::platform::MKLDNNDeviceContext; +using framework::DataLayout; +using framework::Tensor; +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::stream; +using platform::GetMKLDNNFormat; +using platform::MKLDNNDeviceContext; +using platform::to_void_cast; namespace { std::string gethash(const mkldnn::memory::dims &operand_dims, @@ -35,188 +39,260 @@ std::string gethash(const mkldnn::memory::dims &operand_dims, }; return dim2str(operand_dims) + std::to_string(algorithm); } +} // namespace + +template +class MKLDNNActivationKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *x = ctx.Input("X"); + PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN && + x->format() != memory::format::format_undef, + "Wrong layout/format set for Input x tensor"); + + Functor functor; + + auto attrs = functor.GetAttrs(); + for (auto &attr : attrs) { + *attr.second = ctx.Attr(attr.first); + } + functor(ctx); + } +}; -template -void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm, - const T alpha = 0, const T beta = 0) { +template +class MKLDNNActivationGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *diff_y = ctx.Input(framework::GradVarName("Out")); + PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN && + diff_y->format() != memory::format::format_undef, + "Wrong layout/format set for Input OutGrad tensor"); + + Functor functor; + + auto attrs = functor.GetAttrs(); + for (auto &attr : attrs) { + *attr.second = ctx.Attr(attr.first); + } + functor(ctx); + } +}; + +template +void eltwise_forward(const framework::ExecutionContext &ctx, + mkldnn::algorithm algorithm, const T alpha = 0, + const T beta = 0) { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); - auto &dev_ctx = ctx.template device_context(); const auto &mkldnn_engine = dev_ctx.GetEngine(); - // get buffers - const auto *src = ctx.template Input("X"); - const auto *src_data = src->template data(); + const auto *x = ctx.Input("X"); + auto *y = ctx.Output("Out"); - auto *dst = ctx.template Output("Out"); - T *dst_data = dst->template mutable_data(ctx.GetPlace()); + const T *x_data = x->data(); + T *y_data = y->mutable_data(ctx.GetPlace()); - // get memory dim - PADDLE_ENFORCE(src->dims().size() == 2 || src->dims().size() == 4, + PADDLE_ENFORCE(x->dims().size() == 2 || x->dims().size() == 4, "Input dim must be with 2 or 4"); - std::vector src_tz = framework::vectorize2int(src->dims()); + + std::vector src_tz = framework::vectorize2int(x->dims()); + + auto src_format = + src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format(); const std::string key = gethash(src_tz, algorithm); const std::string key_src_data = key + ctx.op().Output("Out") + "@eltwise_fwd_src_data"; - const std::string key_src_mem = key + "@eltwise_fwd_src_mem"; - const std::string key_dst_mem = key + "@eltwise_fwd_dst_mem"; - const std::string key_fwd = key + "@eltwise_fwd"; + const std::string key_src_layout = + key + ctx.op().Output("Out") + "@eltwise_fwd_src_layout"; + const std::string key_with_layout = key + std::to_string(src_format); + const std::string key_src_mem = key_with_layout + "@eltwise_fwd_src_mem"; + const std::string key_dst_mem = key_with_layout + "@eltwise_fwd_dst_mem"; + const std::string key_fwd = key_with_layout + "@eltwise_fwd"; + const std::string key_fwd_pd = key_with_layout + "@eltwise_fwd_pd"; + + // save input data and layout to be referred in backward path + auto p_src_data = std::make_shared(x_data); + dev_ctx.SetBlob(key_src_data, p_src_data); + auto p_src_layout = std::make_shared(src_format); + dev_ctx.SetBlob(key_src_layout, p_src_layout); auto p_fwd = std::static_pointer_cast( dev_ctx.GetBlob(key_fwd)); - // save input data to be referred in backward path - auto p_src_data = std::make_shared(src_data); - dev_ctx.SetBlob(key_src_data, p_src_data); + std::shared_ptr dst_memory; if (p_fwd == nullptr) { - // create memory description - auto data_md = src_tz.size() == 2 - ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, - mkldnn::memory::format::nc) - : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, - mkldnn::memory::format::nchw); - - // create memory primitives - auto p_src_mem = std::make_shared(mkldnn::memory( - {data_md, mkldnn_engine}, platform::to_void_cast(src_data))); - dev_ctx.SetBlob(key_src_mem, p_src_mem); - - auto p_dst_mem = std::make_shared(mkldnn::memory( - {data_md, mkldnn_engine}, platform::to_void_cast(dst_data))); - dev_ctx.SetBlob(key_dst_mem, p_dst_mem); - - auto fwd_desc = mkldnn::eltwise_forward::desc( - mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta); - auto p_fwd_pd = std::make_shared( - fwd_desc, mkldnn_engine); - const std::string key_fwd_pd = key + "eltwise_fwd_pd"; - dev_ctx.SetBlob(key_fwd_pd, p_fwd_pd); - p_fwd = std::make_shared( - *p_fwd_pd, *(p_src_mem.get()), *(p_dst_mem.get())); + // create mkldnn memory for input X + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), src_format); + auto src_memory = std::shared_ptr( + new memory({src_md, mkldnn_engine}, to_void_cast(x_data))); + // save src_memory to be referred in backward path + dev_ctx.SetBlob(key_src_mem, src_memory); + + // create primitive descriptor for activation forward and save it + auto forward_desc = mkldnn::eltwise_forward::desc( + mkldnn::prop_kind::forward_training, algorithm, + src_memory->get_primitive_desc().desc(), alpha, beta); + auto forward_pd = std::make_shared( + forward_desc, mkldnn_engine); + + // save prim desc into global device context to be referred in backward path + dev_ctx.SetBlob(key_fwd_pd, forward_pd); + + // create mkldnn memory for output y + dst_memory = + std::make_shared(forward_pd->dst_primitive_desc(), y_data); + + dev_ctx.SetBlob(key_dst_mem, dst_memory); + + // create activation primitive + p_fwd = std::make_shared(*forward_pd, *src_memory, + *dst_memory); dev_ctx.SetBlob(key_fwd, p_fwd); } else { // primitives already exist - auto p_src_mem = + auto src_memory = std::static_pointer_cast(dev_ctx.GetBlob(key_src_mem)); - PADDLE_ENFORCE(p_src_mem != nullptr, - "Fail to find eltwise p_src_mem in device context."); - auto p_dst_mem = + PADDLE_ENFORCE(src_memory != nullptr, + "Fail to find eltwise src_memory in device context."); + dst_memory = std::static_pointer_cast(dev_ctx.GetBlob(key_dst_mem)); - PADDLE_ENFORCE(p_dst_mem != nullptr, - "Fail to find eltwise p_src_mem in device context."); + PADDLE_ENFORCE(dst_memory != nullptr, + "Fail to find eltwise dst_memory in device context."); - p_src_mem->set_data_handle(platform::to_void_reinterpret_cast(src_data)); - p_dst_mem->set_data_handle(dst_data); + src_memory->set_data_handle(platform::to_void_cast(x_data)); + dst_memory->set_data_handle(y_data); } // push primitive to stream and wait until it's executed - std::vector pipeline = {*(p_fwd.get())}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + std::vector pipeline; + pipeline.push_back(*p_fwd); + stream(stream::kind::eager).submit(pipeline).wait(); + + y->set_layout(DataLayout::kMKLDNN); + y->set_format(GetMKLDNNFormat(*dst_memory)); } -template -void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm, - const T alpha = 0, const T beta = 0) { +template +void eltwise_grad(const framework::ExecutionContext &ctx, + mkldnn::algorithm algorithm, const T alpha = 0, + const T beta = 0) { auto &dev_ctx = ctx.template device_context(); const auto &mkldnn_engine = dev_ctx.GetEngine(); - // get buffers - const auto *out = ctx.template Input("Out"); - - auto *dout = ctx.template Input(framework::GradVarName("Out")); - const auto *diff_dst = dout->template data(); + const auto *diff_y = ctx.Input(framework::GradVarName("Out")); + auto *diff_x = ctx.Output(framework::GradVarName("X")); - auto *dx = - ctx.template Output(framework::GradVarName("X")); - const T *diff_src = dx->template mutable_data(ctx.GetPlace()); + const T *diff_y_data = diff_y->data(); + T *diff_x_data = diff_x->mutable_data(ctx.GetPlace()); - // get memory dim - std::vector src_tz = framework::vectorize2int(out->dims()); + std::vector diff_dst_tz = framework::vectorize2int(diff_y->dims()); - const std::string key = gethash(src_tz, algorithm); - const std::string key_diff_src_mem = key + "@eltwise_diff_src_mem"; - const std::string key_diff_dst_mem = key + "@eltwise_diff_dst_mem"; - const std::string key_grad = key + "@eltwise_grad"; + auto diff_y_format = + diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format(); + const std::string key = gethash(diff_dst_tz, algorithm); const std::string key_src_data = key + ctx.op().Input("Out") + "@eltwise_fwd_src_data"; + const std::string key_src_layout = + key + ctx.op().Input("Out") + "@eltwise_fwd_src_layout"; + const auto p_src_layout = + std::static_pointer_cast(dev_ctx.GetBlob(key_src_layout)); + const std::string key_src_mem = + key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem"; + const std::string key_fwd_pd = + key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd"; + const std::string key_with_layouts = + key + std::to_string(*p_src_layout) + "-" + std::to_string(diff_y_format); + const std::string key_diff_src_mem = + key_with_layouts + "@eltwise_diff_src_mem"; + const std::string key_diff_dst_mem = + key_with_layouts + "@eltwise_diff_dst_mem"; + const std::string key_grad = key_with_layouts + "@eltwise_grad"; + const auto p_src_data = std::static_pointer_cast(dev_ctx.GetBlob(key_src_data)); - const std::string key_src_mem = key + "@eltwise_fwd_src_mem"; - auto p_src_mem = + auto src_memory = std::static_pointer_cast(dev_ctx.GetBlob(key_src_mem)); - p_src_mem->set_data_handle(*p_src_data.get()); + PADDLE_ENFORCE(src_memory != nullptr, + "Fail to find src_memory in device context"); + src_memory->set_data_handle(*p_src_data.get()); + + std::shared_ptr diff_src_memory; - auto p_grad = std::static_pointer_cast( + auto p_grad = std::static_pointer_cast( dev_ctx.GetBlob(key_grad)); if (p_grad == nullptr) { - // create memory description - auto data_md = src_tz.size() == 2 - ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, - mkldnn::memory::format::nc) - : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, - mkldnn::memory::format::nchw); - - // create memory primitives - std::shared_ptr p_diff_src_mem = - std::make_shared(mkldnn::memory( - {data_md, mkldnn_engine}, platform::to_void_cast(diff_src))); - dev_ctx.SetBlob(key_diff_src_mem, p_diff_src_mem); - std::shared_ptr p_diff_dst_mem = - std::make_shared(mkldnn::memory( - {data_md, mkldnn_engine}, platform::to_void_cast(diff_dst))); - dev_ctx.SetBlob(key_diff_dst_mem, p_diff_dst_mem); - - auto bwd_desc = mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, - alpha, beta); - - const std::string key_fwd_pd = key + "eltwise_fwd_pd"; - auto *p_fwd_pd = static_cast( - dev_ctx.GetBlob(key_fwd_pd).get()); - - auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc( - bwd_desc, mkldnn_engine, *p_fwd_pd); - + // create mkldnn memory for input diff_y + auto diff_dst_md = platform::MKLDNNMemDesc( + diff_dst_tz, platform::MKLDNNGetDataType(), diff_y_format); + auto diff_dst_memory = std::shared_ptr( + new memory({diff_dst_md, mkldnn_engine}, to_void_cast(diff_y_data))); + dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory); + + // retrieve eltwise primitive desc from device context + auto forward_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_fwd_pd)); + PADDLE_ENFORCE(forward_pd != nullptr, + "Fail to find eltwise_fwd_pd in device context"); + + // ceate primitive descriptor for activation backward + auto backward_desc = mkldnn::eltwise_backward::desc( + algorithm, diff_dst_memory->get_primitive_desc().desc(), + src_memory->get_primitive_desc().desc(), alpha, beta); + auto backward_pd = mkldnn::eltwise_backward::primitive_desc( + backward_desc, mkldnn_engine, *forward_pd); + + // create mkldnn memory for output diff_src + diff_src_memory = std::make_shared( + backward_pd.diff_src_primitive_desc(), diff_x_data); + dev_ctx.SetBlob(key_diff_src_mem, diff_src_memory); + + // create activation backward primitive p_grad = std::make_shared( - eltwise_bwd_prim_desc, *static_cast(p_src_mem.get()), - *(static_cast(p_diff_dst_mem.get())), - *(static_cast(p_diff_src_mem.get()))); + backward_pd, *src_memory, *diff_dst_memory, *diff_src_memory); + dev_ctx.SetBlob(key_grad, p_grad); } else { // primitives already exist - auto p_diff_src_mem = std::static_pointer_cast( + diff_src_memory = std::static_pointer_cast( dev_ctx.GetBlob(key_diff_src_mem)); - auto p_diff_dst_mem = std::static_pointer_cast( + auto diff_dst_memory = std::static_pointer_cast( dev_ctx.GetBlob(key_diff_dst_mem)); - p_diff_src_mem->set_data_handle( - platform::to_void_reinterpret_cast(diff_src)); - p_diff_dst_mem->set_data_handle( - platform::to_void_reinterpret_cast(diff_dst)); + diff_src_memory->set_data_handle( + platform::to_void_reinterpret_cast(diff_x_data)); + diff_dst_memory->set_data_handle( + platform::to_void_reinterpret_cast(diff_y_data)); } // push primitive to stream and wait until it's executed - std::vector pipeline = {*(p_grad.get())}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + std::vector pipeline; + pipeline.push_back(*p_grad); + stream(stream::kind::eager).submit(pipeline).wait(); + + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format(GetMKLDNNFormat(*diff_src_memory)); } -} // anonymous namespace template struct MKLDNNActivationFunc : public BaseActivationFunctor { - template - void operator()(const ExecContext &ctx) const { + void operator()(const framework::ExecutionContext &ctx) const { eltwise_forward(ctx, algorithm); } }; template struct MKLDNNActivationGradFunc : public BaseActivationFunctor { - template - void operator()(const ExecContext &ctx) const { + void operator()(const framework::ExecutionContext &ctx) const { eltwise_grad(ctx, algorithm); } }; diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index dd71c66a75a039429f6e4b1771bb31508bb6b56d..286b03d7b7d11a50f33f0190c1a5b9097ed0f4a2 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -19,13 +19,15 @@ limitations under the License. */ namespace paddle { namespace operators { +using paddle::framework::Tensor; + #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \ class OP_NAME##OpMaker \ : public ::paddle::framework::OpProtoAndCheckerMaker { \ public: \ void Make() override { \ - AddInput("X", "Input of " #OP_NAME "operator"); \ - AddOutput("Out", "Output of" #OP_NAME "operator"); \ + AddInput("X", "Input of " #OP_NAME " operator"); \ + AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X"); \ AddAttr("use_mkldnn", \ "(bool, default false) Only used in mkldnn kernel") \ .SetDefault(false); \ @@ -58,14 +60,15 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, const framework::OperatorWithKernel& oper, const std::string& name) { framework::LibraryType library{framework::LibraryType::kPlain}; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; #ifdef PADDLE_WITH_MKLDNN auto it = oper.Attrs().find("use_mkldnn"); if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() && platform::CanMKLDNNBeUsed(ctx)) { library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } #endif - framework::DataLayout layout = framework::DataLayout::kAnyLayout; return framework::OpKernelType( framework::ToDataType(ctx.Input(name)->type()), ctx.GetPlace(), layout, library); @@ -80,6 +83,7 @@ class ActivationOp : public framework::OperatorWithKernel { ctx->ShareLoD("X", /*->*/ "Out"); } + protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return GetKernelType(ctx, *this, "X"); @@ -94,6 +98,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel { ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out")); } + protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return GetKernelType(ctx, *this, "Out"); @@ -110,7 +115,7 @@ $$out = \frac{1}{1 + e^{-x}}$$ __attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC( Logsigmoid Activation Operator -$$out = \log \frac{1}{1 + e^{-x}}$$ +$$out = \\log \\frac{1}{1 + e^{-x}}$$ )DOC"; @@ -131,14 +136,14 @@ $out = \max(x, 0)$ __attribute__((unused)) constexpr char TanhDoc[] = R"DOC( Tanh Activation Operator. -$$out = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ +$$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ )DOC"; __attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC( TanhShrink Activation Operator. -$$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ +$$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ )DOC"; @@ -194,7 +199,7 @@ $out = [x]$ __attribute__((unused)) constexpr char ReciprocalDoc[] = R"DOC( Reciprocal Activation Operator. -$$out = \frac{1}{x}$$ +$$out = \\frac{1}{x}$$ )DOC"; @@ -250,15 +255,14 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "Output of Softshrink operator"); AddAttr("lambda", "non-negative offset").SetDefault(0.5f); AddComment(R"DOC( -Softshrink Activation Operator. +:strong:`Softshrink Activation Operator` -$$ -out = \begin{cases} - x - \lambda, \text{if } x > \lambda \\ - x + \lambda, \text{if } x < -\lambda \\ - 0, \text{otherwise} - \end{cases} -$$ +.. math:: + out = \begin{cases} + x - \lambda, \text{if } x > \lambda \\ + x + \lambda, \text{if } x < -\lambda \\ + 0, \text{otherwise} + \end{cases} )DOC"); } @@ -269,18 +273,18 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "Input of HardShrink operator"); AddOutput("Out", "Output of HardShrink operator"); - AddAttr("threshold", "The value of threshold for HardShrink") + AddAttr("threshold", + "The value of threshold for HardShrink. [default: 0.5]") .SetDefault(0.5f); AddComment(R"DOC( -HardShrink Activation Operator. +:strong:`HardShrink activation operator` -$$ -out = \begin{cases} - x, \text{if } x > \lambda \\ - x, \text{if } x < -\lambda \\ - 0, \text{otherwise} - \end{cases} -$$ +.. math:: + out = \begin{cases} + x, \text{if } x > \lambda \\ + x, \text{if } x < -\lambda \\ + 0, \text{otherwise} + \end{cases} )DOC"); } @@ -381,7 +385,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( STanh Activation Operator. -$$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$ +$$out = b * \\frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$ )DOC"); } @@ -392,18 +396,18 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "Input of ThresholdedRelu operator"); AddOutput("Out", "Output of ThresholdedRelu operator"); - AddAttr("threshold", "The threshold location of activation") + AddAttr("threshold", + "The threshold location of activation. [default 1.0].") .SetDefault(1.0f); AddComment(R"DOC( -ThresholdedRelu Activation Operator. +:strong:`ThresholdedRelu activation operator` -$$ -out = \begin{cases} - x, \text{if } x > threshold \\ - 0, \text{otherwise} - \end{cases} -$$ +.. math:: + out = \begin{cases} + x, \text{if } x > threshold \\ + 0, \text{otherwise} + \end{cases} )DOC"); } }; @@ -442,7 +446,7 @@ class SwishOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Swish Activation Operator. -$$out = \frac{x}{1 + e^{- \beta x}}$$ +$$out = \\frac{x}{1 + e^{- \beta x}}$$ )DOC"); } diff --git a/paddle/fluid/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc index 99b0239855d6241b064a5883c2be3d58078b3b61..6ee73c3000fb45b4e1cd5bbb730da7d61b494b6f 100644 --- a/paddle/fluid/operators/adam_op.cc +++ b/paddle/fluid/operators/adam_op.cc @@ -89,9 +89,9 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator"); AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator"); - AddOutput("ParamOut", "(Tensor) Output parameter"); - AddOutput("Moment1Out", "(Tensor) Output first moment"); - AddOutput("Moment2Out", "(Tensor) Output second moment"); + AddOutput("ParamOut", "(Tensor) Output parameter").Reuse("Param"); + AddOutput("Moment1Out", "(Tensor) Output first moment").Reuse("Moment1"); + AddOutput("Moment2Out", "(Tensor) Output second moment").Reuse("Moment2"); AddAttr("beta1", "(float, default 0.9) " diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8174d3735859b1fac40cd4c07545f34874d31ab7 --- /dev/null +++ b/paddle/fluid/operators/arg_max_op.cc @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" + +REGISTER_OPERATOR(arg_max, paddle::operators::ArgMinMaxOp, + paddle::operators::ArgMaxOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL( + arg_max, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel); diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..a147d77a9e9c577984028e1a6ed9582dda622069 --- /dev/null +++ b/paddle/fluid/operators/arg_max_op.cu @@ -0,0 +1,31 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" + +REGISTER_OP_CUDA_KERNEL( + arg_max, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel); diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h new file mode 100644 index 0000000000000000000000000000000000000000..6cbdaefeda099c36a864289ef8195c20d09c55e6 --- /dev/null +++ b/paddle/fluid/operators/arg_min_max_op_base.h @@ -0,0 +1,160 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/printf.h" + +namespace paddle { +namespace operators { + +enum ArgMinMaxType { kArgMin, kArgMax }; + +template +struct ArgMinMaxFunctor {}; + +#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value) \ + template \ + struct ArgMinMaxFunctor { \ + void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \ + framework::LoDTensor* out, int64_t axis) { \ + auto in_eigen = framework::EigenTensor::From(in); \ + auto out_eigen = framework::EigenTensor::From(*out); \ + out_eigen.device(*(ctx.eigen_device())) = \ + in_eigen.eigen_op_type(axis).template cast(); \ + } \ + } + +DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin); +DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax); + +template +class ArgMinMaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& x = *(ctx.Input("X")); + auto& out = *(ctx.Output("Out")); + out.mutable_data(ctx.GetPlace()); + auto axis = ctx.Attr("axis"); + auto& dev_ctx = ctx.template device_context(); + +#define CALL_ARG_MINMAX_FUNCTOR(rank) \ + ArgMinMaxFunctor \ + functor##rank; \ + functor##rank(dev_ctx, x, &out, axis) + + switch (x.dims().size()) { + case 1: + CALL_ARG_MINMAX_FUNCTOR(1); + break; + case 2: + CALL_ARG_MINMAX_FUNCTOR(2); + break; + case 3: + CALL_ARG_MINMAX_FUNCTOR(3); + break; + case 4: + CALL_ARG_MINMAX_FUNCTOR(4); + break; + case 5: + CALL_ARG_MINMAX_FUNCTOR(5); + break; + case 6: + CALL_ARG_MINMAX_FUNCTOR(6); + break; + default: + PADDLE_THROW( + "%s operator doesn't supports tensors whose ranks are greater " + "than 6.", + (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")); + break; +#undef CALL_ARG_MINMAX_FUNCTOR + } + } +}; + +template +using ArgMinKernel = + ArgMinMaxKernel; + +template +using ArgMaxKernel = + ArgMinMaxKernel; + +class ArgMinMaxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); + const auto& x_dims = ctx->GetInputDim("X"); + int64_t axis = ctx->Attrs().Get("axis"); + PADDLE_ENFORCE(axis >= -x_dims.size() && axis < x_dims.size(), + "'axis' must be inside [-Rank(X), Rank(X))"); + + auto x_rank = x_dims.size(); + if (axis < 0) axis += x_rank; + + std::vector vec; + for (int64_t i = 0; i < axis; i++) vec.push_back(x_dims[i]); + for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]); + ctx->SetOutputDim("Out", framework::make_ddim(vec)); + } +}; + +class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker { + protected: + virtual const char* OpName() const = 0; + virtual const char* Name() const = 0; + + public: + void Make() override { + AddInput("X", "Input tensor."); + AddOutput("Out", "Output tensor."); + AddAttr("axis", "The axis in which to compute the arg indics."); + AddComment(string::Sprintf(R"DOC( + %s Operator. + + Computes the indices of the %s elements of the input tensor's element + along the provided axis. +)DOC", + OpName(), Name())); + } +}; + +class ArgMinOpMaker : public BaseArgMinMaxOpMaker { + protected: + const char* OpName() const override { return "ArgMin"; } + const char* Name() const override { return "min"; } +}; + +class ArgMaxOpMaker : public BaseArgMinMaxOpMaker { + protected: + const char* OpName() const override { return "ArgMax"; } + const char* Name() const override { return "max"; } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..41f188029f17dbe8717afc0ca0760a39edc24b54 --- /dev/null +++ b/paddle/fluid/operators/arg_min_op.cc @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" + +REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinMaxOp, + paddle::operators::ArgMinOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL( + arg_min, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel); diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..4d020508505a6ebac8be41ce1e4f99d436b67ab5 --- /dev/null +++ b/paddle/fluid/operators/arg_min_op.cu @@ -0,0 +1,31 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" + +REGISTER_OP_CUDA_KERNEL( + arg_min, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel); diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc index 4ad6f3443db33fd14b67091d14fd877b951730ff..a757916be7f6ece9b783d51d1051aac6a276795b 100644 --- a/paddle/fluid/operators/assign_value_op.cc +++ b/paddle/fluid/operators/assign_value_op.cc @@ -70,6 +70,7 @@ $$Out = values$$ namespace ops = paddle::operators; -REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker); +REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker, + paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel, ops::AssignValueKernel); diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc index 0e4a56d4a45a732cfcf43b09228bc0c44df5924c..cc158e57f7140c84f02bc7e091d8eac0d2b672e1 100644 --- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc @@ -19,22 +19,15 @@ limitations under the License. */ namespace paddle { namespace operators { -using Tensor = framework::Tensor; +using batch_norm_bwd = mkldnn::batch_normalization_backward; +using batch_norm_fwd = mkldnn::batch_normalization_forward; +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::reorder; +using mkldnn::stream; using paddle::platform::MKLDNNDeviceContext; using paddle::platform::MKLDNNMemDesc; -using mkldnn::memory; - -template -using EigenArrayMap = - Eigen::Map>; -template -using ConstEigenArrayMap = - Eigen::Map>; -template -using EigenVectorArrayMap = Eigen::Map>; -template -using ConstEigenVectorArrayMap = - Eigen::Map>; +using platform::to_void_cast; namespace { template @@ -64,21 +57,12 @@ void run_batch_norm_op(Args &&... args) { mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); } -template -inline void *cast_const_to_void(const T *t) { - return static_cast(const_cast(t)); -} } // namespace template class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - auto data_layout_str = ctx.Attr("data_layout"); - auto data_layout = framework::StringToDataLayout(data_layout_str); - PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW, - "MKLDNN batch normalization handles only NCHW data layout"); - const float epsilon = ctx.Attr("epsilon"); const float momentum = ctx.Attr("momentum"); const bool is_test = ctx.Attr("is_test"); @@ -99,41 +83,53 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { const auto *scale = ctx.Input("Scale"); const auto *shift = ctx.Input("Bias"); - y->mutable_data(ctx.GetPlace()); - mean_out->mutable_data(ctx.GetPlace()); - variance_out->mutable_data(ctx.GetPlace()); + PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN && + x->format() != memory::format::format_undef, + "Wrong layout/format set for Input x tensor"); + + const T *x_data = x->data(); + const T *mean_data = mean->data(); + const T *variance_data = variance->data(); + T *y_data = y->mutable_data(ctx.GetPlace()); + T *mean_out_data = mean_out->mutable_data(ctx.GetPlace()); + T *variance_out_data = variance_out->mutable_data(ctx.GetPlace()); + T *batch_mean_data = nullptr; + T *batch_variance_data = nullptr; if (!is_test) { - batch_mean->mutable_data(ctx.GetPlace()); - batch_variance->mutable_data(ctx.GetPlace()); + batch_mean_data = batch_mean->mutable_data(ctx.GetPlace()); + batch_variance_data = batch_variance->mutable_data(ctx.GetPlace()); } auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring : mkldnn::prop_kind::forward_training; - auto dims = paddle::framework::vectorize2int(x->dims()); - - auto src_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - auto dst_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - - auto src_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine}; - auto dst_pd = mkldnn::memory::primitive_desc{dst_md, mkldnn_engine}; - - auto src = mkldnn::memory{src_pd, cast_const_to_void(x->data())}; - auto dst = mkldnn::memory{dst_pd, y->data()}; + auto src_tz = paddle::framework::vectorize2int(x->dims()); + auto scale_tz = paddle::framework::vectorize2int(scale->dims()); + PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1"); + const unsigned int ic = scale_tz[0]; unsigned flags = mkldnn::use_scale_shift; if (is_test) flags |= mkldnn::use_global_stats; + // create mkldnn memory from input x tensor + auto src_memory = + memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine}, + to_void_cast(x_data)); + + // create primitive descriptor for batch norm forward using bn_fwd_types = bn_type_traits; - auto batch_norm_fwd_desc = - bn_fwd_types::op_desc{propagation, src_md, epsilon, flags}; - auto batch_norm_fwd_pd = - bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine}; + auto batch_norm_fwd_desc = bn_fwd_types::op_desc{ + propagation, src_memory.get_primitive_desc().desc(), epsilon, flags}; + std::shared_ptr batch_norm_fwd_pd = + std::shared_ptr( + new batch_norm_fwd::primitive_desc(batch_norm_fwd_desc, + mkldnn_engine)); - const unsigned int ic = dims[1]; + // Save the pd to be used in backward pass + const std::string key = ctx.op().Output("SavedMean"); + const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; + dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd); // MKLDNN requires a single piece of memory for scale and shift/bias data const size_t scaleshift_size = 2 * ic; @@ -143,73 +139,58 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { copy_to_weights(scale->data(), scale->data() + ic, shift->data(), shift->data() + ic, &scaleshift_data); - auto scaleshift_memory = mkldnn::memory{ - batch_norm_fwd_pd.weights_primitive_desc(), scaleshift_data.data()}; + // crate mkldnn memory for weights(scale/shift) + auto scaleshift_memory = memory(batch_norm_fwd_pd->weights_primitive_desc(), + scaleshift_data.data()); - if (is_test) { - auto mean_memory = mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(), - cast_const_to_void(mean->data())}; + // create mkldnn memory for output y tensor + auto dst_memory = memory(batch_norm_fwd_pd->dst_primitive_desc(), y_data); + if (is_test) { + // create mkldnn memory for stats (as input) + auto mean_memory = memory(batch_norm_fwd_pd->mean_primitive_desc(), + to_void_cast(mean_data)); auto variance_memory = - mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(), - cast_const_to_void(variance->data())}; + memory(batch_norm_fwd_pd->variance_primitive_desc(), + to_void_cast(variance_data)); run_batch_norm_op( - batch_norm_fwd_pd, src, (const mkldnn::primitive::at &)mean_memory, + *batch_norm_fwd_pd, src_memory, + (const mkldnn::primitive::at &)mean_memory, (const mkldnn::primitive::at &)variance_memory, scaleshift_memory, - dst); + dst_memory); } else { + // create mkldnn memory for stats (as output) auto mean_memory = - mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(), - cast_const_to_void(batch_mean->data())}; + memory(batch_norm_fwd_pd->mean_primitive_desc(), batch_mean_data); + auto variance_memory = memory( + batch_norm_fwd_pd->variance_primitive_desc(), batch_variance_data); - auto variance_memory = - mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(), - cast_const_to_void(batch_variance->data())}; - - run_batch_norm_op(batch_norm_fwd_pd, src, - scaleshift_memory, dst, + run_batch_norm_op(*batch_norm_fwd_pd, src_memory, + scaleshift_memory, dst_memory, mean_memory, variance_memory); } if (!is_test) { - const unsigned int in = dims[0]; - const unsigned int sample_size = x->numel() / in / ic; - - // saved_xx is use just in this batch of data - EigenVectorArrayMap saved_mean_e( - batch_mean->mutable_data(ctx.GetPlace()), ic); - EigenVectorArrayMap saved_variance_e( - batch_variance->mutable_data(ctx.GetPlace()), ic); - saved_mean_e.setZero(); - saved_variance_e.setZero(); - - const unsigned int x_arr_size = in * ic; - ConstEigenArrayMap x_arr(x->data(), sample_size, x_arr_size); - for (unsigned int nc = 0; nc < x_arr_size; ++nc) { - saved_mean_e(nc % ic) += x_arr.col(nc).sum(); - } - saved_mean_e /= in * sample_size; - for (unsigned int nc = 0; nc < x_arr_size; ++nc) { - saved_variance_e(nc % ic) += - (x_arr.col(nc) - saved_mean_e(nc % ic)).matrix().squaredNorm(); - } - saved_variance_e /= in * sample_size; - - ConstEigenVectorArrayMap mean_arr{mean->data(), ic}; - ConstEigenVectorArrayMap variance_arr{variance->data(), ic}; - - EigenVectorArrayMap running_mean_arr( - mean_out->mutable_data(ctx.GetPlace()), ic); - EigenVectorArrayMap running_var_arr( - variance_out->mutable_data(ctx.GetPlace()), ic); + // mkldnn only compute stats for current batch + // so we need compute momentum stats via Eigen lib + EigenVectorArrayMap batch_mean_e(batch_mean_data, ic); + EigenVectorArrayMap batch_variance_e(batch_variance_data, ic); + ConstEigenVectorArrayMap mean_e(mean_data, ic); + ConstEigenVectorArrayMap variance_e{variance_data, ic}; + + EigenVectorArrayMap running_mean_e(mean_out_data, ic); + EigenVectorArrayMap running_variance_e(variance_out_data, ic); auto one_minus_momentum = 1. - momentum; - running_mean_arr = - mean_arr * momentum + saved_mean_e * one_minus_momentum; - running_var_arr = - variance_arr * momentum + saved_variance_e * one_minus_momentum; + running_mean_e = mean_e * momentum + batch_mean_e * one_minus_momentum; + running_variance_e = + variance_e * momentum + batch_variance_e * one_minus_momentum; } + + y->set_layout(DataLayout::kMKLDNN); + y->set_format( + (memory::format)dst_memory.get_primitive_desc().desc().data.format); } }; @@ -217,11 +198,6 @@ template class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext &ctx) const override { - auto data_layout_str = ctx.Attr("data_layout"); - auto data_layout = framework::StringToDataLayout(data_layout_str); - PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW, - "MKLDNN batch normalization handles only NCHW data layout"); - auto &dev_ctx = ctx.template device_context(); auto mkldnn_engine = dev_ctx.GetEngine(); @@ -238,88 +214,132 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto *diff_scale = ctx.Output(framework::GradVarName("Scale")); auto *diff_shift = ctx.Output(framework::GradVarName("Bias")); - diff_x->mutable_data(ctx.GetPlace()); - diff_scale->mutable_data(ctx.GetPlace()); - diff_shift->mutable_data(ctx.GetPlace()); + PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN && + diff_y->format() != memory::format::format_undef, + "Wrong layout/format set for Input diff_y tensor"); + + const T *x_data = x->data(); + const T *diff_y_data = diff_y->data(); + const T *batch_mean_data = batch_mean->data(); + const T *batch_variance_data = batch_variance->data(); + const T *scale_data = scale->data(); + const T *shift_data = shift->data(); + T *diff_x_data = diff_x->mutable_data(ctx.GetPlace()); + T *diff_scale_data = diff_scale->mutable_data(ctx.GetPlace()); + T *diff_shift_data = diff_shift->mutable_data(ctx.GetPlace()); + + auto src_tz = paddle::framework::vectorize2int(x->dims()); + auto diff_src_tz = src_tz; + auto dst_tz = src_tz; + auto diff_dst_tz = dst_tz; + auto scale_tz = paddle::framework::vectorize2int(scale->dims()); + PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1"); + + const unsigned int ic = scale_tz[0]; + + // Retrieve bn_fwd_pd from device context + const std::string key = ctx.op().Input("SavedMean"); + const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; + auto batch_norm_fwd_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_batch_norm_fwd_pd)); + PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr, + "Fail to find batch_norm_fwd_pd in device context"); - auto dims = paddle::framework::vectorize2int(x->dims()); - unsigned flags = mkldnn::use_scale_shift | !mkldnn::use_global_stats; + using bn_bwd_types = bn_type_traits; - auto src_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - auto dst_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - auto diff_src_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - auto diff_dst_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); + // create mkldnn memory from input diff_y tensor + auto user_diff_dst_memory = + memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()}, + mkldnn_engine}, + to_void_cast(diff_y_data)); - using bn_bwd_types = bn_type_traits; - using bn_fwd_types = bn_type_traits; + // create mkldnn memory from input x tensor + auto src_memory = + memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine}, + to_void_cast(x_data)); - auto batch_norm_fwd_desc = bn_fwd_types::op_desc{ - mkldnn::prop_kind::forward_training, src_md, epsilon, flags}; - auto batch_norm_fwd_pd = - bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine}; + // for diff_dst, try to use same format as dst in forward pass + auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc(); + auto diff_dst_md = diff_dst_pd.desc(); + // create primitive descriptor for batch norm backward + unsigned flags = mkldnn::use_scale_shift; auto batch_norm_bwd_desc = bn_bwd_types::op_desc{ - mkldnn::prop_kind::backward, diff_dst_md, dst_md, epsilon, flags}; + mkldnn::prop_kind::backward, diff_dst_md, + src_memory.get_primitive_desc().desc(), epsilon, flags}; auto batch_norm_bwd_pd = bn_bwd_types::op_prim{ - batch_norm_bwd_desc, mkldnn_engine, batch_norm_fwd_pd}; - - auto src = mkldnn::memory{{src_md, mkldnn_engine}, - cast_const_to_void(x->data())}; - - auto mean = mkldnn::memory{batch_norm_bwd_pd.mean_primitive_desc(), - cast_const_to_void(batch_mean->data())}; - - auto variance = - mkldnn::memory{batch_norm_bwd_pd.variance_primitive_desc(), - cast_const_to_void(batch_variance->data())}; - - auto diff_dst = mkldnn::memory{{diff_dst_md, mkldnn_engine}, - cast_const_to_void(diff_y->data())}; + batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd}; + + // reorder user_diff_dst if it's not in preferred format + auto diff_dst_memory = user_diff_dst_memory; + primitive reorder_diff_dst; + bool is_diff_dst_reordered = false; + if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory = memory(diff_dst_pd); + reorder_diff_dst = reorder(user_diff_dst_memory, diff_dst_memory); + is_diff_dst_reordered = true; + } - const unsigned int ic = dims[1]; + // create mkldnn memory for input tensors (src/mean/variance) + auto mean_memory = memory(batch_norm_bwd_pd.mean_primitive_desc(), + to_void_cast(batch_mean_data)); + auto variance_memory = memory(batch_norm_bwd_pd.variance_primitive_desc(), + to_void_cast(batch_variance_data)); + // MKLDNN requires a single piece of memory for scale and shift/bias data const size_t scaleshift_size = 2 * ic; std::vector scaleshift_data; scaleshift_data.reserve(scaleshift_size); - copy_to_weights(scale->data(), scale->data() + ic, shift->data(), - shift->data() + ic, &scaleshift_data); + copy_to_weights(scale_data, scale_data + ic, shift_data, shift_data + ic, + &scaleshift_data); - auto scaleshift_memory = mkldnn::memory{ - batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data()}; + // create mkldnn memory for input tensors (scale/shift) + auto scaleshift_memory = memory(batch_norm_bwd_pd.weights_primitive_desc(), + scaleshift_data.data()); + // create mkldnn memory for output diff weights (combined scale/shift) std::vector diff_scaleshift_data; diff_scaleshift_data.reserve(scaleshift_size); - copy_to_weights(diff_scale->data(), diff_scale->data() + ic, - diff_shift->data(), diff_shift->data() + ic, - &diff_scaleshift_data); - auto diff_scaleshift_memory = - mkldnn::memory{batch_norm_bwd_pd.diff_weights_primitive_desc(), - diff_scaleshift_data.data()}; - - auto diff_src = mkldnn::memory{{diff_src_md, mkldnn_engine}, - static_cast(diff_x->data())}; - - run_batch_norm_op( - batch_norm_bwd_pd, src, mean, variance, diff_dst, scaleshift_memory, - diff_src, diff_scaleshift_memory); - + memory(batch_norm_bwd_pd.diff_weights_primitive_desc(), + diff_scaleshift_data.data()); + + // here assume diff_src is in the same format of src + auto diff_src_memory = memory(src_memory.get_primitive_desc(), diff_x_data); + + // finally create batch_norm backward primitive + auto batch_norm_bwd_prim = + batch_norm_bwd(batch_norm_bwd_pd, src_memory, mean_memory, + variance_memory, diff_dst_memory, scaleshift_memory, + diff_src_memory, diff_scaleshift_memory); + + // execute optional reorder and batch_norm backward primitive + std::vector pipeline; + if (is_diff_dst_reordered) pipeline.push_back(reorder_diff_dst); + pipeline.push_back(batch_norm_bwd_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + // copy back diff sacle/shift to output tensors (diff scale/shift) + diff_scaleshift_data.resize(scaleshift_size); auto it = std::begin(diff_scaleshift_data); - std::copy(it, std::next(it, ic), diff_scale->data()); + std::copy(it, std::next(it, ic), diff_scale_data); std::copy(std::next(it, ic), std::end(diff_scaleshift_data), - diff_shift->data()); + diff_shift_data); + + // set layout/format of output tensors + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format((memory::format)diff_src_memory.get_primitive_desc() + .desc() + .data.format); } }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_KERNEL(batch_norm, MKLDNN, paddle::platform::CPUPlace, +REGISTER_OP_KERNEL(batch_norm, MKLDNN, ::paddle::platform::CPUPlace, ops::BatchNormMKLDNNOpKernel); -REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, paddle::platform::CPUPlace, +REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, ::paddle::platform::CPUPlace, ops::BatchNormMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 6ec8c9d18b466142acdb46b0f46826a2aca7a47e..52b0bf85c07fee380f9e7ba1c703b56367628644 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -22,22 +22,6 @@ limitations under the License. */ namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using DataLayout = framework::DataLayout; - -template -using EigenArrayMap = - Eigen::Map>; -template -using ConstEigenArrayMap = - Eigen::Map>; -template -using EigenVectorArrayMap = Eigen::Map>; -template -using ConstEigenVectorArrayMap = - Eigen::Map>; - class BatchNormOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -110,17 +94,19 @@ class BatchNormOp : public framework::OperatorWithKernel { ctx.Input("Variance")->type()), "Variance input should be of float type"); - framework::LibraryType library_{framework::LibraryType::kPlain}; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; #ifdef PADDLE_WITH_MKLDNN - if (library_ == framework::LibraryType::kPlain && + if (library == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { - library_ = framework::LibraryType::kMKLDNN; + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } #endif - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout = framework::DataLayout::kAnyLayout; + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, - library_); + library); } }; @@ -149,13 +135,15 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Variance", "The global variance (for training) " "or estimated Variance (for testing)"); - AddOutput("Y", "result after normalization"); + AddOutput("Y", "result after normalization").Reuse("X"); AddOutput("MeanOut", "Share memory with Mean. " - "Store the global mean when training"); + "Store the global mean when training") + .Reuse("Mean"); AddOutput("VarianceOut", "Share memory with Variance. " - "Store the global Variance when training"); + "Store the global Variance when training") + .Reuse("Variance"); AddOutput("SavedMean", "Mean of the current mini batch, " "will apply to output when training") @@ -366,18 +354,21 @@ class BatchNormGradOp : public framework::OperatorWithKernel { PADDLE_THROW("can't find Y@GRAD"); } - framework::LibraryType library_{framework::LibraryType::kPlain}; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + #ifdef PADDLE_WITH_MKLDNN - if (library_ == framework::LibraryType::kPlain && + if (library == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { - library_ = framework::LibraryType::kMKLDNN; + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } #endif - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout = framework::DataLayout::kAnyLayout; + return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - layout, library_); + layout, library); } }; diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h index 9e5fc41598f29336074335f3624a2300ad018d09..5e3d630d6889e445c5e84fa836d2d81bb7266779 100644 --- a/paddle/fluid/operators/batch_norm_op.h +++ b/paddle/fluid/operators/batch_norm_op.h @@ -19,6 +19,22 @@ limitations under the License. */ namespace paddle { namespace operators { +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + template class BatchNormKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h index 483c9f8c2191fa4eb98b91112f9d6753e2fbddc3..fc15d56891cf7af10a91ca22a09c84fa2e52d465 100644 --- a/paddle/fluid/operators/batch_size_like.h +++ b/paddle/fluid/operators/batch_size_like.h @@ -54,18 +54,18 @@ class BatchSizeLikeOp : public framework::OperatorWithKernel { class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() final { - AddInput("Input", - "(Tensor) Tensor " - "whose input_dim_idx'th dimension specifies the batch_size"); + AddInput( + "Input", + "Tensor whose input_dim_idx'th dimension specifies the batch_size"); AddOutput("Out", - "(Tensor) Tensor of specified shape will be filled " + "Tensor of specified shape will be filled " "with the specified value"); - AddAttr>("shape", "(vector) The shape of the output"); + AddAttr>("shape", "The shape of the output"); AddAttr("input_dim_idx", - "(int, default 0) The index of input's batch size dimension") + "default 0. The index of input's batch size dimension") .SetDefault(0); AddAttr("output_dim_idx", - "(int, default 0) The index of output's batch size dimension") + "default 0. The index of output's batch size dimension") .SetDefault(0); Apply(); } diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index c3dd22119ddab8ecf9213ee274e4cbd4f05e78fd..10d678111f5325e495b24286e6ecf651230393fe 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/beam_search_decode_op.h" +#include #include + +#include "paddle/fluid/operators/beam_search_decode_op.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { @@ -22,8 +24,11 @@ namespace operators { struct BeamSearchDecodeFunctor { BeamSearchDecodeFunctor(const LoDTensorArray& step_ids, const LoDTensorArray& step_scores, - LoDTensor* id_tensor, LoDTensor* score_tensor) - : step_ids_origin_(step_ids), + LoDTensor* id_tensor, LoDTensor* score_tensor, + size_t beam_size, int end_id) + : beam_size_(beam_size), + end_id_(end_id), + step_ids_origin_(step_ids), step_scores_origin_(step_scores), id_tensor_(id_tensor), score_tensor_(score_tensor) { @@ -37,9 +42,11 @@ struct BeamSearchDecodeFunctor { // Copy all tensors in the input tensor array for (auto& step_id : step_ids_origin_) { framework::LoDTensor out; - dev_ctx->Wait(); - framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out); - dev_ctx->Wait(); + if (step_id.numel() > 0) { + dev_ctx->Wait(); + framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out); + dev_ctx->Wait(); + } out.set_lod(step_id.lod()); step_ids_.push_back(out); @@ -53,9 +60,12 @@ struct BeamSearchDecodeFunctor { // Copy all tensors in the input tensor array for (auto& step_score : step_scores_origin_) { framework::LoDTensor out; - dev_ctx->Wait(); - framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx, &out); - dev_ctx->Wait(); + if (step_score.numel() > 0) { + dev_ctx->Wait(); + framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx, + &out); + dev_ctx->Wait(); + } out.set_lod(step_score.lod()); step_scores_.push_back(out); @@ -67,6 +77,8 @@ struct BeamSearchDecodeFunctor { void operator()() const; bool tensor_on_gpu_; + size_t beam_size_; + int end_id_; const LoDTensorArray& step_ids_origin_; const LoDTensorArray& step_scores_origin_; LoDTensorArray step_ids_ = LoDTensorArray(); @@ -77,14 +89,14 @@ struct BeamSearchDecodeFunctor { template void BeamSearchDecodeFunctor::operator()() const { - BeamSearchDecoder beam_search_decoder; + BeamSearchDecoder beam_search_decoder(beam_size_, end_id_); // Check if the tensor is on GPU. If so, use the CPU copy instead if (tensor_on_gpu_) { - beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_, - score_tensor_); + beam_search_decoder.Backtrace(step_ids_, step_scores_, id_tensor_, + score_tensor_); } else { - beam_search_decoder.PackAllSteps(step_ids_origin_, step_scores_origin_, - id_tensor_, score_tensor_); + beam_search_decoder.Backtrace(step_ids_origin_, step_scores_origin_, + id_tensor_, score_tensor_); } } @@ -122,13 +134,17 @@ class BeamSearchDecodeOp : public framework::OperatorBase { "Level of LodTensor should be 2"); } + size_t beam_size = ctx.Attr("beam_size"); + int end_id = ctx.Attr("end_id"); + // prepare output LoDTensor* sentenceIds = ctx.Output("SentenceIds"); LoDTensor* sentenceScores = ctx.Output("SentenceScores"); framework::VisitDataType( framework::ToDataType(scores->at(0).type()), - BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores)); + BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores, + beam_size, end_id)); } }; @@ -137,18 +153,32 @@ class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("Ids", "(LodTensorArray)" - "score of the candidate words in each step"); + "The LodTensorArray containing the selected ids of all steps"); AddInput("Scores", "(LodTensorArray)" - "score of the candidate words in each step"); - AddOutput("SentenceIds", - "(LodTensor)" - "All possible result sentences of word ids"); - AddOutput("SentenceScores", - "(LodTensor)" - "All possible result sentences of word scores"); + "The LodTensorArray containing the selected scores of all steps"); + AddOutput( + "SentenceIds", + "(LodTensor)" + "An LodTensor containing all generated id sequences for all source " + "sentences"); + AddOutput( + "SentenceScores", + "(LodTensor)" + "An LodTensor containing scores corresponding to Output(SentenceIds)"); + AddAttr("beam_size", "beam size for beam search"); + AddAttr("end_id", + "the token id which indicates the end of a sequence"); AddComment(R"DOC( -Pack the result of Beam search op into SentenceIds and SentenceScores. +Beam Search Decode Operator. This Operator constructs the full hypotheses for +each source sentence by walking back along the LoDTensorArray Input(ids) +whose lods can be used to restore the path in the beam search tree. + +The Output(SentenceIds) and Output(SentenceScores) separately contain the +generated id sequences and the corresponding scores. The shapes and lods of the +two LodTensor are same. The lod level is 2 and the two levels separately +indicate how many hypotheses each source sentence has and how many ids each +hypothesis has. )DOC"); } }; @@ -172,10 +202,12 @@ class BeamSearchDecodeInferVarType : public framework::VarTypeInference { void operator()(const framework::OpDesc& op_desc, framework::BlockDesc* block) const override { for (auto& o : op_desc.Output("SentenceIds")) { - block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR); + auto& sentence_ids = block->FindRecursiveOrCreateVar(o); + sentence_ids.SetType(framework::proto::VarType::LOD_TENSOR); } for (auto& o : op_desc.Output("SentenceScores")) { - block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR); + auto& sentence_scores = block->FindRecursiveOrCreateVar(o); + sentence_scores.SetType(framework::proto::VarType::LOD_TENSOR); } } }; diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h index 3c01f81c83555b985bb6b7a9e3330ab594a62863..6aefc5446f167eebb0da673b3fbdf7ed128daa98 100644 --- a/paddle/fluid/operators/beam_search_decode_op.h +++ b/paddle/fluid/operators/beam_search_decode_op.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once +#include #include + #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" @@ -25,42 +27,12 @@ using LoDTensor = framework::LoDTensor; using LoDTensorArray = framework::LoDTensorArray; // all the lod have 2 levels. -// The First is source level, the second is sentence level. -// source level describe how many candidate words for this source. -// sentence level describe these candidates belong to which prefix +// The first is source level, the second is sentence level. +// source level describe how many prefixes (branchs) for each source sentece +// (beam). sentence level describe how these candidates belong to the prefixes. const size_t kSourceLevel = 0; const size_t kSentenceLevel = 1; -template -struct BeamNode { - BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {} - - ~BeamNode() { - if (parent_) { - parent_->DropKid(this); - if (parent_->kids_.size() == 0UL) { - delete parent_; - } - } - VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_; - } - - void AppendTo(BeamNode* parent) { - parent_ = parent; - parent->kids_.insert(this); - } - - void DropKid(BeamNode* kid) { kids_.erase(kid); } - - BeamNode* parent_ = nullptr; - std::unordered_set kids_; - int64_t word_id_; - T score_; -}; - -template -using BeamNodeVector = std::vector>>; - template struct Sentence { std::vector word_ids; @@ -72,24 +44,8 @@ using SentenceVector = std::vector>; template struct BeamSearchDecoder { - /** - * make a BeamNode and all it's related prefix BeanNode into a Sentence. - */ - Sentence MakeSentence(const BeamNode* node) const; - - /** - * Param: - * cur_ids: LoDTensor of One step for word ID - * cur_scores: LoDTensor of One Step for word score - * prefixes_list: prefixes for each source sentence. - * sentence_vector_list: result sentence_vector for each source sentence. - * Return: - * a new prefixes list for each source of current step - */ - std::vector> PackTwoSteps( - const LoDTensor& cur_ids, const LoDTensor& cur_scores, - std::vector>* prefixes_list, - std::vector>* sentence_vector_list) const; + BeamSearchDecoder(size_t beam_size, int end_id) + : beam_size_(beam_size), end_id_(end_id) {} /** * convert the result sentence_vector for each source sentence into two @@ -100,107 +56,30 @@ struct BeamSearchDecoder { * sentence_vector_list: sentence_vector for each source sentence. * id_tensor: result LoDTensor for sentences of id. * score_tensor: result LoDTensor for sentences of score. + * reverse: whether ids of sentence in sentence_vector_list is reversed + * sort_by_score: whether to sort hypotheses of each sentence by scores. */ void ConvertSentenceVectorToLodTensor( std::vector> sentence_vector_list, LoDTensor* id_tensor, - LoDTensor* score_tensor) const; + LoDTensor* score_tensor, bool reverse = true, + bool sort_by_score = true) const; /** - * Pack all steps of id/score LodTensor into sentence LoDTensor - * it's main logic is: - * ```python - * prefix - * result_sentence - * result_lod_tensor - * - * for (step in steps): - * prefix = PackTwoSteps(prefix, step, &result_sentence) - * ConvertSentenceVectorToLodTensor(result_sentence, &result_lod_tensor) - * ``` + * Gather the hypotheses for each source sentence by backtrace though the + * LoDTensorArray step_ids whose lods reserve the path in the tree. */ - void PackAllSteps(const LoDTensorArray& step_ids, - const LoDTensorArray& step_scores, LoDTensor* id_tensor, - LoDTensor* score_tensor) const; -}; - -template -Sentence BeamSearchDecoder::MakeSentence(const BeamNode* node) const { - Sentence sentence; - while (node != nullptr) { - sentence.word_ids.emplace_back(node->word_id_); - sentence.scores.emplace_back(node->score_); - node = node->parent_; - } - - std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids)); - std::reverse(std::begin(sentence.scores), std::end(sentence.scores)); - - return sentence; -} - -template -std::vector> BeamSearchDecoder::PackTwoSteps( - const LoDTensor& cur_ids, const LoDTensor& cur_scores, - std::vector>* prefixes_list, - std::vector>* sentence_vector_list) const { - std::vector> result; + void Backtrace(const LoDTensorArray& step_ids, + const LoDTensorArray& step_scores, LoDTensor* id_tensor, + LoDTensor* score_tensor) const; - for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1; - ++src_idx) { - size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx]; - size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1]; - - BeamNodeVector beam_nodes; - - // if prefixes size is 0, it means this is the first step. In this step, - // all candidate id is the start of candidate sentences. - if (prefixes_list->empty()) { - PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(), - cur_ids.lod().at(kSentenceLevel).back(), - "in the first step"); - for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) { - beam_nodes.push_back(std::unique_ptr>(new BeamNode( - cur_ids.data()[id_idx], cur_scores.data()[id_idx]))); - } - } else { - BeamNodeVector& prefixes = prefixes_list->at(src_idx); - SentenceVector& sentence_vector = (*sentence_vector_list)[src_idx]; - - PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(), - "prefix and candidate set number should be the same"); - - auto candidate_offset = cur_ids.lod()[kSentenceLevel]; - for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) { - std::unique_ptr>& prefix = prefixes[prefix_idx]; - size_t candidate_start = candidate_offset[src_start + prefix_idx]; - size_t candidate_end = candidate_offset[src_start + prefix_idx + 1]; - if (candidate_start == candidate_end) { - VLOG(3) << "this sentence has no more candidate, " - "add to result sentence and rm it from beam tree"; - sentence_vector.push_back(MakeSentence(prefix.get())); - prefix.reset(); - } else { - for (size_t candidate_idx = candidate_start; - candidate_idx < candidate_end; ++candidate_idx) { - auto* candidate = - new BeamNode(cur_ids.data()[candidate_idx], - cur_scores.data()[candidate_idx]); - candidate->AppendTo(prefix.get()); - beam_nodes.push_back(std::unique_ptr>(candidate)); - } - prefix.release(); - } - } - } - result.push_back(std::move(beam_nodes)); - } - return result; -} + size_t beam_size_; + int end_id_; +}; template void BeamSearchDecoder::ConvertSentenceVectorToLodTensor( std::vector> sentence_vector_list, LoDTensor* id_tensor, - LoDTensor* score_tensor) const { + LoDTensor* score_tensor, bool reverse, bool sort_by_score) const { size_t src_num = sentence_vector_list.size(); PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0"); @@ -211,11 +90,29 @@ void BeamSearchDecoder::ConvertSentenceVectorToLodTensor( std::vector score_data; for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { + if (sort_by_score) { + sort(sentence_vector_list[src_idx].begin(), + sentence_vector_list[src_idx].end(), + [reverse](const Sentence& a, const Sentence& b) { + if (reverse) + return a.scores.front() > b.scores.front(); + else + return a.scores.back() > b.scores.back(); + }); + } for (Sentence& sentence : sentence_vector_list[src_idx]) { - id_data.insert(id_data.end(), sentence.word_ids.begin(), - sentence.word_ids.end()); - score_data.insert(score_data.end(), sentence.scores.begin(), - sentence.scores.end()); + if (reverse) { + id_data.insert(id_data.end(), sentence.word_ids.rbegin(), + sentence.word_ids.rend()); + score_data.insert(score_data.end(), sentence.scores.rbegin(), + sentence.scores.rend()); + } else { + id_data.insert(id_data.end(), sentence.word_ids.begin(), + sentence.word_ids.end()); + score_data.insert(score_data.end(), sentence.scores.begin(), + sentence.scores.end()); + } + sentence_level_lod.push_back(sentence_level_lod.back() + sentence.word_ids.size()); } @@ -243,39 +140,75 @@ void BeamSearchDecoder::ConvertSentenceVectorToLodTensor( } template -void BeamSearchDecoder::PackAllSteps(const LoDTensorArray& step_ids, - const LoDTensorArray& step_scores, - LoDTensor* id_tensor, - LoDTensor* score_tensor) const { +void BeamSearchDecoder::Backtrace(const LoDTensorArray& step_ids, + const LoDTensorArray& step_scores, + LoDTensor* id_tensor, + LoDTensor* score_tensor) const { PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0"); PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(), "step_ids and step_scores should be the same"); const size_t step_num = step_ids.size(); const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1; + std::vector> sentence_vector_list( + src_num, SentenceVector(beam_size_)); + std::vector> prefix_idx_vector_list(src_num); + for (int step_id = step_num - 1; step_id >= 0; --step_id) { + auto& cur_ids = step_ids.at(step_id); + auto& cur_scores = step_scores.at(step_id); + for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { + // for each source sentence + auto& sentence_vector = sentence_vector_list.at(src_idx); + auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx); + size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx]; + size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1]; + if (prefix_idx_vector.empty()) { // be finished and pruned at this step + // or the last time step + for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end; + ++prefix_idx) { + size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx]; + size_t candidate_end = + cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1]; + for (size_t candidate_idx = candidate_start; + candidate_idx < candidate_end; ++candidate_idx) { + prefix_idx_vector.push_back(prefix_idx); + size_t idx = prefix_idx_vector.size() - 1; + auto cur_id = cur_ids.data()[candidate_idx]; + auto cur_score = cur_scores.data()[candidate_idx]; + sentence_vector.at(idx).word_ids.push_back(cur_id); + sentence_vector.at(idx).scores.push_back(cur_score); + } + } + } else { // use prefix_idx_vector to backtrace + size_t src_candidate_start = + cur_ids.lod().at(kSentenceLevel)[src_prefix_start]; + size_t prefix_idx = src_prefix_start; + size_t candidate_num = + cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] - + cur_ids.lod().at(kSentenceLevel)[prefix_idx]; + for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) { + auto candidate_idx = prefix_idx_vector.at(idx); + auto cur_id = cur_ids.data()[candidate_idx]; + auto cur_score = cur_scores.data()[candidate_idx]; + if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) { + // to skip redundant end tokens + sentence_vector.at(idx).word_ids.push_back(cur_id); + sentence_vector.at(idx).scores.push_back(cur_score); + } - PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0"); - - // previous prefixes for each step, - // the init length is 0, means this is the first step. - std::vector> beamnode_vector_list(0); - std::vector> sentence_vector_list(src_num); - - // pack all steps for one batch first, then another batch - for (size_t step_id = 0; step_id < step_num; ++step_id) { - beamnode_vector_list = - PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id), - &beamnode_vector_list, &sentence_vector_list); - } - // append last beam_node to result - for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { - for (auto& beam_node : beamnode_vector_list.at(src_idx)) { - sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get())); - beam_node.reset(); + while (src_candidate_start + candidate_num <= + candidate_idx) { // search the corresponding prefix + prefix_idx++; + candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] - + cur_ids.lod().at(kSentenceLevel)[prefix_idx]; + } + prefix_idx_vector.at(idx) = prefix_idx; + } + } } } ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor, - score_tensor); + score_tensor, true, true); } } // namespace operators diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc index 36f9594969c416c694928811012baf94332bbd91..88339e38d89db3f79abf232d6b0d035b759739a6 100644 --- a/paddle/fluid/operators/beam_search_decode_op_test.cc +++ b/paddle/fluid/operators/beam_search_decode_op_test.cc @@ -20,15 +20,11 @@ using LoD = paddle::framework::LoD; using LoDTensor = paddle::framework::LoDTensor; using LoDTensorArray = paddle::framework::LoDTensorArray; -template -using BeamNode = paddle::operators::BeamNode; template using BeamSearchDecoder = paddle::operators::BeamSearchDecoder; template using Sentence = paddle::operators::Sentence; template -using BeamNodeVector = paddle::operators::BeamNodeVector; -template using SentenceVector = paddle::operators::SentenceVector; namespace paddle { @@ -77,138 +73,50 @@ void GenerateExample(const std::vector& level_0, } // namespace test } // namespace paddle -TEST(BeamSearchDecodeOp, DeleteBeamNode) { - auto* root = new BeamNode(0, 0); - auto* b1 = new BeamNode(1, 1); - auto* b2 = new BeamNode(2, 2); - auto* b3 = new BeamNode(3, 3); - - b1->AppendTo(root); - b2->AppendTo(root); - b3->AppendTo(b1); - - delete b3; - delete b2; -} - -TEST(BeamSearchDecodeOp, MakeSentence) { - auto* root = new BeamNode(0, 0); - auto* b1 = new BeamNode(1, 1); - auto* end = new BeamNode(2, 2); - b1->AppendTo(root); - end->AppendTo(b1); - - BeamSearchDecoder helper; - Sentence sentence = helper.MakeSentence(end); - delete end; - - std::vector expect_ids = {0, 1, 2}; - ASSERT_EQ(sentence.word_ids, expect_ids); - - std::vector expect_scores = {0, 1, 2}; - ASSERT_EQ(sentence.scores, expect_scores); -} - -TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) { - CPUPlace place; - - LoDTensorArray ids; - LoDTensorArray scores; - - paddle::test::GenerateExample( - std::vector{0, 2, 6}, std::vector{0, 1, 2, 3, 4, 5, 6}, - std::vector{1, 2, 3, 4, 5, 6}, &ids, &scores); - - std::vector> beamnode_vector_list; - std::vector> sentence_vector_list( - 2, SentenceVector()); - - BeamSearchDecoder helper; - beamnode_vector_list = helper.PackTwoSteps( - ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list); - ASSERT_EQ(beamnode_vector_list.size(), 2UL); - ASSERT_EQ(beamnode_vector_list[0].size(), 2UL); - ASSERT_EQ(beamnode_vector_list[1].size(), 4UL); -} - -TEST(BeamSearchDecodeOp, PackTwoSteps) { - CPUPlace place; - - // first source has three prefix - BeamNodeVector source0_prefixes; - source0_prefixes.push_back( - std::unique_ptr>(new BeamNode(1, 1))); - source0_prefixes.push_back( - std::unique_ptr>(new BeamNode(0, 0))); - source0_prefixes.push_back( - std::unique_ptr>(new BeamNode(3, 3))); - - // second source has two prefix - BeamNodeVector source1_prefixes; - source1_prefixes.push_back( - std::unique_ptr>(new BeamNode(4, 4))); - source1_prefixes.push_back( - std::unique_ptr>(new BeamNode(5, 5))); - - std::vector> beamnode_vector_list; - std::vector> sentence_vector_list( - 2, SentenceVector()); - - beamnode_vector_list.push_back(std::move(source0_prefixes)); - beamnode_vector_list.push_back(std::move(source1_prefixes)); - - // generate data for one step - LoDTensorArray ids; - LoDTensorArray scores; - - paddle::test::GenerateExample(std::vector{0, 3, 5}, - std::vector{0, 1, 1, 3, 4, 5}, - std::vector{0, 1, 2, 3, 4}, &ids, &scores); - - BeamSearchDecoder helper1; - beamnode_vector_list = helper1.PackTwoSteps( - ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list); - - ASSERT_EQ(sentence_vector_list[0].size(), 1UL); - ASSERT_EQ(sentence_vector_list[1].size(), 0UL); - ASSERT_EQ(beamnode_vector_list[0].size(), 3UL); - ASSERT_EQ(beamnode_vector_list[1].size(), 2UL); -} - -TEST(BeamSearchDecodeOp, PackAllSteps) { +TEST(BeamSearchDecodeOp, Backtrace) { CPUPlace place; - // we will constuct a sample data with 3 steps and 2 source sentences + // Construct sample data with 5 steps and 2 source sentences + // beam_size = 2, start_id = 0, end_id = 1 LoDTensorArray ids; LoDTensorArray scores; paddle::test::GenerateExample( - std::vector{0, 3, 6}, std::vector{0, 1, 2, 3, 4, 5, 6}, - std::vector{1, 2, 3, 4, 5, 6}, &ids, &scores); + std::vector{0, 1, 2}, std::vector{0, 1, 2}, + std::vector{0, 0}, &ids, &scores); // start with start_id + paddle::test::GenerateExample(std::vector{0, 1, 2}, + std::vector{0, 2, 4}, + std::vector{2, 3, 4, 5}, &ids, &scores); + paddle::test::GenerateExample(std::vector{0, 2, 4}, + std::vector{0, 2, 2, 4, 4}, + std::vector{3, 1, 5, 4}, &ids, &scores); + paddle::test::GenerateExample(std::vector{0, 2, 4}, + std::vector{0, 1, 2, 3, 4}, + std::vector{1, 1, 3, 5}, &ids, &scores); paddle::test::GenerateExample( - std::vector{0, 3, 6}, std::vector{0, 1, 1, 3, 5, 5, 6}, - std::vector{0, 1, 2, 3, 4, 5}, &ids, &scores); - paddle::test::GenerateExample(std::vector{0, 3, 6}, - std::vector{0, 0, 1, 2, 3, 4, 5}, - std::vector{0, 1, 2, 3, 4}, &ids, &scores); + std::vector{0, 2, 4}, + std::vector{0, 0, 0, 2, + 2}, // the branchs of the first source sentence + // are pruned since finished + std::vector{5, 1}, + &ids, &scores); - ASSERT_EQ(ids.size(), 3UL); - ASSERT_EQ(scores.size(), 3UL); + ASSERT_EQ(ids.size(), 5UL); + ASSERT_EQ(scores.size(), 5UL); - BeamSearchDecoder helper; + BeamSearchDecoder helper(2, 1); // beam_size = 2, end_id = 1 LoDTensor id_tensor; LoDTensor score_tensor; - helper.PackAllSteps(ids, scores, &id_tensor, &score_tensor); + helper.Backtrace(ids, scores, &id_tensor, &score_tensor); LoD lod = id_tensor.lod(); - std::vector expect_source_lod = {0, 4, 8}; + std::vector expect_source_lod = {0, 2, 4}; EXPECT_EQ(lod[0], expect_source_lod); - std::vector expect_sentence_lod = {0, 1, 3, 6, 9, 10, 13, 16, 19}; + std::vector expect_sentence_lod = {0, 4, 7, 12, 17}; EXPECT_EQ(lod[1], expect_sentence_lod); - // 2| 1, 0| 3, 1, 0| 3, 2, 1| 5| 4, 3, 2| 4, 4, 3| 6, 5, 4 - std::vector expect_data = {2, 1, 0, 3, 1, 0, 3, 2, 1, 5, - 4, 3, 2, 4, 4, 3, 6, 5, 4}; + std::vector expect_data = {0, 2, 3, 1, 0, 2, 1, 0, 4, + 5, 3, 5, 0, 4, 5, 3, 1}; ASSERT_EQ(id_tensor.dims()[0], static_cast(expect_data.size())); for (size_t i = 0; i < expect_data.size(); ++i) { ASSERT_EQ(id_tensor.data()[i], diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc index df0b50881f4e3ec6f57bdb2b63033931059c486e..62771d09f112785ca1ba741a0ba239b1f0234633 100644 --- a/paddle/fluid/operators/beam_search_op.cc +++ b/paddle/fluid/operators/beam_search_op.cc @@ -12,25 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/beam_search_op.h" - #include #include #include #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/beam_search_op.h" namespace paddle { namespace operators { void BeamSearch::operator()(const framework::LoDTensor &pre_ids, + const framework::LoDTensor &pre_scores, framework::LoDTensor *selected_ids, framework::LoDTensor *selected_scores) { auto abs_lod = framework::ToAbsOffset(ids_->lod()); auto &high_level = abs_lod[lod_level_]; - auto items = SelectTopBeamSizeItems(); + auto items = SelectTopBeamSizeItems(pre_ids, pre_scores); auto selected_items = ToMap(items, high_level.back()); VLOG(3) << "selected_items:"; for (size_t i = 0; i < selected_items.size(); ++i) { @@ -39,7 +40,8 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids, VLOG(3) << ItemToString(item); } } - PruneEndidCandidates(pre_ids, &selected_items); + + PruneEndBeams(pre_ids, &selected_items); // calculate the output tensor's height size_t num_instances = std::accumulate( std::begin(selected_items), std::end(selected_items), 0, @@ -61,12 +63,6 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids, size_t low_offset = 0; for (auto &items : selected_items) { low_level.push_back(low_offset); - sort(items.begin(), items.end(), [](const Item &a, const Item &b) { - if (a.offset < b.offset) { - return true; - } - return a.id < b.id; - }); for (auto &item : items) { ids_data[low_offset] = item.id; scores_data[low_offset] = item.score; @@ -86,21 +82,31 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids, selected_scores->set_lod(lod); } -int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids, - std::vector> *items) { +void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids, + std::vector> *items) { auto *pre_ids_data = pre_ids.data(); - - int res = 0; - for (size_t offset = 0; offset < items->size(); offset++) { - auto prefix_id = pre_ids_data[offset]; - if (prefix_id == end_id_) { - items->at(offset).clear(); - } else { - res++; + auto abs_lod = framework::ToAbsOffset(ids_->lod()); + auto &high_level = abs_lod[lod_level_]; + for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) { + size_t src_prefix_start = high_level[src_idx]; + size_t src_prefix_end = high_level[src_idx + 1]; + bool finish_flag = true; + for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) { + for (auto &item : items->at(offset)) { + if (item.id != static_cast(end_id_) || + pre_ids_data[offset] != end_id_) { + finish_flag = false; + break; + } + } + if (!finish_flag) break; + } + if (finish_flag) { // all branchs of the beam (source sentence) end and + // prune this beam + for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) + items->at(offset).clear(); } } - - return res; } std::vector> BeamSearch::ToMap( @@ -115,19 +121,17 @@ std::vector> BeamSearch::ToMap( return result; } -std::vector> -BeamSearch::SelectTopBeamSizeItems() { +std::vector> BeamSearch::SelectTopBeamSizeItems( + const framework::LoDTensor &pre_ids, + const framework::LoDTensor &pre_scores) { std::vector> result; std::vector items; // for each source sentence, select the top beam_size items across all // candidate sets. - while (NextItemSet(&items)) { - std::nth_element(std::begin(items), std::begin(items) + beam_size_, - std::end(items), [](const Item &a, const Item &b) { - // TODO(superjom) make score's comparation customizable. - // partial sort in descending order - return a.score > b.score; - }); + while (NextItemSet(pre_ids, pre_scores, &items)) { + std::nth_element( + std::begin(items), std::begin(items) + beam_size_, std::end(items), + [](const Item &a, const Item &b) { return a.score > b.score; }); // prune the top beam_size items. if (items.size() > beam_size_) { items.resize(beam_size_); @@ -146,7 +150,9 @@ BeamSearch::SelectTopBeamSizeItems() { } // the candidates of a source -bool BeamSearch::NextItemSet(std::vector *items) { +bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids, + const framework::LoDTensor &pre_scores, + std::vector *items) { if (sent_offset_ >= ids_->NumElements(lod_level_)) { return false; } @@ -164,14 +170,24 @@ bool BeamSearch::NextItemSet(std::vector *items) { instance_dim *= ids.dims()[i]; } + auto *pre_ids_data = pre_ids.data(); + auto *pre_scores_data = pre_scores.data(); items->clear(); items->reserve(framework::product(ids.dims())); for (size_t offset = abs_lod[lod_level_][sent_offset_]; offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) { - for (size_t d = 0; d < instance_dim; d++) { - const size_t dim_offset = offset * instance_dim + d; - items->emplace_back(offset, ids_data[dim_offset], - scores_data[dim_offset]); + auto pre_id = pre_ids_data[offset]; + auto pre_score = pre_scores_data[offset]; + if (pre_id == end_id_) { + // Allocate all probability mass to eos_id for finished branchs and the + // other candidate ids can be ignored. + items->emplace_back(offset, end_id_, pre_score); + } else { + for (size_t d = 0; d < instance_dim; d++) { + const size_t dim_offset = offset * instance_dim + d; + items->emplace_back(offset, ids_data[dim_offset], + scores_data[dim_offset]); + } } } @@ -199,15 +215,27 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { // inputs and outputs stored in proto - AddInput("pre_ids", "ids in previous step"); - AddInput("ids", "a LoDTensor of shape of [None,k]"); + AddInput("pre_ids", + "(LoDTensor) The LoDTensor containing the selected ids at the " + "previous step. It should be a tensor with shape (batch_size, 1) " + "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at " + "thefirst step."); + AddInput("pre_scores", + "(LoDTensor) The LoDTensor containing the accumulated " + "scores corresponding to the selected ids at the previous step."); + AddInput("ids", + "(LoDTensor) The LoDTensor containing the candidates ids. Its " + "shape should be (batch_size * beam_size, K), where K supposed to " + "be beam_size."); AddInput("scores", - "a LoDTensor that has the same shape and LoD with `ids`"); + "(LoDTensor) The LodTensor containing the accumulated scores " + "corresponding to Input(ids) and its shape is the same as the " + "shape of Input(ids)."); AddOutput("selected_ids", - "a LoDTensor that stores the IDs selected by beam search"); - AddOutput( - "selected_scores", - "a LoDTensor that has the same shape and LoD with `selected_ids`"); + "A LodTensor that stores the IDs selected by beam search."); + AddOutput("selected_scores", + "A LoDTensor containing the accumulated scores corresponding to " + "Output(selected_ids)."); // Attributes stored in AttributeMap AddAttr("level", "the level of LoDTensor"); @@ -215,8 +243,21 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("end_id", "the token id which indicates the end of a sequence"); - AddComment( - "This is a beam search operator that help to generate sequences."); + AddComment(R"DOC( +This operator does the search in beams for one time step. +Specifically, it selects the top-K candidate word ids of current step from +Input(ids) according to their Input(scores) for all source sentences, +where K is Attr(beam_size) and Input(ids), Input(scores) are predicted results +from the computation cell. Additionally, Input(pre_ids) and Input(pre_scores) +are the output of beam_search at previous step, they are needed for special use +to handle ended candidate translations. The paths linking prefixes and selected +candidates are organized and reserved in lod. + +Note that the Input(scores) passed in should be accumulated scores, and +length penalty should be done with extra operators before calculating the +accumulated scores if needed, also suggest finding top-K before it and +using the top-K candidates following. +)DOC"); } }; @@ -253,10 +294,12 @@ class BeamSearchInferVarType : public framework::VarTypeInference { void operator()(const framework::OpDesc &op_desc, framework::BlockDesc *block) const override { for (auto &o : op_desc.Output("selected_ids")) { - block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR); + auto &selected_ids = block->FindRecursiveOrCreateVar(o); + selected_ids.SetType(framework::proto::VarType::LOD_TENSOR); } for (auto &o : op_desc.Output("selected_scores")) { - block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR); + auto &selected_scores = block->FindRecursiveOrCreateVar(o); + selected_scores.SetType(framework::proto::VarType::LOD_TENSOR); } } }; diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h index 46bc4f6f936929050276e8b3b93f1eddd62ac638..b5e2ed05924cc8b7bc06058b9b1103ba10be486e 100644 --- a/paddle/fluid/operators/beam_search_op.h +++ b/paddle/fluid/operators/beam_search_op.h @@ -132,6 +132,7 @@ class BeamSearch { * that means no candidates is provided, and the task will stop running. */ void operator()(const framework::LoDTensor& pre_ids, + const framework::LoDTensor& pre_scores, framework::LoDTensor* selected_ids, framework::LoDTensor* selected_scores); /* @@ -153,14 +154,16 @@ class BeamSearch { protected: /* - * Delete all the records that follows the end token. + * Prune the source sentences all branchs finished, and it is optional. + * Pruning must one step later than finishing (thus pre_ids is needed here), + * since the end tokens must be writed out. */ - int PruneEndidCandidates(const framework::LoDTensor& pre_ids, - std::vector>* items); + void PruneEndBeams(const framework::LoDTensor& pre_ids, + std::vector>* items); /* * Transform the items into a map whose key is offset, value is the items. - * NOTE low performance + * NOTE low performance. */ std::vector> ToMap( const std::vector>& inputs, size_t element_num); @@ -168,12 +171,16 @@ class BeamSearch { /* * For each source, select top beam_size records. */ - std::vector> SelectTopBeamSizeItems(); + std::vector> SelectTopBeamSizeItems( + const framework::LoDTensor& pre_ids, + const framework::LoDTensor& pre_scores); /* * Get the items of next source sequence, return false if no remaining items. */ - bool NextItemSet(std::vector* items); + bool NextItemSet(const framework::LoDTensor& pre_ids, + const framework::LoDTensor& pre_scores, + std::vector* items); private: size_t beam_size_; @@ -192,24 +199,25 @@ template class BeamSearchOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* ids_var = context.Input("ids"); - auto* scores_var = context.Input("scores"); - auto* pre_ids_var = context.Input("pre_ids"); - PADDLE_ENFORCE_NOT_NULL(ids_var); - PADDLE_ENFORCE_NOT_NULL(scores_var); - PADDLE_ENFORCE_NOT_NULL(pre_ids_var); + auto* ids = context.Input("ids"); + auto* scores = context.Input("scores"); + auto* pre_ids = context.Input("pre_ids"); + auto* pre_scores = context.Input("pre_scores"); + PADDLE_ENFORCE_NOT_NULL(ids); + PADDLE_ENFORCE_NOT_NULL(scores); + PADDLE_ENFORCE_NOT_NULL(pre_ids); + PADDLE_ENFORCE_NOT_NULL(pre_scores); size_t level = context.Attr("level"); size_t beam_size = context.Attr("beam_size"); int end_id = context.Attr("end_id"); - BeamSearch alg(*ids_var, *scores_var, level, beam_size, end_id); - auto selected_ids_var = - context.Output("selected_ids"); - auto selected_scores_var = + BeamSearch alg(*ids, *scores, level, beam_size, end_id); + auto selected_ids = context.Output("selected_ids"); + auto selected_scores = context.Output("selected_scores"); - PADDLE_ENFORCE_NOT_NULL(selected_ids_var); - PADDLE_ENFORCE_NOT_NULL(selected_scores_var); - alg(*pre_ids_var, selected_ids_var, selected_scores_var); + PADDLE_ENFORCE_NOT_NULL(selected_ids); + PADDLE_ENFORCE_NOT_NULL(selected_scores); + alg(*pre_ids, *pre_scores, selected_ids, selected_scores); } }; } // namespace operators diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc index ec666359aa2bd81f1323b54f9a03235740c3a696..c4f4b478fbfc87e4178155132781214575c1e6b0 100644 --- a/paddle/fluid/operators/beam_search_op_test.cc +++ b/paddle/fluid/operators/beam_search_op_test.cc @@ -30,7 +30,7 @@ using std::endl; void CreateInput(LoDTensor* ids, LoDTensor* scores) { LoD lod; - vector level0({0, 1, 4}); + vector level0({0, 2, 4}); vector level1({0, 1, 2, 3, 4}); lod.push_back(level0); lod.push_back(level1); @@ -64,17 +64,22 @@ TEST(beam_search_op, run) { for (int i = 0; i < 4; i++) { pre_ids.mutable_data(place)[i] = i + 1; } + LoDTensor pre_scores; + pre_scores.Resize(framework::make_ddim(vector(4, 1))); + for (int i = 0; i < 4; i++) { + pre_scores.mutable_data(place)[i] = 0.1 * (i + 1); + } - BeamSearch beamsearch(ids, scores, (int64_t)0, (int64_t)2, 0); + BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0); LoDTensor sids, sscores; - beamsearch(pre_ids, &sids, &sscores); + beamsearch(pre_ids, pre_scores, &sids, &sscores); LOG(INFO) << "score: " << sscores << endl; ASSERT_EQ(sids.lod(), sscores.lod()); - vector tids({2, 4, 3, 8}); - vector tscores({0.3, 0.5, 0.9, 0.7}); + vector tids({4, 2, 3, 8}); + vector tscores({0.5, 0.6, 0.9, 0.7}); for (int i = 0; i < 4; i++) { ASSERT_EQ(tids[i], sids.data()[i]); diff --git a/paddle/fluid/operators/bilinear_interp_op.cc b/paddle/fluid/operators/bilinear_interp_op.cc index 3321adf2743c28f6eeca8b5cc91ef89beed6b97c..2dc3399da183fbcf7664066f6f7ce12db3dc6d5e 100644 --- a/paddle/fluid/operators/bilinear_interp_op.cc +++ b/paddle/fluid/operators/bilinear_interp_op.cc @@ -56,17 +56,16 @@ class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "(Tensor) The input tensor of bilinear interpolation, " + "The input tensor of bilinear interpolation, " "This is a 4-D tensor with shape of (N x C x h x w)"); AddInput("OutSize", - "(Tensor) This is a 1-D tensor with two number. " + "This is a 1-D tensor with two number. " "The first number is height and the second number is width.") .AsDispensable(); - AddOutput("Out", - "(Tensor) The dimension of output is (N x C x out_h x out_w]"); + AddOutput("Out", "The dimension of output is (N x C x out_h x out_w)"); - AddAttr("out_h", "(int) output height of bilinear interpolation op."); - AddAttr("out_w", "(int) output width of bilinear interpolation op."); + AddAttr("out_h", "output height of bilinear interpolation op."); + AddAttr("out_w", "output width of bilinear interpolation op."); AddComment(R"DOC( Bilinear interpolation is an extension of linear interpolation for interpolating functions of two variables (e.g. H-direction and @@ -111,6 +110,7 @@ REGISTER_OPERATOR(bilinear_interp, ops::BilinearInterpOp, ops::BilinearInterpOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(bilinear_interp_grad, ops::BilinearInterpOpGrad); -REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel); +REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel, + ops::BilinearInterpKernel); REGISTER_OP_CPU_KERNEL(bilinear_interp_grad, ops::BilinearInterpGradKernel); diff --git a/paddle/fluid/operators/bilinear_interp_op.h b/paddle/fluid/operators/bilinear_interp_op.h index 8b03cd5a0635584a45782fe5a4823c37fe4fa8e8..70847cb8c1abe2e94bc844ab8117d1f23fea533b 100644 --- a/paddle/fluid/operators/bilinear_interp_op.h +++ b/paddle/fluid/operators/bilinear_interp_op.h @@ -46,8 +46,10 @@ class BilinearInterpKernel : public framework::OpKernel { int in_chw = channels * in_hw; int out_chw = channels * out_hw; - T ratio_h = (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; - T ratio_w = (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; + float ratio_h = + (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; + float ratio_w = + (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; if (in_h == out_h && in_w == out_w) { memcpy(output, input, input_t->numel() * sizeof(T)); @@ -56,24 +58,24 @@ class BilinearInterpKernel : public framework::OpKernel { for (int i = 0; i < out_h; ++i) { // loop for images int h = ratio_h * i; int hid = (h < in_h - 1) ? 1 : 0; - T h1lambda = ratio_h * i - h; - T h2lambda = 1 - h1lambda; + float h1lambda = ratio_h * i - h; + float h2lambda = 1.f - h1lambda; for (int j = 0; j < out_w; ++j) { int w = ratio_w * j; int wid = (w < in_w - 1) ? 1 : 0; - T w1lambda = ratio_w * j - w; - T w2lambda = 1 - w1lambda; + float w1lambda = ratio_w * j - w; + float w2lambda = 1.f - w1lambda; // calculate four position for bilinear interpolation const T* in_pos = &input[k * in_chw + h * in_w + w]; T* out_pos = &output[k * out_chw + i * out_w + j]; for (int c = 0; c < channels; ++c) { // loop for channels // bilinear interpolation - out_pos[0] = + out_pos[0] = static_cast( h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) + h1lambda * (w2lambda * in_pos[hid * in_w] + - w1lambda * in_pos[hid * in_w + wid]); + w1lambda * in_pos[hid * in_w + wid])); in_pos += in_hw; out_pos += out_hw; } @@ -117,8 +119,10 @@ class BilinearInterpGradKernel : public framework::OpKernel { int in_chw = channels * in_hw; int out_chw = channels * out_hw; - T ratio_h = (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; - T ratio_w = (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; + float ratio_h = + (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; + float ratio_w = + (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; if (in_h == out_h && in_w == out_w) { memcpy(d_input, d_output, d_input_t->numel() * sizeof(T)); @@ -127,22 +131,24 @@ class BilinearInterpGradKernel : public framework::OpKernel { for (int i = 0; i < out_h; ++i) { // loop for images int h = ratio_h * i; int hid = (h < in_h - 1) ? 1 : 0; - T h1lambda = ratio_h * i - h; - T h2lambda = 1 - h1lambda; + float h1lambda = ratio_h * i - h; + float h2lambda = 1 - h1lambda; for (int j = 0; j < out_w; ++j) { int w = ratio_w * j; int wid = (w < in_w - 1) ? 1 : 0; - T w1lambda = ratio_w * j - w; - T w2lambda = 1 - w1lambda; + float w1lambda = ratio_w * j - w; + float w2lambda = 1 - w1lambda; T* in_pos = &d_input[k * in_chw + h * in_w + w]; const T* out_pos = &d_output[k * out_chw + i * out_w + j]; for (int c = 0; c < channels; ++c) { // loop for channels - in_pos[0] += h2lambda * w2lambda * out_pos[0]; - in_pos[wid] += h2lambda * w1lambda * out_pos[0]; - in_pos[hid * in_w] += h1lambda * w2lambda * out_pos[0]; - in_pos[hid * in_w + wid] += h1lambda * w1lambda * out_pos[0]; + in_pos[0] += static_cast(h2lambda * w2lambda * out_pos[0]); + in_pos[wid] += static_cast(h2lambda * w1lambda * out_pos[0]); + in_pos[hid * in_w] += + static_cast(h1lambda * w2lambda * out_pos[0]); + in_pos[hid * in_w + wid] += + static_cast(h1lambda * w1lambda * out_pos[0]); in_pos += in_hw; out_pos += out_hw; } diff --git a/paddle/fluid/operators/checkpoint_notify_op.cc b/paddle/fluid/operators/checkpoint_notify_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c4219a429a53eb4869426a2674109555fb784b85 --- /dev/null +++ b/paddle/fluid/operators/checkpoint_notify_op.cc @@ -0,0 +1,88 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include // NOLINT +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/send_recv_util.h" +#include "paddle/fluid/string/printf.h" + +namespace paddle { +namespace operators { + +class CheckpointNotifyOp : public framework::OperatorBase { + public: + CheckpointNotifyOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const override { + std::vector epmap = Attr>("epmap"); + std::string dir = Attr("dir"); + std::string lookup_table_name = Attr("lookup_table"); + + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance(); + for (size_t i = 0; i < epmap.size(); i++) { + auto lookup_table_save_dir = + string::Sprintf("%s/%s_%d", dir, lookup_table_name, i); + rpc_client->AsyncCheckpointNotify(epmap[i], lookup_table_save_dir); + VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name + << " and dir:" << dir << " to " << epmap[i]; + } + rpc_client->Wait(); + } +}; + +class CheckpointNotifyOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddAttr>("epmap", + "(string vector, default 127.0.0.1:6164)" + "Parameter Server endpoints in the order") + .SetDefault({"127.0.0.1:6164"}); + AddAttr( + "dir", "(string, default '') indicate the folder checkpoint will use"); + AddAttr("lookup_table", + "(string, default '') the lookup table name"); + AddComment(R"DOC( +CheckpointNotify operator + +This operator will send lookup table and it's checkpoint direcoty to listen_and_serve op at +the parameter server. +)DOC"); + } +}; + +class CheckpointNotifyOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override {} +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(checkpoint_notify, ops::CheckpointNotifyOp, + paddle::framework::EmptyGradOpMaker, + ops::CheckpointNotifyOpMaker, + ops::CheckpointNotifyOpShapeInference); diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc index 62636bb2f9078768180ab1e0016e3565617d24d2..dc43c69be0bcea2b82e1d61a9a5b2e03129d4f8e 100644 --- a/paddle/fluid/operators/chunk_eval_op.cc +++ b/paddle/fluid/operators/chunk_eval_op.cc @@ -91,32 +91,31 @@ class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker { "(int64_t). The number of chunks both in Inference and Label on the " "given mini-batch."); AddAttr("num_chunk_types", - "(int). The number of chunk type. See below for details."); - AddAttr( - "chunk_scheme", - "(string, default IOB). The labeling scheme indicating " - "how to encode the chunks. Must be IOB, IOE, IOBES or plain. See below " - "for details.") + "The number of chunk type. See the description for details."); + AddAttr("chunk_scheme", + "The labeling scheme indicating " + "how to encode the chunks. Must be IOB, IOE, IOBES or " + "plain. See the description" + "for details.") .SetDefault("IOB"); AddAttr>("excluded_chunk_types", - "(list) A list including chunk type ids " + "A list including chunk type ids " "indicating chunk types that are not counted. " - "See below for details.") + "See the description for details.") .SetDefault(std::vector{}); AddComment(R"DOC( For some basics of chunking, please refer to -‘Chunking with Support Vector Machines ’. +'Chunking with Support Vector Machines '. - -CheckEvalOp computes the precision, recall, and F1-score of chunk detection, +ChunkEvalOp computes the precision, recall, and F1-score of chunk detection, and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. Here is a NER example of labeling for these tagging schemes: - - Li Ming works at Agricultural Bank of China in Beijing. - IO: I-PER I-PER O O I-ORG I-ORG I-ORG I-ORG O I-LOC - IOB: B-PER I-PER O O B-ORG I-ORG I-ORG I-ORG O B-LOC - IOE: I-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O E-LOC - IOBES: B-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O S-LOC + + Li Ming works at Agricultural Bank of China in Beijing. + IO I-PER I-PER O O I-ORG I-ORG I-ORG I-ORG O I-LOC + IOB B-PER I-PER O O B-ORG I-ORG I-ORG I-ORG O B-LOC + IOE I-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O E-LOC + IOBES B-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O S-LOC There are three chunk types(named entity types) including PER(person), ORG(organization) and LOC(LOCATION), and we can see that the labels have the form -. @@ -124,31 +123,31 @@ and LOC(LOCATION), and we can see that the labels have the form -("force_cpu", - "(bool, default false) Force fill output variable to cpu " + "Force fill output variable to cpu " "memory. Otherwise, fill output variable to the running " - "device") - .SetDefault(false); - AddOutput("Out", string::Sprintf( - "(LoDTensor) n-dim bool tensor. Each element is %s", - comment.equation)); - AddComment(string::Sprintf(R"DOC(%s Operator - + "device [default true].") + .SetDefault(true); + AddOutput("Out", string::Sprintf("n-dim bool tensor. Each element is %s", + comment.equation)); + AddComment(string::Sprintf(R"DOC( It operates element-wise on X and Y, and returns the Out. Each of them is a N-dim tensor. X and Y could be any type. The each element of the Out tensor is -calculated by %s +calculated by $%s$ )DOC", - comment.type, comment.equation)); - AddAttr("axis", - "(int, default -1). The start dimension index " - "for broadcasting Y onto X.") + comment.equation)); + AddAttr( + "axis", + "The start dimension index for broadcasting Y onto X. [default -1]") .SetDefault(-1) .EqualGreaterThan(-1); } diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 38337f9aa52435c445420047957500d21069506a..c72405593788493e10a1293b0c722e2d11c6e312 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -107,7 +107,13 @@ REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker, false> /* set false to disable empty grad */); REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad); REGISTER_OP_CPU_KERNEL( - concat, ops::ConcatKernel); + concat, ops::ConcatKernel, + ops::ConcatKernel, + ops::ConcatKernel, + ops::ConcatKernel); REGISTER_OP_CPU_KERNEL( concat_grad, - ops::ConcatGradKernel); + ops::ConcatGradKernel, + ops::ConcatGradKernel, + ops::ConcatGradKernel, + ops::ConcatGradKernel); diff --git a/paddle/fluid/operators/concat_op.cu.cc b/paddle/fluid/operators/concat_op.cu.cc index 590eca9d066ff7549939e62ddbfedc8ab76bb5e7..8e38e5231fbf6955ff8a9680a241a4a4ba1b924d 100644 --- a/paddle/fluid/operators/concat_op.cu.cc +++ b/paddle/fluid/operators/concat_op.cu.cc @@ -15,7 +15,13 @@ limitations under the License. */ #include "paddle/fluid/operators/concat_op.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( - concat, ops::ConcatKernel); + concat, ops::ConcatKernel, + ops::ConcatKernel, + ops::ConcatKernel, + ops::ConcatKernel); REGISTER_OP_CUDA_KERNEL( concat_grad, - ops::ConcatGradKernel); + ops::ConcatGradKernel, + ops::ConcatGradKernel, + ops::ConcatGradKernel, + ops::ConcatGradKernel); diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h index 1b1b8bf5ed959dd9c2ce8c9f5c905a75b81865fd..a496301526f58875ff51aeaa5b2094c3c656531c 100644 --- a/paddle/fluid/operators/concat_op.h +++ b/paddle/fluid/operators/concat_op.h @@ -60,34 +60,45 @@ template class ConcatGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { - auto* in = ctx.Input(framework::GradVarName("Out")); + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto ins = ctx.MultiInput("X"); + auto out_var_names = ctx.Outputs(framework::GradVarName("X")); auto outs = ctx.MultiOutput(framework::GradVarName("X")); int64_t axis = static_cast(ctx.Attr("axis")); + // get output tensor that the name is not kEmptyVarName + std::vector outputs; + for (size_t j = 0; j < outs.size(); ++j) { + if (out_var_names[j] != framework::kEmptyVarName) { + outs[j]->mutable_data(ctx.GetPlace()); + outputs.push_back(outs[j]); + } else { + outputs.push_back(nullptr); + } + } + // Sometimes direct copies will be faster, this maybe need deeply analysis. if (axis == 0 && outs.size() < 10) { size_t input_offset = 0; - auto in_stride = framework::stride_numel(in->dims()); + const auto in_stride = framework::stride_numel(out_grad->dims()); - for (auto& out : outs) { - out->mutable_data(ctx.GetPlace()); - auto out_stride = framework::stride_numel(out->dims()); - StridedNumelCopyWithAxis(ctx.device_context(), axis, out->data(), - out_stride, in->data() + input_offset, - in_stride, out_stride[axis]); + for (size_t i = 0; i < outs.size(); ++i) { + auto out_stride = framework::stride_numel(ins[i]->dims()); + auto* out = outputs[i]; + if (out != nullptr) { + StridedNumelCopyWithAxis( + ctx.device_context(), axis, out->data(), out_stride, + out_grad->data() + input_offset, in_stride, out_stride[axis]); + } input_offset += out_stride[axis]; } } else { - std::vector outputs(outs.size()); - for (size_t j = 0; j < outs.size(); ++j) { - outs[j]->mutable_data(ctx.GetPlace()); - outputs[j] = *outs[j]; - } - auto& dev_ctx = ctx.template device_context(); paddle::operators::math::ConcatGradFunctor concat_grad_functor; - concat_grad_functor(dev_ctx, *in, static_cast(axis), &outputs); + concat_grad_functor(dev_ctx, *out_grad, ins, static_cast(axis), + &outputs); } } }; diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index 7a7b8b76e43b1f91a3ba2767c217993cc39f26b6..1828be57b5a54005a0066b18ebebdb740726f67a 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/float16.h" -DEFINE_bool(cudnn_algo_use_autotune, true, +DEFINE_bool(cudnn_deterministic, true, "Whether allow using an autotuning algorithm for convolution " "operator. The autotuning algorithm may be non-deterministic. If " "false, the algorithm is deterministic."); @@ -272,7 +272,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); if (input_grad) { - if (FLAGS_cudnn_algo_use_autotune) { + if (FLAGS_cudnn_deterministic) { PADDLE_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( handle, cudnn_filter_desc, @@ -297,7 +297,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { } if (filter_grad) { - if (FLAGS_cudnn_algo_use_autotune) { + if (FLAGS_cudnn_deterministic) { PADDLE_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( handle, cudnn_input_desc, cudnn_output_grad_desc, diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 63d371310d2a26a1460e527fc51923dfd6e0b8bc..6b06913d1c83f4534238ac3dd22ac4035c0f0fbf 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -18,6 +18,17 @@ namespace paddle { namespace operators { +using conv_bwd_data = mkldnn::convolution_backward_data; +using conv_bwd_weights = mkldnn::convolution_backward_weights; +using conv_fwd = mkldnn::convolution_forward; +using framework::DataLayout; +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::reorder; +using mkldnn::stream; +using platform::to_void_cast; +using platform::GetMKLDNNFormat; + template class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -25,6 +36,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); + // Get unique name for index + const std::string key = ctx.op().Output("Output"); + const std::string key_conv_pd = key + "@conv_pd"; + auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); @@ -33,10 +48,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto* filter = ctx.Input("Filter"); auto* output = ctx.Output("Output"); - // Get an unique name from "argument" name of "Output" variable - // This name will be used as key when saving info into device context - const std::string key = ctx.op().Output("Output"); - const std::string key_conv_pd = key + "@conv_pd"; + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); @@ -63,60 +80,86 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { paddle::framework::vectorize2int(filter->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - // TODO(pzelazko-intel): support more formats - auto src_md = platform::MKLDNNMemDesc( - src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - auto weights_md = - platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::oihw); - auto dst_md = platform::MKLDNNMemDesc( - dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - - auto src_memory = - mkldnn::memory({src_md, mkldnn_engine}, - reinterpret_cast(const_cast(input_data))); - auto weights_memory = - mkldnn::memory({weights_md, mkldnn_engine}, - reinterpret_cast(const_cast(filter_data))); - auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data); - - std::shared_ptr conv_pd = - ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, - mkldnn_engine); - - // save conv_pd into global device context to be referred in backward path - dev_ctx.SetBlob(key_conv_pd, conv_pd); + // create mkldnn memory from input tensors (data/weights) + auto user_src_memory = memory( + {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine}, + to_void_cast(input_data)); + auto user_weights_memory = + memory({{{weights_tz}, memory::data_type::f32, filter->format()}, + mkldnn_engine}, + to_void_cast(filter_data)); + + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, + memory::format::any); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::f32, memory::format::any); + auto dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32, + memory::format::any); + + // create a conv primitive descriptor and save it for usage in backward + std::shared_ptr conv_pd = ConvFwdPrimitiveDesc( + src_md, weights_md, dst_md, strides, paddings, mkldnn_engine); + + // create reorder primitive if the input format is not the preferred one + auto src_memory = user_src_memory; + primitive reorder_src; + bool is_src_reordered = false; + if (memory::primitive_desc(conv_pd->src_primitive_desc()) != + user_src_memory.get_primitive_desc()) { + src_memory = memory(conv_pd->src_primitive_desc()); + reorder_src = reorder(user_src_memory, src_memory); + is_src_reordered = true; + } + auto weights_memory = user_weights_memory; + primitive reorder_weights; + bool is_weights_reordered = false; + if (memory::primitive_desc(conv_pd->weights_primitive_desc()) != + user_weights_memory.get_primitive_desc()) { + weights_memory = memory(conv_pd->weights_primitive_desc()); + reorder_weights = reorder(user_weights_memory, weights_memory); + is_weights_reordered = true; + } + + // create memory primitive for conv dst + auto dst_memory = memory(conv_pd->dst_primitive_desc(), output_data); // create convolution op primitive - auto conv_prim = mkldnn::convolution_forward(*conv_pd, src_memory, - weights_memory, dst_memory); + auto conv_prim = conv_fwd(*conv_pd, src_memory, weights_memory, dst_memory); // push primitive to stream and wait until it's executed - std::vector pipeline{conv_prim}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + std::vector pipeline; + if (is_src_reordered) pipeline.push_back(reorder_src); + if (is_weights_reordered) pipeline.push_back(reorder_weights); + pipeline.push_back(conv_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + // Save conv_pd/src_memory/weights_memory for backward pass + dev_ctx.SetBlob(key_conv_pd, conv_pd); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(dst_memory)); } private: - std::unique_ptr - ConvFwdPrimitiveDesc(const mkldnn::memory::desc& src, - const mkldnn::memory::desc& weights, - const mkldnn::memory::desc& dst, - const std::vector& strides, - const std::vector& paddings, - const mkldnn::engine& engine) const { - mkldnn::memory::dims stride_dims = {strides[0], strides[1]}; - mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]}; - - auto conv_desc = mkldnn::convolution_forward::desc( - mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, - dst, stride_dims, padding_dims, padding_dims, - mkldnn::padding_kind::zero); - - auto p_conv_pd = - new mkldnn::convolution_forward::primitive_desc(conv_desc, engine); - - return std::unique_ptr( - p_conv_pd); + std::unique_ptr ConvFwdPrimitiveDesc( + const memory::desc& src, const memory::desc& weights, + const memory::desc& dst, const std::vector& strides, + const std::vector& paddings, const mkldnn::engine& engine) const { + memory::dims stride_dims = {strides[0], strides[1]}; + memory::dims padding_dims = {paddings[0], paddings[1]}; + + auto conv_desc = + conv_fwd::desc(mkldnn::prop_kind::forward, mkldnn::convolution_direct, + src, weights, dst, stride_dims, padding_dims, + padding_dims, mkldnn::padding_kind::zero); + + auto p_conv_pd = new conv_fwd::primitive_desc(conv_desc, engine); + + return std::unique_ptr(p_conv_pd); } }; @@ -139,6 +182,19 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); Tensor* filter_grad = ctx.Output(framework::GradVarName("Filter")); + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); + PADDLE_ENFORCE(output->layout() == DataLayout::kMKLDNN && + output->format() != memory::format::format_undef, + "Wrong layout/format set for Output tensor"); + PADDLE_ENFORCE(output_grad->layout() == DataLayout::kMKLDNN && + output_grad->format() != memory::format::format_undef, + "Wrong layout/format set for output_grad tensor"); + if (!input_grad && !filter_grad) return; // Get an unique name from "argument" name of "Output" variable @@ -167,108 +223,147 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { paddle::framework::vectorize2int(filter->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - // TODO(pzelazko-intel): support more formats - auto src_md = platform::MKLDNNMemDesc( - src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - auto diff_src_md = platform::MKLDNNMemDesc( - src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - auto weights_md = - platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::oihw); - auto diff_weights_md = - platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::oihw); - auto diff_dst_md = platform::MKLDNNMemDesc( - dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - - // create memory - auto diff_dst_memory = mkldnn::memory( - {diff_weights_md, mkldnn_engine}, - reinterpret_cast(const_cast(output_grad_data))); + // create mkldnn memory from input tensors (input/weights/output_grad) + auto user_src_memory = memory( + {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine}, + to_void_cast(input_data)); + auto user_weights_memory = + memory({{{weights_tz}, memory::data_type::f32, filter->format()}, + mkldnn_engine}, + to_void_cast(filter_data)); + auto user_diff_dst_memory = + memory({{{dst_tz}, memory::data_type::f32, output_grad->format()}, + mkldnn_engine}, + to_void_cast(output_grad_data)); + + /* create memory descriptor for conv backward without specified format + * ('any') which lets a primitive (conv backward in this case) choose + * the memory format preferred for best performance + */ + auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, + memory::format::any); + auto diff_src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, + memory::format::any); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::f32, memory::format::any); + auto diff_weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::f32, memory::format::any); + auto diff_dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32, + memory::format::any); + // Retrieve conv_pd from device context - auto conv_pd = - std::static_pointer_cast( - dev_ctx.GetBlob(key_conv_pd)); + auto conv_pd = std::static_pointer_cast( + dev_ctx.GetBlob(key_conv_pd)); PADDLE_ENFORCE(conv_pd != nullptr, "Fail to find conv_pd in device context"); // create backward conv primitive for weights if (filter_grad) { - // create primitive descriptor - mkldnn::convolution_backward_weights::primitive_desc conv_bwd_weights_pd = - ConvBwdWeightsPrimitiveDesc(src_md, diff_weights_md, diff_dst_md, - strides, paddings, *conv_pd, - mkldnn_engine); - - // create memory + // create backward convolution primitive descriptor + auto conv_bwd_weights_desc = conv_bwd_weights::desc( + mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md, + strides, paddings, paddings, mkldnn::padding_kind::zero); + auto conv_bwd_weights_pd = conv_bwd_weights::primitive_desc( + conv_bwd_weights_desc, mkldnn_engine, *conv_pd); + + // create reorder primitive if the input format is not the preferred one + auto src_memory = user_src_memory; + primitive reorder_src; + bool is_src_reordered = false; + if (memory::primitive_desc(conv_bwd_weights_pd.src_primitive_desc()) != + user_src_memory.get_primitive_desc()) { + src_memory = memory(conv_bwd_weights_pd.src_primitive_desc()); + reorder_src = reorder(user_src_memory, src_memory); + is_src_reordered = true; + } + + auto diff_dst_memory_4filter = user_diff_dst_memory; + primitive reorder_diff_dst_4filter; + bool is_diff_dst_reordered_4filter = false; + if (memory::primitive_desc( + conv_bwd_weights_pd.diff_dst_primitive_desc()) != + user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory_4filter = + memory(conv_bwd_weights_pd.diff_dst_primitive_desc()); + reorder_diff_dst_4filter = + reorder(user_diff_dst_memory, diff_dst_memory_4filter); + is_diff_dst_reordered_4filter = true; + } + + // create mkldnn memory for output (i.e. diff weights) auto diff_weights_memory = - mkldnn::memory({diff_weights_md, mkldnn_engine}, - reinterpret_cast(filter_grad_data)); - auto src_memory = - mkldnn::memory({src_md, mkldnn_engine}, - reinterpret_cast(const_cast(input_data))); + memory(conv_bwd_weights_pd.diff_weights_primitive_desc(), + reinterpret_cast(filter_grad_data)); // create backward conv primitive for weights - auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights( - conv_bwd_weights_pd, src_memory, diff_dst_memory, - diff_weights_memory); + auto conv_bwd_weights_prim = + conv_bwd_weights(conv_bwd_weights_pd, src_memory, + diff_dst_memory_4filter, diff_weights_memory); // push primitive and execute it - std::vector pipeline{conv_bwd_weights_prim}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + std::vector pipeline; + if (is_src_reordered) pipeline.push_back(reorder_src); + if (is_diff_dst_reordered_4filter) + pipeline.push_back(reorder_diff_dst_4filter); + pipeline.push_back(conv_bwd_weights_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + filter_grad->set_layout(DataLayout::kMKLDNN); + filter_grad->set_format(GetMKLDNNFormat(diff_weights_memory)); } if (input_grad) { - // create primitive descriptor - mkldnn::convolution_backward_data::primitive_desc conv_bwd_data_pd = - ConvBwdDataPrimitiveDesc(diff_src_md, weights_md, diff_dst_md, - strides, paddings, *conv_pd, mkldnn_engine); - - // create memory - auto diff_src_memory = mkldnn::memory( - {diff_src_md, mkldnn_engine}, - reinterpret_cast(const_cast(input_grad_data))); - auto weights_memory = - mkldnn::memory({weights_md, mkldnn_engine}, - reinterpret_cast(const_cast(filter_data))); + // create backward convolution primitive descriptor + auto conv_bwd_data_desc = conv_bwd_data::desc( + mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md, + strides, paddings, paddings, mkldnn::padding_kind::zero); + auto conv_bwd_data_pd = conv_bwd_data::primitive_desc( + conv_bwd_data_desc, mkldnn_engine, *conv_pd); + + // create reorder primitive if the input format is not the preferred one + auto weights_memory = user_weights_memory; + primitive reorder_weights; + bool is_weights_reordered = false; + if (memory::primitive_desc(conv_bwd_data_pd.weights_primitive_desc()) != + user_weights_memory.get_primitive_desc()) { + weights_memory = memory(conv_bwd_data_pd.weights_primitive_desc()); + reorder_weights = reorder(user_weights_memory, weights_memory); + is_weights_reordered = true; + } + + auto diff_dst_memory_4data = user_diff_dst_memory; + primitive reorder_diff_dst_4data; + bool is_diff_dst_reordered_4data = false; + if (memory::primitive_desc(conv_bwd_data_pd.diff_dst_primitive_desc()) != + user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory_4data = + memory(conv_bwd_data_pd.diff_dst_primitive_desc()); + reorder_diff_dst_4data = + reorder(user_diff_dst_memory, diff_dst_memory_4data); + is_diff_dst_reordered_4data = true; + } + + // create mkldnn memory for output (i.e. diff src) + auto diff_src_memory = memory(conv_bwd_data_pd.diff_src_primitive_desc(), + reinterpret_cast(input_grad_data)); // create backward conv primitive for data - auto conv_bwd_data_prim = mkldnn::convolution_backward_data( - conv_bwd_data_pd, diff_dst_memory, weights_memory, diff_src_memory); + auto conv_bwd_data_prim = + conv_bwd_data(conv_bwd_data_pd, diff_dst_memory_4data, weights_memory, + diff_src_memory); - // push primitive to stream and wait until it's executed - std::vector pipeline{conv_bwd_data_prim}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + // push primitive and execute it + std::vector pipeline; + if (is_weights_reordered) pipeline.push_back(reorder_weights); + if (is_diff_dst_reordered_4data) + pipeline.push_back(reorder_diff_dst_4data); + pipeline.push_back(conv_bwd_data_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + input_grad->set_layout(DataLayout::kMKLDNN); + input_grad->set_format(GetMKLDNNFormat(diff_src_memory)); } } // Compute() - - private: - mkldnn::convolution_backward_weights::primitive_desc - ConvBwdWeightsPrimitiveDesc( - const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights, - const mkldnn::memory::desc& diff_dst, const std::vector& strides, - const std::vector& paddings, - const mkldnn::convolution_forward::primitive_desc& conv_pd, - const mkldnn::engine& engine) const { - auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc( - mkldnn::convolution_direct, src, diff_weights, diff_dst, strides, - paddings, paddings, mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_weights::primitive_desc( - conv_bwd_weights_desc, engine, conv_pd); - } - - mkldnn::convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc( - const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights, - const mkldnn::memory::desc& diff_dst, const std::vector& strides, - const std::vector& paddings, - const mkldnn::convolution_forward::primitive_desc& conv_pd, - const mkldnn::engine& engine) const { - auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc( - mkldnn::convolution_direct, diff_src, weights, diff_dst, strides, - paddings, paddings, mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_data::primitive_desc(conv_bwd_data_desc, - engine, conv_pd); - } }; } // namespace operators diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 697d91484257984b104a13b0572cf19b16f8d37e..37153d58439a90190eb2ad82d5dcc145e22dfa48 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -75,6 +75,10 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library{framework::LibraryType::kPlain}; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library = framework::LibraryType::kCUDNN; @@ -84,6 +88,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( if (library == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } #endif @@ -99,9 +104,6 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( "float16 can only be used when CUDNN is used"); } - std::string data_format = ctx.Attr("data_format"); - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout = framework::StringToDataLayout(data_format); return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, library); } @@ -122,7 +124,8 @@ void Conv2DOpMaker::Make() { "input image channels divided by the groups."); AddOutput("Output", "(Tensor) The output tensor of convolution operator. " - "The format of output tensor is also NCHW."); + "The format of output tensor is also NCHW.") + .Reuse("Input"); AddAttr>("strides", "(vector default:{1, 1}), the " "strides(h_stride, w_stride) of " @@ -217,7 +220,8 @@ void Conv3DOpMaker::Make() { "input image channels divided by the groups."); AddOutput("Output", "(Tensor) The output tensor of convolution operator." - "The format of output tensor is also NCDHW."); + "The format of output tensor is also NCDHW.") + .Reuse("Input"); AddAttr>("strides", "(vector, default:{1, 1, 1}), the " "strides(d_stride, h_stride, w_stride) of " @@ -309,6 +313,10 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; @@ -318,12 +326,10 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif - std::string data_format = ctx.Attr("data_format"); - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), layout_, library_); diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index 0b363f5c43f9fc191790e5cca629ffc46eb9388c..2e9e957ebdc2a5cb7663b968c5da631aebe60b1c 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -156,7 +156,7 @@ Parameters(strides, paddings) are two elements. These two elements represent hei and width, respectively. The input(X) size and output(Out) size may be different. -Example: +For an example: Input: Input shape: $(N, C_{in}, H_{in}, W_{in})$ Filter shape: $(C_{in}, C_{out}, H_f, W_f)$ diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc index 046dd11910bb0ff46b567c3b89883582782205d3..8f3644039f9950a8a70e2fd66c20837a5f52bd7f 100644 --- a/paddle/fluid/operators/cos_sim_op.cc +++ b/paddle/fluid/operators/cos_sim_op.cc @@ -76,9 +76,9 @@ class CosSimOpMaker : public framework::OpProtoAndCheckerMaker { .AsIntermediate(); AddComment(R"DOC( -Cosine Similarity Operator. +**Cosine Similarity Operator** -$Out = X^T * Y / (\sqrt{X^T * X} * \sqrt{Y^T * Y})$ +$Out = \frac{X^T * Y}{(\sqrt{X^T * X} * \sqrt{Y^T * Y})}$ The input X and Y must have the same shape, except that the 1st dimension of input Y could be just 1 (different from input X), which will be diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc index 40f43936db662f2b18ffa540da4794755b5d6fc7..c27befe1143baa68add4b56f3572eab75272c3a5 100644 --- a/paddle/fluid/operators/crf_decoding_op.cc +++ b/paddle/fluid/operators/crf_decoding_op.cc @@ -53,21 +53,18 @@ sequence of observed tags. The output of this operator changes according to whether Input(Label) is given: 1. Input(Label) is given: - -This happens in training. This operator is used to co-work with the chunk_eval -operator. - -When Input(Label) is given, the crf_decoding operator returns a row vector -with shape [N x 1] whose values are fixed to be 0, indicating an incorrect -prediction, or 1 indicating a tag is correctly predicted. Such an output is the -input to chunk_eval operator. + This happens in training. This operator is used to co-work with the chunk_eval + operator. + When Input(Label) is given, the crf_decoding operator returns a row vector + with shape [N x 1] whose values are fixed to be 0, indicating an incorrect + prediction, or 1 indicating a tag is correctly predicted. Such an output is the + input to chunk_eval operator. 2. Input(Label) is not given: - -This is the standard decoding process. + This is the standard decoding process. The crf_decoding operator returns a row vector with shape [N x 1] whose values -range from 0 to maximum tag number - 1. Each element indicates an index of a +range from 0 to maximum tag number - 1, Each element indicates an index of a predicted tag. )DOC"); } diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc index 669b3bbe9df4cae1aa381184092dfa51157ab6a3..5b5a220cf90e7813f914ae35733e7a4103391b2d 100644 --- a/paddle/fluid/operators/crop_op.cc +++ b/paddle/fluid/operators/crop_op.cc @@ -48,6 +48,13 @@ class CropOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", y_dim); } } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } }; class CropOpMaker : public framework::OpProtoAndCheckerMaker { @@ -60,13 +67,19 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker { "The input used as reference for cropping, " "which is of the same dimensions as X.") .AsDispensable(); + AddInput("Offsets", + "The input used to describe offsets in runtime, which is a " + "1-D vector whose size equals to the rank of input 'X'. The " + "elements data type must be int.") + .AsDispensable(); AddOutput("Out", "The output of crop op, " "which is of the same dimensions as X."); AddAttr>("offsets", "A list describing offsets to be cropped. " "The size of offsets list should be the same as " - "the dimension size of input X."); + "the dimension size of input X.") + .SetDefault(std::vector()); AddAttr>("shape", "A list describing the shape of output. " "The size of shape list should be the same as " @@ -77,6 +90,17 @@ Crop Operator. Crop input into output, as specified by offsets and shape. +There are two ways to set the offsets: +1. In runtime: Using the input 'Offsets', which is a Vairbale and can be + output of other operators. This way is suitable for + dynamic offsets. +2. In network configuration: Using the attribute 'offsets', which will be + set in Python configure script. This way is + suitable for fixed offsets. +You CANNOT use these two ways at the same time. An exception will be raised +if input 'Offset' is configured and meanwhile the attribute 'offsets' is +not empty. + There are two ways to set shape: 1. reference input: crop input X into the same shape as reference input. The dimension of reference input should @@ -146,6 +170,15 @@ class CropOpGrad : public framework::OperatorWithKernel { ctx->SetOutputDim(x_grad_name, x_dims); } } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input(framework::GradVarName("Out")) + ->type()), + ctx.device_context()); + } }; } // namespace operators diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h index f05c2e23284e3a24cf48442996f671ec6084c391..772e80bbea4f2db654cefd0dcb404bc33803bd7a 100644 --- a/paddle/fluid/operators/crop_op.h +++ b/paddle/fluid/operators/crop_op.h @@ -27,6 +27,37 @@ template ; using framework::Tensor; +static std::vector GetOffsets(const framework::ExecutionContext& ctx) { + std::vector res; + int rank = ctx.Input("X")->dims().size(); + if (ctx.HasInput("Offsets")) { + PADDLE_ENFORCE(ctx.Attr>("offsets").empty(), + "Input 'Offsets' and attribute 'offsets' should not be used " + "at the same time."); + const auto* offsets_tensor = ctx.Input("Offsets"); + PADDLE_ENFORCE_EQ(offsets_tensor->dims().size(), 1); + PADDLE_ENFORCE_EQ( + rank, offsets_tensor->dims()[0], + "Offsets size should be equal to dimension size of input tensor."); + const int* offsets_data; + framework::Tensor cpu_tmp_tensor; + if (platform::is_cpu_place(offsets_tensor->place())) { + offsets_data = offsets_tensor->data(); + } else { + framework::TensorCopySync(*offsets_tensor, platform::CPUPlace(), + &cpu_tmp_tensor); + offsets_data = cpu_tmp_tensor.data(); + } + res = std::vector(offsets_data, offsets_data + rank); + } else { + res = ctx.Attr>("offsets"); + PADDLE_ENFORCE_EQ( + rank, static_cast(res.size()), + "Offsets size should be equal to dimension size of input tensor."); + } + return res; +} + template class CropKernel : public framework::OpKernel { public: @@ -37,10 +68,7 @@ class CropKernel : public framework::OpKernel { T* out_data = out->mutable_data(context.GetPlace()); auto x_stride = framework::stride(x->dims()); auto out_stride = framework::stride(out->dims()); - auto offsets = context.Attr>("offsets"); - PADDLE_ENFORCE_EQ( - x->dims().size(), static_cast(offsets.size()), - "Offsets size should be equal to dimension size of input tensor."); + auto offsets = GetOffsets(context); int64_t offset = 0; for (size_t i = 0; i < offsets.size(); ++i) { offset += (x_stride[i] * offsets[i]); @@ -56,7 +84,7 @@ void CropGradFunction(const framework::ExecutionContext& context) { if (d_x != nullptr) { auto* d_out = context.Input(framework::GradVarName("Out")); d_x->mutable_data(context.GetPlace()); - auto offsets = context.Attr>("offsets"); + auto offsets = GetOffsets(context); Eigen::array, D> paddings; for (size_t i = 0; i < D; ++i) { paddings[i].first = offsets[i]; diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index a3bec3da45136bca5cb2763e7ffd6b67703a1813..d5e095f9cad95b74b8ff79e4a60ccbdf11512a5a 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -124,7 +124,8 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { "Tensor with shape [N x D]."); AddOutput("Y", "(Tensor, default Tensor), a 2-D tensor with shape " - "[N x 1]. The cross entropy loss."); + "[N x 1]. The cross entropy loss.") + .Reuse("X"); AddAttr("soft_label", "(bool, default false), a flag indicating whether to " "interpretate the given labels as soft labels.") diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc index 92bb835e8f18e17ae1355fdec29f43b8ffb70460..5302b822d6b9f232e9ccd0d03cc549d7d5044ebf 100644 --- a/paddle/fluid/operators/cumsum_op.cc +++ b/paddle/fluid/operators/cumsum_op.cc @@ -30,19 +30,19 @@ class CumOp : public framework::OperatorWithKernel { class CumsumOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("X", "Input of Cumsum operator"); - AddOutput("Out", "Output of Cumsum operator"); + AddInput("X", "Input of cumsum operator"); + AddOutput("Out", "Output of cumsum operator"); AddAttr("axis", - "(int, default -1). The dimenstion to accumulate along. " - "-1 means the last dimenstion") + "The dimenstion to accumulate along. -1 means the last " + "dimenstion [default -1].") .SetDefault(-1) .EqualGreaterThan(-1); AddAttr("exclusive", - "bool, default false). Whether to perform exclusive cumsum") + "Whether to perform exclusive cumsum. [default false].") .SetDefault(false); AddAttr("reverse", - "bool, default false). If true, the cumsum is performed in " - "the reversed direction") + "If true, the cumsum is performed in the reversed direction. " + "[default false].") .SetDefault(false); AddComment(R"DOC( The cumulative sum of the elements along a given axis. diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt deleted file mode 100644 index cf20530513cf6cd420e56b2f6378225f73c2bc8b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/detail/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -if(WITH_DISTRIBUTE) - grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc - request_handler_impl.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor - selected_rows memory) - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr - cares zlib protobuf sendrecvop_grpc SERIAL) - cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc - grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor - proto_desc lookup_table_op SERIAL) -endif() diff --git a/paddle/fluid/operators/detail/macros.h b/paddle/fluid/operators/detail/macros.h new file mode 100644 index 0000000000000000000000000000000000000000..b9e385994efcea0388756e8bd780ebfc719ed08d --- /dev/null +++ b/paddle/fluid/operators/detail/macros.h @@ -0,0 +1,27 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_GRPC +#include "paddle/fluid/operators/distributed/grpc_client.h" +#include "paddle/fluid/operators/distributed/grpc_server.h" +#define RPCSERVER_T distributed::AsyncGRPCServer +#define RPCCLIENT_T distributed::GRPCClient +#else +#include "paddle/fluid/operators/distributed/brpc_client.h" +#include "paddle/fluid/operators/distributed/brpc_server.h" +#define RPCSERVER_T distributed::AsyncBRPCServer +#define RPCCLIENT_T distributed::BRPCClient +#endif diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc index 76ef08cb9ad385681375eada7e58721022032db4..d0f95f727fdbc82777147e3e8ada6ad4f7a35e60 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cc +++ b/paddle/fluid/operators/detection/box_coder_op.cc @@ -22,21 +22,21 @@ class BoxCoderOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("PriorBox"), "Input(PriorBox) of BoxCoderOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"), - "Input(PriorBoxVar) of BoxCoderOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("TargetBox"), "Input(TargetBox) of BoxCoderOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("OutputBox"), "Output(OutputBox) of BoxCoderOp should not be null."); auto prior_box_dims = ctx->GetInputDim("PriorBox"); - auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); auto target_box_dims = ctx->GetInputDim("TargetBox"); PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2, "The rank of Input of PriorBoxVar must be 2"); PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]"); - PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims); + if (ctx->HasInput("PriorBoxVar")) { + auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); + PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims); + } auto code_type = GetBoxCodeType(ctx->Attrs().Get("code_type")); if (code_type == BoxCodeType::kEncodeCenterSize) { @@ -71,9 +71,11 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker { "of the coordinate system. [xmax, ymax] is the right bottom " "coordinate of the anchor box."); AddInput("PriorBoxVar", - "(Tensor, default Tensor) " + "(Tensor, default Tensor, optional) " "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group " - "of variance."); + "of variance. PriorBoxVar will set all elements to 1 by " + "default.") + .AsDispensable(); AddInput( "TargetBox", "(LoDTensor or Tensor) This input can be a 2-D LoDTensor with shape " @@ -104,23 +106,36 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker { "and M represents the number of deocded boxes."); AddComment(R"DOC( -Bounding Box Coder Operator. + +Bounding Box Coder. + Encode/Decode the target bounding box with the priorbox information. + The Encoding schema described below: -ox = (tx - px) / pw / pxv -oy = (ty - py) / ph / pyv -ow = log(abs(tw / pw)) / pwv -oh = log(abs(th / ph)) / phv + + ox = (tx - px) / pw / pxv + + oy = (ty - py) / ph / pyv + + ow = log(abs(tw / pw)) / pwv + + oh = log(abs(th / ph)) / phv + The Decoding schema described below: -ox = (pw * pxv * tx * + px) - tw / 2 -oy = (ph * pyv * ty * + py) - th / 2 -ow = exp(pwv * tw) * pw + tw / 2 -oh = exp(phv * th) * ph + th / 2 -where tx, ty, tw, th denote the target box's center coordinates, width and -height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor) -center coordinates, width and height. pxv, pyv, pwv, phv denote the variance -of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates, -width and height. + + ox = (pw * pxv * tx * + px) - tw / 2 + + oy = (ph * pyv * ty * + py) - th / 2 + + ow = exp(pwv * tw) * pw + tw / 2 + + oh = exp(phv * th) * ph + th / 2 + +where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width +and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the +priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`, +`phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the +encoded/decoded coordinates, width and height. )DOC"); } }; @@ -131,5 +146,6 @@ width and height. namespace ops = paddle::operators; REGISTER_OPERATOR(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel, - ops::BoxCoderKernel); +REGISTER_OP_CPU_KERNEL( + box_coder, ops::BoxCoderKernel, + ops::BoxCoderKernel); diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu index fc7eb5d1ed71c19630e96ea0ff0e6fe0962744a8..a7af111f63d654319dd1d90d2032956951dfe49e 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cu +++ b/paddle/fluid/operators/detection/box_coder_op.cu @@ -48,15 +48,18 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data, target_box_data[row_idx * len + 1] + (normalized == false); - output[idx * len] = (target_box_center_x - prior_box_center_x) / - prior_box_width / prior_box_var_data[col_idx * len]; - output[idx * len + 1] = (target_box_center_y - prior_box_center_y) / - prior_box_height / - prior_box_var_data[col_idx * len + 1]; - output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) / - prior_box_var_data[col_idx * len + 2]; - output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) / - prior_box_var_data[col_idx * len + 3]; + output[idx * len] = + (target_box_center_x - prior_box_center_x) / prior_box_width; + output[idx * len + 1] = + (target_box_center_y - prior_box_center_y) / prior_box_height; + output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)); + output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)); + if (prior_box_var_data) { + output[idx * len] /= prior_box_var_data[col_idx * len]; + output[idx * len + 1] /= prior_box_var_data[col_idx * len + 1]; + output[idx * len + 2] /= prior_box_var_data[col_idx * len + 2]; + output[idx * len + 3] /= prior_box_var_data[col_idx * len + 3]; + } } } @@ -79,20 +82,31 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data, T prior_box_center_y = (prior_box_data[col_idx * len + 3] + prior_box_data[col_idx * len + 1]) / 2; - - T target_box_width = exp(prior_box_var_data[col_idx * len + 2] * + T target_box_width, target_box_height; + T target_box_center_x, target_box_center_y; + if (prior_box_var_data) { + target_box_width = exp(prior_box_var_data[col_idx * len + 2] * target_box_data[idx * len + 2]) * prior_box_width; - T target_box_height = exp(prior_box_var_data[col_idx * len + 3] * + target_box_height = exp(prior_box_var_data[col_idx * len + 3] * target_box_data[idx * len + 3]) * prior_box_height; - T target_box_center_x = prior_box_var_data[col_idx * len] * + target_box_center_x = prior_box_var_data[col_idx * len] * target_box_data[idx * len] * prior_box_width + prior_box_center_x; - T target_box_center_y = prior_box_var_data[col_idx * len + 1] * + target_box_center_y = prior_box_var_data[col_idx * len + 1] * target_box_data[idx * len + 1] * prior_box_height + prior_box_center_y; + } else { + target_box_width = exp(target_box_data[idx * len + 2]) * prior_box_width; + target_box_height = + exp(target_box_data[idx * len + 3]) * prior_box_height; + target_box_center_x = + target_box_data[idx * len] * prior_box_width + prior_box_center_x; + target_box_center_y = target_box_data[idx * len + 1] * prior_box_height + + prior_box_center_y; + } output[idx * len] = target_box_center_x - target_box_width / 2; output[idx * len + 1] = target_box_center_y - target_box_height / 2; @@ -103,7 +117,7 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data, } } -template +template class BoxCoderCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -114,6 +128,11 @@ class BoxCoderCUDAKernel : public framework::OpKernel { auto* target_box = context.Input("TargetBox"); auto* output_box = context.Output("OutputBox"); + const T* prior_box_data = prior_box->data(); + const T* target_box_data = target_box->data(); + const T* prior_box_var_data = nullptr; + if (prior_box_var) prior_box_var_data = prior_box_var->data(); + if (target_box->lod().size()) { PADDLE_ENFORCE_EQ(target_box->lod().size(), 1, "Only support 1 level of LoD."); @@ -125,10 +144,6 @@ class BoxCoderCUDAKernel : public framework::OpKernel { int grid = (row * col + block - 1) / block; auto& device_ctx = context.cuda_device_context(); - const T* prior_box_data = prior_box->data(); - const T* prior_box_var_data = prior_box_var->data(); - const T* target_box_data = target_box->data(); - output_box->mutable_data({row, col, len}, context.GetPlace()); T* output = output_box->data(); @@ -150,5 +165,7 @@ class BoxCoderCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel, - ops::BoxCoderCUDAKernel); +REGISTER_OP_CUDA_KERNEL( + box_coder, + ops::BoxCoderCUDAKernel, + ops::BoxCoderCUDAKernel); diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h index 3dc68935ac1ea0d3e6ddf2a56bc3aba822c49230..5ed8520acddfa8fe2105a7c1615bcb3243cb130f 100644 --- a/paddle/fluid/operators/detection/box_coder_op.h +++ b/paddle/fluid/operators/detection/box_coder_op.h @@ -28,19 +28,20 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) { PADDLE_THROW("Not support type %s.", type); } -template +template class BoxCoderKernel : public framework::OpKernel { public: - void EncodeCenterSize(const framework::Tensor& target_box, - const framework::Tensor& prior_box, - const framework::Tensor& prior_box_var, + void EncodeCenterSize(const framework::Tensor* target_box, + const framework::Tensor* prior_box, + const framework::Tensor* prior_box_var, const bool normalized, T* output) const { - int64_t row = target_box.dims()[0]; - int64_t col = prior_box.dims()[0]; - int64_t len = prior_box.dims()[1]; - auto* target_box_data = target_box.data(); - auto* prior_box_data = prior_box.data(); - auto* prior_box_var_data = prior_box_var.data(); + int64_t row = target_box->dims()[0]; + int64_t col = prior_box->dims()[0]; + int64_t len = prior_box->dims()[1]; + auto* target_box_data = target_box->data(); + auto* prior_box_data = prior_box->data(); + const T* prior_box_var_data = nullptr; + if (prior_box_var) prior_box_var_data = prior_box_var->data(); for (int64_t i = 0; i < row; ++i) { for (int64_t j = 0; j < col; ++j) { @@ -65,30 +66,35 @@ class BoxCoderKernel : public framework::OpKernel { (normalized == false); size_t offset = i * col * len + j * len; - output[offset] = (target_box_center_x - prior_box_center_x) / - prior_box_width / prior_box_var_data[j * len]; - output[offset + 1] = (target_box_center_y - prior_box_center_y) / - prior_box_height / prior_box_var_data[j * len + 1]; + output[offset] = + (target_box_center_x - prior_box_center_x) / prior_box_width; + output[offset + 1] = + (target_box_center_y - prior_box_center_y) / prior_box_height; output[offset + 2] = - std::log(std::fabs(target_box_width / prior_box_width)) / - prior_box_var_data[j * len + 2]; + std::log(std::fabs(target_box_width / prior_box_width)); output[offset + 3] = - std::log(std::fabs(target_box_height / prior_box_height)) / - prior_box_var_data[j * len + 3]; + std::log(std::fabs(target_box_height / prior_box_height)); + if (prior_box_var) { + output[offset] /= prior_box_var_data[j * len]; + output[offset + 1] /= prior_box_var_data[j * len + 1]; + output[offset + 2] /= prior_box_var_data[j * len + 2]; + output[offset + 3] /= prior_box_var_data[j * len + 3]; + } } } } - void DecodeCenterSize(const framework::Tensor& target_box, - const framework::Tensor& prior_box, - const framework::Tensor& prior_box_var, + void DecodeCenterSize(const framework::Tensor* target_box, + const framework::Tensor* prior_box, + const framework::Tensor* prior_box_var, const bool normalized, T* output) const { - int64_t row = target_box.dims()[0]; - int64_t col = prior_box.dims()[0]; - int64_t len = prior_box.dims()[1]; + int64_t row = target_box->dims()[0]; + int64_t col = prior_box->dims()[0]; + int64_t len = prior_box->dims()[1]; - auto* target_box_data = target_box.data(); - auto* prior_box_data = prior_box.data(); - auto* prior_box_var_data = prior_box_var.data(); + auto* target_box_data = target_box->data(); + auto* prior_box_data = prior_box->data(); + const T* prior_box_var_data = nullptr; + if (prior_box_var) prior_box_var_data = prior_box_var->data(); for (int64_t i = 0; i < row; ++i) { for (int64_t j = 0; j < col; ++j) { @@ -103,19 +109,32 @@ class BoxCoderKernel : public framework::OpKernel { T prior_box_center_y = (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; - T target_box_center_x = prior_box_var_data[j * len] * + T target_box_center_x = 0, target_box_center_y = 0; + T target_box_width = 0, target_box_height = 0; + if (prior_box_var) { + target_box_center_x = prior_box_var_data[j * len] * target_box_data[offset] * prior_box_width + prior_box_center_x; - T target_box_center_y = prior_box_var_data[j * len + 1] * + target_box_center_y = prior_box_var_data[j * len + 1] * target_box_data[offset + 1] * prior_box_height + prior_box_center_y; - T target_box_width = std::exp(prior_box_var_data[j * len + 2] * + target_box_width = std::exp(prior_box_var_data[j * len + 2] * target_box_data[offset + 2]) * prior_box_width; - T target_box_height = std::exp(prior_box_var_data[j * len + 3] * + target_box_height = std::exp(prior_box_var_data[j * len + 3] * target_box_data[offset + 3]) * prior_box_height; + } else { + target_box_center_x = + target_box_data[offset] * prior_box_width + prior_box_center_x; + target_box_center_y = target_box_data[offset + 1] * prior_box_height + + prior_box_center_y; + target_box_width = + std::exp(target_box_data[offset + 2]) * prior_box_width; + target_box_height = + std::exp(target_box_data[offset + 3]) * prior_box_height; + } output[offset] = target_box_center_x - target_box_width / 2; output[offset + 1] = target_box_center_y - target_box_height / 2; @@ -147,10 +166,10 @@ class BoxCoderKernel : public framework::OpKernel { bool normalized = context.Attr("box_normalized"); T* output = output_box->data(); if (code_type == BoxCodeType::kEncodeCenterSize) { - EncodeCenterSize(*target_box, *prior_box, *prior_box_var, normalized, + EncodeCenterSize(target_box, prior_box, prior_box_var, normalized, output); } else if (code_type == BoxCodeType::kDecodeCenterSize) { - DecodeCenterSize(*target_box, *prior_box, *prior_box_var, normalized, + DecodeCenterSize(target_box, prior_box, prior_box_var, normalized, output); } } diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cc b/paddle/fluid/operators/detection/iou_similarity_op.cc index 8e58605fcea04f9ffa97ce8cca53c073e7068aaf..9c89b7ca9af1b235659554afc805600d31ef8ea6 100644 --- a/paddle/fluid/operators/detection/iou_similarity_op.cc +++ b/paddle/fluid/operators/detection/iou_similarity_op.cc @@ -68,15 +68,16 @@ class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker { "representing pairwise iou scores."); AddComment(R"DOC( -IOU Similarity Operator. +**IOU Similarity Operator** + Computes intersection-over-union (IOU) between two box lists. - Box list 'X' should be a LoDTensor and 'Y' is a common Tensor, - boxes in 'Y' are shared by all instance of the batched inputs of X. - Given two boxes A and B, the calculation of IOU is as follows: +Box list 'X' should be a LoDTensor and 'Y' is a common Tensor, +boxes in 'Y' are shared by all instance of the batched inputs of X. +Given two boxes A and B, the calculation of IOU is as follows: $$ IOU(A, B) = -\frac{area(A\cap B)}{area(A)+area(B)-area(A\cap B)} +\\frac{area(A\\cap B)}{area(A)+area(B)-area(A\\cap B)} $$ )DOC"); diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc index 335e8dd470f851d8c5f6bdbc94cfc343da269034..568d50d457d838d5f11605710c0d3b987af01d10 100644 --- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc +++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc @@ -83,11 +83,13 @@ class PolygonBoxTransformOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( PolygonBoxTransform Operator. + +PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate. + The input is the final geometry output in detection network. We use 2*n numbers to denote the coordinate shift from n corner vertices of the polygon_box to the pixel location. As each distance offset contains two numbers (xi, yi), the geometry output contains 2*n channels. -PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate. )DOC"); } }; diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc index 716c8625d35308f98582e6802e90d99d643e188b..d7f49a9590e4ef4ca4d2ad5a92572c70e6bfb6ac 100644 --- a/paddle/fluid/operators/detection_map_op.cc +++ b/paddle/fluid/operators/detection_map_op.cc @@ -175,12 +175,12 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Detection mAP evaluate operator. The general steps are as follows. First, calculate the true positive and - false positive according to the input of detection and labels, then - calculate the mAP evaluate value. - Supporting '11 point' and 'integral' mAP algorithm. Please get more information - from the following articles: - https://sanchom.wordpress.com/tag/average-precision/ - https://arxiv.org/abs/1512.02325 +false positive according to the input of detection and labels, then +calculate the mAP evaluate value. +Supporting '11 point' and 'integral' mAP algorithm. Please get more information +from the following articles: +https://sanchom.wordpress.com/tag/average-precision/ +https://arxiv.org/abs/1512.02325 )DOC"); } diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..312f80e09077f21a47985c1c936c2ac41c292ead --- /dev/null +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -0,0 +1,33 @@ +if(WITH_GRPC) + grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc + request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor + selected_rows memory) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr + cares zlib protobuf sendrecvop_grpc SERIAL) + cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc + grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor + proto_desc lookup_table_op SERIAL) + return() +endif() + + +set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") +set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc + PROTO send_recv.proto + DEPS lod_tensor selected_rows memory) + +find_library(OPENSSL_CRYPTO_LIBRARY_STATIC NAMES libcrypto.so) +ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY_STATIC}) + + +find_library(OPENSSL_SSL_LIBRARY_STATIC NAMES libssl.so) +ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY_STATIC}) + +cc_test(brpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_brpc + brpc protobuf leveldb gflags glog + protobuf executor proto_desc lookup_table_op snappystream snappy ssl crypto SERIAL) diff --git a/paddle/fluid/operators/distributed/brpc_client.cc b/paddle/fluid/operators/distributed/brpc_client.cc new file mode 100644 index 0000000000000000000000000000000000000000..b394c678fb6503eb73a1e11e6feb814251e9e940 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_client.cc @@ -0,0 +1,180 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/distributed/brpc_client.h" +#include "paddle/fluid/framework/threadpool.h" + +namespace paddle { +namespace operators { +namespace distributed { + +DEFINE_int32(brpc_channel_num, 24, + "Number of channels to send requests connected to one server"); +DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds"); +DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)"); + +BRPCClient::~BRPCClient() { Wait(); } + +void HandleSendResponse(brpc::Controller* cntl, + sendrecv::VoidMessage* response) { + // std::unique_ptr makes sure cntl/response will be deleted before returning. + std::unique_ptr cntl_guard(cntl); + std::unique_ptr response_guard(response); + + if (cntl->Failed()) { + LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText(); + return; + } + LOG(INFO) << "Received response from " << cntl->remote_side() + << " latency=" << cntl->latency_us() << "us"; +} + +bool BRPCClient::AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const framework::Scope* p_scope = &scope; + const auto ch_ptr = GetChannel(ep_val); + + framework::AsyncIO( + [var_name_val, p_ctx, ep_val, p_scope, time_out, ch_ptr, this] { + auto ch_ctx = ch_ptr->Pop(); + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); + cntl->set_timeout_ms(time_out); + + google::protobuf::Closure* done = + brpc::NewCallback(&HandleSendResponse, cntl, response); + + sendrecv::VariableMessage request; + ch_ctx->stub->SendVariable(cntl, &request, response, done); + }); + req_count_++; + + return true; +} + +void HandleGetResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response) { + // std::unique_ptr makes sure cntl/response will be deleted before returning. + std::unique_ptr cntl_guard(cntl); + std::unique_ptr response_guard(response); + + if (cntl->Failed()) { + LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText(); + return; + } + LOG(INFO) << "Received response from " << cntl->remote_side() + << " latency=" << cntl->latency_us() << "us"; + + // framework::Variable* outvar = nullptr; + // DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar); +} + +bool BRPCClient::AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + + framework::AsyncIO( + [var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {}); + + req_count_++; + + return true; +} + +bool BRPCClient::AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string in_var_name_val = in_var_name; + const std::string out_var_name_val = out_var_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + + framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, + time_out, ch, this] {}); + + req_count_++; + return true; +} + +void BRPCClient::AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out) { + req_count_++; +} + +void BRPCClient::AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) { + req_count_++; +} + +void BRPCClient::Wait() { + std::unique_lock lk(sync_mutex_); + sync_cond_.wait(lk, [this] { return req_count_ == 0; }); +} + +ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { + { + std::lock_guard guard(chan_mutex_); + auto it = channels_.find(ep); + if (it != channels_.end()) { + return it->second; + } + } + + ChannelQueuePtr q(new framework::BlockingQueue()); + + brpc::ChannelOptions options; + options.protocol = "baidu_std"; + options.connection_type = "pooled"; + options.connect_timeout_ms = 100; + options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/; + options.max_retry = FLAGS_max_retry; + for (int i = 0; i < FLAGS_brpc_channel_num; ++i) { + std::shared_ptr c(new ChannelContext()); + if (c->channel.Init(ep.c_str(), &options) != 0) { + LOG(ERROR) << "Fail to initialize channel"; + return nullptr; + } + + c->stub.reset(new sendrecv::SendRecvService_Stub( + static_cast(&c->channel))); + q->Push(c); + } + + { + std::lock_guard guard(chan_mutex_); + channels_[ep] = q; + } + + return q; +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_client.h b/paddle/fluid/operators/distributed/brpc_client.h new file mode 100644 index 0000000000000000000000000000000000000000..8ff1f0a6076b3574c42065edcbac50eb75b3b483 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_client.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include // NOLINT +#include +#include +#include +#include +#include // NOLINT +#include +#include + +#include "brpc/channel.h" +#include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN + +namespace paddle { +namespace operators { +namespace distributed { + +struct ChannelContext { + brpc::Channel channel; + std::shared_ptr stub; +}; + +typedef std::shared_ptr ChannelContextPtr; +typedef std::shared_ptr> + ChannelQueuePtr; + +class BRPCClient : public RPCClient { + public: + BRPCClient() {} + virtual ~BRPCClient(); + + bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + bool AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + void AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out = FLAGS_rpc_deadline) override; + + void AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out = FLAGS_rpc_deadline) override; + + void Wait() override; + + private: + void Proceed(); + ChannelQueuePtr GetChannel(const std::string& ep); + + private: + std::unordered_map channels_; + + // mutex for Wait client sync + std::mutex sync_mutex_; + std::condition_variable sync_cond_; + std::atomic req_count_{0}; + + // mutex for GetChannel thread safety + std::mutex chan_mutex_; + DISABLE_COPY_AND_ASSIGN(BRPCClient); +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_server.cc b/paddle/fluid/operators/distributed/brpc_server.cc new file mode 100644 index 0000000000000000000000000000000000000000..862167f02084cfe81db1c0936bbfb0415fa85721 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_server.cc @@ -0,0 +1,144 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/distributed/brpc_server.h" +#include "paddle/fluid/operators/distributed/request_handler.h" + +namespace sendrecv { + +typedef std::unordered_map + HandlerMap; + +class BRPCServiceImpl : public SendRecvService { + public: + explicit BRPCServiceImpl(const HandlerMap& rpc_call_map) + : request_send_h_(nullptr), + request_get_h_(nullptr), + request_prefetch_h_(nullptr) { + auto it = rpc_call_map.find(paddle::operators::distributed::kRequestSend); + if (it != rpc_call_map.end()) { + request_send_h_ = it->second; + } + + it = rpc_call_map.find(paddle::operators::distributed::kRequestSend); + if (it != rpc_call_map.end()) { + request_get_h_ = it->second; + } + + it = rpc_call_map.find(paddle::operators::distributed::kRequestPrefetch); + if (it != rpc_call_map.end()) { + request_prefetch_h_ = it->second; + } + } + + virtual ~BRPCServiceImpl() {} + + void SendVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE(request_send_h_ != nullptr, + "RequestSend handler should be registed first!"); + brpc::ClosureGuard done_guard(done); + + paddle::framework::Scope* local_scope = request_send_h_->scope(); + paddle::framework::Variable* outvar = nullptr; + paddle::framework::Variable* invar = nullptr; + + std::string varname = request->varname(); + + if (!request_send_h_->sync_mode()) { + local_scope = &request_send_h_->scope()->NewScope(); + invar = local_scope->Var(varname); + } else { + invar = local_scope->FindVar(varname); + } + + request_send_h_->Handle(varname, local_scope, invar, &outvar); + + if (!request_send_h_->sync_mode()) { + request_send_h_->scope()->DeleteScope(local_scope); + } + } + + void GetVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VariableMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE(request_get_h_ != nullptr, + "RequestGet handler should be registed first!"); + } + + void PrefetchVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE(request_prefetch_h_ != nullptr, + "kRequestPrefetch handler should be registed first!"); + } + + private: + paddle::operators::distributed::RequestHandler* request_send_h_; + paddle::operators::distributed::RequestHandler* request_get_h_; + paddle::operators::distributed::RequestHandler* request_prefetch_h_; +}; +} // namespace sendrecv + +namespace paddle { +namespace operators { +namespace distributed { + +void AsyncBRPCServer::StartServer() { + // Instance of your service. + sendrecv::BRPCServiceImpl service_impl(rpc_call_map_); + + // Add the service into server. Notice the second parameter, because the + // service is put on stack, we don't want server to delete it, otherwise + // use brpc::SERVER_OWNS_SERVICE. + if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) { + LOG(FATAL) << "Fail to add service"; + return; + } + + brpc::ServerOptions options; + options.idle_timeout_sec = idle_timeout_s_; + options.max_concurrency = max_concurrency_; + if (server_.Start(bind_address_.c_str(), &options) != 0) { + LOG(FATAL) << "Fail to start EchoServer" << bind_address_; + return; + } + + butil::EndPoint ep = server_.listen_address(); + selected_port_ = ep.port; + + { + std::lock_guard lock(this->mutex_ready_); + ready_ = 1; + } + condition_ready_.notify_all(); + + server_.Join(); +} + +void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); } + +void AsyncBRPCServer::WaitServerReady() { + VLOG(3) << "AsyncGRPCServer is wait server ready"; + std::unique_lock lock(this->mutex_ready_); + condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); + VLOG(3) << "AsyncGRPCServer WaitSeverReady"; +} + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_server.h b/paddle/fluid/operators/distributed/brpc_server.h new file mode 100644 index 0000000000000000000000000000000000000000..85a7ad0dfe843dad483d43631b69a79d75211ce9 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_server.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // NOLINT +#include // NOLINT +#include + +#include "brpc/server.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class AsyncBRPCServer final : public RPCServer { + public: + explicit AsyncBRPCServer(const std::string& address, int client_num) + : RPCServer(address, client_num), ready_(0) {} + + virtual ~AsyncBRPCServer() {} + void StartServer() override; + void WaitServerReady() override; + + private: + void ShutDownImpl() override; + + brpc::Server server_; + + static constexpr int idle_timeout_s_ = -1; + static constexpr int max_concurrency_ = 0; + + std::mutex mutex_ready_; + std::condition_variable condition_ready_; + int ready_; +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.cc b/paddle/fluid/operators/distributed/bytebuffer_stream.cc similarity index 94% rename from paddle/fluid/operators/detail/bytebuffer_stream.cc rename to paddle/fluid/operators/distributed/bytebuffer_stream.cc index a14171563edb0ac9a22b7ae493c965de3efb7823..6e91b447db838c9095432eda22e9e1171e938d31 100644 --- a/paddle/fluid/operators/detail/bytebuffer_stream.cc +++ b/paddle/fluid/operators/distributed/bytebuffer_stream.cc @@ -17,11 +17,11 @@ limitations under the License. */ // file and did some modifications so that we can send gRPC // requests without too much copying of the tensor data. -#include "paddle/fluid/operators/detail/bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/bytebuffer_stream.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { GrpcByteBufferSource::GrpcByteBufferSource() {} @@ -83,6 +83,6 @@ google::protobuf::int64 GrpcByteBufferSource::ByteCount() const { return byte_count_; } -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.h b/paddle/fluid/operators/distributed/bytebuffer_stream.h similarity index 99% rename from paddle/fluid/operators/detail/bytebuffer_stream.h rename to paddle/fluid/operators/distributed/bytebuffer_stream.h index 054dd4ff294414cca55d7e033f2c5403bbb85526..e7de172c79c30761483b5d96f5bad19860208832 100644 --- a/paddle/fluid/operators/detail/bytebuffer_stream.h +++ b/paddle/fluid/operators/distributed/bytebuffer_stream.h @@ -106,7 +106,7 @@ class GrpcBufferReader final namespace paddle { namespace operators { -namespace detail { +namespace distributed { // Source provides a way for a particular RPC implementation to provide // received data to ParseFrom. class Source { @@ -183,6 +183,6 @@ class GrpcByteSource : public Source { char space_[sizeof(Reader)]; }; -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc similarity index 60% rename from paddle/fluid/operators/detail/grpc_client.cc rename to paddle/fluid/operators/distributed/grpc_client.cc index da9ca1a0c1d55018141f0e4285fe35d7c437fd55..8228a8c5a3eae73fe82551c8aad55290b0d54ef0 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -12,39 +12,51 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/distributed/grpc_client.h" #include #include +#include "glog/logging.h" // For VLOG #include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { -std::once_flag RPCClient::init_flag_; +void GRPCClient::InitImpl() { InitEventLoop(); } -std::unique_ptr RPCClient::rpc_client_(nullptr); +void GRPCClient::InitEventLoop() { + // start the client process thread + // TODO(wuyi): can make this in a threadpool + client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this))); +} -RPCClient* RPCClient::GetInstance() { - std::call_once(init_flag_, &RPCClient::Init); - return rpc_client_.get(); +void GRPCClient::SendComplete() { + for (auto& it : channels_) { + this->AsyncSendComplete(it.first); + } } -void RPCClient::Init() { - if (rpc_client_.get() == nullptr) { - rpc_client_.reset(new RPCClient()); +GRPCClient::~GRPCClient() { + Wait(); + cq_.Shutdown(); + { + std::lock_guard guard(chan_mutex_); + for (auto& it : channels_) { + it.second.reset(); + } } + client_thread_->join(); } -bool RPCClient::AsyncSendVariable(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out) { +bool GRPCClient::AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; @@ -64,6 +76,9 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, var_h.scope = p_scope; var_h.name = var_name_val; var_h.ctx = p_ctx; + var_h.method = "Send"; + + VLOG(3) << var_h.String() << " begin"; // stub context SendProcessor* s = new SendProcessor(ch); @@ -94,11 +109,10 @@ void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) { result->Swap(&tmp); } -bool RPCClient::AsyncGetVariable(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out) { +bool GRPCClient::AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; @@ -119,6 +133,9 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, var_h.scope = p_scope; var_h.name = var_name_val; var_h.ctx = p_ctx; + var_h.method = "Get"; + + VLOG(3) << var_h.String() << " begin"; // stub context GetProcessor* s = new GetProcessor(ch); @@ -136,12 +153,12 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, return true; } -bool RPCClient::AsyncPrefetchVariable(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out) { +bool GRPCClient::AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string in_var_name_val = in_var_name; @@ -162,6 +179,9 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep, var_h.scope = p_scope; var_h.name = out_var_name_val; var_h.ctx = p_ctx; + var_h.method = "Prefetch"; + + VLOG(3) << var_h.String() << " begin"; // stub context GetProcessor* s = new GetProcessor(ch); @@ -179,7 +199,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep, return true; } -void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) { +void GRPCClient::AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out) { const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); @@ -192,7 +213,8 @@ void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) { req_count_++; } -void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) { +void GRPCClient::AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) { const auto ch = GetChannel(ep); FetchBarrierProcessor* s = new FetchBarrierProcessor(ch); s->Prepare(time_out); @@ -204,76 +226,75 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) { req_count_++; } -bool RPCClient::Wait() { - VLOG(3) << "RPCClient begin Wait()" - << " req_count_:" << req_count_; - if (req_count_ <= 0) { - return true; - } - const size_t kReqCnt = req_count_; - bool a[kReqCnt]; - std::vector> waits(req_count_); - std::mutex mu; - - for (int i = 0; i < req_count_; i++) { - waits[i] = framework::AsyncIO([i, &a, &mu, this] { - bool ret = Proceed(); - std::lock_guard l(mu); - a[i] = ret; - }); - } +void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) { + const auto ch = GetChannel(ep); - for (int i = 0; i < req_count_; i++) { - waits[i].wait(); - } + BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); + s->Prepare(time_out); - int last_req_count = req_count_; - req_count_ = 0; + sendrecv::VariableMessage req; + req.set_varname(COMPLETE_MESSAGE); + auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + req_count_++; +} - for (int i = 0; i < last_req_count; i++) { - if (!a[i]) { - return false; - } - } +void GRPCClient::AsyncCheckpointNotify(const std::string& ep, + const std::string& dir, + int64_t time_out) { + const auto ch = GetChannel(ep); - return true; -} + CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch); + s->Prepare(time_out); -bool RPCClient::Proceed() { - void* tag = NULL; - bool ok = false; + sendrecv::VariableMessage req; + req.set_varname(CHECKPOINT_SAVE_MESSAGE); + req.set_out_varname(dir); - // request counts. - if (!cq_.Next(&tag, &ok)) { - LOG(ERROR) << "Get meets CompletionQueue error"; - return false; - } + auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + req_count_++; +} - GPR_ASSERT(ok); - PADDLE_ENFORCE(tag); +void GRPCClient::Wait() { + std::unique_lock lk(sync_mutex_); + sync_cond_.wait(lk, [this] { return req_count_ == 0; }); +} + +void GRPCClient::Proceed() { + void* tag = nullptr; + bool ok = false; - // TODO(gongwb): add more retries. - BaseProcessor* c = static_cast(tag); - if (!c->status_.ok()) { - LOG(ERROR) << "proc param error:" << c->var_h_.String() - << " grpc error:" << c->status_.error_message(); + while (cq_.Next(&tag, &ok)) { + BaseProcessor* c = static_cast(tag); + GPR_ASSERT(ok); + PADDLE_ENFORCE(c); + if (c->status_.ok()) { + VLOG(3) << c->var_h_.String() << " process"; + c->Process(); + } else { + LOG(FATAL) << c->var_h_.String() + << " meets grpc error:" << c->status_.error_message(); + } delete c; - return false; + { + std::lock_guard lk(sync_mutex_); + req_count_--; + } + sync_cond_.notify_all(); } - - c->Process(); - delete c; - return true; } -std::shared_ptr RPCClient::GetChannel(const std::string& ep) { - // TODO(Yancey1989): make grpc client completely thread-safe - std::unique_lock lock(mutex_); + +std::shared_ptr GRPCClient::GetChannel(const std::string& ep) { + std::lock_guard guard(chan_mutex_); auto it = channels_.find(ep); if (it != channels_.end()) { return it->second; } + // Channel configurations: grpc::ChannelArguments args; + args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000); args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE); args.SetMaxSendMessageSize(std::numeric_limits::max()); args.SetMaxReceiveMessageSize(std::numeric_limits::max()); @@ -284,6 +305,6 @@ std::shared_ptr RPCClient::GetChannel(const std::string& ep) { return ch; } -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h similarity index 63% rename from paddle/fluid/operators/detail/grpc_client.h rename to paddle/fluid/operators/distributed/grpc_client.h index 449d5105afb8c02294a0ef57610e7de1b1631b35..7a08f2d3a4a28a4323723e6b887c50588eed2bce 100644 --- a/paddle/fluid/operators/detail/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -16,15 +16,18 @@ limitations under the License. */ #include -#include // NOLINT +#include // NOLINT +#include // NOLINT #include #include #include #include #include // NOLINT #include +#include // NOLINT #include +#include "grpc++/channel.h" #include "grpc++/generic/generic_stub.h" #include "grpc++/grpc++.h" #include "grpc++/support/byte_buffer.h" @@ -35,22 +38,27 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN namespace paddle { namespace operators { -namespace detail { +namespace distributed { struct VarHandle { + // RPC endpoint. std::string ep; const platform::DeviceContext* ctx; const framework::Scope* scope; + // Variable name. std::string name; + // RPC method name. + std::string method; std::string String() const { std::ostringstream s; - s << "name:[" << name << "] ep:[" << ep << "]"; + s << method << " name:[" << name << "], ep:[" << ep << "]"; return s.str(); } }; @@ -68,6 +76,7 @@ class BaseProcessor { virtual void Prepare(const VarHandle& var_info, int64_t time_out) { context_.reset(new grpc::ClientContext()); var_h_ = var_info; + context_->set_wait_for_ready(true); std::chrono::system_clock::time_point deadline = std::chrono::system_clock::now() + std::chrono::milliseconds(time_out); @@ -77,6 +86,7 @@ class BaseProcessor { virtual void Prepare(int64_t time_out) { context_.reset(new grpc::ClientContext()); + context_->set_wait_for_ready(true); std::chrono::system_clock::time_point deadline = std::chrono::system_clock::now() + std::chrono::milliseconds(time_out); @@ -161,55 +171,82 @@ class FetchBarrierProcessor : public BaseProcessor { std::unique_ptr stub_; }; -class RPCClient { +class CheckpointNotifyProcessor : public BaseProcessor { public: - RPCClient() {} + explicit CheckpointNotifyProcessor(std::shared_ptr ch) + : BaseProcessor(ch) { + stub_ = sendrecv::SendRecvService::NewStub(ch); + } + + virtual ~CheckpointNotifyProcessor() {} + + virtual void Process() {} + sendrecv::VoidMessage reply_; + std::unique_ptr stub_; +}; + +class GRPCClient : public RPCClient { + public: + GRPCClient() {} + virtual ~GRPCClient(); - static RPCClient* GetInstance(); + bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - bool AsyncSendVariable(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = 600 * 1000); + bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - bool AsyncGetVariable(const std::string& ep, + bool AsyncPrefetchVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = 600 * 1000); - - bool AsyncPrefetchVariable(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out = 600 * 1000); + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out = FLAGS_rpc_deadline) override; void AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out = 600 * 1000); + int64_t time_out = FLAGS_rpc_deadline) override; void AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out = 600 * 1000); + int64_t time_out = FLAGS_rpc_deadline) override; + + void AsyncCheckpointNotify(const std::string& ep, const std::string& dir, + int64_t time_out = FLAGS_rpc_deadline) override; + + void Wait() override; + + void SendComplete() override; - bool Wait(); + protected: + void InitImpl() override; private: - bool Proceed(); + // InitEventLoop should only be called by Init() + void InitEventLoop(); + + void Proceed(); + + void AsyncSendComplete(const std::string& ep, + int64_t time_out = FLAGS_rpc_deadline); + std::shared_ptr GetChannel(const std::string& ep); - // Init is called by GetInstance. - static void Init(); private: grpc::CompletionQueue cq_; - std::map> channels_; + std::unordered_map> channels_; + std::unique_ptr client_thread_; + + // mutex for Wait client sync + std::mutex sync_mutex_; + std::condition_variable sync_cond_; std::atomic req_count_{0}; - std::mutex mutex_; - static std::unique_ptr rpc_client_; - static std::once_flag init_flag_; - DISABLE_COPY_AND_ASSIGN(RPCClient); + + // mutex for GetChannel thread safety + std::mutex chan_mutex_; + DISABLE_COPY_AND_ASSIGN(GRPCClient); }; -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/serde_test.cc b/paddle/fluid/operators/distributed/grpc_serde_test.cc similarity index 93% rename from paddle/fluid/operators/detail/serde_test.cc rename to paddle/fluid/operators/distributed/grpc_serde_test.cc index 15892295e6901fe649788c9e34604008fc8cbdfa..3d107b533bcb7bfef3f9b13ec99afbd579a62e52 100644 --- a/paddle/fluid/operators/detail/serde_test.cc +++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc @@ -21,8 +21,8 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" -#include "paddle/fluid/operators/detail/variable_response.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/printf.h" @@ -50,7 +50,7 @@ void RunSerdeTestSelectedRows(platform::Place place) { for (int i = 0; i < 564; ++i) rows->push_back(i); ::grpc::ByteBuffer msg; - operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); + operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg); EXPECT_GT(msg.Length(), static_cast(0)); // deserialize @@ -81,10 +81,10 @@ void RunSerdeTestSelectedRows(platform::Place place) { // deserialize zero-copy // framework::Variable var2; - // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); + // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2); framework::Scope scope; scope.Var("myvar"); - operators::detail::VariableResponse resp(&scope, &ctx); + operators::distributed::VariableResponse resp(&scope, &ctx); EXPECT_EQ(resp.Parse(msg), 0); framework::Variable* var2 = resp.GetVar(); @@ -128,7 +128,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) { math::set_constant(ctx, tensor, 31.9); ::grpc::ByteBuffer msg; - operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); + operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg); EXPECT_GT(msg.Length(), static_cast(0)); // deserialize @@ -171,7 +171,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) { // deserialize zero-copy framework::Scope scope; scope.Var("myvar"); - operators::detail::VariableResponse resp(&scope, &ctx); + operators::distributed::VariableResponse resp(&scope, &ctx); if (from_type == 0) { EXPECT_EQ(resp.Parse(msg), 0); } else { diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc similarity index 68% rename from paddle/fluid/operators/detail/grpc_server.cc rename to paddle/fluid/operators/distributed/grpc_server.cc index e73756d89004bc48339c0aa31dd0857c2ca6722d..f35e268f6ad36da02f17db2feb3fbf1fdf6c1e41 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc_server.cc @@ -15,13 +15,13 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/detail/grpc_server.h" +#include "paddle/fluid/operators/distributed/grpc_server.h" using ::grpc::ServerAsyncResponseWriter; namespace paddle { namespace operators { -namespace detail { +namespace distributed { enum CallStatus { PROCESS = 0, FINISH }; // reference: @@ -41,11 +41,35 @@ class RequestBase { virtual ~RequestBase() {} virtual void Process() = 0; - CallStatus Status() { return status_; } - void SetStatus(CallStatus status) { status_ = status; } + std::string Status2String(const std::string& method) { + std::string status = "Process"; + if (status_ == FINISH) { + status = "Finish"; + } + + std::ostringstream s; + s << method << " name:[" << GetReqName() << "]" + << ", ep:[" << ctx_.peer() << "]" + << " " << status << " using req_id:" << req_id_; + return s.str(); + } + + CallStatus Status() const { + std::lock_guard l(status_mu_); + return status_; + } + + template + void Finish(const T& reply, ServerAsyncResponseWriter* responder) { + std::lock_guard l(status_mu_); + status_ = FINISH; + responder->Finish(reply, ::grpc::Status::OK, + reinterpret_cast(static_cast(req_id_))); + } virtual std::string GetReqName() = 0; protected: + mutable std::mutex status_mu_; ::grpc::ServerContext ctx_; GrpcService::AsyncService* service_; ::grpc::ServerCompletionQueue* cq_; @@ -63,29 +87,24 @@ class RequestSend final : public RequestBase { request_.reset(new VariableResponse(request_handler->scope(), request_handler->dev_ctx(), !request_handler->sync_mode())); - int method_id = static_cast(detail::GrpcMethod::kSendVariable); + int method_id = static_cast(distributed::GrpcMethod::kSendVariable); service_->RequestAsyncUnary( method_id, &ctx_, request_.get(), &responder_, cq_, cq_, reinterpret_cast(static_cast(req_id))); } - virtual ~RequestSend() {} - std::string GetReqName() override { return request_->Varname(); } void Process() override { std::string varname = GetReqName(); - VLOG(3) << "RequestSend var_name:" << varname; + VLOG(4) << "RequestSend var_name:" << varname; auto scope = request_->GetMutableLocalScope(); auto invar = request_->GetVar(); framework::Variable* outvar = nullptr; request_handler_->Handle(varname, scope, invar, &outvar); - - status_ = FINISH; - responder_.Finish(reply_, ::grpc::Status::OK, - reinterpret_cast(static_cast(req_id_))); + Finish(reply_, &responder_); } protected: @@ -100,7 +119,7 @@ class RequestGet final : public RequestBase { ::grpc::ServerCompletionQueue* cq, RequestHandler* request_handler, int req_id) : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - auto method_id = static_cast(detail::GrpcMethod::kGetVariable); + auto method_id = static_cast(distributed::GrpcMethod::kGetVariable); service_->RequestAsyncUnary( method_id, &ctx_, &request_, &responder_, cq_, cq_, reinterpret_cast(static_cast(req_id))); @@ -113,7 +132,7 @@ class RequestGet final : public RequestBase { void Process() override { // proc request. std::string varname = request_.varname(); - VLOG(3) << "RequestGet " << varname; + VLOG(4) << "RequestGet " << varname; auto scope = request_handler_->scope(); auto invar = scope->FindVar(varname); @@ -125,10 +144,7 @@ class RequestGet final : public RequestBase { SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(), &reply_); } - - status_ = FINISH; - responder_.Finish(reply_, ::grpc::Status::OK, - reinterpret_cast(static_cast(req_id_))); + Finish(reply_, &responder_); } protected: @@ -147,7 +163,8 @@ class RequestPrefetch final : public RequestBase { local_scope_(nullptr) { request_.reset(new VariableResponse(request_handler->scope(), request_handler->dev_ctx(), true)); - int method_id = static_cast(detail::GrpcMethod::kPrefetchVariable); + int method_id = + static_cast(distributed::GrpcMethod::kPrefetchVariable); service_->RequestAsyncUnary( method_id, &ctx_, request_.get(), &responder_, cq_, cq_, reinterpret_cast(static_cast(req_id))); @@ -159,21 +176,21 @@ class RequestPrefetch final : public RequestBase { void Process() override { // prefetch process... - std::string varname = request_->OutVarname(); - VLOG(3) << "RequestPrefetch " << varname; + std::string in_var_name = request_->Varname(); + std::string out_var_name = request_->OutVarname(); + VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name + << " out_var_name: " << out_var_name; auto scope = request_->GetMutableLocalScope(); - auto invar = scope->FindVar(varname); - framework::Variable* outvar = nullptr; + auto invar = scope->FindVar(in_var_name); + // out var must be created in local scope! + framework::Variable* outvar = scope->Var(out_var_name); - request_handler_->Handle(varname, scope, invar, &outvar); + request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name); - SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(), + SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(), &reply_); - - status_ = FINISH; - responder_.Finish(reply_, ::grpc::Status::OK, - reinterpret_cast(static_cast(req_id_))); + Finish(reply_, &responder_); } protected: @@ -183,11 +200,50 @@ class RequestPrefetch final : public RequestBase { framework::Scope* local_scope_; }; +class RequestCheckpointNotify final : public RequestBase { + public: + explicit RequestCheckpointNotify(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + RequestHandler* request_handler, int req_id) + : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { + request_.reset(new VariableResponse(request_handler->scope(), + request_handler->dev_ctx())); + int method_id = + static_cast(distributed::GrpcMethod::kCheckpointNotify); + service_->RequestAsyncUnary( + method_id, &ctx_, request_.get(), &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id))); + } + + virtual ~RequestCheckpointNotify() {} + + std::string GetReqName() override { return request_->Varname(); } + + void Process() override { + auto scope = request_->GetMutableLocalScope(); + + std::string checkpoint_notify = request_->Varname(); + std::string checkpoint_dir = request_->OutVarname(); + + VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify + << ", dir: " << checkpoint_dir; + + request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr, + checkpoint_dir); + Finish(reply_, &responder_); + } + + protected: + std::shared_ptr request_; + sendrecv::VoidMessage reply_; + ServerAsyncResponseWriter responder_; +}; + void AsyncGRPCServer::WaitServerReady() { - VLOG(3) << "AsyncGRPCServer is wait server ready"; + VLOG(4) << "AsyncGRPCServer is wait server ready"; std::unique_lock lock(this->mutex_ready_); condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); - VLOG(3) << "AsyncGRPCServer WaitSeverReady"; + VLOG(4) << "AsyncGRPCServer WaitSeverReady"; } void AsyncGRPCServer::StartServer() { @@ -220,13 +276,14 @@ void AsyncGRPCServer::StartServer() { reqs.reserve(kRequestBufSize); for (int i = 0; i < kRequestBufSize; i++) { + VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i; TryToRegisterNewOne(rpc_name, i); } for (int i = 0; i < threadnum; i++) { rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind( &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f))); - VLOG(3) << t.first << " creates threads!"; + VLOG(4) << t.first << " creates threads!"; } } @@ -243,7 +300,7 @@ void AsyncGRPCServer::StartServer() { auto& threads = t.second; for (size_t i = 0; i < threads.size(); ++i) { threads[i]->join(); - VLOG(3) << t.first << " threads ends!"; + VLOG(4) << t.first << " threads ends!"; } } } @@ -251,7 +308,7 @@ void AsyncGRPCServer::StartServer() { void AsyncGRPCServer::ShutdownQueue() { for (auto& t : rpc_cq_) { t.second->Shutdown(); - VLOG(3) << t.first << " shutdown!"; + VLOG(4) << t.first << " queue shutdown!"; } } @@ -260,7 +317,7 @@ void AsyncGRPCServer::ShutDownImpl() { is_shut_down_ = true; ShutdownQueue(); - VLOG(3) << "server_ shutdown!"; + VLOG(4) << "server_ shutdown!"; server_->Shutdown(); } @@ -268,12 +325,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, int req_id) { std::unique_lock lock(cq_mutex_); if (is_shut_down_) { - VLOG(3) << "shutdown, do not TryToRegisterNewSendOne"; + VLOG(4) << "shutdown, do not TryToRegisterNewSendOne"; return; } - VLOG(4) << "register send rpc_name:" << rpc_name - << ", handler:" << rpc_call_map_[kRequestSend]; + VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name + << " REQ ID: " << req_id; auto& reqs = rpc_reqs_[rpc_name]; auto& handler = rpc_call_map_[rpc_name]; @@ -286,8 +343,10 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, b = new RequestGet(&service_, cq.get(), handler, req_id); } else if (rpc_name == kRequestPrefetch) { b = new RequestPrefetch(&service_, cq.get(), handler, req_id); + } else if (rpc_name == kRequestCheckpoint) { + b = new RequestCheckpointNotify(&service_, cq.get(), handler, req_id); } else { - PADDLE_ENFORCE(false, "not surpported rpc"); + PADDLE_ENFORCE(false, "not supported rpc"); } reqs[req_id] = b; @@ -302,14 +361,14 @@ void AsyncGRPCServer::HandleRequest( bool ok = false; while (true) { - VLOG(3) << "HandleRequest " << rpc_name << " wait next"; + VLOG(4) << "HandleRequest " << rpc_name << " wait next"; if (!cq->Next(&tag, &ok)) { - LOG(INFO) << "CompletionQueue " << rpc_name << " shutdown!"; + VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!"; break; } int req_id = static_cast(reinterpret_cast(tag)); - VLOG(3) << "HandleRequest " << rpc_name << ", req_id:" << req_id + VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id << " get next"; auto& reqs = rpc_reqs_[rpc_name]; @@ -320,22 +379,21 @@ void AsyncGRPCServer::HandleRequest( base = reqs[req_id]; } + VLOG(3) << base->Status2String(rpc_name); + // reference: // https://github.com/tensorflow/tensorflow/issues/5596 // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I if (!ok) { LOG(WARNING) << "completion queue:" << rpc_name - << " recv no regular event:argument name[" - << base->GetReqName() << "]"; + << " recv no regular event" + << " context:" << base->Status2String(rpc_name); TryToRegisterNewOne(rpc_name, req_id); delete base; continue; } - VLOG(3) << "queue id:" << rpc_name << ", req_id:" << req_id - << ", status:" << base->Status(); - switch (base->Status()) { case PROCESS: { base->Process(); @@ -351,6 +409,6 @@ void AsyncGRPCServer::HandleRequest( } } -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/distributed/grpc_server.h similarity index 83% rename from paddle/fluid/operators/detail/grpc_server.h rename to paddle/fluid/operators/distributed/grpc_server.h index e6ffc7066f24d5088a95801ed1c0670b24d5771f..d2524f5e65db6dedab78f45e17380359b58a3d11 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/distributed/grpc_server.h @@ -29,17 +29,17 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/detail/grpc_service.h" -#include "paddle/fluid/operators/detail/request_handler.h" -#include "paddle/fluid/operators/detail/rpc_server.h" -#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/detail/send_recv.pb.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/grpc_service.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" +#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { class RequestBase; @@ -53,6 +53,7 @@ class AsyncGRPCServer final : public RPCServer { void StartServer() override; private: + // HandleRequest needs to be thread-safe. void HandleRequest( ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name, std::function TryToRegisterNewOne); @@ -83,6 +84,6 @@ class AsyncGRPCServer final : public RPCServer { std::map> rpc_reqs_; }; -}; // namespace detail +}; // namespace distributed }; // namespace operators }; // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/distributed/grpc_service.h similarity index 83% rename from paddle/fluid/operators/detail/grpc_service.h rename to paddle/fluid/operators/distributed/grpc_service.h index e0505c2b9d0903837713d7e0032b01ab091c2e04..cdc4e7b79276d6aac55aeac8ac121ca28d2cc1f0 100644 --- a/paddle/fluid/operators/detail/grpc_service.h +++ b/paddle/fluid/operators/distributed/grpc_service.h @@ -23,7 +23,7 @@ #include #include #include -#include "paddle/fluid/operators/detail/variable_response.h" +#include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/platform/profiler.h" @@ -42,24 +42,25 @@ class ServerContext; // Support parsing/unparsing of tensorflow::VariableResponse. // Wire-format is identical to RecvVariableResponse. template <> -class SerializationTraits { +class SerializationTraits { public: static Status Serialize( - const paddle::operators::detail::VariableResponse& msg, + const paddle::operators::distributed::VariableResponse& msg, grpc_byte_buffer** bp, bool* own_buffer) { PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!"); return Status(); } - static Status Deserialize(grpc_byte_buffer* buffer, - paddle::operators::detail::VariableResponse* msg, - int max_message_size = INT_MAX) { + static Status Deserialize( + grpc_byte_buffer* buffer, + paddle::operators::distributed::VariableResponse* msg, + int max_message_size = INT_MAX) { if (buffer == nullptr) { return Status(StatusCode::INTERNAL, "No payload"); } Status result = g_core_codegen_interface->ok(); if (result.ok()) { - paddle::operators::detail::GrpcByteSource source(buffer); + paddle::operators::distributed::GrpcByteSource source(buffer); int ret = msg->Parse(&source); if (ret != 0) { result = Status(StatusCode::INTERNAL, "VariableResponse parse error"); @@ -73,16 +74,17 @@ class SerializationTraits { namespace paddle { namespace operators { -namespace detail { +namespace distributed { enum class GrpcMethod { kSendVariable, kGetVariable, kPrefetchVariable, + kCheckpointNotify, }; static const int kGrpcNumMethods = - static_cast(GrpcMethod::kPrefetchVariable) + 1; + static_cast(GrpcMethod::kCheckpointNotify) + 1; inline const char* GrpcMethodName(GrpcMethod id) { switch (id) { @@ -92,6 +94,8 @@ inline const char* GrpcMethodName(GrpcMethod id) { return "/sendrecv.SendRecvService/GetVariable"; case GrpcMethod::kPrefetchVariable: return "/sendrecv.SendRecvService/PrefetchVariable"; + case GrpcMethod::kCheckpointNotify: + return "/sendrecv.SendRecvService/CheckpointNotify"; } // Shouldn't be reached. @@ -118,6 +122,6 @@ class GrpcService final { }; }; -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h similarity index 98% rename from paddle/fluid/operators/detail/proto_encoder_helper.h rename to paddle/fluid/operators/distributed/proto_encoder_helper.h index d91d054b2507f32d1e948dde33da06a70cabe775..2fab02e32fe18ee04f86a69bb5bae1cbe7c6762c 100644 --- a/paddle/fluid/operators/detail/proto_encoder_helper.h +++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h @@ -26,7 +26,7 @@ limitations under the License. */ namespace paddle { namespace operators { -namespace detail { +namespace distributed { char* EncodeVarint32(char* dst, uint32_t v) { // Operate on characters as unsigneds @@ -144,6 +144,6 @@ class ProtoEncodeHelper { char* limit_; // Just for CHECKs }; -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h similarity index 74% rename from paddle/fluid/operators/detail/request_handler.h rename to paddle/fluid/operators/distributed/request_handler.h index 4bc5e7f10ee2a8939d230fe96517bd9f56c13933..90742a201ad46447d6fbbe2137aa40fabc2f9983 100644 --- a/paddle/fluid/operators/detail/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -28,15 +28,23 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { constexpr char kRequestSend[] = "RequestSend"; constexpr char kRequestGet[] = "RequestGet"; constexpr char kRequestPrefetch[] = "RequestPrefetch"; +constexpr char kRequestCheckpoint[] = "RequestCheckpoint"; + +#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" +#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" +#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" +#define COMPLETE_MESSAGE "COMPLETE@RECV" + +#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY" +#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY" class RPCServer; @@ -57,9 +65,17 @@ class RequestHandler { void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; } void SetProgram(framework::ProgramDesc* program) { program_ = program; } void SetExecutor(framework::Executor* executor) { executor_ = executor; } + + // Used for dist lookup table prefetch void SetPrefetchPreparedCtx( - std::unique_ptr prepared) { - prefetch_ctx_.reset(prepared.release()); + std::unordered_map< + std::string, std::shared_ptr>* g) { + prefetch_var_name_to_prepared_ctx_ = g; + } + + void SetCheckpointNotifyPreparedCtx( + std::shared_ptr g) { + checkpoint_prepared_ctx_ = g; } // Used for async. @@ -75,12 +91,8 @@ class RequestHandler { bool sync_mode() { return sync_mode_; } framework::Scope* scope() { return scope_; } const platform::DeviceContext* dev_ctx() { return dev_ctx_; } - framework::ExecutorPrepareContext* prefetch_ctx() { - return prefetch_ctx_.get(); - } framework::ProgramDesc* program() { return program_; } framework::Executor* executor() { return executor_; } - std::vector& sparse_vars() { return sparse_vars_; } // This function processes user's rpc request. // The implemention is in request_handler_impl. @@ -97,8 +109,8 @@ class RequestHandler { // *request_handler_->dev_ctx(), &reply_); // } virtual bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, - framework::Variable** outvar) = 0; + framework::Variable* var, framework::Variable** outvar, + const std::string& out_var_name = "") = 0; protected: const bool sync_mode_; @@ -107,21 +119,22 @@ class RequestHandler { framework::Executor* executor_; framework::Scope* scope_; framework::ProgramDesc* program_; - std::unique_ptr prefetch_ctx_; + + // used for distribute lookup table prefetch + std::unordered_map>* + prefetch_var_name_to_prepared_ctx_; + // used for checkpoint notify + std::shared_ptr checkpoint_prepared_ctx_; // Used for async. std::unordered_map>* grad_to_prepared_ctx_; - // Record received sparse variables, so that - // we could reset those after execute optimize program - std::vector sparse_vars_; RPCServer* rpc_server_; - - std::mutex sparse_var_mutex_; }; -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc similarity index 59% rename from paddle/fluid/operators/detail/request_handler_impl.cc rename to paddle/fluid/operators/distributed/request_handler_impl.cc index f16c06d52f4fb86d51083a8b3b98d05a64c1af74..163154c678f65b08981041d647b11f4b2b5860ba 100644 --- a/paddle/fluid/operators/detail/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -16,24 +16,27 @@ #include #include -#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/detail/request_handler_impl.h" -#include "paddle/fluid/operators/detail/rpc_server.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" -#include "paddle/fluid/operators/detail/variable_response.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" +#include "paddle/fluid/string/printf.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { + +// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables +// to directory specified. +constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath"; bool RequestSendHandler::Handle(const std::string& varname, framework::Scope* scope, framework::Variable* invar, - framework::Variable** outvar) { + framework::Variable** outvar, + const std::string& out_var_name) { VLOG(4) << "RequestSendHandler:" << varname; // Async @@ -52,6 +55,9 @@ bool RequestSendHandler::Handle(const std::string& varname, if (varname == BATCH_BARRIER_MESSAGE) { VLOG(3) << "sync: recv batch barrier message"; rpc_server_->IncreaseBatchBarrier(kRequestSend); + } else if (varname == COMPLETE_MESSAGE) { + VLOG(3) << "sync: recv complete message"; + rpc_server_->DecreaseClientNum(); } else { VLOG(3) << "sync: received var_name: " << varname; if (sync_mode_) { @@ -63,20 +69,27 @@ bool RequestSendHandler::Handle(const std::string& varname, PADDLE_THROW("sync: Can not find server side var"); return false; } - if (invar->IsType()) { - std::unique_lock lock(sparse_var_mutex_); + std::unique_lock lock(mutex_sparse_vars_); sparse_vars_.push_back(invar); } } - return true; } +void RequestSendHandler::ResetSparseVarRecorder() { + std::unique_lock lock(mutex_sparse_vars_); + for (auto* var : sparse_vars_) { + var->GetMutable()->mutable_rows()->clear(); + } + sparse_vars_.clear(); +} + bool RequestGetHandler::Handle(const std::string& varname, framework::Scope* scope, framework::Variable* invar, - framework::Variable** outvar) { + framework::Variable** outvar, + const std::string& out_var_name) { VLOG(4) << "RequestGetHandler:" << varname; if (varname != FETCH_BARRIER_MESSAGE) { @@ -99,17 +112,36 @@ bool RequestGetHandler::Handle(const std::string& varname, bool RequestPrefetchHandler::Handle(const std::string& varname, framework::Scope* scope, framework::Variable* invar, - framework::Variable** outvar) { + framework::Variable** outvar, + const std::string& out_var_name) { VLOG(4) << "RequestPrefetchHandler " << varname; - auto var_desc = program_->Block(0).FindVar(varname); - *outvar = scope->FindVar(varname); + auto var_desc = program_->Block(0).FindVar(out_var_name); InitializeVariable(*outvar, var_desc->GetType()); - executor_->RunPreparedContext(prefetch_ctx_.get(), scope); + executor_->RunPreparedContext( + (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope); + + return true; +} +bool RequestCheckpointHandler::Handle(const std::string& varname, + framework::Scope* scope, + framework::Variable* invar, + framework::Variable** outvar, + const std::string& out_var_name) { + PADDLE_ENFORCE( + checkpoint_notify_id != -1, + "when checkpoint_notify_id = -1, there should be no RPC invoke."); + + auto* lt_var = scope->FindVar(LOOKUP_TABLE_PATH)->GetMutable(); + lt_var->clear(); + lt_var->append(out_var_name); + VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: " + << out_var_name; + executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope); return true; } -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h similarity index 68% rename from paddle/fluid/operators/detail/request_handler_impl.h rename to paddle/fluid/operators/distributed/request_handler_impl.h index 8d0c62232b68ad6c05e751c25103802ee12db57e..87185500f2ffc3a8578eea339cc7a1e2b0e46631 100644 --- a/paddle/fluid/operators/detail/request_handler_impl.h +++ b/paddle/fluid/operators/distributed/request_handler_impl.h @@ -28,19 +28,24 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/detail/request_handler.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/request_handler.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { class RequestSendHandler final : public RequestHandler { public: explicit RequestSendHandler(bool sync_mode) : RequestHandler(sync_mode) {} virtual ~RequestSendHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar) override; + framework::Variable* var, framework::Variable** outvar, + const std::string& out_var_name = "") override; + void ResetSparseVarRecorder(); + + private: + std::mutex mutex_sparse_vars_; + std::vector sparse_vars_; }; class RequestGetHandler final : public RequestHandler { @@ -48,7 +53,8 @@ class RequestGetHandler final : public RequestHandler { explicit RequestGetHandler(bool sync_mode) : RequestHandler(sync_mode) {} virtual ~RequestGetHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar) override; + framework::Variable* var, framework::Variable** outvar, + const std::string& out_var_name = "") override; }; class RequestPrefetchHandler final : public RequestHandler { @@ -56,9 +62,25 @@ class RequestPrefetchHandler final : public RequestHandler { explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {} virtual ~RequestPrefetchHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar) override; + framework::Variable* var, framework::Variable** outvar, + const std::string& out_var_name = "") override; +}; + +class RequestCheckpointHandler final : public RequestHandler { + public: + explicit RequestCheckpointHandler(bool sync_mode, int checkpoint_notify_id) + : RequestHandler(sync_mode) { + this->checkpoint_notify_id = checkpoint_notify_id; + } + virtual ~RequestCheckpointHandler() {} + bool Handle(const std::string& varname, framework::Scope* scope, + framework::Variable* var, framework::Variable** outvar, + const std::string& out_var_name = "") override; + + private: + int checkpoint_notify_id; }; -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_client.cc b/paddle/fluid/operators/distributed/rpc_client.cc new file mode 100644 index 0000000000000000000000000000000000000000..b5ec9fe5367beb97b3cc7298102deff1e8ca4ec9 --- /dev/null +++ b/paddle/fluid/operators/distributed/rpc_client.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "gflags/gflags.h" + +// default to 3min to avoid temprary network failures. +DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc"); + +namespace paddle { +namespace operators { +namespace distributed { + +std::once_flag RPCClient::init_flag_; +std::unique_ptr RPCClient::rpc_client_(nullptr); + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h new file mode 100644 index 0000000000000000000000000000000000000000..37783b78ecc5c58aab3e358066bd7f2fba861799 --- /dev/null +++ b/paddle/fluid/operators/distributed/rpc_client.h @@ -0,0 +1,94 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "gflags/gflags.h" + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" + +DECLARE_int32(rpc_deadline); + +namespace paddle { +namespace operators { +namespace distributed { + +class RPCClient { + public: + RPCClient() {} + virtual ~RPCClient() {} + virtual bool AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual bool AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual bool AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual void AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual void AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual void AsyncCheckpointNotify(const std::string& ep, + const std::string& dir, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + // SendComplete tells all the server that current trainer have no more data + // to train, so that the pserver can reduce it's barrier count, and continue + // to train with other trainers. + virtual void SendComplete() = 0; + + virtual void Wait() = 0; + + template + static RPCClient* GetInstance() { + std::call_once(init_flag_, &RPCClient::Init); + return rpc_client_.get(); + } + + // Init is called by GetInstance. + template + static void Init() { + if (rpc_client_.get() == nullptr) { + rpc_client_.reset(new T()); + rpc_client_->InitImpl(); + } + } + + protected: + virtual void InitImpl() {} + + private: + static std::once_flag init_flag_; + static std::unique_ptr rpc_client_; +}; +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc similarity index 82% rename from paddle/fluid/operators/detail/rpc_server.cc rename to paddle/fluid/operators/distributed/rpc_server.cc index 448763372a8c224cc68319a4a444915896b68234..c0520e248d49f4f390af9075fc6f87ec4bd74c39 100644 --- a/paddle/fluid/operators/detail/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -17,11 +17,11 @@ #include #include -#include "paddle/fluid/operators/detail/rpc_server.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { void RPCServer::ShutDown() { LOG(INFO) << "RPCServer ShutDown "; @@ -43,29 +43,34 @@ void RPCServer::SavePort() const { void RPCServer::WaitBarrier(const std::string& rpc_name) { std::unique_lock lock(this->mutex_); - barrier_cond_.wait(lock, [=] { + barrier_cond_.wait(lock, [this, &rpc_name] { return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load()); }); - VLOG(3) << "batch_barrier_:" << barrier_counter_[rpc_name]; + VLOG(3) << "batch_barrier_: " << rpc_name << " " + << barrier_counter_[rpc_name]; } void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { - VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; + VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; int b = 0; - { - std::unique_lock lock(mutex_); - b = ++barrier_counter_[rpc_name]; - } - - VLOG(3) << "RPCServer IncreaseBatchBarrier " << rpc_name - << ", barrier_count:" << b << ", fan_in" << client_num_; - + std::unique_lock lock(mutex_); + b = ++barrier_counter_[rpc_name]; if (b >= client_num_) { + lock.unlock(); barrier_cond_.notify_all(); + lock.lock(); } } +void RPCServer::DecreaseClientNum() { + { + std::unique_lock lock(mutex_); + client_num_--; + } + barrier_cond_.notify_all(); +} + void RPCServer::ResetBarrierCounter() { VLOG(3) << "RPCServer ResetBarrierCounter "; std::unique_lock lock(mutex_); @@ -96,7 +101,7 @@ void RPCServer::SetCond(const std::string& rpc_name) { } void RPCServer::WaitCond(const std::string& rpc_name) { - VLOG(3) << "RPCServer WaitCond " << rpc_name; + VLOG(4) << "RPCServer WaitCond " << rpc_name; int cond = 0; { std::unique_lock lock(mutex_); @@ -108,6 +113,6 @@ void RPCServer::WaitCond(const std::string& rpc_name) { lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); }); } -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h similarity index 93% rename from paddle/fluid/operators/detail/rpc_server.h rename to paddle/fluid/operators/distributed/rpc_server.h index c2e7ae706c9dc6776e09b25e424b30f110c3855d..cf25e78435bb470b25a46db647ca818571cc83a5 100644 --- a/paddle/fluid/operators/detail/rpc_server.h +++ b/paddle/fluid/operators/distributed/rpc_server.h @@ -19,11 +19,11 @@ #include // NOLINT #include #include -#include "paddle/fluid/operators/detail/request_handler.h" +#include "paddle/fluid/operators/distributed/request_handler.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { class RPCServer { public: @@ -60,6 +60,7 @@ class RPCServer { void SetCond(const std::string& rpc_name); void WaitCond(const std::string& rpc_name); void IncreaseBatchBarrier(const std::string rpc_name); + void DecreaseClientNum(); void ResetBarrierCounter(); protected: @@ -78,14 +79,13 @@ class RPCServer { std::string bind_address_; std::atomic exit_flag_; int selected_port_; - - const int client_num_; + int client_num_; std::unordered_map rpc_call_map_; std::unordered_map rpc_thread_num_; friend class RequestHandler; }; -}; // namespace detail +}; // namespace distributed }; // namespace operators }; // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc similarity index 76% rename from paddle/fluid/operators/detail/grpc_server_test.cc rename to paddle/fluid/operators/distributed/rpc_server_test.cc index f97f638701cfb263f28dddbdc3bc80fb16468744..a0693cffabcc561b0adfafc2c49027a890dd5efc 100644 --- a/paddle/fluid/operators/detail/grpc_server_test.cc +++ b/paddle/fluid/operators/distributed/rpc_server_test.cc @@ -17,23 +17,23 @@ limitations under the License. */ #include // NOLINT #include "gtest/gtest.h" -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/detail/grpc_server.h" - #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/detail/request_handler_impl.h" +#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" namespace framework = paddle::framework; namespace platform = paddle::platform; -namespace detail = paddle::operators::detail; +namespace distributed = paddle::operators::distributed; USE_OP(lookup_table); -std::unique_ptr g_rpc_service; -std::unique_ptr g_req_handler; +std::unique_ptr g_rpc_service; +std::unique_ptr g_req_handler; framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) { auto root_block = program->MutableBlock(0); @@ -98,36 +98,40 @@ void StartServer() { framework::Executor exe(place); platform::CPUDeviceContext ctx(place); auto* block = AppendPrefetchBlcok(&program); - auto prepared = exe.Prepare(program, block->ID()); + std::string in_var_name("ids"); + std::vector prefetch_block_ids{block->ID()}; + auto prepared = exe.Prepare(program, prefetch_block_ids); InitTensorsOnServer(&scope, &place, 10); + std::unordered_map> + prefetch_var_name_to_prepared; + prefetch_var_name_to_prepared[in_var_name] = prepared[0]; g_req_handler->SetProgram(&program); - g_req_handler->SetPrefetchPreparedCtx(std::move(prepared)); + g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared); g_req_handler->SetDevCtx(&ctx); g_req_handler->SetScope(&scope); g_req_handler->SetExecutor(&exe); - g_rpc_service->RegisterRPC(detail::kRequestPrefetch, g_req_handler.get()); + g_rpc_service->RegisterRPC(distributed::kRequestPrefetch, + g_req_handler.get()); g_req_handler->SetRPCServer(g_rpc_service.get()); std::thread server_thread( - std::bind(&detail::AsyncGRPCServer::StartServer, g_rpc_service.get())); + std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get())); - // FIXME(gongwb): don't use hard time. - sleep(10); - LOG(INFO) << "got nccl id and stop server..."; - g_rpc_service->ShutDown(); server_thread.join(); } TEST(PREFETCH, CPU) { - g_req_handler.reset(new detail::RequestPrefetchHandler(true)); - g_rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", 1)); + g_req_handler.reset(new distributed::RequestPrefetchHandler(true)); + g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); + distributed::RPCClient* client = + distributed::RPCClient::GetInstance(); std::thread server_thread(StartServer); g_rpc_service->WaitServerReady(); - detail::RPCClient client; int port = g_rpc_service->GetSelectedPort(); std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); @@ -141,8 +145,8 @@ TEST(PREFETCH, CPU) { std::string in_var_name("ids"); std::string out_var_name("out"); - client.AsyncPrefetchVariable(ep, ctx, scope, in_var_name, out_var_name); - client.Wait(); + client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name); + client->Wait(); auto var = scope.Var(out_var_name); auto value = var->GetMutable()->value(); auto ptr = value.mutable_data(place); @@ -152,6 +156,7 @@ TEST(PREFETCH, CPU) { } } + g_rpc_service->ShutDown(); server_thread.join(); LOG(INFO) << "begin reset"; g_rpc_service.reset(nullptr); diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/distributed/send_recv.proto similarity index 95% rename from paddle/fluid/operators/detail/send_recv.proto rename to paddle/fluid/operators/distributed/send_recv.proto index a244afc46f3247c7e6e8481b09b5c729a2a569f7..e0902320cff003797b12ed0204f7f99c44554b62 100644 --- a/paddle/fluid/operators/detail/send_recv.proto +++ b/paddle/fluid/operators/distributed/send_recv.proto @@ -14,6 +14,8 @@ limitations under the License. */ syntax = "proto3"; package sendrecv; +// option cc_generic_services = true; + service SendRecvService { // For parameter server round-robin like hashing, do not split tensors. // Send and recv only one tensor @@ -23,6 +25,8 @@ service SendRecvService { rpc GetVariable(VariableMessage) returns (VariableMessage) {} // pre-fetch variable by given variable name and Ids rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} + + rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} } // VariableMessage is serialized paddle variable message. diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc similarity index 95% rename from paddle/fluid/operators/detail/sendrecvop_utils.cc rename to paddle/fluid/operators/distributed/sendrecvop_utils.cc index 507b465435609a91ebca97dd70b176c3b79bee02..98129d9f1014c39347e3409533f2bc10092611d2 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #ifdef PADDLE_WITH_CUDA #include @@ -23,14 +23,14 @@ limitations under the License. */ #include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/zero_copy_stream.h" #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/operators/detail/bytebuffer_stream.h" -#include "paddle/fluid/operators/detail/proto_encoder_helper.h" -#include "paddle/fluid/operators/detail/variable_response.h" +#include "paddle/fluid/operators/distributed/bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/proto_encoder_helper.h" +#include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { using VarMsg = sendrecv::VariableMessage; @@ -222,11 +222,11 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, const platform::DeviceContext& ctx, const framework::Scope* scope, framework::Variable** var) { - operators::detail::VariableResponse resp(scope, &ctx); + operators::distributed::VariableResponse resp(scope, &ctx); PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!"); *var = resp.GetVar(); } -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h similarity index 83% rename from paddle/fluid/operators/detail/sendrecvop_utils.h rename to paddle/fluid/operators/distributed/sendrecvop_utils.h index c72e1bd076f670458f3915072154847db6205092..fe25e73fa608727ba0bb912a82776b330ec8d83a 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -25,22 +25,12 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/detail/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" namespace paddle { namespace operators { -namespace detail { - -#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" -#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" -#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" - -static int64_t GetTimestamp() { - struct timeval tp; - gettimeofday(&tp, NULL); - return tp.tv_sec * 1000 + tp.tv_usec / 1000; -} +namespace distributed { typedef void (*DestroyCallback)(void*); @@ -71,6 +61,6 @@ inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) { } } -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc similarity index 94% rename from paddle/fluid/operators/detail/variable_response.cc rename to paddle/fluid/operators/distributed/variable_response.cc index 24cb91a3bb820a0e5d51aaa49154434919080f69..45832c60bf9172497afabac927ba39a7cbfb9a52 100644 --- a/paddle/fluid/operators/detail/variable_response.cc +++ b/paddle/fluid/operators/distributed/variable_response.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/detail/variable_response.h" +#include "paddle/fluid/operators/distributed/variable_response.h" #include #include @@ -22,12 +22,12 @@ #endif #include "paddle/fluid/platform/profiler.h" -#include "paddle/fluid/operators/detail/send_recv.pb.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { enum WireType { WIRETYPE_VARINT = 0, @@ -76,6 +76,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, if (total_written + size_to_write > length) { size_to_write = length - total_written; } + // This log is useful to see how long a internal block size is of rpc. + VLOG(7) << "copy " << size_to_write << " data to CUDAPlace"; memory::Copy(boost::get(place), reinterpret_cast(p), cpu, data, size_to_write, gpu_dev_ctx.stream()); @@ -103,6 +105,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, } // TODO(gongwb): can we avoid copy? platform::CPUPlace cpu; + // This log is useful to see how long a internal block size is of rpc. + VLOG(7) << "copy " << size_to_write << " data to CPUPlace"; memory::Copy(cpu, reinterpret_cast(p), cpu, data, size_to_write); p += size_to_write; @@ -158,13 +162,13 @@ bool VariableResponse::CopySelectRowsTensorData( slr->set_height(meta_.slr_height()); auto* tensor = slr->mutable_value(); tensor->Resize(dims); - PADDLE_ENFORCE_EQ( - static_cast(tensor->numel()), - length / framework::SizeOfType( - paddle::operators::detail::ToTypeIndex(meta_.data_type()))); + PADDLE_ENFORCE_EQ(static_cast(tensor->numel()), + length / framework::SizeOfType( + paddle::operators::distributed::ToTypeIndex( + meta_.data_type()))); void* tensor_data = tensor->mutable_data( ctx.GetPlace(), - paddle::operators::detail::ToTypeIndex(meta_.data_type())); + paddle::operators::distributed::ToTypeIndex(meta_.data_type())); if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) { return false; @@ -480,6 +484,6 @@ int VariableResponse::Parse(Source* source) { return 0; } -}; // namespace detail +}; // namespace distributed }; // namespace operators }; // namespace paddle diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h similarity index 92% rename from paddle/fluid/operators/detail/variable_response.h rename to paddle/fluid/operators/distributed/variable_response.h index 69cfd784f8dd4f129f50c6882061e53e8535b949..1db4a0a522654ff2497b8bd9ee1381b5ab64067a 100644 --- a/paddle/fluid/operators/detail/variable_response.h +++ b/paddle/fluid/operators/distributed/variable_response.h @@ -22,17 +22,17 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/detail/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/zero_copy_stream.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/detail/bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/bytebuffer_stream.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { class VariableResponse { public: @@ -99,6 +99,6 @@ class VariableResponse { sendrecv::VariableMessage meta_; }; -}; // namespace detail +}; // namespace distributed }; // namespace operators }; // namespace paddle diff --git a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3f612256840825a75f49944ab97ff957d572a863 --- /dev/null +++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc @@ -0,0 +1,190 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/elementwise_add_op.h" +#include "paddle/fluid/operators/elementwise_op_function.h" + +#include "paddle/fluid/platform/mkldnn_helper.h" + +namespace paddle { +namespace operators { + +using framework::DataLayout; +using framework::Tensor; +using mkldnn::memory; +using mkldnn::reorder; +using mkldnn::primitive; +using mkldnn::stream; +using mkldnn::sum; + +template +class EltwiseAddMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + const T* x_data = x->data(); + const T* y_data = y->data(); + T* z_data = z->mutable_data(ctx.GetPlace()); + + int axis = ctx.Attr("axis"); + + auto x_dims = x->dims(); + auto y_dims = y->dims(); + auto z_dims = z->dims(); + + // Execute default elementwise_add operator when + // broadcast operations need to performed. + if (x_dims != y_dims) { + auto sum_func = [](T a, T b) -> T { return a + b; }; + + TransformFunctor + functor( + x, y, z, + ctx.template device_context(), + sum_func); + + axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); + PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), + "Axis should be in range [0, x_dims)"); + + trim_trailing_singular_dims(&y_dims); + axis = (y_dims.size() == 0) ? x_dims.size() : axis; + + int pre, n, post; + get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post); + + if (post == 1) { + functor.RunRowWise(n, pre); + } else { + functor.RunMidWise(n, pre, post); + } + z->set_layout(DataLayout::kMKLDNN); + z->set_format(x->format()); + } else { + PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN && + x->format() != memory::format::format_undef, + "Wrong layout/format set for X tensor"); + PADDLE_ENFORCE(y->layout() == DataLayout::kMKLDNN && + y->format() != memory::format::format_undef, + "Wrong layout/format set for X tensor"); + + std::vector src_x_tz = framework::vectorize2int(x_dims); + std::vector src_y_tz = framework::vectorize2int(y_dims); + std::vector dst_tz = framework::vectorize2int(z_dims); + + std::vector srcs_pd; + std::vector srcs; + std::vector scales = {1.0f, 1.0f}; + + auto src_x_pd = memory::primitive_desc( + {{src_x_tz}, memory::data_type::f32, x->format()}, mkldnn_engine); + auto src_y_pd = memory::primitive_desc( + {{src_y_tz}, memory::data_type::f32, y->format()}, mkldnn_engine); + auto src_x_memory = + memory(src_x_pd, paddle::platform::to_void_cast(x_data)); + auto src_y_memory = + memory(src_y_pd, paddle::platform::to_void_cast(y_data)); + + srcs_pd.push_back(src_x_pd); + srcs_pd.push_back(src_y_pd); + srcs.push_back(src_x_memory); + srcs.push_back(src_y_memory); + + auto dst_md = + memory::desc({dst_tz}, memory::data_type::f32, memory::format::any); + + // create primitive descriptor for sum + auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd); + + // create mkldnn memory for dst + memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data); + + std::vector inputs; + inputs.push_back(srcs[0]); + inputs.push_back(srcs[1]); + + // create sum primitive + auto sum_prim = sum(sum_pd, inputs, dst_memory); + + std::vector pipeline; + pipeline.push_back(sum_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + z->set_layout(DataLayout::kMKLDNN); + z->set_format( + (memory::format)dst_memory.get_primitive_desc().desc().data.format); + } + } +}; + +template +class EltwiseAddMKLDNNGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + + auto set_mkldnn_format = [](Tensor* in, const Tensor* out) { + in->set_layout(DataLayout::kMKLDNN); + in->set_format(out->format()); + }; + + if (x->dims() == y->dims()) { + auto blas = math::GetBlas(ctx); + if (dx) { + blas.VCOPY(dout->numel(), dout->data(), + dx->mutable_data(ctx.GetPlace())); + set_mkldnn_format(dx, dout); + } + + if (dy) { + blas.VCOPY(dout->numel(), dout->data(), + dy->mutable_data(ctx.GetPlace())); + set_mkldnn_format(dy, dout); + } + } else { + // Execute default kernel when broadcast is needed + ElemwiseGradCompute, IdentityGrad>( + ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad(), + IdentityGrad()); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(elementwise_add, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseAddMKLDNNKernel) + +REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseAddMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise_mul_op.cc index ba343909bb87b4f2efa56c0a4ff664b278e90c60..7cd67e74de6b9c4fbc718f60b4f671ccab2f9956 100644 --- a/paddle/fluid/operators/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise_mul_op.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise_mul_op.h" #include "paddle/fluid/operators/elementwise_op.h" namespace ops = paddle::operators; -REGISTER_ELEMWISE_OP(elementwise_mul, "Mul", "Out = X \\odot\\ Y"); +REGISTER_ELEMWISE_OP(elementwise_mul, "Mul", "Out = X \\\\odot Y"); REGISTER_OP_CPU_KERNEL( elementwise_mul, ops::ElementwiseMulKernel, diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h index f4cec8ad971abebe8d6dff1a384c8414269148a5..bb88970e42c194d9437609b62435f1a89e2b446b 100644 --- a/paddle/fluid/operators/elementwise_op.h +++ b/paddle/fluid/operators/elementwise_op.h @@ -14,8 +14,12 @@ limitations under the License. */ #pragma once #include +#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif namespace paddle { namespace operators { @@ -40,6 +44,21 @@ class ElementwiseOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", x_dim); ctx->ShareLoD("X", /*->*/ "Out"); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto input_data_type = + framework::ToDataType(ctx.Input("X")->type()); + +#ifdef PADDLE_WITH_MKLDNN + if (platform::CanMKLDNNBeUsed(ctx)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class ElementwiseOpInferVarType : public framework::VarTypeInference { @@ -59,47 +78,50 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { void Make() final { AddInput("X", "(Tensor), The first input tensor of elementwise op."); AddInput("Y", "(Tensor), The second input tensor of elementwise op."); - AddOutput("Out", "The output of elementwise op."); + AddOutput("Out", "The output of elementwise op.").Reuse("X"); AddAttr("axis", "(int, default -1). The start dimension index " "for broadcasting Y onto X.") .SetDefault(-1) .EqualGreaterThan(-1); + AddAttr("use_mkldnn", "(bool, default false). Used by MKLDNN.") + .SetDefault(false); AddComment(string::Sprintf(R"DOC( -Limited Elementwise %s Operator. +Limited Elementwise %s Operator The equation is: $$%s$$ -$X$ is a tensor of any dimension and the dimensions of tensor $Y$ must be -smaller than or equal to the dimensions of $X$. +- $X$: a tensor of any dimension. +- $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$. There are two cases for this operator: -1. The shape of $Y$ is same with $X$; -2. The shape of $Y$ is a congiguous subsequencet of $X$. The trailing dimensions - of size 1 for $Y$ will be ignored for the consideration of subsequence. +1. The shape of $Y$ is the same with $X$. +2. The shape of $Y$ is a continuous subsequence of $X$. For case 2: -$Y$ will be broadcasted to match the shape of $X$ and axis should be -set to index of the start dimension to broadcast $Y$ onto $X$. +1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index + for broadcasting $Y$ onto $X$. +2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$. +3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of + subsequence, such as shape(Y) = (2, 1) => (2). -If axis is -1, it is treated as axis=rank(X)-rank(Y). +For example: -For example .. code-block:: python shape(X) = (2, 3, 4, 5), shape(Y) = (,) shape(X) = (2, 3, 4, 5), shape(Y) = (5,) - shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5) + shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2 shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1 shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0 shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0 -Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details) -information. However, the output only shares the LoD information with input $X$. +The inputs $X$ and $Y$ can carry the different LoD information. +But the output only shares the LoD information with the input $X$. )DOC", GetName(), GetEquation())); @@ -137,6 +159,21 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { ctx->SetOutputDim(y_grad_name, y_dims); } } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto input_data_type = + framework::ToDataType(ctx.Input("X")->type()); + +#ifdef PADDLE_WITH_MKLDNN + if (platform::CanMKLDNNBeUsed(ctx)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index 8843a1c44b7004ba5d7935f75d3c99d9c30fc6c0..a9ae1396db8d7dab0364779e506d5c0a3e2ff6ed 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -43,7 +43,7 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType FCOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library{framework::LibraryType::kMKLDNN}; - framework::DataLayout layout{framework::DataLayout::kAnyLayout}; + framework::DataLayout layout{framework::DataLayout::kMKLDNN}; return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), @@ -65,7 +65,7 @@ void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType FCOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library{framework::LibraryType::kMKLDNN}; - framework::DataLayout layout{framework::DataLayout::kAnyLayout}; + framework::DataLayout layout{framework::DataLayout::kMKLDNN}; return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc index 79ec02f52094121d01c6bda2a5d99d2211893e89..02beb80fc8a9f451393dcdd54492c4f88f908497 100644 --- a/paddle/fluid/operators/fetch_barrier_op.cc +++ b/paddle/fluid/operators/fetch_barrier_op.cc @@ -19,8 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" - -#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -43,15 +42,16 @@ class FetchBarrierOp : public framework::OperatorBase { // For profiling platform::RecordEvent record_event(Type(), &ctx); - auto rpc_client = detail::RPCClient::GetInstance(); + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance(); - PADDLE_ENFORCE(rpc_client->Wait()); + rpc_client->Wait(); for (auto& ep : eps) { VLOG(3) << "fetch barrier, ep: " << ep; rpc_client->AsyncSendFetchBarrier(ep); } - PADDLE_ENFORCE(rpc_client->Wait()); + rpc_client->Wait(); } }; diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc index 1ae78675a0cac8a72aeaef1227b631a41e4a10b2..453a1b32a0171a2ca88879ab3287e89c4d3c7759 100644 --- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc @@ -32,16 +32,16 @@ class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp { class FillConstantBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker { protected: void Apply() override { - AddAttr("dtype", - "(int, default 5 (FP32)) " - "Output data type") + AddAttr( + "dtype", + "It could be numpy.dtype. Output data type. Default is float32") .SetDefault(framework::proto::VarType::FP32); - AddAttr("value", "(float, default 0) The value to be filled") + AddAttr("value", "default 0. The value to be filled") .SetDefault(0.0f); AddComment(R"DOC( -FillConstantBatchSizeLike Operator. - -Fill up a variable with specified constant value. +This function creates a tensor of specified *shape*, *dtype* and batch size, +and initializes this with a constant supplied in *value*. The batch size is +obtained from the `input` tensor. )DOC"); } diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc index 9c0561b016fdbfa8e48535eaa673a3f85bc936e5..f6b156eb30dae154395b34dcfc26319cd89edbca 100644 --- a/paddle/fluid/operators/gather_test.cc +++ b/paddle/fluid/operators/gather_test.cc @@ -43,7 +43,8 @@ TEST(Gather, GatherData) { auto* cpu_place = new paddle::platform::CPUPlace(); paddle::platform::CPUDeviceContext ctx(*cpu_place); paddle::operators::CPUGather(ctx, *src, *index, output); - + delete cpu_place; + cpu_place = NULL; for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4); for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4); diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc index 8050f61d4546f3351645f23ddcc63b2c49f17929..4a974281481c8bc02589b428098475d73b8a0ba5 100644 --- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc +++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc @@ -36,11 +36,12 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker { void Apply() override { AddAttr("mean", "(float, default 0.0) " - "mean of random tensor.") + "The mean (or center) of the gaussian distribution.") .SetDefault(.0f); AddAttr("std", "(float, default 1.0) " - "std of random tensor.") + "The standard deviation (std, or spread) of the " + "gaussian distribution.") .SetDefault(1.0f); AddAttr("seed", "(int, default 0) " @@ -55,9 +56,11 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker { .SetDefault(framework::proto::VarType::FP32); AddComment(R"DOC( -GaussianRandom Operator. Used to initialize tensors with gaussian random generator. +The defalut mean of the distribution is 0. and defalut standard +deviation (std) of the distribution is 1.. Uers can set mean and std +by input arguments. )DOC"); } }; diff --git a/paddle/fluid/operators/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/gaussian_random_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..76b00b396c1349eff5db1059268e7cf280a8fc64 --- /dev/null +++ b/paddle/fluid/operators/gaussian_random_mkldnn_op.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/operators/mean_op.h" + +namespace paddle { +namespace operators { + +using framework::DataLayout; +template +class GaussianMKLDNNKernel : public paddle::framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + float mean = context.Attr("mean"); + float std = context.Attr("std"); + auto* tensor = context.Output("Out"); + T* data = tensor->mutable_data(context.GetPlace()); + + unsigned int seed = static_cast(context.Attr("seed")); + std::minstd_rand engine; + if (seed == 0) { + seed = std::random_device()(); + } + engine.seed(seed); + std::normal_distribution dist(mean, std); + int64_t size = tensor->numel(); + for (int64_t i = 0; i < size; ++i) { + data[i] = dist(engine); + } + + // The format of output is set as the mkldnn's format + // TODO(@mozga-intel) The format of matrix sets inside the another layers. + tensor->set_layout(DataLayout::kMKLDNN); + tensor->set_format(mkldnn::memory::format::oihw); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(gaussian_random, MKLDNN, ::paddle::platform::CPUPlace, + ops::GaussianMKLDNNKernel); diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc index 815c1bb50988be49ca9996e368a59344c6583d58..1488aab1926b5b4ba7bceed582700f5a11fc6c93 100644 --- a/paddle/fluid/operators/gaussian_random_op.cc +++ b/paddle/fluid/operators/gaussian_random_op.cc @@ -15,6 +15,10 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + namespace paddle { namespace operators { @@ -62,9 +66,20 @@ class GaussianRandomOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { + framework::LibraryType library{framework::LibraryType::kPlain}; + framework::DataLayout layout{framework::DataLayout::kAnyLayout}; + +#ifdef PADDLE_WITH_MKLDNN + if (library == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; + } +#endif + return framework::OpKernelType( static_cast(ctx.Attr("dtype")), - ctx.device_context()); + ctx.device_context(), layout, library); } }; @@ -95,7 +110,9 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker { "(int, default 5(FP32)) " "Output data type.") .SetDefault(framework::proto::VarType::FP32); - + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddComment(R"DOC( GaussianRandom Operator. diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc index 4bce2d322d825110a446c9bc5eccdacf0ba3c943..697c239e59d158428ae9ba9f7feded19637dff28 100644 --- a/paddle/fluid/operators/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/gen_nccl_id_op.cc @@ -21,9 +21,8 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/detail/grpc_server.h" -#include "paddle/fluid/operators/detail/request_handler_impl.h" +#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/platform/nccl_helper.h" namespace paddle { @@ -61,12 +60,18 @@ class GenNCCLIdOp : public framework::OperatorBase { std::vector endpoint_list = Attr>("endpoint_list"); - detail::RPCClient client; + distributed::RPCClient* client = + distributed::RPCClient::GetInstance(); + for (auto& ep : endpoint_list) { VLOG(3) << "sending nccl id to " << ep; - client.AsyncSendVariable(ep, dev_ctx, *scope, NCCL_ID_VARNAME); + client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME); + } + client->Wait(); + for (auto& ep : endpoint_list) { + client->AsyncSendBatchBarrier(ep); } - client.Wait(); + client->Wait(); VLOG(3) << "sending completed..."; } @@ -76,10 +81,12 @@ class GenNCCLIdOp : public framework::OperatorBase { // NOTE: Can not use unique_ptr here because the default // deleter will call GRPC Server's base class's dtor and // that will cause a wired crash. - detail::RequestSendHandler rpc_h(true); - detail::AsyncGRPCServer rpc_service(endpoint, 1); - rpc_service.RegisterRPC(detail::kRequestSend, &rpc_h); - rpc_h.SetRPCServer(&rpc_service); + distributed::RequestSendHandler rpc_h(true); + std::unique_ptr rpc_service( + new RPCSERVER_T(endpoint, 1)); + + rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h); + rpc_h.SetRPCServer(rpc_service.get()); framework::ProgramDesc empty_program; framework::Executor executor(dev_ctx.GetPlace()); @@ -89,12 +96,13 @@ class GenNCCLIdOp : public framework::OperatorBase { rpc_h.SetExecutor(&executor); std::thread server_thread( - std::bind(&detail::AsyncGRPCServer::StartServer, &rpc_service)); - rpc_service.SetCond(detail::kRequestSend); + std::bind(&distributed::RPCServer::StartServer, rpc_service.get())); + + rpc_service->SetCond(distributed::kRequestSend); VLOG(3) << "start getting nccl id from trainer 0..."; - rpc_service.WaitBarrier(detail::kRequestSend); + rpc_service->WaitBarrier(distributed::kRequestSend); VLOG(3) << "got nccl id and stop server..."; - rpc_service.ShutDown(); + rpc_service->ShutDown(); VLOG(3) << "rpc server stopped"; server_thread.join(); } diff --git a/paddle/fluid/operators/get_places_op.cc b/paddle/fluid/operators/get_places_op.cc index eafc364a15fa17cc5107bba737b0b44e712b0bef..db6ff7825690176ded0ab957764ed8411d3cd804 100644 --- a/paddle/fluid/operators/get_places_op.cc +++ b/paddle/fluid/operators/get_places_op.cc @@ -85,7 +85,7 @@ class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker { .InEnum({"CUDA", "CPU", "AUTO"}) .SetDefault("AUTO"); AddComment(R"DOC( -Returns a list of places based on flags. The list will be used for parallel +Returns a list of places based on arguments. The list will be used for parallel execution. )DOC"); } diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc index ab097d31e9ab5eafa788539170e7e405df697625..14ce1da2e97186a50ed8bd52223a500c4c57b328 100644 --- a/paddle/fluid/operators/layer_norm_op.cc +++ b/paddle/fluid/operators/layer_norm_op.cc @@ -62,36 +62,33 @@ class LayerNormOp : public framework::OperatorWithKernel { class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("X", "(LoDTensor) The input tensor."); + AddInput("X", "The input tensor."); AddInput("Scale", - "(Tensor, optional) Scale is a 1-dimensional tensor of size " + "(optional) Scale is a 1-dimensional tensor of size " "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])." "It is applied to the output.") .AsDispensable(); AddInput("Bias", - "(Tensor, optional) Bias is a 1-dimensional tensor of size " + "(optional) Bias is a 1-dimensional tensor of size " "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])." "It is applied to the output.") .AsDispensable(); - AddOutput("Y", "(LoDTensor) Result after normalization."); - AddOutput("Mean", "(Tensor) Mean of the current mini batch.") - .AsIntermediate(); - AddOutput("Variance", "(Tensor) Variance of the current mini batch.") + AddOutput("Y", "Result after normalization."); + AddOutput("Mean", "Mean of the current mini batch.").AsIntermediate(); + AddOutput("Variance", "Variance of the current mini batch.") .AsIntermediate(); AddAttr("epsilon", - "(float, default 1e-5) Constant for " - "numerical stability") + "Constant for numerical stability [default 1e-5].") .SetDefault(1e-5) .AddCustomChecker([](const float &epsilon) { PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, "'epsilon' should be between 0.0 and 0.001."); }); AddAttr("begin_norm_axis", - "(int default:1), the " - "axis of `begin_norm_axis ... Rank(X) - 1` will be " + "the axis of `begin_norm_axis ... Rank(X) - 1` will be " "normalized. `begin_norm_axis` splits the tensor(`X`) to a " - "matrix [N,H].") + "matrix [N,H]. [default 1].") .SetDefault(1) .AddCustomChecker([](const int &begin_norm_axis) { PADDLE_ENFORCE_GT(begin_norm_axis, 0, @@ -99,10 +96,14 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { }); AddComment(R"DOC( -Layer Normalization. -Layer Norm has been implemented as discussed in the paper: -https://arxiv.org/abs/1607.06450 -... +Assume feature vectors exist on dimensions +:attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics +along these dimensions for each feature vector :math:`a` with size +:math:`H`, then normalize each feature vector using the corresponding +statistics. After that, apply learnable gain and bias on the normalized +tensor to scale and shift if :attr:`scale` and :attr:`shift` are set. + +Refer to `Layer Normalization `_ )DOC"); } }; diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc index e38525cd7f44de020f364ffd16e71a439048347f..ea1ca7f59db22bee973a8827a88e2fb80265fa51 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.cc +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -67,8 +67,6 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker { "mini-batch. Note: S is equal to the sequence number in a mini-batch. " "The output is no longer a LoDTensor."); AddComment(R"DOC( -LinearChainCRF Operator. - Conditional Random Field defines an undirected probabilistic graph with nodes denoting random variables and edges denoting dependencies between these variables. CRF learns the conditional probability $P(Y|X)$, where @@ -86,6 +84,7 @@ CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details. Equation: + 1. Denote Input(Emission) to this operator as $x$ here. 2. The first D values of Input(Transition) to this operator are for starting weights, denoted as $a$ here. @@ -108,6 +107,7 @@ Finally, the linear chain CRF operator outputs the logarithm of the conditional likelihood of each training sample in a mini-batch. NOTE: + 1. The feature function for a CRF is made up of the emission features and the transition features. The emission feature weights are NOT computed in this operator. They MUST be computed first before this operator is called. diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 66a0f87b46c6447bac7e42f0f61e3170cb1f2fdb..56e39649b409f7eed108027f6df58c19dd3c8ab8 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -19,15 +19,16 @@ limitations under the License. */ #include // NOLINT #include -#include "paddle/fluid/operators/detail/grpc_server.h" -#include "paddle/fluid/operators/detail/request_handler_impl.h" +#include "paddle/fluid/operators/detail/macros.h" + +#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { -void RunServer(std::shared_ptr service) { +void RunServer(std::shared_ptr service) { service->StartServer(); VLOG(4) << "RunServer thread end"; } @@ -89,37 +90,43 @@ void ListenAndServOp::SavePort() const { rpc_service_->SavePort(); } -void ListenAndServOp::RunSyncLoop(framework::Executor *executor, - framework::ProgramDesc *program, - framework::Scope *recv_scope, - framework::BlockDesc *prefetch_block) const { +static int64_t GetTimestamp() { + struct timeval tp; + gettimeofday(&tp, NULL); + return tp.tv_sec * 1000 + tp.tv_usec / 1000; +} + +void ListenAndServOp::RunSyncLoop( + framework::Executor *executor, framework::ProgramDesc *program, + framework::Scope *recv_scope, + const std::vector &prefetch_block_id_list, + const int checkpoint_point_block_id) const { size_t num_blocks = program->Size(); + auto optimize_blocks = + Attr>(kOptimizeBlocks); PADDLE_ENFORCE_GE(num_blocks, 2, "server program should have at least 2 blocks"); - std::vector block_list; - for (size_t blkid = 1; blkid < num_blocks; ++blkid) { - block_list.push_back(blkid); + std::vector optimize_blocks_idx; + for (auto blk : optimize_blocks) { + optimize_blocks_idx.push_back(blk->ID()); } - auto optimize_prepared = executor->Prepare(*program, block_list); + auto optimize_prepared = executor->Prepare(*program, optimize_blocks_idx); // Insert placeholder for block0 which holds current op itself. optimize_prepared.insert( optimize_prepared.begin(), std::shared_ptr(nullptr)); rpc_service_->ResetBarrierCounter(); - // Record received sparse variables, so that - // we could reset those after execute optimize program - std::vector sparse_vars; while (true) { // Get from multiple trainers, we don't care about the order in which // the gradients arrives, just add suffix 0~n and merge the gradient. - rpc_service_->SetCond(detail::kRequestSend); - rpc_service_->WaitBarrier(detail::kRequestSend); + rpc_service_->SetCond(distributed::kRequestSend); + rpc_service_->WaitBarrier(distributed::kRequestSend); if (rpc_service_->IsExit()) { LOG(WARNING) << "get exit!rpc_processor break!"; - rpc_service_->SetCond(detail::kRequestGet); + rpc_service_->SetCond(distributed::kRequestGet); break; } @@ -127,43 +134,38 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, // and this will still work. // The optimize blocks which have the same parent ID would run parallel // TODO(Yancey1989): need to use ParallelExecutor for future - int32_t last_parent_blkid = program->Block(1).Parent(); + int32_t last_parent_blkid = optimize_blocks[0]->Parent(); std::vector parallel_blkids; - parallel_blkids.push_back(1); - double ts = detail::GetTimestamp(); - for (size_t blkid = 2; blkid < num_blocks; ++blkid) { - if (blkid != static_cast(prefetch_block->ID())) { - if (program->Block(blkid).Parent() != last_parent_blkid) { - ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, - program, recv_scope); - parallel_blkids.clear(); - last_parent_blkid = program->Block(blkid).Parent(); - } - parallel_blkids.push_back(blkid); + parallel_blkids.push_back(optimize_blocks[0]->ID()); + double ts = GetTimestamp(); + for (size_t i = 1; i < optimize_blocks.size(); ++i) { + // skip the first optimize block because it is already in the + // parallel_blkids. + int blkid = optimize_blocks[i]->ID(); + if (program->Block(blkid).Parent() != last_parent_blkid) { + ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, + program, recv_scope); + parallel_blkids.clear(); + last_parent_blkid = program->Block(blkid).Parent(); } + parallel_blkids.push_back(blkid); } ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program, recv_scope); - VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)"; - - // Reset the received sparse variables, the sum operator would not - // sum the input sparse variables which rows is empty at the next - // mini-batch. - // TODO(Yancey1989): move the reset action into an operator, we couldn't - // have any hide logic in the operator. - for (framework::Variable *var : sparse_vars) { - var->GetMutable()->mutable_rows()->clear(); - } + VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; - rpc_service_->SetCond(detail::kRequestGet); - rpc_service_->WaitBarrier(detail::kRequestGet); + rpc_service_->SetCond(distributed::kRequestGet); + rpc_service_->WaitBarrier(distributed::kRequestGet); rpc_service_->ResetBarrierCounter(); + // reset received sparse vars to avoid reuse it in the next mini-batch + dynamic_cast(request_send_handler_.get()) + ->ResetSparseVarRecorder(); } // while(true) } void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, - framework::ProgramDesc *program) const { - VLOG(3) << "RunAsyncLoop in"; + framework::ProgramDesc *program, + framework::Scope *recv_scope) const { // grad name to block id std::unordered_map grad_to_block_id; std::unordered_map id_to_grad; @@ -190,6 +192,10 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, block_list.push_back(blkid); } auto optimize_prepared = executor->Prepare(*program, block_list); + // execute global block if needed + if (block_list[0] == 1 && id_to_grad.count(1) == 0) { + executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope); + } std::unordered_map> grad_to_prepared_ctx; @@ -201,10 +207,9 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx); request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx); - VLOG(3) << "RunAsyncLoop into while"; while (true) { if (rpc_service_->IsExit()) { - LOG(INFO) << "get exit!rpc_processor break!"; + VLOG(4) << "get exit!rpc_processor break!"; break; } @@ -212,19 +217,22 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, } // while(true) } -static void FillRequestCtx(detail::RequestHandler *h, framework::Scope *scope, - platform::DeviceContext *dev_ctx, - framework::Executor *executor, - framework::ProgramDesc *program, - framework::ExecutorPrepareContext *prefetch_ctx, - detail::RPCServer *rpc_server) { +static void FillRequestCtx( + distributed::RequestHandler *h, framework::Scope *scope, + platform::DeviceContext *dev_ctx, framework::Executor *executor, + framework::ProgramDesc *program, + std::unordered_map> + *prefetch_ctx, + std::shared_ptr checkpoint_ctx, + distributed::RPCServer *rpc_server) { h->SetScope(scope); h->SetDevCtx(dev_ctx); h->SetExecutor(executor); h->SetProgram(program); - h->SetPrefetchPreparedCtx( - std::unique_ptr(prefetch_ctx)); + h->SetPrefetchPreparedCtx(prefetch_ctx); h->SetRPCServer(rpc_server); + h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx); } void ListenAndServOp::RunImpl(const framework::Scope &scope, @@ -240,38 +248,83 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, PADDLE_ENFORCE(!rpc_service_); std::string endpoint = Attr("endpoint"); + int checkpoint_block_id = Attr(kCheckpointBlockId); - LOG(INFO) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in - << ", end_point:" << endpoint; + VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in + << ", end_point:" << endpoint + << ", checkpoint_block_id: " << checkpoint_block_id; - // request_handler_.reset(new detail::GRPCRequestSendHandler(sync_mode)); - rpc_service_.reset(new detail::AsyncGRPCServer(endpoint, fan_in)); - request_send_handler_.reset(new detail::RequestSendHandler(sync_mode)); - request_get_handler_.reset(new detail::RequestGetHandler(sync_mode)); - request_prefetch_handler_.reset( - new detail::RequestPrefetchHandler(sync_mode)); + rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in)); - rpc_service_->RegisterRPC(detail::kRequestSend, request_send_handler_.get()); - rpc_service_->RegisterRPC(detail::kRequestGet, request_get_handler_.get()); - rpc_service_->RegisterRPC(detail::kRequestPrefetch, + request_send_handler_.reset(new distributed::RequestSendHandler(sync_mode)); + request_get_handler_.reset(new distributed::RequestGetHandler(sync_mode)); + request_prefetch_handler_.reset( + new distributed::RequestPrefetchHandler(sync_mode)); + request_checkpoint_handler_.reset(new distributed::RequestCheckpointHandler( + sync_mode, checkpoint_block_id)); + + rpc_service_->RegisterRPC(distributed::kRequestSend, + request_send_handler_.get()); + rpc_service_->RegisterRPC(distributed::kRequestGet, + request_get_handler_.get()); + rpc_service_->RegisterRPC(distributed::kRequestPrefetch, request_prefetch_handler_.get()); - - auto *optimize_block = Attr(kOptimizeBlock); - auto *prefetch_block = Attr(kPrefetchBlock); - auto *program = optimize_block->Program(); + rpc_service_->RegisterRPC(distributed::kRequestCheckpoint, + request_checkpoint_handler_.get()); + + auto optimize_blocks = + Attr>(kOptimizeBlocks); + PADDLE_ENFORCE(optimize_blocks.size() >= 1, + "optimize blocks should be 1 at least on the pserver side."); + auto *program = optimize_blocks[0]->Program(); framework::Executor executor(dev_place); + std::shared_ptr ckpt_pre_context = nullptr; + if (checkpoint_block_id != -1) { + auto ctx = executor.Prepare(*program, checkpoint_block_id); + // see: https://stackoverflow.com/a/14856553 + ckpt_pre_context = std::move(ctx); + } + // prepare for prefetch - VLOG(3) << "prefetch block id is " << prefetch_block->ID(); - auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID()); + std::vector prefetch_block_id_list; + std::unordered_map block_id_to_prefetch_var_name; - auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, - &dev_ctx, &executor, program, prefetch_prepared.release(), - rpc_service_.get()); + auto prefetch_var_name_to_block_id_str = + Attr>(kPrefetchVarNameToBlockId); + for (const auto &prefetch_var_name_and_id : + prefetch_var_name_to_block_id_str) { + std::vector pieces; + split(prefetch_var_name_and_id, ':', &pieces); + VLOG(3) << "after split, prefetch_var = " << pieces[0] + << ", id=" << pieces[1]; + PADDLE_ENFORCE_EQ(pieces.size(), 2); + + int block_id = std::stoi(pieces[1]); + prefetch_block_id_list.push_back(block_id); + block_id_to_prefetch_var_name[block_id] = pieces[0]; + } + + auto prefetch_prepared = executor.Prepare(*program, prefetch_block_id_list); + + std::unordered_map> + prefetch_var_name_to_prepared_ctx; + for (size_t i = 0; i < prefetch_block_id_list.size(); ++i) { + auto block_id = prefetch_block_id_list[i]; + auto prefetch_var_name = block_id_to_prefetch_var_name[block_id]; + prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i]; + } + + auto f = + std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx, + &executor, program, &prefetch_var_name_to_prepared_ctx, + ckpt_pre_context, rpc_service_.get()); f(request_send_handler_.get()); f(request_get_handler_.get()); f(request_prefetch_handler_.get()); + f(request_checkpoint_handler_.get()); // start the server listening after all member initialized. server_thread_.reset(new std::thread(RunServer, rpc_service_)); @@ -285,9 +338,10 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, // Write to a file of server selected port for python use. SavePort(); if (sync_mode) { - RunSyncLoop(&executor, program, &recv_scope, prefetch_block); + RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list, + checkpoint_block_id); } else { - RunAsyncLoop(&executor, program); + RunAsyncLoop(&executor, program, &recv_scope); } } @@ -309,17 +363,23 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker { "a map from grad name to it's optimize block id") .SetDefault({}); AddAttr("sync_mode", "if works at sync_mode or not").SetDefault(true); - AddAttr(kOptimizeBlock, - "BlockID to run on server side."); - AddAttr(kPrefetchBlock, - "prefetch block to run on server side."); + AddAttr>( + kOptimizeBlocks, "Optimize blocks to run on server side.") + .SetDefault({}); + AddAttr>(kPrefetchVarNameToBlockId, + "prefetch blocks to run on server side.") + .SetDefault({}); AddAttr("Fanin", "How many clients send to this server.") .SetDefault(1); + AddAttr(kCheckpointBlockId, + "BolckID to run save checkpoint on pserer.") + .SetDefault(-1); } }; void SignalHandler::StopAndExit(int signal_num) { - VLOG(3) << "Catch interrupt signal: " << signal_num << ", program will exit"; + // Do not use VLOG here for the device for printing maybe already released. + // exit will release interal allocated resoureces. exit(0); } diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h index 87952cb0e683596b2b0395890b6e25b15f74d7e2..978969cc515c7954b59f2bf7a4f2c0e1b13f9bc0 100644 --- a/paddle/fluid/operators/listen_and_serv_op.h +++ b/paddle/fluid/operators/listen_and_serv_op.h @@ -18,21 +18,23 @@ limitations under the License. */ #include #include #include +#include #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/detail/request_handler.h" -#include "paddle/fluid/operators/detail/rpc_server.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" namespace paddle { namespace operators { -constexpr char kOptimizeBlock[] = "OptimizeBlock"; -constexpr char kPrefetchBlock[] = "PrefetchBlock"; +constexpr char kOptimizeBlocks[] = "optimize_blocks"; +constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id"; +constexpr char kCheckpointBlockId[] = "checkpint_block_id"; -void RunServer(std::shared_ptr service); +void RunServer(std::shared_ptr service); class ListenAndServOp : public framework::OperatorBase { public: @@ -46,10 +48,12 @@ class ListenAndServOp : public framework::OperatorBase { void RunSyncLoop(framework::Executor* executor, framework::ProgramDesc* program, framework::Scope* recv_scope, - framework::BlockDesc* prefetch_block) const; + const std::vector& prefetch_block_id_list, + const int checkpoint_point_block_id) const; void RunAsyncLoop(framework::Executor* executor, - framework::ProgramDesc* program) const; + framework::ProgramDesc* program, + framework::Scope* recv_scope) const; void SavePort() const; @@ -61,10 +65,13 @@ class ListenAndServOp : public framework::OperatorBase { const platform::Place& dev_place) const override; protected: - mutable std::shared_ptr rpc_service_; - mutable std::shared_ptr request_send_handler_; - mutable std::shared_ptr request_get_handler_; - mutable std::shared_ptr request_prefetch_handler_; + mutable std::shared_ptr rpc_service_; + mutable std::shared_ptr request_send_handler_; + mutable std::shared_ptr request_get_handler_; + mutable std::shared_ptr + request_prefetch_handler_; + mutable std::shared_ptr + request_checkpoint_handler_; mutable std::shared_ptr server_thread_; }; diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index 93f45cff8a26201b1fbb1c44141e125a67c44037..ac35cf0b89bfaa0c0f8e64445f18a3bbd478e70a 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -34,6 +34,8 @@ class LoadOp : public framework::OperatorBase { auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place); platform::RecordEvent record_event(Type(), dev_ctx); + // FIXME(yuyang18): We save variable to local file now, but we should change + // it to save an output stream. auto filename = Attr("file_path"); std::ifstream fin(filename); PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", @@ -44,9 +46,25 @@ class LoadOp : public framework::OperatorBase { PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found", out_var_name); - auto *tensor = out_var->GetMutable(); + if (out_var->IsType()) { + LoadLodTensor(fin, place, out_var); + } else if (out_var->IsType()) { + LoadSelectedRows(fin, place, out_var); + } else { + PADDLE_ENFORCE( + false, + "Load only support LoDTensor and SelectedRows, %s has wrong type", + out_var_name); + } + } - DeserializeFromStream(fin, tensor, *dev_ctx); + void LoadLodTensor(std::istream &fin, const platform::Place &place, + framework::Variable *var) const { + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + auto *tensor = var->GetMutable(); + DeserializeFromStream(fin, tensor, dev_ctx); auto load_as_fp16 = Attr("load_as_fp16"); auto in_dtype = framework::ToDataType(tensor->type()); @@ -63,36 +81,40 @@ class LoadOp : public framework::OperatorBase { &fp16_tensor); // reset output tensor - out_var->Clear(); - tensor = out_var->GetMutable(); + var->Clear(); + tensor = var->GetMutable(); tensor->set_lod(fp16_tensor.lod()); tensor->ShareDataWith(fp16_tensor); } } + + void LoadSelectedRows(std::istream &fin, const platform::Place &place, + framework::Variable *var) const { + auto *selectedRows = var->GetMutable(); + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + framework::DeserializeFromStream(fin, selectedRows, dev_ctx); + } }; class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddOutput("Out", "(Tensor) The tensor need to be loaded"); + AddOutput("Out", "The LoDTensor / SelectedRows need to be loaded"); AddAttr( "load_as_fp16", - "(boolean, default false)" "If true, the tensor will be first loaded and then " "converted to float16 data type. Otherwise, the tensor will be " - "directly loaded without data type conversion.") + "directly loaded without data type conversion. Default is false.") .SetDefault(false); AddAttr("file_path", - "(string) " - "Variable will be loaded from \"file_path\".") + R"(Variable will be loaded from "file_path")") .AddCustomChecker( [](const std::string &path) { return !path.empty(); }); - AddComment(R"DOC( -Load Operator. - -Load operator will load a tensor variable from disk file. - -)DOC"); + AddComment( + "Load operator will load a LoDTensor / SelectedRows variable from disk " + "file."); } }; } // namespace operators diff --git a/paddle/fluid/operators/logical_op.cc b/paddle/fluid/operators/logical_op.cc index db109f5cd053d84718ac85bd4693ecece12ce172..26970db8d2af62bb06fce4eb1a1f21fd41617bd1 100644 --- a/paddle/fluid/operators/logical_op.cc +++ b/paddle/fluid/operators/logical_op.cc @@ -146,6 +146,6 @@ REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$"); REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU, paddle::operators::LogicalNotFunctor); REGISTER_BINARY_LOGICAL_OP(logical_xor, - "$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$"); + "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$"); REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU, paddle::operators::LogicalXorFunctor); diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index 52b9cd7fb7019b738098a8649f23277afd40e938..52b459a6a2e56b7c256efdb535b4652c64bae23c 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -124,16 +124,17 @@ namespace { framework::OpKernelType GetExpectedLRNKernel( const framework::ExecutionContext& ctx) { framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_MKLDNN if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif - std::string data_format = ctx.Attr("data_format"); - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), layout_, library_); diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc index 4751e3e8025e51a687f8fcfd25e603b61e762f6d..3225bf9bb63d57969ce9ae0e4a74e8f466c8c2d0 100644 --- a/paddle/fluid/operators/lstm_op.cc +++ b/paddle/fluid/operators/lstm_op.cc @@ -184,34 +184,32 @@ Long-Short Term Memory (LSTM) Operator. The defalut implementation is diagonal/peephole connection (https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows: -$$ -i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) \\ +$$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) $$ -f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) \\ +$$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) $$ -\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) \\ +$$ \\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) $$ -o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) \\ +$$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) $$ -c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\ +$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$ -h_t = o_t \odot act_h(c_t) -$$ +$$ h_t = o_t \\odot act_h(c_t) $$ -where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix -of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$ -are diagonal weight matrices for peephole connections. In our implementation, -we use vectors to reprenset these diagonal weight matrices. The b terms -denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$ -is the non-line activations, such as logistic sigmoid function, and -$i, f, o$ and $c$ are the input gate, forget gate, output gate, -and cell activation vectors, respectively, all of which have the same size as -the cell output activation vector $h$. - -The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$ -are the cell input and cell output activation functions and `tanh` is usually -used for them. $\tilde{c_t}$ is also called candidate hidden state, -which is computed based on the current input and the previous hidden state. +- W terms denote weight matrices (e.g. $W_{xi}$ is the matrix + of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$ + are diagonal weight matrices for peephole connections. In our implementation, + we use vectors to reprenset these diagonal weight matrices. +- The b terms denote bias vectors ($b_i$ is the input gate bias vector). +- $\sigma$ is the non-line activations, such as logistic sigmoid function. +- $i, f, o$ and $c$ are the input gate, forget gate, output gate, + and cell activation vectors, respectively, all of which have the same size as + the cell output activation vector $h$. +- The $\odot$ is the element-wise product of the vectors. +- $act_g$ and $act_h$ are the cell input and cell output activation functions + and `tanh` is usually used for them. +- $\tilde{c_t}$ is also called candidate hidden state, + which is computed based on the current input and the previous hidden state. Set `use_peepholes` False to disable peephole connection. The formula is omitted here, please refer to the paper diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 1a37cb39d56066b8380338b9710a441e41518c39..a907d6a71b7a16983e601073b039b48406853a0b 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -18,15 +18,15 @@ #include "paddle/fluid/framework/tensor.h" #ifdef PADDLE_WITH_MKLML -#include -#include -#include +#include "paddle/fluid/platform/dynload/mklml.h" #endif #ifdef PADDLE_USE_OPENBLAS #include +#ifdef LAPACK_FOUND #include #endif +#endif #ifndef LAPACK_FOUND extern "C" { @@ -46,6 +46,18 @@ namespace paddle { namespace operators { namespace math { +static void SetNumThreads(int num_threads) { +#ifdef PADDLE_USE_OPENBLAS + int real_num_threads = num_threads > 1 ? num_threads : 1; + openblas_set_num_threads(real_num_threads); +#elif defined(PADDLE_WITH_MKLML) + int real_num_threads = num_threads > 1 ? num_threads : 1; + platform::dynload::MKL_Set_Num_Threads(real_num_threads); +#else + PADDLE_ENFORCE(false, "To be implemented."); +#endif +} + /** * Matrix Descriptor of a memory buffer. * diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index ae20406bc21d5e08359be8295cd98495dda7813b..2ce94cfc93823aa891114ef8fd1e851727ebc623 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -22,61 +22,109 @@ namespace math { template struct CBlas; +#ifdef PADDLE_WITH_MKLML template <> struct CBlas { template static void GEMM(ARGS... args) { - cblas_sgemm(args...); + platform::dynload::cblas_sgemm(args...); } template static void AXPY(ARGS... args) { - cblas_saxpy(args...); + platform::dynload::cblas_saxpy(args...); + } + + template + static void VCOPY(ARGS... args) { + platform::dynload::cblas_scopy(args...); + } + + template + static void GEMV(ARGS... args) { + platform::dynload::cblas_sgemv(args...); + } + + template + static void GEMM_BATCH(ARGS... args) { + platform::dynload::cblas_sgemm_batch(args...); } -#ifdef PADDLE_WITH_MKLML template static void VADD(ARGS... args) { - vsAdd(args...); + platform::dynload::vsAdd(args...); + } +}; + +template <> +struct CBlas { + template + static void GEMM(ARGS... args) { + platform::dynload::cblas_dgemm(args...); + } + + template + static void AXPY(ARGS... args) { + platform::dynload::cblas_daxpy(args...); } -#endif template static void VCOPY(ARGS... args) { - cblas_scopy(args...); + platform::dynload::cblas_dcopy(args...); } template static void GEMV(ARGS... args) { - cblas_sgemv(args...); + platform::dynload::cblas_dgemv(args...); } -#ifdef PADDLE_WITH_MKLML template static void GEMM_BATCH(ARGS... args) { - cblas_sgemm_batch(args...); + platform::dynload::cblas_dgemm_batch(args...); + } + + template + static void VADD(ARGS... args) { + platform::dynload::vdAdd(args...); } -#endif }; +#else + template <> -struct CBlas { +struct CBlas { template static void GEMM(ARGS... args) { - cblas_dgemm(args...); + cblas_sgemm(args...); } template static void AXPY(ARGS... args) { - cblas_daxpy(args...); + cblas_saxpy(args...); } -#ifdef PADDLE_WITH_MKLML template - static void VADD(ARGS... args) { - vdAdd(args...); + static void VCOPY(ARGS... args) { + cblas_scopy(args...); + } + + template + static void GEMV(ARGS... args) { + cblas_sgemv(args...); + } +}; + +template <> +struct CBlas { + template + static void GEMM(ARGS... args) { + cblas_dgemm(args...); + } + + template + static void AXPY(ARGS... args) { + cblas_daxpy(args...); } -#endif template static void VCOPY(ARGS... args) { @@ -87,15 +135,8 @@ struct CBlas { static void GEMV(ARGS... args) { cblas_dgemv(args...); } - -#ifdef PADDLE_WITH_MKLML - template - static void GEMM_BATCH(ARGS... args) { - cblas_dgemm_batch(args...); - } -#endif }; - +#endif template <> struct CBlas { static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); } diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc index cc69212466b72f3fa82e8f5f58b4f3229dab28ec..55c8a472aca7fe700ef6a3f96bed1496d7b12b80 100644 --- a/paddle/fluid/operators/math/concat.cc +++ b/paddle/fluid/operators/math/concat.cc @@ -70,21 +70,23 @@ template class ConcatGradFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const int axis, - std::vector* outputs) { + const framework::Tensor& input, + const std::vector& ref_inputs, + const int axis, std::vector* outputs) { // TODO(zcd): Add input data validity checking - int num = outputs->size(); + size_t num = outputs->size(); int input_rows = 1; - auto dim_0 = outputs->at(0).dims(); + auto dim_0 = ref_inputs[0]->dims(); for (int i = 0; i < axis; ++i) { input_rows *= dim_0[i]; } + int input_cols = 0; std::vector output_cols(outputs->size()); - for (int i = 0; i < num; ++i) { - int t_cols = outputs->at(i).numel() / input_rows; + for (size_t i = 0; i < num; ++i) { + int t_cols = ref_inputs[i]->numel() / input_rows; input_cols += t_cols; output_cols[i] = t_cols; } @@ -94,11 +96,14 @@ class ConcatGradFunctor { for (int k = 0; k < input_rows; ++k) { const T* src_ptr = input.data() + k * input_cols; int col_idx = 0; - for (int j = 0; j < num; ++j) { + for (size_t j = 0; j < num; ++j) { int col_len = output_cols[j]; - T* dst_ptr = outputs->at(j).data() + k * col_len; - memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx, - sizeof(T) * col_len); + auto* out_tensor = outputs->at(j); + if (out_tensor != nullptr) { + T* dst_ptr = out_tensor->data() + k * col_len; + memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx, + sizeof(T) * col_len); + } col_idx += col_len; } } diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu index 4285d38dcd6a4124543cdd2246c82a8203f5a281..5863d74fca21de8b77bc208fb95d8fd52562f7a7 100644 --- a/paddle/fluid/operators/math/concat.cu +++ b/paddle/fluid/operators/math/concat.cu @@ -22,43 +22,24 @@ namespace paddle { namespace operators { namespace math { -template -__device__ T upper_bound(const T* first, T count, T val) { - const T* orig = first; - const T* it = nullptr; - T step = 0; - while (count > 0) { - it = first; - step = count / 2; - it += step; - if (!(val < *it)) { - first = ++it; - count -= step + 1; - } else { - count = step; - } - } - return first - orig; -} - template __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size, const int output_rows, const int output_cols, T* output) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - int segment = upper_bound(input_cols, col_size, tid_x) - 1; - - int curr_offset = input_cols[segment]; - int curr_segment = segment; + int curr_segment = 0; + int curr_offset = input_cols[0]; for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { - T curr_col_offset; - while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) { + int curr_col_offset = input_cols[curr_segment + 1]; + while (curr_col_offset <= tid_x) { curr_offset = curr_col_offset; ++curr_segment; + curr_col_offset = input_cols[curr_segment + 1]; } int local_col = tid_x - curr_offset; int segment_width = curr_col_offset - curr_offset; + T* input_ptr = inputs[curr_segment]; int tid_y = blockIdx.y * blockDim.y + threadIdx.y; for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) @@ -89,23 +70,25 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row, const int in_col, const int* out_cols, int out_cols_size, T** outputs_data) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - int segment = upper_bound(out_cols, out_cols_size, tid_x) - 1; - int curr_offset = out_cols[segment]; - int curr_segment = segment; + int curr_segment = 0; + int curr_offset = out_cols[0]; for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { - T curr_col_offset; - while ((curr_col_offset = out_cols[curr_segment + 1]) <= tid_x) { + int curr_col_offset = out_cols[curr_segment + 1]; + while (curr_col_offset <= tid_x) { curr_offset = curr_col_offset; ++curr_segment; + curr_col_offset = out_cols[curr_segment + 1]; } int local_col = tid_x - curr_offset; int segment_width = curr_col_offset - curr_offset; T* output_ptr = outputs_data[curr_segment]; - int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) - output_ptr[tid_y * segment_width + local_col] = - input_data[tid_y * in_col + tid_x]; + if (output_ptr != nullptr) { + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * segment_width + local_col] = + input_data[tid_y * in_col + tid_x]; + } } } @@ -118,10 +101,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row, int split = tid_x / fixed_out_col; int in_offset = tid_x - split * fixed_out_col; T* output_ptr = outputs_data[split]; - int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) - output_ptr[tid_y * fixed_out_col + in_offset] = - input_data[tid_y * in_col + tid_x]; + if (output_ptr != nullptr) { + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * fixed_out_col + in_offset] = + input_data[tid_y * in_col + tid_x]; + } } } @@ -203,17 +188,18 @@ template class ConcatGradFunctor { public: void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const int axis, - std::vector* outputs) { + const framework::Tensor& input, + const std::vector& ref_inputs, + const int axis, std::vector* outputs) { // TODO(zcd): Add input data validity checking int o_num = outputs->size(); int out_row = 1; - auto dim_0 = outputs->at(0).dims(); + auto dim_0 = ref_inputs[0]->dims(); for (int i = 0; i < axis; ++i) { out_row *= dim_0[i]; } - int out_col = outputs->at(0).numel() / out_row; + int out0_col = ref_inputs[0]->numel() / out_row; int in_col = 0, in_row = out_row; bool sameShape = true; @@ -223,13 +209,17 @@ class ConcatGradFunctor { outputs_cols[0] = 0; for (int i = 0; i < o_num; ++i) { - int t_col = outputs->at(i).numel() / out_row; + int t_col = ref_inputs.at(i)->numel() / out_row; if (sameShape) { - if (t_col != out_col) sameShape = false; + if (t_col != out0_col) sameShape = false; } in_col += t_col; outputs_cols[i + 1] = in_col; - outputs_ptr[i] = outputs->at(i).data(); + if (outputs->at(i) != nullptr) { + outputs_ptr[i] = outputs->at(i)->data(); + } else { + outputs_ptr[i] = nullptr; + } } T** dev_out_gpu_data = @@ -255,7 +245,7 @@ class ConcatGradFunctor { if (sameShape) { KernelConcatGrad<<>>( - input.data(), in_row, in_col, out_col, dev_out_gpu_data); + input.data(), in_row, in_col, out0_col, dev_out_gpu_data); } else { const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace()); KernelConcatGrad<<>>( diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat.h index 041ce8bf8a2e9528a004c076ead4471a3837c1a6..9e080f2e8be23768dcea47b577043beef37b2eaf 100644 --- a/paddle/fluid/operators/math/concat.h +++ b/paddle/fluid/operators/math/concat.h @@ -57,7 +57,8 @@ template class ConcatGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, - const int axis, std::vector* outputs); + const std::vector& ref_inputs, + const int axis, std::vector* outputs); }; } // namespace math diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index d39154c6f88d6d17c1719eb9a5b048211f4bb52b..c3387be6daa3bd34a6e3410ced23fce5d65f2cf7 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -30,6 +30,7 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; +template struct SetConstant; #define DEFINE_CPU_TRANS(RANK) \ template struct Transpose -#include -#include +#include "paddle/fluid/platform/dynload/mklml.h" #endif #ifdef PADDLE_USE_OPENBLAS #include +#ifdef LAPACK_FOUND #include #endif +#endif #ifndef LAPACK_FOUND extern "C" { diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc index 3719a264e90ea7d1a99eb9589ce4fd0d8e074781..b545671b43d3a453ab03e4774427179617f62db0 100644 --- a/paddle/fluid/operators/math/math_function_test.cc +++ b/paddle/fluid/operators/math/math_function_test.cc @@ -77,6 +77,8 @@ TEST(math_function, gemm_trans_clbas) { paddle::platform::CPUDeviceContext context(*cpu_place); GetBlas(context).GEMM(false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1, input3_ptr + 1, 4); + delete cpu_place; + cpu_place = NULL; EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[1], 24); diff --git a/paddle/fluid/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc index 8e508b68eeab69a4595904dcc3ea0a541d9ab6e6..b1e69f375d3274aade3184af02f7f914dba5db71 100644 --- a/paddle/fluid/operators/max_sequence_len_op.cc +++ b/paddle/fluid/operators/max_sequence_len_op.cc @@ -42,10 +42,15 @@ class MaxSeqenceLenOp : public framework::OperatorBase { class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("RankTable", "The lod_rank_table."); - AddOutput("Out", "The max sequence length."); - AddComment( - R"DOC(Calculate the max sequence length through lod_rank_table.)DOC"); + AddInput("RankTable", "Input variable which is a LoDRankTable object"); + AddOutput("Out", "The max sequence length"); + AddComment(R"DOC( + Given a LoDRankTable object, this layer returns the max length of + a batch of sequences. In fact, a LoDRankTable object contains a list of + tuples() and the list is already sorted by + sequence length in descending order, so the operator just returns the + sequence length of the first tuple element +)DOC"); } }; diff --git a/paddle/fluid/operators/mean_iou_op.cc b/paddle/fluid/operators/mean_iou_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a60f245f53e342fd9c1382fdda33a011a7fb06d6 --- /dev/null +++ b/paddle/fluid/operators/mean_iou_op.cc @@ -0,0 +1,110 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/mean_iou_op.h" + +namespace paddle { +namespace operators { + +class MeanIoUOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Predictions"), + "Input (Predictions) of MeanIoU op should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input (labels) of MeanIoU op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutMeanIou"), + "Output (OutMeanIou) of MeanIoU op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutWrong"), + "Output (OutWrong) of MeanIoU op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutCorrect"), + "Output (OutWrong) of MeanIoU op should not be null."); + + int64_t num_classes = + static_cast(ctx->Attrs().Get("num_classes")); + + ctx->SetOutputDim("OutMeanIou", {1}); + ctx->SetOutputDim("OutWrong", {num_classes}); + ctx->SetOutputDim("OutCorrect", {num_classes}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Predictions")->type()), + ctx.GetPlace()); + } +}; + +class MeanIoUOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Predictions", + "(Tensor), A Tensor of prediction results for semantic labels" + " with type int32 or int64. The rank should be greater than 1."); + AddInput( + "Labels", + "(Tensor), A Tensor of ground truth labels with type int32 or int64." + "Its shape should be the same as Input(Predictions)."); + AddInput("InWrongs", + "(vector), A list of Tensor with shape " + "[num_classes]. They are used to collect wrong number among " + "batches. Empty list is also valid here.") + .AsDuplicable() + .AsDispensable(); + AddInput( + "InCorrects", + "(vector), A list of Tensor with shape " + "[num_classes]. They are used to collect correct number among batches. " + "Empty list is also valid here.") + .AsDuplicable() + .AsDispensable(); + AddInput("InMeanIou", + "(vector), A list of Tensor that Output(mean_iou) should " + "be added to. Empty list is also valid here.") + .AsDuplicable() + .AsDispensable(); + AddOutput("OutMeanIou", + "(vector), A Tensor representing the" + " mean intersection-over-union with shape [1]."); + AddOutput("OutWrong", "(Tensor), A Tensor with shape [num_classes]. "); + AddOutput("OutCorrect", "(Tensor), A Tensor with shape [num_classes]. "); + AddAttr("num_classes", "(int), The possible number of labels."); + + AddComment(R"DOC( +mean-IOU Operator. +Mean Intersection-Over-Union is a common evaluation metric for +semantic image segmentation, which first computes the IOU for each +semantic class and then computes the average over classes. +IOU is defined as follows: + IOU = true_positive / (true_positive + false_positive + false_negative). +It is based on pixel level area while "IOU Similarity Operator" +is based on area of rectangle. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(mean_iou, ops::MeanIoUOp, ops::MeanIoUOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(mean_iou, ops::MeanIoUKernel, + ops::MeanIoUKernel, + ops::MeanIoUKernel); diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..83bb4dde46fa241affad3788e3381b6ecd8aa098 --- /dev/null +++ b/paddle/fluid/operators/mean_iou_op.cu @@ -0,0 +1,164 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/mean_iou_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { + +using platform::PADDLE_CUDA_NUM_THREADS; + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void CountCUDAKernel(const int num_classes, const int count, + const T* predictions, const T* labels, + int* wrong, int* correct) { + extern __shared__ int blcok_cache[]; + int* wrong_c = blcok_cache; + int* correct_c = blcok_cache + num_classes; + // init cache + for (int i = threadIdx.x; i < num_classes * 2; i += blockDim.x) { + blcok_cache[i] = 0; + } + __syncthreads(); + + T pred; + T label; + CUDA_1D_KERNEL_LOOP(i, count) { + pred = predictions[i]; + label = labels[i]; + if (pred == label) { + atomicAdd(correct_c + pred, 1); + } else { + atomicAdd(wrong_c + pred, 1); + atomicAdd(wrong_c + label, 1); + } + } + + __syncthreads(); + + for (int i = threadIdx.x; i < num_classes; i += blockDim.x) { + atomicAdd(wrong + i, wrong_c[i]); + atomicAdd(correct + i, correct_c[i]); + } +} + +__global__ void ComputeIoUCUDAKernel(const int num_classes, int* wrong, + int* correct, float* ious, float* iou) { + __shared__ int valid_count_c; + if (threadIdx.x == 0) { + valid_count_c = 0; + } + __syncthreads(); + CUDA_1D_KERNEL_LOOP(i, num_classes) { + int wrong_n = wrong[i]; + int correct_n = correct[i]; + int denominator = wrong_n + correct_n; + if (denominator > 0) { + atomicAdd(&valid_count_c, 1); + ious[i] = static_cast(correct_n) / denominator; + } else { + ious[i] = 0; + } + } + __syncthreads(); + if (threadIdx.x == 0) { + float iou_sum = 0; + for (int i = 0; i < num_classes; ++i) { + iou_sum += ious[i]; + } + iou[0] += iou_sum / valid_count_c; + } +} + +template +class MeanIoUCUDAOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& place = *ctx.template device_context() + .eigen_device(); + // get input and output tensor + auto* predictions = ctx.Input("Predictions"); + auto* labels = ctx.Input("Labels"); + auto* out_mean_iou = ctx.Output("OutMeanIou"); + auto* out_wrong = ctx.Output("OutWrong"); + auto* out_correct = ctx.Output("OutCorrect"); + int num_classes = static_cast(ctx.Attr("num_classes")); + + // Get data ptr + const T* predictions_data = predictions->data(); + const T* labels_data = labels->data(); + int* out_wrong_data = out_wrong->mutable_data(ctx.GetPlace()); + int* out_correct_data = out_correct->mutable_data(ctx.GetPlace()); + float* out_mean_iou_data = + out_mean_iou->mutable_data(ctx.GetPlace()); + + // Get Eigen tensor + auto out_mean_iou_t = EigenTensor::From(*out_mean_iou); + auto out_wrong_t = EigenTensor::From(*out_wrong); + auto out_correct_t = EigenTensor::From(*out_correct); + + // Temporary tensor + Tensor ious; + float* ious_data = ious.mutable_data( + {static_cast(num_classes)}, ctx.GetPlace()); + auto ious_t = EigenTensor::From(ious); + + // Init out_wrong, out_correct and out_mean_iou + out_wrong_t.device(place) = out_wrong_t.constant(0); + out_correct_t.device(place) = out_correct_t.constant(0); + out_mean_iou_t.device(place) = out_mean_iou_t.constant(0.0f); + + // collect pre wrong, correct and mean_iou + auto in_mean_ious = ctx.MultiInput("InMeanIou"); + for (int i = 0; i < in_mean_ious.size(); ++i) { + out_mean_iou_t.device(place) += + EigenTensor::From(*in_mean_ious[i]); + } + auto in_wrongs = ctx.MultiInput("InWrongs"); + for (int i = 0; i < in_wrongs.size(); ++i) { + out_wrong_t.device(place) += EigenTensor::From(*in_wrongs[i]); + } + auto in_corrects = ctx.MultiInput("InCorrects"); + for (int i = 0; i < in_corrects.size(); ++i) { + out_correct_t.device(place) += EigenTensor::From(*in_corrects[i]); + } + // compute + auto stream = ctx.cuda_device_context().stream(); + int block = PADDLE_CUDA_NUM_THREADS; + int grid = (predictions->numel() + block - 1) / block; + int cache_size = (num_classes * 2 + 1) * sizeof(int); + CountCUDAKernel<<>>( + num_classes, predictions->numel(), predictions_data, labels_data, + out_wrong_data, out_correct_data); + ctx.device_context().Wait(); + ComputeIoUCUDAKernel<<<1, block, 0, stream>>>(num_classes, out_wrong_data, + out_correct_data, ious_data, + out_mean_iou_data); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(mean_iou, ops::MeanIoUCUDAOpKernel, + ops::MeanIoUCUDAOpKernel, + ops::MeanIoUCUDAOpKernel); diff --git a/paddle/fluid/operators/mean_iou_op.h b/paddle/fluid/operators/mean_iou_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9fa00e60e05504e0bb8658c6908e4d4ac46b2ca4 --- /dev/null +++ b/paddle/fluid/operators/mean_iou_op.h @@ -0,0 +1,117 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +template +using EigenTensor = framework::EigenTensor; + +template +class MeanIoUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& place = *ctx.template device_context() + .eigen_device(); + // get input and output tensor + auto* predictions = ctx.Input("Predictions"); + auto* labels = ctx.Input("Labels"); + auto* out_mean_iou = ctx.Output("OutMeanIou"); + auto* out_wrong = ctx.Output("OutWrong"); + auto* out_correct = ctx.Output("OutCorrect"); + int num_classes = static_cast(ctx.Attr("num_classes")); + + // get data ptr + const T* predictions_data = predictions->data(); + const T* labels_data = labels->data(); + float* out_mean_iou_data = + out_mean_iou->mutable_data(ctx.GetPlace()); + int* out_wrong_data = out_wrong->mutable_data(ctx.GetPlace()); + int* out_correct_data = out_correct->mutable_data(ctx.GetPlace()); + + // get eigen tensor + auto out_mean_iou_t = EigenTensor::From(*out_mean_iou); + auto out_wrong_t = EigenTensor::From(*out_wrong); + auto out_correct_t = EigenTensor::From(*out_correct); + + // Tmp tensor + Tensor denominator; + Tensor valid_count; + Tensor iou_sum; + + // get data ptr of tmp tensor + int* denominator_data = denominator.mutable_data( + {static_cast(num_classes)}, ctx.GetPlace()); + int* valid_count_data = valid_count.mutable_data({1}, ctx.GetPlace()); + float* iou_sum_data = iou_sum.mutable_data({1}, ctx.GetPlace()); + + // get eigen tensor of tmp tensor + auto denominator_t = EigenTensor::From(denominator); + auto valid_count_t = EigenTensor::From(valid_count); + auto iou_sum_t = EigenTensor::From(iou_sum); + + // init out_wrong, out_correct and out_mean_iou + out_wrong_t = out_wrong_t.constant(0); + out_correct_t = out_correct_t.constant(0); + out_mean_iou_t = out_mean_iou_t.constant(0); + + // collect pre wrong, correct and mean_iou + auto in_mean_ious = ctx.MultiInput("InMeanIou"); + for (size_t i = 0; i < in_mean_ious.size(); ++i) { + out_mean_iou_t.device(place) += + EigenTensor::From(*in_mean_ious[i]); + } + auto in_wrongs = ctx.MultiInput("InWrongs"); + for (size_t i = 0; i < in_wrongs.size(); ++i) { + out_wrong_t.device(place) += EigenTensor::From(*in_wrongs[i]); + } + auto in_corrects = ctx.MultiInput("InCorrects"); + for (size_t i = 0; i < in_corrects.size(); ++i) { + out_correct_t.device(place) += EigenTensor::From(*in_corrects[i]); + } + + // compute + for (int64_t i = 0; i < predictions->numel(); ++i) { + if (predictions_data[i] == labels_data[i]) { + out_correct_data[predictions_data[i]] += 1; + } else { + out_wrong_data[labels_data[i]] += 1; + out_wrong_data[predictions_data[i]] += 1; + } + } + + denominator_t = out_wrong_t + out_correct_t; + valid_count_t = + (denominator_t > denominator_t.constant(0.0f)).cast().sum(); + + for (int i = 0; i < num_classes; ++i) { + if (denominator_data[i] == 0) { + denominator_data[i] = 1; + } + } + + iou_sum_t = + (out_correct_t.cast() / denominator_t.cast()).sum(); + out_mean_iou_data[0] += (iou_sum_data[0] / valid_count_data[0]); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc index 74477eb439dc202c3f5f17fdf3e1647bc5c23512..9e0bebd17c02a3ce010b77142757b8789cfbcdd9 100644 --- a/paddle/fluid/operators/mean_op.cc +++ b/paddle/fluid/operators/mean_op.cc @@ -33,12 +33,10 @@ class MeanOp : public framework::OperatorWithKernel { class MeanOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("X", "The input of mean op"); - AddOutput("Out", "The output of mean op"); + AddInput("X", "(Tensor) The input of mean op"); + AddOutput("Out", "(Tensor) The output of mean op").Reuse("X"); AddComment(R"DOC( -Mean Operator. - -Out is a scalar which is the mean of all elements in X. +Mean Operator calculates the mean of all elements in X. )DOC"); } diff --git a/paddle/fluid/operators/merge_ids_op.cc b/paddle/fluid/operators/merge_ids_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c6ec4ab047d5e91625e646fd26108d2e477cdce5 --- /dev/null +++ b/paddle/fluid/operators/merge_ids_op.cc @@ -0,0 +1,128 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/merge_ids_op.h" + +namespace paddle { +namespace operators { + +class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}"); + AddInput( + "X", + "(LoDTensors) multi input tensor with shape{batch_num, N}, N is the " + "size of embedding table") + .AsDuplicable(); + AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors."); + + AddComment(R"DOC( +Merge multi LoDTensor's into one according to Ids's shard num. + + +split_ids_op -> prefetch_op -> merge_ids_op + + +merge_ids_op should be used after split_ids_op and prefetch_op, split_ids_op + will split input Ids into multiple tensors according to Id's shard number. +prefetch_op will send them to parameter server to prefetch embedding value +back. During split, the order of ids is disordered. In merge_ids_op we use +the original Ids to restore the order of the fetched embedding value and + also pass the lod information to the merged output. + + +Example: + + Ids = [1,2,3,4,5,6] # 3 shared + +split_ids_op -> + + Id0 = [3, 6] # id % 3 == 0 + Id1 = [1, 4] # id % 3 == 1 + Id2 = [2, 5] # id % 3 == 2 + +prefetch_op -> + + X0 = [[0.3 0.3] # 3 + [0.6 0.6]] # 6 + X1 = [[0.1 0.1] # 1 + [0.4 0.4]] # 4 + X2 = [[0.2 0.2] # 2 + [0.5 0.5]] # 5 + +merge_ids_op -> + + Out = [[0.1 0.1] # 1 + [0.2 0.2] # 2 + [0.3 0.3] # 3 + [0.4 0.4] # 4 + [0.5 0.5] # 5 + [0.6 0.6]] # 6 +)DOC"); + } +}; + +class MergeIdsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Ids"), "MergeIdsOp must has input Ids."); + PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has input X."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "MergeIdsOp must has output Out."); + + auto ids_var_type = ctx->GetInputsVarType("Ids").front(); + auto ids_dims = ctx->GetInputDim("Ids"); + if (ids_var_type == framework::proto::VarType::LOD_TENSOR) { + PADDLE_ENFORCE_EQ(ids_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[1], 1); + } + auto x_var_type = ctx->GetInputsVarType("X"); + for (auto &var_type : x_var_type) { + PADDLE_ENFORCE_EQ(var_type, framework::proto::VarType::LOD_TENSOR, + "input X only support lod tensors"); + } + ctx->ShareLoD("Ids", "Out"); + } + + private: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.MultiInput("X").front()->type()), + ctx.GetPlace()); + } +}; + +class MergeIdsOpInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + auto *input_var = block->Var(op_desc.Input("Ids")[0]); + for (auto &out_var : op_desc.Output("Out")) { + block->Var(out_var)->SetType(input_var->GetType()); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(merge_ids, ops::MergeIdsOp, ops::MergeIdsOpMaker, + ops::MergeIdsOpInferVarType); +REGISTER_OP_CPU_KERNEL( + merge_ids, ops::MergeIdsOpKernel); diff --git a/paddle/fluid/operators/merge_ids_op.h b/paddle/fluid/operators/merge_ids_op.h new file mode 100644 index 0000000000000000000000000000000000000000..83712a8519c6817151e1922c606c0fdd4682a2db --- /dev/null +++ b/paddle/fluid/operators/merge_ids_op.h @@ -0,0 +1,92 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { + +template +class MergeIdsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto place = ctx.GetPlace(); + if (!platform::is_cpu_place(place)) { + PADDLE_THROW("MergeIds do not support GPU kernel"); + } + VLOG(3) << "run in MergeIdsOpKernel"; + + const auto *ids_var = ctx.InputVar("Ids"); + PADDLE_ENFORCE(ids_var->IsType(), + "only support to merge Ids of LoDTensor"); + + const auto &ids_tensor = ids_var->Get(); + const auto &ids_dims = ids_tensor.dims(); + const int64_t *ids = ids_tensor.data(); + + auto x_tensors = ctx.MultiInput("X"); + + auto *out = ctx.Output("Out"); + + int batch_size = 0; + int embedding_size = 0; + for (auto &input : x_tensors) { + if (framework::product(input->dims()) != 0) { + if (embedding_size == 0) { + embedding_size = input->dims()[1]; + } + PADDLE_ENFORCE_EQ(embedding_size, input->dims()[1], + "embedding size of all input should be the same"); + batch_size += input->dims()[0]; + } + } + PADDLE_ENFORCE_EQ( + batch_size, ids_dims[0], + "the batch size of ids and merged embedding value should be the same"); + + const size_t shard_num = x_tensors.size(); + + if (shard_num == 1) { + VLOG(3) << "only one shard, we can copy the data directly"; + TensorCopy(*x_tensors[0], place, out); + } else { + std::vector in_indexs(shard_num, 0); + auto *out_data = out->mutable_data( + framework::make_ddim({batch_size, embedding_size}), place); + // copy data from ins[shard_num] to out. + for (int i = 0; i < ids_dims[0]; ++i) { + int64_t id = ids[i]; + size_t shard_id = static_cast(id) % shard_num; + int index = in_indexs[shard_id]; + memcpy(out_data + embedding_size * i, + x_tensors[shard_id]->data() + index * embedding_size, + sizeof(T) * embedding_size); + in_indexs[shard_id] += 1; + } + + for (size_t i = 0; i < shard_num; ++i) { + PADDLE_ENFORCE_EQ(in_indexs[i], x_tensors[i]->dims()[0], + "after merge, all data in x_tensor should be used"); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc index a4363fd25d57edb5c2509904a1f55634832613be..18ad46cb5eeeab2169136e40cebdaa53c0bfd587 100644 --- a/paddle/fluid/operators/multiplex_op.cc +++ b/paddle/fluid/operators/multiplex_op.cc @@ -62,26 +62,46 @@ class MultiplexOp : public framework::OperatorWithKernel { class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("Ids", "The index tensor of multiplex operator."); - AddInput("X", "The candidate tensors of multiplex operator.") + AddInput("Ids", + "Tensor, index variable which is a 2-D tensor with shape " + "[M, 1] where M is the batch size."); + AddInput("X", + "A list of variables to gather from. All variables have the same " + "shape and the rank is at least 2.") .AsDuplicable(); AddOutput("Out", "The output tensor of multiplex operator."); AddComment(R"DOC( -Multiplex Operator. - -Multiplex multiple tensors according to the index provided by the index tensor. - -Ids: the index tensor. -X[0 : N - 1]: the candidate tensors for output (N >= 2). -For each index i from 0 to batchSize - 1, the output is the i-th row of the +Referring to the given index variable, this layer selects rows from the +input variables to construct a multiplex variable. Assuming that there are +:math:`m` input variables and :math:`I_i` represents the i-th input +variable and :math:`i` is in [0, :math:`m`). All input variables are +tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`]. +Please note that rank of the input tensor should be at least 2. Each input +variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`] +where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2` +* ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input +variable. The given index variable should be a 2-D tensor with shape +[:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable. +Then the output variable will be a tensor with shape [:math:`d_0`, +:math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D +matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th +row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`. + +* Ids: the index tensor. + +* X[0 : N - 1]: the candidate tensors for output (N >= 2). + +* For each index i from 0 to batchSize - 1, the output is the i-th row of the the (Ids[i])-th tensor. For i-th row of the output tensor: -$$y[i] = x_{k}[i]$$ +$$ +y[i] = x_{k}[i] +$$ -where `y` is the output tensor, `x_{k}` is the k-th input tensor, -and `k = Ids[i]`. +where $y$ is the output tensor, $x_{k}$ is the k-th input tensor, +and $k = Ids[i]$. )DOC"); } diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 06092e680a1efbef379ccf40fdf476769f820429..e471f04662a1fa3e8e77a2db37f0da4521682018 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -128,8 +128,10 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker { "user should avoid setting this attribute.") .SetDefault({}); AddComment(R"DOC( -Compute and return the noise-contrastive estimation training loss. -See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf). +Compute and return the noise-contrastive estimation training loss. See +`Noise-contrastive estimation: A new estimation principle for unnormalized +statistical models + `_. By default this operator uses a uniform distribution for sampling. )DOC"); } diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc index cdbc975c02214721ceae3a338741101ef32d7ee9..aa19c62c83648814e86b1e7062424be3693e4b98 100644 --- a/paddle/fluid/operators/norm_op.cc +++ b/paddle/fluid/operators/norm_op.cc @@ -16,40 +16,34 @@ limitations under the License. */ namespace paddle { namespace operators { -template class NormOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput( - "X", - "(Tensor) The input tensor of norm operator. " - "The format of input tensor is NCHW. Where N is batch size, C is the " - "number of channels, H and W is the height and width of feature."); - AddInput("Scale", - "(Tensor) The input tensor of norm operator. " - "The format of input tensor is C * 1."); - AddAttr("epsilon", - "(float, default 1e-10) Constant " - "for numerical stability.") + AddInput("X", "(Tensor) A tensor of rank >= axis."); + AddAttr("axis", + "The axis on which to apply normalization. If axis < 0, " + "the dimension to normalization is rank(X) + axis. -1 is " + "the last dimension."); + AddAttr("epsilon", + "(float, default 1e-10) The epsilon value is used " + "to avoid division by zero.") .SetDefault(1.0e-10f); - AddOutput("Out", - "(Tensor) The output tensor of norm operator." - "N * M." - "M = C * H * W"); + AddOutput("Norm", + "(Tensor) A tensor saved the `sqrt(sum(x) + epsion)` will " + "be used in backward kernel.") + .AsIntermediate(); + AddOutput("Out", "(Tensor) A tensor of the same shape as X."); AddComment(R"DOC( - "Input shape: $(N, C, H, W)$ - Scale shape: $(C, 1)$ - Output shape: $(N, C, H, W)$ - Where - forward - $$ - [\frac {x_{1}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{2}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{3}}{\sqrt{\sum{x_{i}^{2}}}} \cdot \cdot \cdot \frac {x_{n}}{\sqrt{\sum{x_{i}^{2}}}}] - $$ - backward - $$ - \frac{\frac{\mathrm{d}L }{\mathrm{d}y_{1}} - \frac {x_{1}\sum {\frac{\mathrm{d} L}{\mathrm{d} y_{j}}}x_{j}}{\sum x_{j}^{2}} }{\sqrt{\sum{x_{j}^{2}}}} - $$ - )DOC"); + +Given a tensor, apply 2-normalization along the provided axis. + +$$ +y = \frac{x}{ \sqrt{\sum {x^2} + epsion }} +$$ + +where, $\sum {x^2}$ is calculated along the `axis` dimension. + +)DOC"); } }; @@ -58,15 +52,15 @@ class NormOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of NormOp" - "should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Scale"), - "Input(Scale) of NormOp" - "should not be null."); + "Input(X) of NormOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of NormOp should not be null."); - auto in_x_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", in_x_dims); + auto xdim = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", xdim); + int axis = ctx->Attrs().Get("axis"); + if (axis < 0) axis = xdim.size() + axis; + xdim[axis] = 1; + ctx->SetOutputDim("Norm", xdim); } }; @@ -84,12 +78,12 @@ class NormOpGrad : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker, +using CPU = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(norm_grad, ops::NormOpGrad); -REGISTER_OP_CPU_KERNEL( - norm, ops::NormKernel, - ops::NormKernel); -REGISTER_OP_CPU_KERNEL( - norm_grad, ops::NormGradKernel, - ops::NormGradKernel); +REGISTER_OP_CPU_KERNEL(norm, ops::NormKernel, + ops::NormKernel); +REGISTER_OP_CPU_KERNEL(norm_grad, ops::NormGradKernel, + ops::NormGradKernel); diff --git a/paddle/fluid/operators/norm_op.cu b/paddle/fluid/operators/norm_op.cu index d1d9be50742b54a3b6f068fd43ec4b16696183bf..1d0021d33ff9ee65c3366183466b94266e6c2999 100644 --- a/paddle/fluid/operators/norm_op.cu +++ b/paddle/fluid/operators/norm_op.cu @@ -16,9 +16,9 @@ limitations under the License. */ #include "paddle/fluid/operators/norm_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - norm, ops::NormKernel, - ops::NormKernel); -REGISTER_OP_CUDA_KERNEL( - norm_grad, ops::NormGradKernel, - ops::NormGradKernel); +using CUDA = paddle::platform::CUDADeviceContext; + +REGISTER_OP_CUDA_KERNEL(norm, ops::NormKernel, + ops::NormKernel); +REGISTER_OP_CUDA_KERNEL(norm_grad, ops::NormGradKernel, + ops::NormGradKernel); diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h index 0ad29e8a0385c46a07842930378ed7a040564437..3167bdc8ac718b23435690577e4163826d14a332 100644 --- a/paddle/fluid/operators/norm_op.h +++ b/paddle/fluid/operators/norm_op.h @@ -19,156 +19,110 @@ limitations under the License. */ namespace paddle { namespace operators { -template +inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n, + int* post) { + *pre = 1; + *post = 1; + *n = dim[axis]; + for (int i = 0; i < axis; ++i) { + (*pre) *= dim[i]; + } + for (int i = axis + 1; i < dim.size(); ++i) { + (*post) *= dim[i]; + } +} + +template class NormKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { - const framework::Tensor* in_x = context.Input("X"); - const framework::Tensor* scale = context.Input("Scale"); - auto* out = context.Output("Out"); - auto epsilon = static_cast(context.Attr("epsilon")); - out->mutable_data(context.GetPlace()); - int batch_size = in_x->dims()[0]; - int channels = in_x->dims()[1]; - int height = in_x->dims()[2]; - int width = in_x->dims()[3]; - int fea_len = height * width; - auto* place = - context.template device_context().eigen_device(); - auto x = - framework::EigenMatrix::From( - *in_x, framework::make_ddim({batch_size, fea_len * channels})); - // get square - framework::Tensor x_square; - x_square.mutable_data(in_x->dims(), context.GetPlace()); - auto x_square_eigen = - framework::EigenMatrix::From( - x_square, framework::make_ddim({batch_size, fea_len * channels})); - x_square_eigen.device(*place) = x.square(); - auto scale_eigen = - framework::EigenVector::Flatten( - *scale); - for (int n = 0; n < batch_size; ++n) { - framework::Tensor in_x_batch = in_x->Slice(n, n + 1); - auto in_x_batch_eigen = - framework::EigenMatrix::From( - in_x_batch, framework::make_ddim({channels, fea_len})); - framework::Tensor x_square_batch = x_square.Slice(n, n + 1); - auto x_square_batch_eigen = - framework::EigenMatrix::From( - x_square_batch, framework::make_ddim({channels, fea_len})); - framework::Tensor out_batch = out->Slice(n, n + 1); - auto out_batch_eigen = - framework::EigenMatrix::From( - out_batch, framework::make_ddim({channels, fea_len})); - framework::Tensor tmp_tensor; - tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), - context.GetPlace()); - auto tmp = framework::EigenVector::Flatten(tmp_tensor); - // get colsum and sqrt , inverse - auto dim = Eigen::array({{0}}); - tmp.device(*place) = x_square_batch_eigen.sum(dim); - tmp.device(*place) = (tmp + epsilon).sqrt().inverse(); - Eigen::array broadcast_dim_col; - broadcast_dim_col[1] = 1; - broadcast_dim_col[0] = channels; - out_batch_eigen.device(*place) = - in_x_batch_eigen * (tmp.broadcast(broadcast_dim_col)); - Eigen::array broadcast_dim_row; - broadcast_dim_row[1] = fea_len; - broadcast_dim_row[0] = 1; - out_batch_eigen.device(*place) = - out_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row)); - } + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_x = ctx.Input("X"); + auto* out_y = ctx.Output("Out"); + auto* out_norm = ctx.Output("Norm"); + out_y->mutable_data(ctx.GetPlace()); + out_norm->mutable_data(ctx.GetPlace()); + + auto xdim = in_x->dims(); + auto ndim = out_norm->dims(); + T eps = static_cast(ctx.Attr("epsilon")); + int axis = ctx.Attr("axis"); + if (axis < 0) axis = xdim.size() + axis; + int pre, n, post; + GetDims(xdim, axis, &pre, &n, &post); + + auto* place = ctx.template device_context().eigen_device(); + + Eigen::DSizes shape(pre, n, post); + Eigen::DSizes norm_shape(pre, post); + + auto x_e = framework::EigenVector::Flatten(*in_x); + auto y_e = framework::EigenVector::Flatten(*out_y); + auto norm_e = framework::EigenVector::Flatten(*out_norm); + auto x = x_e.reshape(shape); + auto y = y_e.reshape(shape); + auto norm = norm_e.reshape(norm_shape); + + Eigen::DSizes rdim(1); + // y = x / sqrt((sum(x * x) + epsilon)) + // norm = sqrt(sum(x * x) + epsilon) + auto sum = x.pow(2).sum(rdim) + eps; + norm.device(*place) = sum.sqrt(); + // y = x / norm + Eigen::DSizes rshape(pre, 1, post); + Eigen::DSizes bcast(1, n, 1); + y.device(*place) = x / norm.reshape(rshape).broadcast(bcast); } }; template class NormGradKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { - const framework::Tensor* in_x = context.Input("X"); - const framework::Tensor* scale = context.Input("Scale"); - const framework::Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - auto epsilon = static_cast(context.Attr("epsilon")); - framework::Tensor* in_x_grad = - context.Output(framework::GradVarName("X")); - in_x_grad->mutable_data(context.GetPlace()); - int batch_size = in_x->dims()[0]; - int channels = in_x->dims()[1]; - int height = in_x->dims()[2]; - int width = in_x->dims()[3]; - int fea_len = height * width; - auto* place = - context.template device_context().eigen_device(); - - auto scale_eigen = - framework::EigenVector::Flatten( - *scale); - auto x = - framework::EigenMatrix::From( - *in_x, framework::make_ddim({batch_size, fea_len * channels})); - // get square - framework::Tensor x_square; - x_square.mutable_data(in_x->dims(), context.GetPlace()); - auto x_square_eigen = - framework::EigenMatrix::From( - x_square, framework::make_ddim({batch_size, fea_len * channels})); - x_square_eigen.device(*place) = x.square(); - - for (int n = 0; n < batch_size; ++n) { - framework::Tensor in_x_batch = in_x->Slice(n, n + 1); - auto in_x_batch_eigen = - framework::EigenMatrix::From( - in_x_batch, framework::make_ddim({channels, fea_len})); - framework::Tensor in_g_batch = in_x_grad->Slice(n, n + 1); - auto in_g_batch_eigen = - framework::EigenMatrix::From( - in_g_batch, framework::make_ddim({channels, fea_len})); - framework::Tensor x_square_batch = x_square.Slice(n, n + 1); - auto x_square_batch_eigen = - framework::EigenMatrix::From( - x_square_batch, framework::make_ddim({channels, fea_len})); - framework::Tensor outg_batch = out_grad->Slice(n, n + 1); - auto outg_batch_eigen = - framework::EigenMatrix::From( - outg_batch, framework::make_ddim({channels, fea_len})); - - framework::Tensor tmp_tensor; - tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), - context.GetPlace()); - auto tmp_eigen = - framework::EigenVector::Flatten(tmp_tensor); - auto dim = Eigen::array({{0}}); - tmp_eigen.device(*place) = (in_x_batch_eigen * outg_batch_eigen).sum(dim); - framework::Tensor norm_tmp_tensor; - norm_tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), - context.GetPlace()); - auto norm_tmp_eigen = - framework::EigenVector::Flatten(norm_tmp_tensor); - norm_tmp_eigen.device(*place) = - (x_square_batch_eigen.sum(dim) + epsilon).sqrt(); - Eigen::array broadcast_dim_col; - broadcast_dim_col[1] = 1; - broadcast_dim_col[0] = channels; - in_g_batch_eigen.device(*place) = - in_x_batch_eigen * tmp_eigen.broadcast(broadcast_dim_col); - in_g_batch_eigen.device(*place) = - in_g_batch_eigen / - (norm_tmp_eigen * norm_tmp_eigen).broadcast(broadcast_dim_col); - in_g_batch_eigen.device(*place) = outg_batch_eigen - in_g_batch_eigen; - // outg_batch_eigen + (in_g_batch_eigen * -1); - in_g_batch_eigen.device(*place) = - in_g_batch_eigen / norm_tmp_eigen.broadcast(broadcast_dim_col); - Eigen::array broadcast_dim_row; - broadcast_dim_row[1] = fea_len; - broadcast_dim_row[0] = 1; - in_g_batch_eigen.device(*place) = - in_g_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row)); - } + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_x = ctx.Input("X"); + auto* in_norm = ctx.Input("Norm"); + auto* in_dy = ctx.Input(framework::GradVarName("Out")); + auto* out_dx = ctx.Output(framework::GradVarName("X")); + out_dx->mutable_data(ctx.GetPlace()); + + auto xdim = in_x->dims(); + int axis = ctx.Attr("axis"); + if (axis < 0) axis = xdim.size() + axis; + int pre, n, post; + GetDims(xdim, axis, &pre, &n, &post); + + auto* place = ctx.template device_context().eigen_device(); + + auto x_e = framework::EigenVector::Flatten(*in_x); + auto dy_e = framework::EigenVector::Flatten(*in_dy); + auto norm_e = framework::EigenVector::Flatten(*in_norm); + auto dx_e = framework::EigenVector::Flatten(*out_dx); + + Eigen::DSizes shape(pre, n, post); + Eigen::DSizes norm_shape(pre, post); + auto x = x_e.reshape(shape); + auto dy = dy_e.reshape(shape); + auto norm = norm_e.reshape(norm_shape); + auto dx = dx_e.reshape(shape); + + framework::Tensor rsum; + rsum.mutable_data({pre, post}, ctx.GetPlace()); + auto sum = framework::EigenTensor::From(rsum); + + Eigen::DSizes rdim(1); + Eigen::DSizes bcast(1, n, 1); + Eigen::DSizes rshape(pre, 1, post); + + // dx = ( dy/sqrt(sum(x*x)) ) * [1 - x*sum(x) / (sum(x*x) + e)] + // = [dy - dy * x * sum(x) / (sum(x*x) + e)] / sqrt(sum(x*x)) + // = [dy - x * sum(x*dy) / (sum(x*x) + e)] / sqrt(sum(x*x)) + // 1. sum = sum(x*dy) + sum.device(*place) = (x * dy).sum(rdim); + // 2. dx = x * sum + dx.device(*place) = sum.reshape(rshape).broadcast(bcast) * x; + // 3. dx / (sum(x*x) + e) + // where, norm.pow(2) = sum(x*x) + e, which is calculated in forward. + dx.device(*place) = dx / norm.pow(2).broadcast(bcast); + // 4. [dy - dx] / sqrt(sum(x*x)) + dx.device(*place) = (dy - dx) / norm.broadcast(bcast); } }; } // namespace operators diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc index 1012640d5e2052e4f347ad458cea9072a004f334..c9744db3d0654ef63357963d9a9a3cb946f56e2d 100644 --- a/paddle/fluid/operators/parallel_do_op.cc +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -295,7 +295,7 @@ class ParallelDoGradOp : public framework::OperatorBase { auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}}, - framework::AttributeMap{}); + framework::AttributeMap{{"use_mkldnn", {false}}}); VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]); sum_op->Run(*sub_scopes[0], places[0]); WaitOnPlace(places[0]); diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc index d60a99994edc926456706ff6a3ba998a3e5e7dd5..be55bc43b14f1e6211f71b4080d1676838ad508c 100644 --- a/paddle/fluid/operators/pool_cudnn_op.cu.cc +++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc @@ -135,7 +135,11 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel { PoolingMode pooling_mode; if (pooling_type == "max") { - pooling_mode = PoolingMode::kMaximum; + if (FLAGS_cudnn_deterministic) { + pooling_mode = PoolingMode::kMaximumDeterministic; + } else { + pooling_mode = PoolingMode::kMaximum; + } } else { pooling_mode = PoolingMode::kAverage; } diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc index 60e936298defe7c6ce8a33bdc7de05b52eb950e7..5341187d1ce9400ac34750ab691608e76158ae0d 100644 --- a/paddle/fluid/operators/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/pool_mkldnn_op.cc @@ -18,16 +18,24 @@ limitations under the License. */ namespace paddle { namespace operators { -using mkldnn::memory; // Note: paddle has also "memory" namespace -using mkldnn::pooling_forward; +using framework::DataLayout; +using mkldnn::memory; using mkldnn::pooling_backward; +using mkldnn::pooling_forward; +using mkldnn::primitive; +using mkldnn::reorder; +using mkldnn::stream; +using platform::to_void_cast; // Generate keys for storing/retriving primitives for this operator // TODO(jczaja): Make hashing function more optimial -static std::string gethash(memory::dims& input_dims, std::string& pooling_type, - std::vector& ksize, std::vector& strides, - std::vector& paddings, std::string suffix) { - auto dims2str = [](memory::dims& operand_dims) { +static std::string gethash(const memory::dims& input_dims, + const std::string& pooling_type, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string& suffix) { + auto dims2str = [](const memory::dims& operand_dims) { std::string dstr = ""; for (size_t i = 0; i < operand_dims.size(); ++i) { dstr += std::to_string(operand_dims[i]) + "-"; @@ -52,8 +60,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { const Tensor* input = ctx.Input("X"); Tensor* output = ctx.Output("Out"); - // Get an unique name from "argument" name of "Out" variable - // This name will be used as key when saving info into device context + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); std::string pooling_type = ctx.Attr("pooling_type"); std::vector ksize = ctx.Attr>("ksize"); @@ -79,6 +88,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector src_tz = paddle::framework::vectorize2int(input->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + auto input_format = input->format(); + memory::format output_format{memory::format::format_undef}; + const std::string key = gethash(src_tz, pooling_type, ksize, strides, paddings, ctx.op().Output("Out")); const std::string key_pool_p = key + "@pool_p"; @@ -91,16 +103,17 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { auto pool_p = std::static_pointer_cast(dev_ctx.GetBlob(key_pool_p)); if (pool_p == nullptr) { - // TODO(pzelazko-intel): support more formats + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), input_format); - auto src_md = - platform::MKLDNNMemDesc(src_tz, platform::MKLDNNGetDataType(), - mkldnn::memory::format::nchw); - auto dst_md = - platform::MKLDNNMemDesc(dst_tz, platform::MKLDNNGetDataType(), - mkldnn::memory::format::nchw); + /* create memory descriptor for pooling without specified format + * ('any') which lets a primitive (pooling in this case) choose + * the memory format preferred for best performance + */ + auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32, + mkldnn::memory::format::any); - std::shared_ptr pool_pd = + std::shared_ptr pool_pd = CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize, pooling_type, mkldnn_engine); @@ -113,20 +126,22 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { // save pool_workspace_memory to be referred in backward path dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); - auto pool_src_memory_p = std::make_shared( - memory::primitive_desc{src_md, mkldnn_engine}, - static_cast(const_cast(input_data))); - dev_ctx.SetBlob(key_pool_src_mem_p, pool_src_memory_p); + auto src_memory = std::make_shared(pool_pd->src_primitive_desc(), + to_void_cast(input_data)); + auto dst_memory = + std::make_shared(pool_pd->dst_primitive_desc(), output_data); - auto pool_dst_memory_p = std::make_shared( - memory::primitive_desc{dst_md, mkldnn_engine}, - static_cast(output_data)); - dev_ctx.SetBlob(key_pool_dst_mem_p, pool_dst_memory_p); + dev_ctx.SetBlob(key_pool_src_mem_p, src_memory); + dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory); + + pool_p = std::make_shared(*pool_pd, *(src_memory.get()), + *(dst_memory.get()), + *workspace_memory); - pool_p = std::make_shared( - *pool_pd, *(pool_src_memory_p.get()), *(pool_dst_memory_p.get()), - *workspace_memory); dev_ctx.SetBlob(key_pool_p, pool_p); + + output_format = + (memory::format)dst_memory->get_primitive_desc().desc().data.format; } else { // Primitives already exist auto pool_src_memory_p = @@ -137,14 +152,20 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { std::static_pointer_cast(dev_ctx.GetBlob(key_pool_dst_mem_p)); PADDLE_ENFORCE(pool_dst_memory_p != nullptr, "Fail to find pooling dst mem_p in device context"); - pool_src_memory_p->set_data_handle( - reinterpret_cast(const_cast(input_data))); + pool_src_memory_p->set_data_handle(to_void_cast(input_data)); pool_dst_memory_p->set_data_handle(output_data); + + output_format = (memory::format)pool_dst_memory_p->get_primitive_desc() + .desc() + .data.format; } // push primitive to stream and wait until it's executed std::vector pipeline{*(pool_p.get())}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + stream(stream::kind::eager).submit(pipeline).wait(); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(output_format); } private: @@ -191,6 +212,13 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { const Tensor* out_grad = ctx.Input(framework::GradVarName("Out")); Tensor* in_x_grad = ctx.Output(framework::GradVarName("X")); + PADDLE_ENFORCE(in_x->layout() == DataLayout::kMKLDNN && + in_x->format() != memory::format::format_undef, + "Wrong layout/format set for Input X tensor"); + PADDLE_ENFORCE(out_grad->layout() == DataLayout::kMKLDNN && + out_grad->format() != memory::format::format_undef, + "Wrong layout/format set for Input output_grad tensor"); + std::string pooling_type = ctx.Attr("pooling_type"); std::vector ksize = ctx.Attr>("ksize"); std::vector strides = ctx.Attr>("strides"); @@ -209,6 +237,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { const T* out_grad_data = out_grad->data(); T* in_x_grad_data = in_x_grad->mutable_data(ctx.GetPlace()); + memory::format in_x_grad_format{memory::format::format_undef}; std::vector diff_src_tz = paddle::framework::vectorize2int(in_x_grad->dims()); @@ -222,39 +251,48 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { const std::string key_pool_bwd_p = key + "@pool_bwd_p"; const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p"; const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p"; + const std::string key_pool_src_mem_p = key + "@pool_src_mem_p"; + const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p"; const std::string key_pool_pd = key + "@pool_pd"; const std::string key_pool_workspace_memory = key + "@pool_workspace_memory"; + auto user_diff_dst_memory = + memory({{{diff_dst_tz}, memory::data_type::f32, out_grad->format()}, + mkldnn_engine}, + to_void_cast(out_grad_data)); + + std::shared_ptr diff_src_memory; + std::shared_ptr diff_dst_memory; + auto dst_memory = + std::static_pointer_cast(dev_ctx.GetBlob(key_pool_dst_mem_p)); + PADDLE_ENFORCE(dst_memory != nullptr, + "Fail to find dst_memory in device context"); + + primitive reorder_diff_dst; + bool is_diff_dst_reordered = false; auto pool_bwd_p = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_bwd_p)); if (pool_bwd_p == nullptr) { - auto diff_src_md = - platform::MKLDNNMemDesc(diff_src_tz, platform::MKLDNNGetDataType(), - mkldnn::memory::format::nchw); - auto diff_dst_md = - platform::MKLDNNMemDesc(diff_dst_tz, platform::MKLDNNGetDataType(), - mkldnn::memory::format::nchw); + // Retrieve src_memory/dst_memory saved in forward pass + auto src_memory = + std::static_pointer_cast(dev_ctx.GetBlob(key_pool_src_mem_p)); + PADDLE_ENFORCE(src_memory != nullptr, + "Fail to find src_memory in device context"); // Retrieve pool_pd/pool_workspace_memory from device context auto pool_pd = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_pd)); PADDLE_ENFORCE(pool_pd != nullptr, "Fail to find pool_pd in device context"); - - auto workspace_memory = std::static_pointer_cast( + auto workspace_memory = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_workspace_memory)); PADDLE_ENFORCE(workspace_memory != nullptr, "Fail to find workspace_memory in device context"); - auto pool_diff_src_memory_p = std::make_shared(memory( - {diff_src_md, mkldnn_engine}, static_cast(in_x_grad_data))); - dev_ctx.SetBlob(key_pool_diff_src_mem_p, pool_diff_src_memory_p); - - auto pool_diff_dst_memory_p = std::make_shared( - memory({diff_dst_md, mkldnn_engine}, - static_cast(const_cast(out_grad_data)))); - dev_ctx.SetBlob(key_pool_diff_dst_mem_p, pool_diff_dst_memory_p); + // create memory descriptors for pooling + auto diff_src_md = src_memory.get()->get_primitive_desc().desc(); + auto diff_dst_md = dst_memory.get()->get_primitive_desc().desc(); auto pool_bwd_desc = mkldnn::pooling_backward::desc( pooling_type == "max" ? mkldnn::algorithm::pooling_max @@ -264,35 +302,74 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc( pool_bwd_desc, mkldnn_engine, *pool_pd); + // reorder between user_diff_dst and pool diff_dst if needed + diff_dst_memory = std::make_shared(user_diff_dst_memory); + if (memory::primitive_desc(dst_memory->get_primitive_desc()) != + user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory = + std::make_shared(dst_memory.get()->get_primitive_desc()); + reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory); + is_diff_dst_reordered = true; + } + + diff_src_memory = std::make_shared( + pool_bwd_pd.diff_src_primitive_desc(), in_x_grad_data); + + dev_ctx.SetBlob(key_pool_diff_src_mem_p, diff_src_memory); + dev_ctx.SetBlob(key_pool_diff_dst_mem_p, diff_dst_memory); + pool_bwd_p = std::make_shared( - pool_bwd_pd, *(pool_diff_dst_memory_p.get()), *workspace_memory, - *(pool_diff_src_memory_p)); + pool_bwd_pd, *(diff_dst_memory.get()), *workspace_memory, + *(diff_src_memory)); dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p); + } else { // Primitives already exist - auto pool_diff_src_memory_p = std::static_pointer_cast( + diff_src_memory = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_diff_src_mem_p)); - PADDLE_ENFORCE(pool_diff_src_memory_p != nullptr, + PADDLE_ENFORCE(diff_src_memory != nullptr, "Fail to find pooling src mem_p in device context"); - auto pool_diff_dst_memory_p = std::static_pointer_cast( + diff_dst_memory = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_diff_dst_mem_p)); - PADDLE_ENFORCE(pool_diff_dst_memory_p != nullptr, + PADDLE_ENFORCE(diff_dst_memory != nullptr, "Fail to find pooling dst mem_p in device context"); - pool_diff_src_memory_p->set_data_handle( - reinterpret_cast(in_x_grad_data)); - pool_diff_dst_memory_p->set_data_handle(const_cast(out_grad_data)); + + diff_src_memory->set_data_handle(reinterpret_cast(in_x_grad_data)); + diff_dst_memory->set_data_handle(const_cast(out_grad_data)); + + // reorder between user_diff_dst and pool diff_dst if needed + if (memory::primitive_desc(dst_memory->get_primitive_desc()) != + user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory = + std::make_shared(dst_memory.get()->get_primitive_desc()); + reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory); + is_diff_dst_reordered = true; + } } + in_x_grad_format = (memory::format)diff_src_memory->get_primitive_desc() + .desc() + .data.format; + // push primitive to stream and wait until it's executed - std::vector pipeline{*(pool_bwd_p.get())}; + std::vector pipeline; + if (is_diff_dst_reordered) { + pipeline.push_back(reorder_diff_dst); + } + pipeline.push_back(*(pool_bwd_p.get())); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + + in_x_grad->set_layout(DataLayout::kMKLDNN); + in_x_grad->set_format(in_x_grad_format); } // Compute() }; } // namespace operators } // namespace paddle +namespace ops = paddle::operators; + REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace, - paddle::operators::PoolMKLDNNOpKernel); + ops::PoolMKLDNNOpKernel); REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace, - paddle::operators::PoolMKLDNNGradOpKernel); + ops::PoolMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index f4fb2b132fe8d59cb50f5a1f7359240ac50445fe..f8ad63690e84339da0390d4ddd2db45f25db385a 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -83,6 +83,9 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const { framework::OpKernelType PoolOp::GetExpectedKernelType( const framework::ExecutionContext &ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; @@ -92,11 +95,10 @@ framework::OpKernelType PoolOp::GetExpectedKernelType( if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif - std::string data_format = ctx.Attr("data_format"); - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), layout_, library_); @@ -112,6 +114,9 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { framework::OpKernelType PoolOpGrad::GetExpectedKernelType( const framework::ExecutionContext &ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; @@ -121,6 +126,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif @@ -129,8 +135,6 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN, "float16 can only be used when CUDNN is used"); } - std::string data_format = ctx.Attr("data_format"); - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, library_); } @@ -147,7 +151,8 @@ void Pool2dOpMaker::Make() { "The format of output tensor is also NCHW, " "where N is batch size, C is the number of channels, " "H is the height of the feature, " - "and W is the width of the feature."); + "and W is the width of the feature.") + .Reuse("X"); AddAttr("pooling_type", "(string), pooling type, can be \"max\" for max-pooling " @@ -199,8 +204,6 @@ void Pool2dOpMaker::Make() { // TODO(dzhwinter): need to registered layout transform function AddComment(R"DOC( -Pool2d Operator. - The pooling2d operation calculates the output based on the input, pooling_type and ksize, strides, paddings parameters. Input(X) and output(Out) are in NCHW format, where N is batch size, C is the @@ -210,19 +213,28 @@ These two elements represent height and width, respectively. The input(X) size and output(Out) size may be different. Example: + Input: + X shape: $(N, C, H_{in}, W_{in})$ + Output: + Out shape: $(N, C, H_{out}, W_{out})$ + For ceil_mode = false: $$ - H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ - W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 + H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 + $$ + $$ + W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 $$ For ceil_mode = true: $$ - H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1 \\ - W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1 + H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1 + $$ + $$ + W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1 $$ )DOC"); @@ -240,7 +252,8 @@ void Pool3dOpMaker::Make() { "The format of output tensor is also NCDHW, " "where N is batch size, C is " "the number of channels, and D, H and W is the depth, height and " - "width of the feature, respectively."); + "width of the feature, respectively.") + .Reuse("X"); AddAttr("pooling_type", "(string) Pooling type, can be \"max\" for max-pooling " diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc index e0a9b24ac8978418a1a4ece62286e022bec8b834..8734282fe496b8e90af19abd5549566d62316fc3 100644 --- a/paddle/fluid/operators/prefetch_op.cc +++ b/paddle/fluid/operators/prefetch_op.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/send_recv_util.h" namespace paddle { @@ -41,19 +41,19 @@ class PrefetchOp : public framework::OperatorBase { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); - auto rpc_client = detail::RPCClient::GetInstance(); + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance(); for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get " << outs[i] << " back"; - rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i], - outs[i]); + rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, ins[i], outs[i]); } else { VLOG(3) << "don't send no-initialied variable: " << ins[i]; } } - PADDLE_ENFORCE(rpc_client->Wait()); + rpc_client->Wait(); } }; diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc index b14b559e31dd422f8ebe4002988a9746dfdf28a2..123fa44fa3ddbc9343b9629be63fdefdf12b4646 100644 --- a/paddle/fluid/operators/random_crop_op.cc +++ b/paddle/fluid/operators/random_crop_op.cc @@ -20,7 +20,6 @@ class RandomCropOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( @@ -36,11 +35,16 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Seed", "The random seed."); AddOutput("Out", "The cropped instance batch."); AddOutput("SeedOut", "The random seed after random cropping.") - .AsDispensable(); + .AsIntermediate(); AddAttr>("shape", "The shape of a cropped instance."); + AddAttr("startup_seed", + "If the input 'Seed' is not initialized, the 'startup_seed' " + "will be used to replace it. Even so, the seed after random " + "crop will also be outputed to the 'SeedOut'.") + .SetDefault(0); AddComment(R"DOC( - This operator takes a batch of instance, and do random cropping on each instance. - It means that cropping positions differs on each instance, which is determined + This operator takes a batch of instance, and do random cropping on each instance. + It means that cropping positions differs on each instance, which is determined by an uniform random generator. All cropped instances have the same shape, which is determined by the operator's attribute 'shape'. )DOC"); @@ -50,8 +54,6 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker { class RandomCropOpInferShape : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext* ctx) const override { - auto seed_dim = ctx->GetInputDim("Seed"); - PADDLE_ENFORCE(seed_dim.size() == 1 && seed_dim[0] == 1); auto shape = ctx->Attrs().Get>("shape"); auto x_dim = ctx->GetInputDim("X"); PADDLE_ENFORCE_GT(x_dim.size(), static_cast(shape.size())); @@ -63,7 +65,6 @@ class RandomCropOpInferShape : public framework::InferShapeBase { out_dim[x_i] = shape[shape_i]; } ctx->SetOutputDim("Out", framework::make_ddim(out_dim)); - ctx->SetOutputDim("SeedOut", framework::make_ddim({1})); } }; diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h index f3261cbdc986b0cc724315c1eb92b8b84e18c742..d68ba9d661698bb0d33b139f5748daec2ead6595 100644 --- a/paddle/fluid/operators/random_crop_op.h +++ b/paddle/fluid/operators/random_crop_op.h @@ -142,16 +142,22 @@ template class RandomCropKernel : public framework::OpKernel { public: virtual void Compute(const framework::ExecutionContext& ctx) const { - auto& seed_tensor = detail::Ref(ctx.Input("Seed")); int64_t seed = 0; - if (platform::is_cpu_place(seed_tensor.place())) { - seed = *seed_tensor.data(); + auto& seed_tensor = detail::Ref(ctx.Input("Seed")); + if (seed_tensor.IsInitialized()) { + if (platform::is_cpu_place(seed_tensor.place())) { + seed = *seed_tensor.data(); + } else { + LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify " + "your program"; + framework::LoDTensor cpu_seed; + framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed); + seed = *cpu_seed.data(); + } } else { - LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify " - "your program"; - framework::LoDTensor cpu_seed; - framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed); - seed = *cpu_seed.data(); + VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute " + "'startup_seed' instead."; + seed = ctx.Attr("startup_seed"); } auto shape = ctx.Attr>("shape"); auto& x = detail::Ref(ctx.Input("X")); @@ -171,7 +177,7 @@ class RandomCropKernel : public framework::OpKernel { engine.discard(functor.prod_batchsize_dims_ * (functor.rank_ - functor.num_batchsize_dims_)); *ctx.Output("SeedOut")->mutable_data( - platform::CPUPlace()) = engine(); + framework::make_ddim({1}), platform::CPUPlace()) = engine(); } }; diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt index 62532036f86bfb82465ccd9e0ec526299489932a..a39c8a00538875e4e3284898230a6cb0693b7a12 100644 --- a/paddle/fluid/operators/reader/CMakeLists.txt +++ b/paddle/fluid/operators/reader/CMakeLists.txt @@ -24,6 +24,7 @@ reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_o reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc) reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc) reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc) +reader_library(create_py_reader_op SRCS create_py_reader_op.cc) cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc) # Export local libraries to parent diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 71684b14176edc8f71efbefa9a7decffc8f3011e..db8cf3b605c9175eeda4548b1e7c8203f26c5d89 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -88,24 +88,29 @@ class BlockingQueue { receive_cv_.notify_all(); } - bool IsClosed() { + bool IsClosed() const { std::lock_guard lock(mutex_); return closed_; } - size_t Cap() { + size_t Cap() const { std::lock_guard lock(mutex_); return capacity_; } + size_t Size() const { + std::lock_guard lock(mutex_); + return queue_.size(); + } + private: size_t capacity_; bool closed_; std::deque queue_; - std::mutex mutex_; - std::condition_variable receive_cv_; - std::condition_variable send_cv_; + mutable std::mutex mutex_; + mutable std::condition_variable receive_cv_; + mutable std::condition_variable send_cv_; }; } // namespace reader } // namespace operators diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc index 4cc7cbc6e89b0712faf9ad9c51480bce00da15f5..ecbae3894d551186f53625a6cc9cfdb36adc8d2d 100644 --- a/paddle/fluid/operators/reader/create_batch_reader_op.cc +++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc @@ -20,7 +20,7 @@ namespace reader { class BatchReader : public framework::DecoratedReader { public: - BatchReader(ReaderBase* reader, int batch_size) + BatchReader(const std::shared_ptr& reader, int batch_size) : DecoratedReader(reader), batch_size_(batch_size) { buffer_.reserve(batch_size_); } diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc index 331224a59899b4a7d517ca4f7141fb5b8f4f5168..a75c6d4c567ac93f37b38070421133af305f20a3 100644 --- a/paddle/fluid/operators/reader/create_custom_reader_op.cc +++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc @@ -22,7 +22,8 @@ namespace reader { class CustomReader : public framework::DecoratedReader { public: - CustomReader(ReaderBase* reader, const framework::BlockDesc& sub_block, + CustomReader(const std::shared_ptr& reader, + const framework::BlockDesc& sub_block, const std::vector& source_var_names, const std::vector& sink_var_names) : DecoratedReader(reader), @@ -38,6 +39,7 @@ class CustomReader : public framework::DecoratedReader { const framework::ProgramDesc program_; int sub_block_id_; framework::Executor exe_; + framework::Scope scope_; std::vector source_var_names_; std::vector sink_var_names_; @@ -157,23 +159,24 @@ void CustomReader::ReadNext(std::vector* out) { // The scope for CustomReader's sub-block should be independent and shouldn't // be any other computation scope's child. Otherwise, data preprocessing and // compution cannot be concurrent. - framework::Scope scope; + framework::Scope* exe_scope = &scope_.NewScope(); // 1. Copy LoDTensors from underlying reader's output to source variables. for (size_t i = 0; i < source_var_names_.size(); ++i) { - framework::Variable* var = scope.Var(source_var_names_[i]); + framework::Variable* var = exe_scope->Var(source_var_names_[i]); framework::LoDTensor* tensor = var->GetMutable(); tensor->ShareDataWith(underlying_outs[i]); tensor->set_lod(underlying_outs[i].lod()); } // 2. Run the sub-block. - exe_.Run(program_, &scope, sub_block_id_, false, true); + exe_.Run(program_, exe_scope, sub_block_id_, false, true); // 3. Copy LoDTensors from sink variables to out. out->resize(sink_var_names_.size()); for (size_t i = 0; i < sink_var_names_.size(); ++i) { - const auto& tensor = detail::Ref(scope.FindVar(sink_var_names_[i])) + const auto& tensor = detail::Ref(exe_scope->FindVar(sink_var_names_[i])) .Get(); framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]); } + scope_.DeleteScope(exe_scope); } } // namespace reader diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index bc830a2b72e657f79f4c94e24428d38ff2b7c42e..5f734489a81764875988f440696682570ff4d1d7 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -23,18 +23,19 @@ namespace reader { // 'Double buffer' means we shall maintain two batches of input data at the same // time. So the kCacheSize shoul be at least 2. -static constexpr size_t kCacheSize = 3; +static constexpr size_t kCacheSize = 5; // There will be two bacthes out of the channel during training: // 1. the one waiting to be sent to the channel // 2. the one just be received from the channel, which is also being used by // subsequent operators. // So the channel size should be kChacheSize - 2 -static constexpr size_t kChannelSize = 1; // kCacheSize - 2 +static constexpr size_t kChannelSize = 3; // kCacheSize - 2 class DoubleBufferReader : public framework::DecoratedReader { public: explicit DoubleBufferReader( - ReaderBase* reader, platform::Place target_place = platform::CPUPlace()) + const std::shared_ptr& reader, + platform::Place target_place = platform::CPUPlace()) : DecoratedReader(reader), place_(target_place) { cpu_tensor_cache_.resize(kCacheSize); gpu_tensor_cache_.resize(kCacheSize); diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc index 249b0b7c6dbc8b8104bce95562e6e9b2a28c77f8..19b54110b9aeece33b8d6c73612ae0e12dbfafbd 100644 --- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc +++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc @@ -21,7 +21,7 @@ namespace reader { class MultiPassReader : public framework::DecoratedReader { public: - MultiPassReader(ReaderBase* reader, int pass_num) + MultiPassReader(const std::shared_ptr& reader, int pass_num) : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {} void ReadNext(std::vector* out) override { diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..36587360f7347a10e01d4e994482027d9a9bb5d0 --- /dev/null +++ b/paddle/fluid/operators/reader/create_py_reader_op.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#include "paddle/fluid/operators/reader/reader_op_registry.h" + +namespace paddle { +namespace operators { +namespace reader { + +class PyReader : public framework::ReaderBase { + public: + explicit PyReader(const std::shared_ptr& queue) { + PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); + queue_ = queue; + } + + void ReadNext(std::vector* out) override { + bool success; + *out = queue_->Pop(&success); + if (!success) out->clear(); + } + + void ReInit() override {} + + private: + std::shared_ptr queue_; +}; + +class CreatePyReaderOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + if (out->Get() != nullptr) return; + + const std::string& queue_name = Input("blocking_queue"); + auto* queue_holder_var = scope.FindVar(queue_name); + PADDLE_ENFORCE( + queue_holder_var != nullptr, + "No LoDTensorBlockingQueueHolder variable with name %s found", + queue_name); + auto* queue_holder = + queue_holder_var->template GetMutable(); + + out->Reset(new PyReader(queue_holder->GetQueue())); + } +}; + +class CreatePyReaderOpMaker : public FileReaderMakerBase { + protected: + void Apply() override { + AddInput("blocking_queue", + "Name of the `LoDTensorBlockingQueueHolder` variable"); + + AddComment(R"DOC( + Create PyReader to support LoDTensor data feeding in Python side. + )DOC"); + } +}; + +} // namespace reader +} // namespace operators +} // namespace paddle + +namespace reader = ::paddle::operators::reader; + +REGISTER_FILE_READER_OPERATOR(create_py_reader, reader::CreatePyReaderOp, + reader::CreatePyReaderOpMaker); diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc index 282ec3f36b98e7aa62d71fb04f72721a5464e21c..559827f08494af6730aafa1e67c46a47c21dedf6 100644 --- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc +++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc @@ -78,11 +78,15 @@ class CreateRecordIOReaderOp : public framework::OperatorBase { class CreateRecordIOReaderOpMaker : public FileReaderMakerBase { protected: void Apply() override { - AddAttr("filename", "The filename of record io reader"); + AddAttr( + "filename", + "The filename of record file. This file will given to reader."); AddComment(R"DOC( - CreateRecordIOReader Operator +Open a recordio file and return the reader object. The returned reader object +is thread-safe. - Create a reader from a record io file +NOTE: This is a very low-level API. It is used for debugging data file or +training. Please use `open_files` instead of this API for production usage. )DOC"); } }; diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc index fd233be945932eee9f9a3c0c578a43d5b7cc83aa..57e8e21214b7c99e52550fe51a67c9b5201cb46f 100644 --- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc +++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc @@ -23,7 +23,8 @@ namespace reader { class ShuffleReader : public framework::DecoratedReader { public: - ShuffleReader(ReaderBase* reader, size_t buffer_size, size_t seed = 0) + ShuffleReader(const std::shared_ptr& reader, size_t buffer_size, + size_t seed = 0) : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) { VLOG(10) << "Create shuffle reader of " << reader_; if (seed_ == 0) { diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc index 1db70f3e9699dba604569c36dc35025dfe2c94fe..3798015146f4ffb085aa82e23ca3f1fb3c5cf5a4 100644 --- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc +++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc @@ -21,7 +21,8 @@ namespace reader { class ThreadedReader : public framework::DecoratedReader { public: - explicit ThreadedReader(ReaderBase* reader) : DecoratedReader(reader) {} + explicit ThreadedReader(const std::shared_ptr& reader) + : DecoratedReader(reader) {} void ReadNext(std::vector* out) override { std::lock_guard lock(mutex_); diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h new file mode 100644 index 0000000000000000000000000000000000000000..30d962ba10a954a837f9771d21cedf0feb643439 --- /dev/null +++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h @@ -0,0 +1,103 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/operators/reader/blocking_queue.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { +namespace reader { + +class LoDTensorBlockingQueueHolder; + +class LoDTensorBlockingQueue { + friend class LoDTensorBlockingQueueHolder; + + private: + LoDTensorBlockingQueue(size_t capacity, + const std::vector& dims) + : queue_(capacity), dims_(dims) {} + + public: + bool Push(const std::vector& lod_tensor_vec) { + CheckDims(lod_tensor_vec); + return queue_.Send(lod_tensor_vec); + } + + bool Push(std::vector&& lod_tensor_vec) { + CheckDims(lod_tensor_vec); + return queue_.Send(std::move(lod_tensor_vec)); + } + + std::vector Pop(bool* ok = nullptr) { + std::vector lod_tensor_vec; + bool success = queue_.Receive(&lod_tensor_vec); + if (ok != nullptr) *ok = success; + return lod_tensor_vec; + } + + inline size_t Cap() const { return queue_.Cap(); } + + inline size_t Size() const { return queue_.Size(); } + + inline void Close() { return queue_.Close(); } + + inline bool IsClosed() const { return queue_.IsClosed(); } + + private: + void CheckDims(const std::vector& lod_tensor_vec) { + PADDLE_ENFORCE(dims_.size() == lod_tensor_vec.size(), + "Expect input size is %d but found %s", dims_.size(), + lod_tensor_vec.size()); + for (size_t i = 0; i < dims_.size(); ++i) { + const auto& in_dims = framework::slice_ddim( + lod_tensor_vec[i].dims(), 1, lod_tensor_vec[i].dims().size()); + const auto& expect_dims = + framework::slice_ddim(dims_[i], 1, dims_[i].size()); + PADDLE_ENFORCE(in_dims == expect_dims, + "Dims of the %d-th input tensor do not match", i); + } + } + + BlockingQueue> queue_; + std::vector dims_; +}; + +class LoDTensorBlockingQueueHolder { + public: + void InitOnce(size_t capacity, const std::vector& dims) { + PADDLE_ENFORCE( + queue_ == nullptr, + "LoDTensorBlockingQueueHolder::InitOnce() can only be called once"); + queue_.reset(new LoDTensorBlockingQueue(capacity, dims)); + } + + inline const std::shared_ptr& GetQueue() const { + return queue_; + } + + private: + std::shared_ptr queue_; +}; + +} // namespace reader +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc index 612e1f5eca3a4836db1fd167fc6bb63400d20177..e11256a49ffa6adc9410376cc8a71fa017df7e9c 100644 --- a/paddle/fluid/operators/reader/reader_op_registry.cc +++ b/paddle/fluid/operators/reader/reader_op_registry.cc @@ -54,7 +54,7 @@ std::unique_ptr CreateReaderByFileName( } void FileReaderMakerBase::Make() { - AddOutput("Out", "(ReaderHolder) The created random reader.").AsDuplicable(); + AddOutput("Out", "(ReaderHolder): The created random reader.").AsDuplicable(); AddAttr>("shape_concat", "The concat of all data's shapes."); AddAttr>( "ranks", diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index 9c1cee7022a9b9a98f026f7602f0f7badc44a49b..162bfcbb0844d29385d0f8ad5d25a3f8de6bd41b 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -429,7 +429,8 @@ class RecurrentGradOp : public RecurrentBase { auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {pg_names[param_id], new_inside_name}}}, - {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); + {{"Out", {pg_names[param_id]}}}, + framework::AttributeMap{{"use_mkldnn", {false}}}); sum_op->Run(cur_scope, place); cur_scope.Rename(new_inside_name, inside_grad_name); diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index d8ddb7b448910b5e0e6e71742eb2fdc6a225c919..9854a31f5b10f5ecd940c0d41c2c3e468fc17bad 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -19,8 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" - -#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -44,14 +43,15 @@ class RecvOp : public framework::OperatorBase { // For profiling platform::RecordEvent record_event(Type(), &ctx); - auto rpc_client = detail::RPCClient::GetInstance(); + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance(); for (size_t i = 0; i < outs.size(); i++) { VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; - rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]); + rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]); } if (sync_mode) { - PADDLE_ENFORCE(rpc_client->Wait()); + rpc_client->Wait(); } } }; @@ -77,9 +77,15 @@ This operator can get variables from server side. } }; +class RecvOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override {} +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker); +REGISTER_OPERATOR(recv, ops::RecvOp, paddle::framework::EmptyGradOpMaker, + ops::RecvOpMaker, ops::RecvOpShapeInference); diff --git a/paddle/fluid/operators/reduce_max_op.cc b/paddle/fluid/operators/reduce_max_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..95d3768e1fdf6947659c7b3a1c9d57fad741472a --- /dev/null +++ b/paddle/fluid/operators/reduce_max_op.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_REDUCE_OP(reduce_max); +REGISTER_OP_CPU_KERNEL( + reduce_max, ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL( + reduce_max_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_max_op.cu b/paddle/fluid/operators/reduce_max_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..0d86b3127e42f7ee14ba57b1c762e8128a0f2d54 --- /dev/null +++ b/paddle/fluid/operators/reduce_max_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_max, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_max_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.cc b/paddle/fluid/operators/reduce_mean_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..fc258c2496340b47d24dc89f16f7419dbb4b0d95 --- /dev/null +++ b/paddle/fluid/operators/reduce_mean_op.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_mean_op.h" + +REGISTER_REDUCE_OP(reduce_mean); +REGISTER_OP_CPU_KERNEL(reduce_mean, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL(reduce_mean_grad, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..960cb3235be7f4cc98b97d3b088ceaeb3d4a4209 --- /dev/null +++ b/paddle/fluid/operators/reduce_mean_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_mean_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_mean, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_mean_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.h b/paddle/fluid/operators/reduce_mean_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1359679c4767d2032bf3e3a90849ad2a2ef3e829 --- /dev/null +++ b/paddle/fluid/operators/reduce_mean_op.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +struct MeanFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->mean(dim); + } +}; + +struct MeanGradFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, + const Dim& dim, int size) { + dx->device(place) = dy->broadcast(dim) / dx->constant(size); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_min_max_op.h b/paddle/fluid/operators/reduce_min_max_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ec59f3e71c1c702655a3feed10935b2f5a29d8a8 --- /dev/null +++ b/paddle/fluid/operators/reduce_min_max_op.h @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +struct MaxFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->maximum(dim); + } +}; + +struct MinFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->minimum(dim); + } +}; + +struct MaxOrMinGradFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, + const Dim& dim, int size) { + auto equals = (*x) == y->broadcast(dim); + auto ones = dx->constant(1); + auto zeros = dx->constant(0); + // If there are multiple minimum or maximum elements, the subgradient of + // each is the set [0, 1], and we pass gradient to all of them here. + dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_min_op.cc b/paddle/fluid/operators/reduce_min_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..330a86d2e4237a10d8cf6fd40025540edf08d897 --- /dev/null +++ b/paddle/fluid/operators/reduce_min_op.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_REDUCE_OP(reduce_min); +REGISTER_OP_CPU_KERNEL( + reduce_min, ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL( + reduce_min_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_min_op.cu b/paddle/fluid/operators/reduce_min_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..da466f805eff4709dc23471baef03e94052ee6c1 --- /dev/null +++ b/paddle/fluid/operators/reduce_min_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_min, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_min_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc deleted file mode 100644 index e293fd5e410b2a34b3c71ea674607ba9d7654535..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_op.cc +++ /dev/null @@ -1,186 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/reduce_op.h" - -#include -#include -#include - -namespace paddle { -namespace operators { - -using framework::Tensor; - -class ReduceOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of ReduceOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of ReduceOp should not be null."); - auto x_dims = ctx->GetInputDim("X"); - auto x_rank = x_dims.size(); - PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); - auto dims = ctx->Attrs().Get>("dim"); - for (size_t i = 0; i < dims.size(); ++i) { - if (dims[i] < 0) dims[i] = x_rank + dims[i]; - PADDLE_ENFORCE_LT( - dims[i], x_rank, - "The dim should be in the range [-rank(input), rank(input))."); - } - sort(dims.begin(), dims.end()); - bool reduce_all = ctx->Attrs().Get("reduce_all"); - bool keep_dim = ctx->Attrs().Get("keep_dim"); - if (reduce_all) { - if (keep_dim) - ctx->SetOutputDim( - "Out", framework::make_ddim(std::vector(x_rank, 1))); - else - ctx->SetOutputDim("Out", {1}); - } else { - auto dims_vector = vectorize(x_dims); - if (keep_dim) { - for (size_t i = 0; i < dims.size(); ++i) { - dims_vector[dims[i]] = 1; - } - } else { - const int kDelFlag = -2; - for (size_t i = 0; i < dims.size(); ++i) { - dims_vector[dims[i]] = kDelFlag; - } - dims_vector.erase( - remove(dims_vector.begin(), dims_vector.end(), kDelFlag), - dims_vector.end()); - } - auto out_dims = framework::make_ddim(dims_vector); - ctx->SetOutputDim("Out", out_dims); - if (dims[0] != 0) { - // Only pass LoD when not reducing on the first dim. - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - } -}; - -class ReduceGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null."); - auto x_dims = ctx->GetInputDim("X"); - auto x_rank = x_dims.size(); - PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); - auto dims = ctx->Attrs().Get>("dim"); - for (size_t i = 0; i < dims.size(); ++i) { - if (dims[i] < 0) dims[i] = x_rank + dims[i]; - PADDLE_ENFORCE_LT( - dims[i], x_rank, - "The dim should be in the range [-rank(input), rank(input))."); - } - sort(dims.begin(), dims.end()); - auto x_grad_name = framework::GradVarName("X"); - if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); - ctx->ShareLoD("X", /*->*/ x_grad_name); - } - } -}; - -class ReduceOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() final { - AddInput("X", - "(Tensor) The input tensor. Tensors with rank at most 6 are " - "supported."); - AddOutput("Out", "(Tensor) The result tensor."); - AddAttr>( - "dim", - "(list, default {0}) The dimensions to reduce. " - "Must be in the range [-rank(input), rank(input)). " - "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. " - "Note that reducing on the first dim will make the LoD info lost.") - .SetDefault({0}); - AddAttr("keep_dim", - "(bool, default false) " - "If true, retain the reduced dimension with length 1.") - .SetDefault(false); - AddAttr("reduce_all", - "(bool, default false) " - "If true, output a scalar reduced along all dimensions.") - .SetDefault(false); - AddComment(string::Sprintf(R"DOC( -%s Operator. - -This operator computes the %s of input tensor along the given dimension. -The result tensor has 1 fewer dimension than the input unless keep_dim is true. -If reduce_all is true, just reduce along all dimensions and output a scalar. - -)DOC", - GetOpType(), GetName())); - } - - protected: - virtual std::string GetName() const = 0; - virtual std::string GetOpType() const = 0; -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -#define REGISTER_REDUCE_OP(op_name) \ - class __##op_name##Maker__ : public ops::ReduceOpMaker { \ - protected: \ - virtual std::string GetName() const { return #op_name; } \ - virtual std::string GetOpType() const { return "Reduce " #op_name; } \ - }; \ - REGISTER_OPERATOR(reduce_##op_name, ops::ReduceOp, __##op_name##Maker__, \ - paddle::framework::DefaultGradOpDescMaker); \ - REGISTER_OPERATOR(reduce_##op_name##_grad, ops::ReduceGradOp) - -REGISTER_REDUCE_OP(sum); -REGISTER_REDUCE_OP(mean); -REGISTER_REDUCE_OP(max); -REGISTER_REDUCE_OP(min); -REGISTER_REDUCE_OP(prod); - -#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \ - REGISTER_OP_CPU_KERNEL(reduce_type, \ - ops::ReduceKernel, \ - ops::ReduceKernel, \ - ops::ReduceKernel, \ - ops::ReduceKernel); \ - REGISTER_OP_CPU_KERNEL( \ - reduce_type##_grad, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel); - -FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL); diff --git a/paddle/fluid/operators/reduce_op.cu b/paddle/fluid/operators/reduce_op.cu deleted file mode 100644 index ae29587f55847315b1d84f1344677e753fe01a9b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_op.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#define EIGEN_USE_GPU -#include "paddle/fluid/operators/reduce_op.h" - -namespace ops = paddle::operators; - -#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor) \ - REGISTER_OP_CUDA_KERNEL( \ - reduce_type, ops::ReduceKernel, \ - ops::ReduceKernel, \ - ops::ReduceKernel, \ - ops::ReduceKernel); \ - REGISTER_OP_CUDA_KERNEL( \ - reduce_type##_grad, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel); - -FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL); diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h index 7df47f316c30b9eb2644677681b91023e1838548..72b6cf1773d5bcc42e40e72111179d454d2bb4a9 100644 --- a/paddle/fluid/operators/reduce_op.h +++ b/paddle/fluid/operators/reduce_op.h @@ -14,105 +14,20 @@ limitations under the License. */ #pragma once +#include +#include #include -#include "glog/logging.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" + +#include "paddle/fluid/operators/reduce_op_function.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using DDim = framework::DDim; -template -using EigenTensor = framework::EigenTensor; -template -using EigenScalar = framework::EigenScalar; -template -using EigenVector = framework::EigenVector; - -struct SumFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->sum(dim); - } -}; - -struct SumGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - dx->device(place) = dy->broadcast(dim); - } -}; - -struct MeanFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->mean(dim); - } -}; - -struct MeanGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - dx->device(place) = dy->broadcast(dim) / dx->constant(size); - } -}; - -struct MaxFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->maximum(dim); - } -}; - -struct MinFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->minimum(dim); - } -}; - -struct MaxOrMinGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - auto equals = (*x) == y->broadcast(dim); - auto ones = dx->constant(1); - auto zeros = dx->constant(0); - // If there are multiple minimum or maximum elements, the subgradient of - // each is the set [0, 1], and we pass gradient to all of them here. - dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros); - } -}; - -struct ProdFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->prod(dim); - } -}; - -struct ProdGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse(); - } -}; - -#define HANDLE_DIM(NDIM, RDIM) \ - if (ndim == NDIM && rdim == RDIM) { \ - ReduceCompute(context); \ +#define HANDLE_DIM(NDIM, RDIM) \ + if (ndim == NDIM && rdim == RDIM) { \ + ReduceFunctor( \ + context.template device_context(), *input, output, \ + dims, keep_dim); \ } template @@ -120,11 +35,15 @@ class ReduceKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + output->mutable_data(context.GetPlace()); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + if (reduce_all) { // Flatten and reduce 1-D tensor - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - output->mutable_data(context.GetPlace()); auto x = EigenVector::Flatten(*input); auto out = EigenScalar::From(*output); auto& place = @@ -133,8 +52,8 @@ class ReduceKernel : public framework::OpKernel { Functor functor; functor(place, &x, &out, reduce_dim); } else { - int ndim = context.Input("X")->dims().size(); - int rdim = context.Attr>("dim").size(); + int ndim = input->dims().size(); + int rdim = dims.size(); // comments for accelerating compiling temporarily. // HANDLE_DIM(6, 5); // HANDLE_DIM(6, 4); @@ -154,48 +73,6 @@ class ReduceKernel : public framework::OpKernel { HANDLE_DIM(1, 1); } } - - private: - template - void ReduceCompute(const framework::ExecutionContext& context) const { - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - output->mutable_data(context.GetPlace()); - - auto x = EigenTensor::From(*input); - auto x_rank = static_cast(x.dimensions().size()); - auto dims = context.Attr>("dim"); - auto reduce_dim = Eigen::array(); - for (size_t i = 0; i < dims.size(); ++i) { - if (dims[i] < 0) dims[i] = x_rank + dims[i]; - reduce_dim[i] = dims[i]; - } - // construct the squeezed output tensor - bool keep_dim = context.Attr("keep_dim"); - DDim out_dims = output->dims(); - if (keep_dim && x_rank > 1) { - const int kDelFlag = -2; - auto dims_vector = vectorize(out_dims); - for (size_t i = 0; i < dims.size(); ++i) { - dims_vector[dims[i]] = kDelFlag; - } - dims_vector.erase( - remove(dims_vector.begin(), dims_vector.end(), kDelFlag), - dims_vector.end()); - out_dims = framework::make_ddim(dims_vector); - } - auto& place = - *context.template device_context().eigen_device(); - Functor functor; - - if (D == 1) { - auto out = EigenScalar::From(*output); - functor(place, &x, &out, reduce_dim); - } else { - auto out = EigenTensor::From(*output, out_dims); - functor(place, &x, &out, reduce_dim); - } - } }; template @@ -203,12 +80,15 @@ class ReduceGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { bool reduce_all = context.Attr("reduce_all"); + auto dims = context.Attr>("dim"); + + auto* input0 = context.Input("X"); + auto* input1 = context.Input("Out"); + auto* input2 = context.Input(framework::GradVarName("Out")); + auto* output = context.Output(framework::GradVarName("X")); + output->mutable_data(context.GetPlace()); + if (reduce_all) { - auto* input0 = context.Input("X"); - auto* input1 = context.Input("Out"); - auto* input2 = context.Input(framework::GradVarName("Out")); - auto* output = context.Output(framework::GradVarName("X")); - output->mutable_data(context.GetPlace()); auto x = EigenVector::Flatten(*input0); auto x_reduce = EigenVector::From(*input1); auto x_reduce_grad = EigenVector::From(*input2); @@ -221,74 +101,172 @@ class ReduceGradKernel : public framework::OpKernel { functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, broadcast_dim[0]); } else { - int rank = context.Input("X")->dims().size(); + int rank = input0->dims().size(); switch (rank) { case 1: - ReduceGradCompute<1>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 2: - ReduceGradCompute<2>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 3: - ReduceGradCompute<3>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 4: - ReduceGradCompute<4>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 5: - ReduceGradCompute<5>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 6: - ReduceGradCompute<6>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; } } } +}; - private: - template - void ReduceGradCompute(const framework::ExecutionContext& context) const { - auto* input0 = context.Input("X"); - auto* input1 = context.Input("Out"); - auto* input2 = context.Input(framework::GradVarName("Out")); - auto* output = context.Output(framework::GradVarName("X")); +class ReduceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; - output->mutable_data(context.GetPlace()); - auto x = EigenTensor::From(*input0); - auto x_grad = EigenTensor::From(*output); - auto x_rank = static_cast(x.dimensions().size()); - auto dims = context.Attr>("dim"); - auto x_dims = input0->dims(); - auto reduced_dims_v = vectorize(x_dims); - Eigen::array broadcast_dim; - for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ReduceOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ReduceOp should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto x_rank = x_dims.size(); + PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); + auto dims = ctx->Attrs().Get>("dim"); + for (size_t i = 0; i < dims.size(); ++i) { + if (dims[i] < 0) dims[i] = x_rank + dims[i]; + PADDLE_ENFORCE_LT( + dims[i], x_rank, + "The dim should be in the range [-rank(input), rank(input))."); + } + sort(dims.begin(), dims.end()); + bool reduce_all = ctx->Attrs().Get("reduce_all"); + bool keep_dim = ctx->Attrs().Get("keep_dim"); + if (reduce_all) { + if (keep_dim) + ctx->SetOutputDim( + "Out", framework::make_ddim(std::vector(x_rank, 1))); + else + ctx->SetOutputDim("Out", {1}); + } else { + auto dims_vector = vectorize(x_dims); + if (keep_dim) { + for (size_t i = 0; i < dims.size(); ++i) { + dims_vector[dims[i]] = 1; + } + } else { + const int kDelFlag = -2; + for (size_t i = 0; i < dims.size(); ++i) { + dims_vector[dims[i]] = kDelFlag; + } + dims_vector.erase( + remove(dims_vector.begin(), dims_vector.end(), kDelFlag), + dims_vector.end()); + } + auto out_dims = framework::make_ddim(dims_vector); + ctx->SetOutputDim("Out", out_dims); + if (dims[0] != 0) { + // Only pass LoD when not reducing on the first dim. + ctx->ShareLoD("X", /*->*/ "Out"); + } + } + } +}; + +class ReduceGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; - int broad_cats_times = 1; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto x_rank = x_dims.size(); + PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); + auto dims = ctx->Attrs().Get>("dim"); for (size_t i = 0; i < dims.size(); ++i) { if (dims[i] < 0) dims[i] = x_rank + dims[i]; - reduced_dims_v[dims[i]] = 1; - broadcast_dim[dims[i]] = x_dims[dims[i]]; - broad_cats_times *= x_dims[dims[i]]; + PADDLE_ENFORCE_LT( + dims[i], x_rank, + "The dim should be in the range [-rank(input), rank(input))."); + } + sort(dims.begin(), dims.end()); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + ctx->ShareLoD("X", /*->*/ x_grad_name); } - auto reduced_dims = framework::make_ddim(reduced_dims_v); - auto x_reduce = EigenTensor::From(*input1, reduced_dims); - auto x_reduce_grad = EigenTensor::From(*input2, reduced_dims); + } +}; + +class ReduceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() final { + AddInput("X", + "(Tensor) The input tensor. Tensors with rank at most 6 are " + "supported."); + AddOutput("Out", "(Tensor) The result tensor."); + AddAttr>( + "dim", + "(list, default {0}) The dimensions to reduce. " + "Must be in the range [-rank(input), rank(input)). " + "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. " + "Note that reducing on the first dim will make the LoD info lost.") + .SetDefault({0}); + AddAttr("keep_dim", + "(bool, default false) " + "If true, retain the reduced dimension with length 1.") + .SetDefault(false); + AddAttr("reduce_all", + "(bool, default false) " + "If true, output a scalar reduced along all dimensions.") + .SetDefault(false); + AddComment(string::Sprintf(R"DOC( +%s Operator. - auto& place = - *context.template device_context().eigen_device(); +This operator computes the %s of input tensor along the given dimension. +The result tensor has 1 fewer dimension than the input unless keep_dim is true. +If reduce_all is true, just reduce along all dimensions and output a scalar. - Functor functor; - functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, - broad_cats_times); +)DOC", + GetOpType(), GetName())); } + + protected: + virtual std::string GetName() const = 0; + virtual std::string GetOpType() const = 0; }; } // namespace operators } // namespace paddle -#define FOR_EACH_KERNEL_FUNCTOR(__macro) \ - __macro(reduce_sum, SumFunctor, SumGradFunctor); \ - __macro(reduce_mean, MeanFunctor, MeanGradFunctor); \ - __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \ - __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); \ - __macro(reduce_prod, ProdFunctor, ProdGradFunctor); +namespace ops = paddle::operators; + +#define REGISTER_REDUCE_OP(op_name) \ + class __##op_name##Maker__ : public ops::ReduceOpMaker { \ + protected: \ + virtual std::string GetName() const { return #op_name; } \ + virtual std::string GetOpType() const { return "Reduce " #op_name; } \ + }; \ + REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__, \ + paddle::framework::DefaultGradOpDescMaker); \ + REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp) diff --git a/paddle/fluid/operators/reduce_op_function.h b/paddle/fluid/operators/reduce_op_function.h new file mode 100644 index 0000000000000000000000000000000000000000..3da27bc8ac8d448471b9ff3779ac6aca59fac523 --- /dev/null +++ b/paddle/fluid/operators/reduce_op_function.h @@ -0,0 +1,109 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; +template +using EigenTensor = framework::EigenTensor; +template +using EigenScalar = framework::EigenScalar; +template +using EigenVector = framework::EigenVector; + +template +void ReduceFunctor(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* output, const std::vector& dims, + bool keep_dim) { + auto x = EigenTensor::From(input); + auto x_rank = static_cast(x.dimensions().size()); + auto reduce_dim = Eigen::array(); + std::vector dims_ref = dims; + for (size_t i = 0; i < dims_ref.size(); ++i) { + if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i]; + reduce_dim[i] = dims_ref[i]; + } + // construct the squeezed output tensor + DDim out_dims = output->dims(); + if (keep_dim && x_rank > 1) { + const int kDelFlag = -2; + auto dims_vector = framework::vectorize(out_dims); + for (size_t i = 0; i < dims_ref.size(); ++i) { + dims_vector[dims_ref[i]] = kDelFlag; + } + dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag), + dims_vector.end()); + out_dims = framework::make_ddim(dims_vector); + } + auto& place = *context.eigen_device(); + Functor functor; + + if (D == 1) { + auto out = EigenScalar::From(*output); + functor(place, &x, &out, reduce_dim); + } else { + auto out = EigenTensor::From(*output, out_dims); + functor(place, &x, &out, reduce_dim); + } +} + +template +void ReduceGradFunctor(const DeviceContext& context, + const framework::Tensor& input0, + const framework::Tensor& input1, + const framework::Tensor& input2, + framework::Tensor* output, + const std::vector& dims) { + auto x = EigenTensor::From(input0); + auto x_grad = EigenTensor::From(*output); + auto x_rank = static_cast(x.dimensions().size()); + auto x_dims = input0.dims(); + auto reduced_dims_v = framework::vectorize(x_dims); + std::vector dims_ref = dims; + Eigen::array broadcast_dim; + for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; + + int broad_cats_times = 1; + for (size_t i = 0; i < dims_ref.size(); ++i) { + if (dims_ref[i] < 0) { + dims_ref[i] = x_rank + dims_ref[i]; + } + reduced_dims_v[dims_ref[i]] = 1; + broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]]; + broad_cats_times *= x_dims[dims_ref[i]]; + } + auto reduced_dims = framework::make_ddim(reduced_dims_v); + auto x_reduce = EigenTensor::From(input1, reduced_dims); + auto x_reduce_grad = EigenTensor::From(input2, reduced_dims); + + auto& place = *context.eigen_device(); + + Functor functor; + functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, + broad_cats_times); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_prod_op.cc b/paddle/fluid/operators/reduce_prod_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..713728b99757a6f3bb128f665d5576ac64eef8ec --- /dev/null +++ b/paddle/fluid/operators/reduce_prod_op.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_prod_op.h" + +REGISTER_REDUCE_OP(reduce_prod); +REGISTER_OP_CPU_KERNEL(reduce_prod, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL(reduce_prod_grad, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_prod_op.cu b/paddle/fluid/operators/reduce_prod_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..d62e677d92cffecf629d1684026b0c7bcfec29e3 --- /dev/null +++ b/paddle/fluid/operators/reduce_prod_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_prod_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_prod, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_prod_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_prod_op.h b/paddle/fluid/operators/reduce_prod_op.h new file mode 100644 index 0000000000000000000000000000000000000000..97748113e092719aceed9d806ca6242077111532 --- /dev/null +++ b/paddle/fluid/operators/reduce_prod_op.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +struct ProdFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->prod(dim); + } +}; + +struct ProdGradFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, + const Dim& dim, int size) { + dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_sum_op.cc b/paddle/fluid/operators/reduce_sum_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c5b5398787b44e658b0f8390162df0e6c3006651 --- /dev/null +++ b/paddle/fluid/operators/reduce_sum_op.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_sum_op.h" + +REGISTER_REDUCE_OP(reduce_sum); +REGISTER_OP_CPU_KERNEL( + reduce_sum, ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL(reduce_sum_grad, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..f2e16955a50dc6a7feda9fbaf968c929ef3d8a4f --- /dev/null +++ b/paddle/fluid/operators/reduce_sum_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_sum_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_sum, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_sum_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_sum_op.h b/paddle/fluid/operators/reduce_sum_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e67d7e1da5f0244d2dee346873692a80cbad2fc4 --- /dev/null +++ b/paddle/fluid/operators/reduce_sum_op.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +struct SumFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->sum(dim); + } +}; + +struct SumGradFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, + const Dim& dim, int size) { + dx->device(place) = dy->broadcast(dim); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reverse_op.cc b/paddle/fluid/operators/reverse_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a20f7d231fa9ea313581ac0629a87fa5f4a88ce5 --- /dev/null +++ b/paddle/fluid/operators/reverse_op.cc @@ -0,0 +1,107 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reverse_op.h" +#include + +namespace paddle { +namespace operators { + +class ReverseOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); + const auto& x_dims = ctx->GetInputDim("X"); + const auto& axis = ctx->Attrs().Get>("axis"); + PADDLE_ENFORCE(!axis.empty(), "'axis' can not be empty."); + for (int a : axis) { + PADDLE_ENFORCE_LT(a, x_dims.size(), + "The axis must be less than input tensor's rank."); + } + ctx->SetOutputDim("Out", x_dims); + } +}; + +class ReverseOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The LoDTensor to be flipped."); + AddOutput("Out", "The LoDTensor after flipping."); + AddAttr>( + "axis", "The axises that along which order of elements is reversed."); + AddComment(R"DOC( + Reverse Operator. + + Reverse the order of elements in the input LoDTensor along given axises. + + Case 1: + Given + X = [[1, 2, 3, 4, 5] + [6, 7, 8, 9, 10] + [11, 12, 13, 14, 15]], + and + axis = [0], + we get: + Out = [[11, 12, 13, 14, 15] + [6, 7, 8, 9, 10] + [1, 2, 3, 4, 5]]. + + Case 2: + Given + X = [[[1, 2, 3, 4] + [5, 6, 7, 8]] + [[9, 10, 11, 12] + [13, 14, 15, 16]]], + and + axis = [0, 2], + we get: + Out = [[[12, 11, 10, 9] + [16, 15, 14, 13]] + [[4, 3, 2, 1] + [8, 7, 6, 5]]], + )DOC"); + } +}; + +class ReverseGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto* grad_op = new framework::OpDesc(); + grad_op->SetType("reverse"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttr("axis", GetAttr("axis")); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(reverse, ops::ReverseOp, ops::ReverseOpMaker, + ops::ReverseGradMaker); +REGISTER_OPERATOR(reverse_grad, ops::ReverseOp); +REGISTER_OP_CPU_KERNEL( + reverse, ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel) diff --git a/paddle/fluid/operators/reverse_op.cu b/paddle/fluid/operators/reverse_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..635c41529b38f2dd287b00ed2e5659e11f619e78 --- /dev/null +++ b/paddle/fluid/operators/reverse_op.cu @@ -0,0 +1,24 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reverse_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + reverse, ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel) diff --git a/paddle/fluid/operators/reverse_op.h b/paddle/fluid/operators/reverse_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9063cd59bba5c6307b55a500455908a5fd278390 --- /dev/null +++ b/paddle/fluid/operators/reverse_op.h @@ -0,0 +1,87 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +template +struct ReverseFunctor { + void operator()(const DeviceContext& context, const framework::LoDTensor& in, + framework::LoDTensor* out, const std::vector& axis) { + Eigen::array reverse_axis; + for (int i = 0; i < Rank; ++i) { + reverse_axis[i] = false; + } + for (int a : axis) { + reverse_axis[a] = true; + } + + auto in_eigen = framework::EigenTensor::From(in); + auto out_eigen = framework::EigenTensor::From(*out); + auto* dev = context.eigen_device(); + + out_eigen.device(*dev) = in_eigen.reverse(reverse_axis); + } +}; + +template +class ReverseKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + const auto& axis = context.Attr>("axis"); + int rank = x->dims().size(); + auto& dev_ctx = context.template device_context(); + + switch (rank) { + case 1: + ReverseFunctor functor1; + functor1(dev_ctx, *x, out, axis); + break; + case 2: + ReverseFunctor functor2; + functor2(dev_ctx, *x, out, axis); + break; + case 3: + ReverseFunctor functor3; + functor3(dev_ctx, *x, out, axis); + break; + case 4: + ReverseFunctor functor4; + functor4(dev_ctx, *x, out, axis); + break; + case 5: + ReverseFunctor functor5; + functor5(dev_ctx, *x, out, axis); + break; + case 6: + ReverseFunctor functor6; + functor6(dev_ctx, *x, out, axis); + break; + default: + PADDLE_THROW( + "Reserve operator doesn't supports tensors whose ranks are greater " + "than 6."); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc index 293abb0ea4f1ac03c3889ce2937ef8fa0845db73..d6d209d5de041500a9b4893d70800a58e8ee1e1d 100644 --- a/paddle/fluid/operators/roi_pool_op.cc +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -139,7 +139,20 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { "The pooled output width.") .SetDefault(1); AddComment(R"DOC( -ROIPool operator +**ROIPool Operator** + +Region of interest pooling (also known as RoI pooling) is to perform +is to perform max pooling on inputs of nonuniform sizes to obtain +fixed-size feature maps (e.g. 7*7). + +The operator has three steps: + +1. Dividing each region proposal into equal-sized sections with + the pooled_width and pooled_height + +2. Finding the largest value in each section + +3. Copying these max values to the output buffer ROI Pooling for Faster-RCNN. The link below is a further introduction: https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc index 20f140f962c3aac364a1239a663d5f340bbeb6b2..10b1b0c899d833d70fa6afe51998fe210899e3c3 100644 --- a/paddle/fluid/operators/row_conv_op.cc +++ b/paddle/fluid/operators/row_conv_op.cc @@ -78,23 +78,23 @@ class RowConvOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "(LoDTensor), the input(X) is a LodTensor, which supports " + "the input(X) is a LodTensor, which supports " "variable time-length input sequences. The underlying tensor " "in this LoDTensor is a matrix with shape (T x N), where T " "is the total time steps in this mini-batch and N is the input " "data dimension."); AddInput("Filter", - "(Tensor), the input(Filter) is a learnable parameter. It " + "the input(Filter) is a learnable parameter. It " "is a 2-D tensor with shape (future_context x N), where, " "future_context is the future context length and N is the data " "dimension."); AddOutput("Out", - "(LoDTensor), the output(Out) is a LodTensor, which supports " + "the output(Out) is a LodTensor, which supports " "variable time-length input sequences. The underlying tensor " "in this LodTensor is a matrix with shape T x N, i.e., the " "same shape as X."); AddComment(R"DOC( -Row-convolution Operator. +:strong:`Row-convolution operator` The row convolution is called lookahead convolution. This operator was introduced in the following paper for DeepSpeech2: @@ -114,9 +114,23 @@ and a filter ($W$) of size $context \times d$, the output sequence is convolved as: $$ -out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :} +out_{i, :} = \\sum_{j=i}^{i + context} in_{j,:} \\cdot W_{i-j, :} $$ +In the above equation: + +* $Out_{i}$: The i-th row of output variable with shape [1, D]. + +* $\\tau$: Future context size. + +* $X_{j}$: The j-th row of input variable with shape [1, D]. + +* $W_{i-j}$: The (i-j)-th row of parameters with shape [1, D]. + +More details about row_conv please refer to +the design document +https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 . + )DOC"); } }; diff --git a/paddle/fluid/operators/save_load_op_test.cc b/paddle/fluid/operators/save_load_op_test.cc index c4fcc61af4b75e6dc7d5c31e20c5fff358637af5..ccaea0eef2906953d922e097348b6c0a86dad6f1 100644 --- a/paddle/fluid/operators/save_load_op_test.cc +++ b/paddle/fluid/operators/save_load_op_test.cc @@ -139,6 +139,7 @@ TEST(LoadFP16Op, CPU) { save_op->Run(scope, place); auto load_var = scope.Var("out_var"); + load_var->GetMutable(); auto load_op = paddle::framework::OpRegistry::CreateOp( "load", {}, {{"Out", {"out_var"}}}, attrs); load_op->Run(scope, place); diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index e6d27e2dedd7668b93bd8ddc330a897d1c6fa732..201a51130d6b6f94104e2dabf9e7facffa672ae0 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -22,11 +22,17 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { +// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables +// to directory specified. +constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath"; + // TODO(yuyang18): If the functions below are needed by other files, move them // to paddle::filesystem namespace. constexpr char kSEP = '/'; @@ -67,9 +73,27 @@ class SaveOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope &scope, const platform::Place &place) const override { + auto iname = Input("X"); + auto *var = scope.FindVar(iname); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op", + iname); + + if (var->IsType()) { + SaveLodTensor(place, var); + } else if (var->IsType()) { + SaveSelectedRows(scope, place, var); + } else { + PADDLE_ENFORCE( + false, + "SaveOp only support LoDTensor and SelectedRows, %s has wrong type", + iname); + } + } + + void SaveLodTensor(const platform::Place &place, + framework::Variable *var) const { auto filename = Attr("file_path"); auto overwrite = Attr("overwrite"); - auto save_as_fp16 = Attr("save_as_fp16"); if (FileExists(filename) && !overwrite) { PADDLE_THROW("%s is existed, cannot save to it when overwrite=false", @@ -78,26 +102,19 @@ class SaveOp : public framework::OperatorBase { MkDirRecursively(DirName(filename).c_str()); - // FIXME(yuyang18): We save variable to local file now, but we should change - // it to save an output stream. - std::ofstream fout(filename); - PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", - filename); - - auto iname = Input("X"); - auto *var = scope.FindVar(iname); - PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op", - iname); - - PADDLE_ENFORCE(var->IsType(), - "SaveOp only support LoDTensor, %s has wrong type", iname); - auto &tensor = var->Get(); // get device context from pool platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(place); + // FIXME(yuyang18): We save variable to local file now, but we should change + // it to save an output stream. + std::ofstream fout(filename); + PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", + filename); + + auto save_as_fp16 = Attr("save_as_fp16"); auto in_dtype = framework::ToDataType(tensor.type()); auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; @@ -112,17 +129,43 @@ class SaveOp : public framework::OperatorBase { } else { framework::SerializeToStream(fout, tensor, dev_ctx); } + fout.close(); + } + + void SaveSelectedRows(const framework::Scope &scope, + const platform::Place &place, + framework::Variable *var) const { + auto *lt_var = scope.FindVar(LOOKUP_TABLE_PATH)->GetMutable(); + PADDLE_ENFORCE( + lt_var != nullptr, + "Can not find variable kLookupTablePath for SaveSelectedRows"); + std::string filename = lt_var->data(); + VLOG(4) << "SaveSelectedRows get File name: " << filename; + + auto &selectedRows = var->Get(); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + // FIXME(yuyang18): We save variable to local file now, but we should change + // it to save an output stream. + std::ofstream fout(filename); + PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", + filename); + framework::SerializeToStream(fout, selectedRows, dev_ctx); + fout.close(); } }; class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("X", "(Tensor ) Input tensor to be saved"); + AddInput("X", "(Tensor ) Input LoDTensor and SelectedRows to be saved"); AddComment(R"DOC( Save operator -This operator will serialize and write a tensor variable to file on disk. +This operator will serialize and write LoDTensor / SelectedRows variable to file on disk. )DOC"); AddAttr("overwrite", "(boolean, default true)" @@ -142,9 +185,26 @@ This operator will serialize and write a tensor variable to file on disk. } }; +class SaveOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + auto out_var_name = op_desc.Output(LOOKUP_TABLE_PATH).front(); + auto &out_var = block->FindRecursiveOrCreateVar(out_var_name); + auto var_type = framework::proto::VarType::RAW; + out_var.SetType(var_type); + } +}; + +class SaveOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override {} +}; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker); +REGISTER_OPERATOR(save, ops::SaveOp, paddle::framework::EmptyGradOpMaker, + ops::SaveOpProtoMaker, ops::SaveOpVarTypeInference, + ops::SaveOpShapeInference); diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index 4687e21e7155fc7309fb28c881c0d47152df9ad5..7f8822e40053b5bcd394f446138a2292d80b69bf 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -41,13 +41,13 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "(Tensor) Input tensor of scale operator."); AddOutput("Out", "(Tensor) Output tensor of scale operator."); AddComment(R"DOC( -Scale operator +**Scale operator** + +Multiply the input tensor with a float scalar to scale the input tensor. $$Out = scale*X$$ )DOC"); - AddAttr("scale", - "(float, default 1.0)" - "The scaling factor of the scale operator.") + AddAttr("scale", "The scaling factor of the scale operator.") .SetDefault(1.0); } }; diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc index bcd8e81609a37cc544f5a5cc4188400c1632a668..6b4572dcccc21e783f1df0b9bcde11d532ff4ba8 100644 --- a/paddle/fluid/operators/send_barrier_op.cc +++ b/paddle/fluid/operators/send_barrier_op.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/macros.h" -#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -44,18 +44,19 @@ class SendBarrierOp : public framework::OperatorBase { // For profiling platform::RecordEvent record_event(Type(), &ctx); - auto rpc_client = detail::RPCClient::GetInstance(); + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance(); VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode; // need to wait before sending send_barrier message - PADDLE_ENFORCE(rpc_client->Wait()); + rpc_client->Wait(); if (sync_mode) { for (auto& ep : eps) { VLOG(3) << "send barrier, ep: " << ep; rpc_client->AsyncSendBatchBarrier(ep); } - PADDLE_ENFORCE(rpc_client->Wait()); + rpc_client->Wait(); } } }; diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index a5150f242ca3b0befafa2443f0bc466e2aea85e4..0cac329aafa8c4c67cae48ba62a48575f5edba92 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -16,10 +16,9 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/send_recv_util.h" #include "paddle/fluid/platform/profiler.h" @@ -36,12 +35,9 @@ class SendOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& place) const override { auto ins = Inputs("X"); - auto outs = Outputs("Out"); - std::vector epmap = Attr>("epmap"); - std::vector endpoints = - Attr>("endpoints"); - bool sync_mode = Attr("sync_mode"); + std::vector epmap = Attr>("epmap"); + int sync_send = Attr("sync_mode"); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); @@ -49,38 +45,21 @@ class SendOp : public framework::OperatorBase { // For profiling platform::RecordEvent record_event(Type(), &ctx); - auto rpc_client = detail::RPCClient::GetInstance(); + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance(); for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; - rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]); + // TODO(Yancey1989): we need to use an IO threadpool which has + // a larger number of threads than the computing threadpool. + rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]); } else { VLOG(3) << "don't send no-initialied variable: " << ins[i]; } } - PADDLE_ENFORCE(rpc_client->Wait()); - - if (sync_mode) { - for (auto& ep : endpoints) { - VLOG(3) << "batch barrier, ep: " << ep; - rpc_client->AsyncSendBatchBarrier(ep); - } - PADDLE_ENFORCE(rpc_client->Wait()); - } - - if (outs.size() > 0) { - for (size_t i = 0; i < outs.size(); i++) { - VLOG(2) << "getting " << outs[i] << " from " << epmap[i]; - rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]); - } - PADDLE_ENFORCE(rpc_client->Wait()); - // tell pservers that current trainer have called fetch - for (auto& ep : endpoints) { - VLOG(2) << "send fetch barrier, ep: " << ep; - rpc_client->AsyncSendFetchBarrier(ep); - } - PADDLE_ENFORCE(rpc_client->Wait()); + if (sync_send) { + rpc_client->Wait(); } } }; @@ -88,26 +67,22 @@ class SendOp : public framework::OperatorBase { class SendOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() { - AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable(); - AddOutput("Out", "(Tensor) Output tensor to be received from server") + AddInput("X", "(Tensor, SelectedRows) Input variables to be sent") .AsDuplicable(); AddComment(R"DOC( Send operator -This operator will send tensor to recv_op at the parameter server. +This operator will send variables to listen_and_serve op at the parameter server. )DOC"); - // TODO(typhoonzero): remove this attr generate de-duplicated vector from - // epmap when initializing. - AddAttr>("endpoints", - "(string vector, default 127.0.0.1:6164)" - "Server endpoints to send variables to.") - .SetDefault({}); + AddAttr("sync_mode", + "(int, default 0)" + "sync send or async send.") + .SetDefault(0); AddAttr>("epmap", "(string vector, default 127.0.0.1:6164)" "Server endpoints in the order of input " "variables for mapping") - .SetDefault({}); - AddAttr("sync_mode", "work in sync_mode or not").SetDefault(true); + .SetDefault({"127.0.0.1:6164"}); } }; diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc index e550552b195b768d68ec64e9c3b5889b56ca719f..aee6180add5708d31f7ce927b37c4524a291fe3c 100644 --- a/paddle/fluid/operators/send_recv_op_test.cc +++ b/paddle/fluid/operators/send_recv_op_test.cc @@ -129,7 +129,10 @@ void StartServerNet(bool is_sparse, std::atomic *initialized) { // sub program run in listen_and_serv_op, for simple test we use sum f::ProgramDesc program; const auto &root_block = program.Block(0); + std::vector optimize_blocks; auto *optimize_block = program.AppendBlock(root_block); + optimize_blocks.push_back(optimize_block); + auto *prefetch_block = program.AppendBlock(root_block); // X for server side tensors, RX for received tensors, must be of same shape. AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block, @@ -139,7 +142,7 @@ void StartServerNet(bool is_sparse, std::atomic *initialized) { attrs.insert({"Fanin", 1}); attrs.insert({"ParamList", std::vector({"Out"})}); attrs.insert({"GradList", std::vector({"x1"})}); - attrs.insert({"OptimizeBlock", optimize_block}); + attrs.insert({"optimize_blocks", optimize_blocks}); attrs.insert({"PrefetchBlock", prefetch_block}); attrs.insert({"grad_to_block_id", std::vector({""})}); attrs.insert({"sync_mode", true}); diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc deleted file mode 100644 index fe839dab6924618c8a4c39868d9bf86056a0be40..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/send_vars_op.cc +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include // NOLINT -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/send_recv_util.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace operators { - -class SendVarsOp : public framework::OperatorBase { - public: - SendVarsOp(const std::string& type, const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - void RunImpl(const framework::Scope& scope, - const platform::Place& place) const override { - auto ins = Inputs("X"); - - std::vector epmap = Attr>("epmap"); - int sync_send = Attr("sync_send"); - - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - // For profiling - platform::RecordEvent record_event(Type(), &ctx); - - auto rpc_client = detail::RPCClient::GetInstance(); - - for (size_t i = 0; i < ins.size(); i++) { - if (NeedSend(scope, ins[i])) { - VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; - // TODO(Yancey1989): we need to use an IO threadpool which has - // a larger number of threads than the computing threadpool. - rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]); - } else { - VLOG(3) << "don't send no-initialied variable: " << ins[i]; - } - } - if (sync_send) { - rpc_client->Wait(); - } - } -}; - -class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() { - AddInput("X", "(Tensor, SelectedRows) Input variables to be sent") - .AsDuplicable(); - AddComment(R"DOC( -Send operator - -This operator will send variables to listen_and_serve op at the parameter server. -)DOC"); - AddAttr("sync_send", - "(int, default 0)" - "sync send or async send.") - .SetDefault(0); - AddAttr>("epmap", - "(string vector, default 127.0.0.1:6164)" - "Server endpoints in the order of input " - "variables for mapping") - .SetDefault({"127.0.0.1:6164"}); - } -}; - -class SendVarsOpShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* ctx) const override {} -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR(send_vars, ops::SendVarsOp, - paddle::framework::EmptyGradOpMaker, ops::SendVarsOpMaker, - ops::SendVarsOpShapeInference); diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h index d62c387c3eebf9df0ab532f4e891da006f239468..39301e1ac0971dfe0ca7854257f10ddeb60f1000 100644 --- a/paddle/fluid/operators/sequence_expand_op.h +++ b/paddle/fluid/operators/sequence_expand_op.h @@ -151,9 +151,6 @@ struct SequenceExpandGradFunctor { const framework::Vector& x_lod, /*expand source lod*/ const framework::Vector& ref_lod, /*expand referenced lod*/ LoDTensor* dx) { - math::SetConstant set_zero; - set_zero(context, dx, static_cast(0)); - int dout_offset = 0; for (size_t i = 1; i < ref_lod.size(); ++i) { int repeat_num = ref_lod[i] - ref_lod[i - 1]; @@ -187,6 +184,10 @@ class SequenceExpandGradKernel : public framework::OpKernel { g_x->mutable_data(context.GetPlace()); g_x->set_lod(x->lod()); + auto& dev_ctx = context.template device_context(); + math::SetConstant set_zero; + set_zero(dev_ctx, g_x, static_cast(0)); + auto& y_lod = y->lod(); if (ref_level == -1) ref_level = y_lod.size() - 1; // just copy the gradient diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc index 7a2bdeac09d61603f437ff10d58d0542bb3c3689..fef230e42d07a5ed73b7a7a6ab682694675bb9d2 100644 --- a/paddle/fluid/operators/sgd_op.cc +++ b/paddle/fluid/operators/sgd_op.cc @@ -74,7 +74,8 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Grad", "(Tensor or SelectedRows) Input gradient"); AddOutput("ParamOut", "(Tensor or SelectedRows, same with Param) " - "Output parameter, should share the same memory with Param"); + "Output parameter, should share the same memory with Param") + .Reuse("Param"); AddComment(R"DOC( SGD operator diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc index c75fce7959d1af51afd52af23fe657d10a2f3988..b44d5f898013a5d27467bd80118c29a886d5e8b3 100644 --- a/paddle/fluid/operators/shape_op.cc +++ b/paddle/fluid/operators/shape_op.cc @@ -36,10 +36,13 @@ class ShapeOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("Input", "(Tensor), The input tensor."); - AddOutput("Out", "(Tensor), The shape of input tensor."); + AddOutput("Out", + "(Tensor), The shape of input tensor, the data type of the shape" + " is int64_t, will be on the same device with the input Tensor."); AddComment(R"DOC( -Shape Operator. -Get the shape of input tensor. +Shape Operator + +Get the shape of input tensor. Only support CPU input Tensor now. )DOC"); } }; diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index 135e2a6f7f877c9ef159a4542b834d5627649e81..c3b0fe32098cb4b41ccc155db58809ef9f1bf46b 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -113,14 +113,14 @@ The logistic loss is given as follows: $$loss = -Labels * \log(\sigma(X)) - (1 - Labels) * \log(1 - \sigma(X))$$ -We know that $$\sigma(X) = (1 / (1 + \exp(-X)))$$. By substituting this we get: +We know that $$\sigma(X) = \\frac{1}{1 + \exp(-X)}$$. By substituting this we get: $$loss = X - X * Labels + \log(1 + \exp(-X))$$ For stability and to prevent overflow of $$\exp(-X)$$ when X < 0, we reformulate the loss as follows: - $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-|X|))$$ + $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-\|X\|))$$ Both the input `X` and `Labels` can carry the LoD (Level of Details) information. However the output only shares the LoD with input `X`. diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4bd23d594134f227e86b01fd75b7e202dd76c11b --- /dev/null +++ b/paddle/fluid/operators/slice_op.cc @@ -0,0 +1,133 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/slice_op.h" +#include +#include + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +class SliceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input (Input) of slice op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output (Out) of slice op should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE(in_dims.size() < 7, + "The rank of input should be less than 7."); + framework::DDim out_dims(in_dims); + auto axes = ctx->Attrs().Get>("axes"); + auto starts = ctx->Attrs().Get>("starts"); + auto ends = ctx->Attrs().Get>("ends"); + + PADDLE_ENFORCE_EQ(starts.size(), ends.size()); + PADDLE_ENFORCE_EQ(starts.size(), axes.size()); + int dim_value, start, end; + for (size_t i = 0; i < axes.size(); ++i) { + dim_value = out_dims[axes[i]]; + start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i]; + end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i]; + start = std::max(start, 0); + end = std::max(end, 0); + start = std::min(start, dim_value); + end = std::min(end, dim_value); + start = std::min(start, end); + out_dims[axes[i]] = end - start; + } + ctx->SetOutputDim("Out", out_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.GetPlace()); + } +}; + +class SliceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", "Tensor of data to extract slices from."); + AddOutput("Out", "Sliced data tensor."); + + AddAttr>( + "axes", + "(list) Axes that `starts` and `ends` apply to. It's optional." + "If not present, will be treated as [0, 1, ..., len(`starts`) - 1]."); + AddAttr>( + "starts", + "(list) Starting indices of corresponding axis in `axes`"); + AddAttr>( + "ends", + "(list) Starting indices of corresponding axis in `axes`."); + + AddComment(R"DOC( +Slice Operator. + +Produces a slice of the input tensor along multiple axes. Similar to numpy: +https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html +Slice uses `axes`, `starts` and `ends` attributes to specify the start and +end dimension for each axis in the list of axes, it uses this information +to slice the input data tensor. If a negative value is passed for any of +the start or end indices, it represents number of elements before the end +of that dimension. If the value passed to start or end is larger than +the n (the number of elements in this dimension), it represents n. +For slicing to the end of a dimension with unknown size, it is recommended +to pass in INT_MAX. If axes are omitted, they are set to [0, ..., ndim-1]. +Following examples will explain how slice works: + + .. code-block:: text + + Cast1: + Given: + data = [ [1, 2, 3, 4], [5, 6, 7, 8], ] + axes = [0, 1] + starts = [1, 0] + ends = [2, 3] + Then: + result = [ [5, 6, 7], ] + + Cast2: + Given: + data = [ [1, 2, 3, 4], [5, 6, 7, 8], ] + starts = [0, 1] + ends = [-1, 1000] + Then: + result = [ [2, 3, 4], ] +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(slice, ops::SliceOp, ops::SliceOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL( + slice, ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel); diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..8c1767c70b19d1386af9610ef3405eb487a39878 --- /dev/null +++ b/paddle/fluid/operators/slice_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/slice_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + slice, ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel); diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ba231aee176564b91a642912ce0b32bcdef8cfc1 --- /dev/null +++ b/paddle/fluid/operators/slice_op.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class SliceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + int rank = ctx.Input("Input")->dims().size(); + switch (rank) { + case 1: + SliceCompute<1>(ctx); + break; + case 2: + SliceCompute<2>(ctx); + break; + case 3: + SliceCompute<3>(ctx); + break; + case 4: + SliceCompute<4>(ctx); + break; + case 5: + SliceCompute<5>(ctx); + break; + case 6: + SliceCompute<6>(ctx); + break; + } + } + + private: + template + void SliceCompute(const framework::ExecutionContext& context) const { + auto& place = + *context.template device_context().eigen_device(); + auto in = context.Input("Input"); + auto out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + auto out_dims = out->dims(); + auto in_dims = in->dims(); + auto axes = context.Attr>("axes"); + auto starts = context.Attr>("starts"); + + auto offsets = Eigen::array(); + auto extents = Eigen::array(); + for (size_t i = 0; i < D; ++i) { + offsets[i] = 0; + extents[i] = out_dims[i]; + } + int start; + for (size_t i = 0; i < axes.size(); ++i) { + start = starts[i]; + if (start < 0) { + start = (start + in_dims[axes[i]]); + } + start = std::max(start, 0); + offsets[axes[i]] = start; + } + auto in_t = + framework::EigenTensor::From( + *in); + auto out_t = + framework::EigenTensor::From( + *out); + out_t.device(place) = in_t.slice(offsets, extents); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc index 14b57b11fefb2b726531cb164dbf479f8df26b24..6668e6b9e917eea7ba4a80ac78917b73eb827208 100644 --- a/paddle/fluid/operators/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/softmax_mkldnn_op.cc @@ -27,8 +27,81 @@ using paddle::platform::MKLDNNMemDesc; using mkldnn::memory; // Note: paddle has also "memory" namespace using mkldnn::primitive; using mkldnn::softmax_forward; +using mkldnn::softmax_backward; using mkldnn::prop_kind; using mkldnn::stream; +using platform::to_void_cast; + +class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler { + public: + SoftmaxMKLDNNHandler( + std::shared_ptr softmax_pd, + const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, + const std::string& base_key) + : platform::MKLDNNHandler(dev_ctx, engine, base_key), + softmax_pd_(softmax_pd) {} + + SoftmaxMKLDNNHandler( + std::shared_ptr softmax_pd, + std::shared_ptr softmax_bwd_pd, + const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, + const std::string& base_key) + : platform::MKLDNNHandler(dev_ctx, engine, base_key), + softmax_pd_(softmax_pd), + softmax_bwd_pd_(softmax_bwd_pd) { + // If we are in Grad operatgor then update a key with BWD suffix to + // distinguish from FWD memory primitives + key_ += "-BWD"; + } + + std::shared_ptr AcquireSoftmax( + std::shared_ptr dst_memory_p, + std::shared_ptr src_memory_p) { + /*Generate key*/ + auto prim_key = key_ + "@softmax_p"; + + auto softmax_p = std::static_pointer_cast( + dev_ctx_.GetBlob(prim_key)); + PADDLE_ENFORCE((softmax_p != nullptr) || (is_reusing_ == false), + "Fail to find softmax primitive in device context"); + if (softmax_p == nullptr) { + softmax_p = std::make_shared( + *(softmax_pd_.get()), + *(static_cast(src_memory_p.get())), + *(static_cast(dst_memory_p.get()))); + dev_ctx_.SetBlob(prim_key, softmax_p); + } else { + is_reusing_ = true; + } + + return softmax_p; + } + + std::shared_ptr AcquireSoftmaxBackward( + std::shared_ptr dst_memory_p, + std::shared_ptr diff_dst_memory_p, + std::shared_ptr diff_src_memory_p) { + auto prim_key = key_ + "@softmax_bwd_p"; + auto softmax_bwd_p = std::static_pointer_cast( + dev_ctx_.GetBlob(prim_key)); + PADDLE_ENFORCE((softmax_bwd_p != nullptr) || (is_reusing_ == false), + "Fail to find softmax backward primitive in device context"); + if (softmax_bwd_p == nullptr) { + softmax_bwd_p = std::make_shared( + *softmax_bwd_pd_, *(dst_memory_p.get()), *(diff_dst_memory_p.get()), + *(diff_src_memory_p.get())); + dev_ctx_.SetBlob(prim_key, softmax_bwd_p); + } else { + is_reusing_ = true; + } + + return softmax_bwd_p; + } + + private: + std::shared_ptr softmax_pd_; + std::shared_ptr softmax_bwd_pd_; +}; template class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { @@ -54,56 +127,27 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { // Same memory descriptor to be used for input and output memory::dims softmax_tz = {src_tz[0], src_tz[1]}; // Generate keys for storing/retriving primitives for this operator - // TODO(jczaja): Each MKLDNN operator may have diffrent hashing function - auto gethash = [](memory::dims& operand_dims) { - return std::string(std::to_string(operand_dims[0]) + "-" + - std::to_string(operand_dims[1])); - }; - const std::string key = gethash(softmax_tz); - const std::string key_softmax_p = key + "@softmax_p"; - const std::string key_softmax_src_mem_p = key + "@softmax_src_mem_p"; - const std::string key_softmax_dst_mem_p = key + "@softmax_dst_mem_p"; - - std::shared_ptr softmax_p = dev_ctx.GetBlob(key_softmax_p); - if (softmax_p == nullptr) { - // Currently only NC data format is supported - auto softmax_md = - MKLDNNMemDesc({softmax_tz}, memory::f32, memory::format::nc); - // Normalization is made after innermost dimension eg. C out of NC - auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring, - softmax_md, 1 /*dim: C*/); - // create memory primitives - auto softmax_src_memory_p = std::make_shared( - memory::primitive_desc{softmax_md, mkldnn_engine}, - static_cast(const_cast(input_data))); - dev_ctx.SetBlob(key_softmax_src_mem_p, softmax_src_memory_p); - auto softmax_dst_memory_p = std::make_shared( - memory::primitive_desc{softmax_md, mkldnn_engine}, - static_cast(output_data)); - dev_ctx.SetBlob(key_softmax_dst_mem_p, softmax_dst_memory_p); - - auto softmax_forward_pd = - std::make_shared(softmax_desc, - mkldnn_engine); - softmax_p = std::make_shared( - *(softmax_forward_pd.get()), - *(static_cast(softmax_src_memory_p.get())), - *(static_cast(softmax_dst_memory_p.get()))); - dev_ctx.SetBlob(key_softmax_p, softmax_p); - } else { - // Primitives already exist - auto src_memory_p = std::static_pointer_cast( - dev_ctx.GetBlob(key_softmax_src_mem_p)); - PADDLE_ENFORCE(src_memory_p != nullptr, - "Fail to find softmax src mem_p in device context"); - auto dst_memory_p = std::static_pointer_cast( - dev_ctx.GetBlob(key_softmax_dst_mem_p)); - PADDLE_ENFORCE(dst_memory_p != nullptr, - "Fail to find softmax dst mem_p in device context"); - src_memory_p->set_data_handle( - reinterpret_cast(const_cast(input_data))); - dst_memory_p->set_data_handle(output_data); - } + const std::string key = + platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Output("Out")); + const std::string key_softmax_pd = key + "@softmax_pd"; + + // Currently only NC data format is supported + auto softmax_md = MKLDNNMemDesc( + {softmax_tz}, platform::MKLDNNGetDataType(), memory::format::nc); + // Normalization is made after innermost dimension eg. C out of NC + auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring, + softmax_md, 1 /*dim: C*/); + auto softmax_pd = std::make_shared( + softmax_desc, mkldnn_engine); + dev_ctx.SetBlob(key_softmax_pd, softmax_pd); + + SoftmaxMKLDNNHandler handler(softmax_pd, dev_ctx, mkldnn_engine, key); + auto softmax_src_memory_p = + handler.AcquireSrcMemory(softmax_md, to_void_cast(input_data)); + auto softmax_dst_memory_p = + handler.AcquireDstMemory(softmax_md, to_void_cast(output_data)); + auto softmax_p = + handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p); std::vector pipeline{ *(static_cast(softmax_p.get()))}; @@ -120,6 +164,77 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { } }; +template +class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + auto& dev_ctx = ctx.template device_context(); + auto mkldnn_engine = dev_ctx.GetEngine(); + const Tensor* output = ctx.Input("Out"); + const T* dst_data = output->data(); + + auto* dout = ctx.template Input(framework::GradVarName("Out")); + const auto* diff_dst_ptr = dout->template data(); + + auto* dx = + ctx.template Output(framework::GradVarName("X")); + T* diff_src_ptr = dx->template mutable_data(ctx.GetPlace()); + + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + std::vector src_tz(dst_tz); + PADDLE_ENFORCE(output->dims().size() == 2UL, + "The input of softmax op must be a 2D matrix."); + // MKL-DNN does support softmax over selected axis. Having 2D Tensor, + // we will make normalization after final eg. axis: 1 + PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])), + "Softmax input and output dimensions should match"); + // Same memory descriptor to be used for input and output + memory::dims softmax_tz = {src_tz[0], src_tz[1]}; + // Currently only supports NC data format + // retrieve eltwise primitive desc from device context + const std::string key = + platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Input("Out")); + const std::string key_softmax_pd = key + "@softmax_pd"; + + auto softmax_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_softmax_pd)); + PADDLE_ENFORCE(softmax_pd != nullptr, + "Fail to find softmax_pd in device context"); + + // TODO(jczaja): Add layouts support when there is a need to do so + // Two dimensional softmax does support NC format + auto data_softmax_md = MKLDNNMemDesc( + {softmax_tz}, platform::MKLDNNGetDataType(), memory::format::nc); + auto diff_softmax_md = MKLDNNMemDesc( + {softmax_tz}, platform::MKLDNNGetDataType(), memory::format::nc); + // Normalization is made after innermost dimension eg. C out of NC + auto softmax_bwd_desc = + softmax_backward::desc(diff_softmax_md, data_softmax_md, 1 /* dim: C*/); + auto softmax_bwd_pd = + std::make_shared( + softmax_bwd_desc, mkldnn_engine, *softmax_pd); + + SoftmaxMKLDNNHandler handler(softmax_pd, softmax_bwd_pd, dev_ctx, + mkldnn_engine, key); + auto dst_memory_p = + handler.AcquireDstMemory(data_softmax_md, to_void_cast(dst_data)); + auto diff_dst_memory_p = handler.AcquireDiffDstMemory( + diff_softmax_md, to_void_cast(diff_dst_ptr)); + auto diff_src_memory_p = handler.AcquireDiffSrcMemory( + diff_softmax_md, to_void_cast(diff_src_ptr)); + + // Get primitve from device context + auto softmax_bwd_p = handler.AcquireSoftmaxBackward( + dst_memory_p, diff_dst_memory_p, diff_src_memory_p); + + std::vector pipeline{*softmax_bwd_p}; + stream(stream::kind::eager).submit(pipeline).wait(); + } +}; } // namespace operators } // namespace paddle @@ -127,3 +242,5 @@ namespace ops = paddle::operators; REGISTER_OP_KERNEL(softmax, MKLDNN, ::paddle::platform::CPUPlace, ops::SoftmaxMKLDNNKernel); +REGISTER_OP_KERNEL(softmax_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::SoftmaxMKLDNNGradKernel); diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index cc256aa627bdda0609f496cab93a2dec7d95f348..31a7458f637921c290fc71ac748143867b4aae19 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -49,6 +49,9 @@ class SoftmaxOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { // choose cudnn kernel if the runtime supported. framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; @@ -58,6 +61,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif @@ -68,9 +72,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { "float16 can only be used on GPU place"); } - std::string data_format = ctx.Attr("data_format"); - return framework::OpKernelType(input_data_type, ctx.GetPlace(), - framework::StringToDataLayout(data_format), + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, library_); } }; @@ -81,7 +83,8 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "The input tensor of softmax. " "2-D with shape [batch_size, input_feature_dimensions]."); - AddOutput("Out", "The normalized values with the same shape as X."); + AddOutput("Out", "The normalized values with the same shape as X.") + .Reuse("X"); AddAttr( "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn") @@ -142,15 +145,30 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { // choose cudnn kernel if the runtime supported. framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif - std::string data_format = ctx.Attr("data_format"); - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - framework::StringToDataLayout(data_format), library_); +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; + } +#endif + auto input_data_type = + framework::ToDataType(ctx.Input("X")->type()); + if (input_data_type == framework::proto::VarType::FP16) { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "float16 can only be used on GPU place"); + } + + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, + library_); } }; diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc index 5e2b2a994534c2fb1e053c067b36651d358b9da8..d661b276bc31bf0c3ab181d706ffdccec89f0632 100644 --- a/paddle/fluid/operators/split_op.cc +++ b/paddle/fluid/operators/split_op.cc @@ -115,4 +115,7 @@ USE_CPU_ONLY_OP(concat); REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker); REGISTER_OP_CPU_KERNEL(split, - ops::SplitOpKernel); + ops::SplitOpKernel, + ops::SplitOpKernel, + ops::SplitOpKernel, + ops::SplitOpKernel); diff --git a/paddle/fluid/operators/split_op.cu.cc b/paddle/fluid/operators/split_op.cu.cc index efa378af857a8881f25c76379ba7cf81e64c80bb..18e0904681753aff7f3deac96efb6d62f389a031 100644 --- a/paddle/fluid/operators/split_op.cu.cc +++ b/paddle/fluid/operators/split_op.cu.cc @@ -15,4 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/split_op.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( - split, ops::SplitOpKernel); + split, ops::SplitOpKernel, + ops::SplitOpKernel, + ops::SplitOpKernel, + ops::SplitOpKernel); diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f78d977760f18c9eb1270e515e68acb208a7c9a4 --- /dev/null +++ b/paddle/fluid/operators/sum_mkldnn_op.cc @@ -0,0 +1,240 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/*Licensed under the Apache License, Version 2.0(the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "mkldnn.hpp" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/operators/sum_op.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/mkldnn_helper.h" + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; +using paddle::platform::MKLDNNDeviceContext; +using paddle::platform::CPUDeviceContext; +using framework::DataLayout; +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::stream; +using mkldnn::sum; +using mkldnn::reorder; +using platform::to_void_cast; + +template +class SumMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + auto in_vars = ctx.MultiInputVar("X"); + + const int N = in_vars.size(); + auto out_var = ctx.OutputVar("Out"); + bool in_place = out_var == in_vars[0]; + + if (out_var->IsType()) { + LoDTensor* output = ctx.Output("Out"); + T* output_data = output->mutable_data(ctx.GetPlace()); + + std::vector dst_tz = framework::vectorize2int(output->dims()); + auto src_tz = dst_tz; + memory::format output_format{memory::format::format_undef}; + std::vector scales; + std::vector srcs_mpd; + std::vector srcs_mem; + + PADDLE_ENFORCE(in_vars[0]->IsType(), + "Input[0] must be LoDTensors"); + auto& input0 = in_vars[0]->Get(); + PADDLE_ENFORCE(input0.layout() == DataLayout::kMKLDNN && + input0.format() != memory::format::format_undef, + "Wrong layout/format for inputs[0]"); + + memory::format input_format = input0.format(); + + if (src_tz.size() == 1 && (input_format == memory::format::nchw || + input_format == memory::format::nhwc)) { + input_format = memory::format::x; + } + if (src_tz.size() == 2 && (input_format == memory::format::nchw || + input_format == memory::format::nhwc)) { + input_format = memory::format::nc; + } + + for (int i = in_place ? 1 : 0; i < N; i++) { + PADDLE_ENFORCE(in_vars[i]->IsType(), + "all inputs must be all LoDTensors"); + auto& input = in_vars[i]->Get(); + PADDLE_ENFORCE(input.layout() == DataLayout::kMKLDNN && + input.format() != memory::format::format_undef, + "Wrong layout/format for inputs"); + + if (input.numel() == 0) { + continue; + } + + const T* input_data = input.data(); + + auto src_md = + memory::desc(src_tz, memory::data_type::f32, input_format); + auto src_mpd = memory::primitive_desc(src_md, mkldnn_engine); + auto src_mem = memory(src_mpd, to_void_cast(input_data)); + srcs_mpd.push_back(src_mpd); + srcs_mem.push_back(src_mem); + scales.push_back(1.0); + } + + auto dst_md = + memory::desc(dst_tz, memory::data_type::f32, memory::format::any); + + auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd); + + std::shared_ptr dst_mem; + if (in_place) { + dst_mem.reset(new memory(sum_pd.dst_primitive_desc())); + } else { + dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data)); + } + std::vector inputs; + for (size_t i = 0; i < srcs_mem.size(); ++i) { + inputs.push_back(srcs_mem[i]); + } + + auto sum_prim = mkldnn::sum(sum_pd, inputs, *dst_mem); + output_format = (memory::format)platform::GetMKLDNNFormat(sum_pd); + + primitive reorder_prim; + std::shared_ptr target_mem; + if (in_place) { + output_format = input_format; + target_mem.reset(new memory( + {{{src_tz}, memory::data_type::f32, output_format}, mkldnn_engine}, + output_data)); + reorder_prim = reorder(*dst_mem, *target_mem); + } + + std::vector pipeline; + pipeline.push_back(sum_prim); + if (in_place) pipeline.push_back(reorder_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(output_format); + } else if (out_var->IsType()) { + // TODO(@mozga-intel) Add MKLDNN SelectedRows support + std::unique_ptr in0; + if (in_place) { + // If is in_place, we store the input[0] to in0 + auto& in_sel0 = in_vars[0]->Get(); + auto& rows = in_sel0.rows(); + in0.reset(new framework::SelectedRows(rows, in_sel0.height())); + in0->mutable_value()->ShareDataWith(in_sel0.value()); + } + + auto get_selected_row = [&](size_t i) -> const SelectedRows& { + if (i == 0 && in0) { + return *in0.get(); + } else { + return in_vars[i]->Get(); + } + }; + auto* out = ctx.Output("Out"); + out->mutable_rows()->clear(); + auto* out_value = out->mutable_value(); + + // Runtime InferShape + size_t first_dim = 0; + for (int i = 0; i < N; i++) { + auto& sel_row = get_selected_row(i); + first_dim += sel_row.rows().size(); + } + auto in_dim = + framework::vectorize(get_selected_row(N - 1).value().dims()); + in_dim[0] = static_cast(first_dim); + + out_value->Resize(framework::make_ddim(in_dim)); + + // if all the input sparse vars are empty, no need to + // merge these vars. + if (first_dim == 0UL) { + return; + } + out_value->mutable_data(ctx.GetPlace()); + math::SelectedRowsAddTo functor; + int64_t offset = 0; + for (int i = 0; i < N; i++) { + auto& sel_row = get_selected_row(i); + if (sel_row.rows().size() == 0) { + continue; + } + PADDLE_ENFORCE_EQ(out->height(), sel_row.height()); + functor(ctx.template device_context(), sel_row, + offset, out); + offset += sel_row.value().numel(); + } + } else if (out_var->IsType()) { + // TODO(@mozga-intel) Add MKLDNN LoDTensorArray support + auto& out_array = *out_var->GetMutable(); + for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) { + PADDLE_ENFORCE(in_vars[i]->IsType(), + "Only support all inputs are TensorArray"); + auto& in_array = in_vars[i]->Get(); + + for (size_t i = 0; i < in_array.size(); ++i) { + if (in_array[i].numel() != 0) { + if (i >= out_array.size()) { + out_array.resize(i + 1); + } + if (out_array[i].numel() == 0) { + framework::TensorCopy(in_array[i], in_array[i].place(), + ctx.device_context(), &out_array[i]); + out_array[i].set_lod(in_array[i].lod()); + } else { + PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod()); + auto in = EigenVector::Flatten(in_array[i]); + auto result = EigenVector::Flatten(out_array[i]); + result.device(*ctx.template device_context() + .eigen_device()) = result + in; + } + } + } + } + } else { + PADDLE_THROW("Unexpected branch, output variable type is %s", + out_var->Type().name()); + } + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_KERNEL(sum, MKLDNN, ::paddle::platform::CPUPlace, + paddle::operators::SumMKLDNNOpKernel); diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index bcc5e22d4a77349e7cde9a43b83f23d4c867d994..fe7c7039c7dec714e265ede1b7167fd800ddc2f7 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -18,6 +18,10 @@ limitations under the License. */ #include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/operators/detail/safe_ref.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + namespace paddle { namespace operators { using framework::Tensor; @@ -63,6 +67,18 @@ class SumOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { auto x_vars = ctx.MultiInputVar("X"); + + framework::LibraryType library{framework::LibraryType::kPlain}; + framework::DataLayout layout{framework::DataLayout::kAnyLayout}; + +#ifdef PADDLE_WITH_MKLDNN + if (library == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; + } +#endif + if (x_vars[0]->IsType()) { int dtype = -1; for (auto& x_var : x_vars) { @@ -80,26 +96,27 @@ class SumOp : public framework::OperatorWithKernel { "Sum operator should have at least one tensor"); return framework::OpKernelType( - static_cast(dtype), - ctx.device_context()); + static_cast(dtype), ctx.GetPlace(), + layout, library); } else if (x_vars[0]->IsType()) { for (auto& var : x_vars) { auto& value = var->Get().value(); if (value.IsInitialized()) { return framework::OpKernelType(framework::ToDataType(value.type()), - ctx.device_context()); + ctx.device_context(), layout, library); } } // if input sparse vars are not initialized, use an default kernel type. return framework::OpKernelType(framework::proto::VarType::FP32, - ctx.device_context()); + ctx.device_context(), layout, library); } else if (x_vars[0]->IsType()) { for (auto& x_var : x_vars) { auto& array = x_var->Get(); for (auto& each : array) { if (each.numel() != 0) { return framework::OpKernelType(framework::ToDataType(each.type()), - ctx.device_context()); + ctx.device_context(), layout, + library); } } } @@ -115,7 +132,10 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "(vector) The input tensors of sum operator.") .AsDuplicable(); - AddOutput("Out", "(Tensor) The output tensor of sum operator."); + AddOutput("Out", "(Tensor) The output tensor of sum operator.").Reuse("X"); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddComment(R"DOC( Sum operator. @@ -132,7 +152,6 @@ class SumOpVarTypeInference : public framework::VarTypeInference { framework::BlockDesc* block) const override { auto& inputs = op_desc.Input("X"); auto var_type = framework::proto::VarType::SELECTED_ROWS; - for (auto& name : op_desc.Input("X")) { VLOG(10) << name << " " << block->FindRecursiveOrCreateVar(name).GetType(); @@ -206,6 +225,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker, ops::SumOpVarTypeInference); + REGISTER_OP_CPU_KERNEL( sum, ops::SumKernel, ops::SumKernel, diff --git a/paddle/fluid/operators/tensor_array_read_write_op.cc b/paddle/fluid/operators/tensor_array_read_write_op.cc index c703d11eeccf8418250f00c801f47418ee9c85ae..a2d44284e9de1ace42cabbce82e0b45929432d7b 100644 --- a/paddle/fluid/operators/tensor_array_read_write_op.cc +++ b/paddle/fluid/operators/tensor_array_read_write_op.cc @@ -38,15 +38,14 @@ class WriteToArrayOp : public ArrayOp { << " to " << offset + 1; out->resize(offset + 1); } + auto *out_tensor = &out->at(offset); + out_tensor->set_lod(x_tensor.lod()); if (x_tensor.memory_size() > 0) { - auto *out_tensor = &out->at(offset); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(place); TensorCopy(x_tensor, place, dev_ctx, out_tensor); - out_tensor->set_lod(x_tensor.lod()); } else { VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so " "nothing has been written to output array[" diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc index 855157e7c4c5c4a43091d28d3a5414e6e386b727..647cfc0a0af2be85e2868c6f68cab962c6631a8d 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt_engine_op.cc @@ -14,27 +14,107 @@ #ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/operators/tensorrt_engine_op.h" +#include +#include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/operators/tensorrt_engine_op.h" namespace paddle { namespace operators { +using inference::Singleton; +using inference::tensorrt::TRT_EngineManager; + +using FluidDT = framework::proto::VarType_Type; +using TRT_DT = nvinfer1::DataType; + +namespace { + +TRT_DT FluidDataType2TRT(FluidDT type) { + switch (type) { + case FluidDT::VarType_Type_FP32: + return TRT_DT::kFLOAT; + case FluidDT::VarType_Type_INT32: + return TRT_DT::kINT32; + default: + return TRT_DT::kINT32; + } + PADDLE_THROW("unkown type"); + return TRT_DT::kINT32; +} + +nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { + PADDLE_ENFORCE_GT(shape.size(), 1UL, + "TensorRT' tensor input requires at least 2 dimensions"); + PADDLE_ENFORCE_LE(shape.size(), 4UL, + "TensorRT' tensor input requires at most 4 dimensions"); + + switch (shape.size()) { + case 2: + return nvinfer1::Dims2(shape[0], shape[1]); + case 3: + return nvinfer1::Dims3(shape[0], shape[1], shape[2]); + case 4: + return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]); + default: + return nvinfer1::Dims(); + } + return nvinfer1::Dims(); +} + +} // namespace + template -void paddle::operators::TensorRTEngineKernel::Prepare( +void TensorRTEngineKernel::Prepare( const framework::ExecutionContext &context) const { + VLOG(4) << "Prepare engine"; // Get the ProgramDesc and pass to convert. - const auto &block = context.Attr("subgraph"); - max_batch_ = context.Attr("max_batch"); + framework::proto::BlockDesc block_desc; + block_desc.ParseFromString(context.Attr("subgraph")); + int max_batch = context.Attr("max_batch"); auto max_workspace = context.Attr("max_workspace"); - engine_.reset(new inference::tensorrt::TensorRTEngine( - max_batch_, max_workspace, nullptr)); - // TODO(Superjomn) parameters should be passed after analysised from outside. + auto params = context.Attr>("parameters"); + std::unordered_set parameters; + for (const auto ¶m : params) { + parameters.insert(param); + } + + // TODO(Superjomn) replace this with a different stream + auto *engine = Singleton::Global().Create( + max_batch, max_workspace, nullptr /*engine hold its own stream*/, + context.Attr("engine_uniq_key")); + engine->InitNetwork(); + + framework::BlockDesc block(nullptr /*programdesc*/, &block_desc); + // Add inputs + VLOG(4) << "declare inputs"; + for (auto &input : context.Inputs("Xs")) { + VLOG(4) << "declare input " << input; + auto *var = block.FindVar(input); + PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, + "TensorRT engine only takes LoDTensor as input"); + auto shape = var->GetShape(); + engine->DeclareInput( + input, FluidDataType2TRT( + var->Proto()->type().lod_tensor().tensor().data_type()), + Vec2TRT_Dims(var->GetShape())); + } + inference::Singleton::Global().ConvertBlock( - block, {}, context.scope(), engine_.get()); - engine_->FreezeNetwork(); + block_desc, parameters, context.scope(), engine); + + // Add outputs + VLOG(4) << "declare outputs"; + for (auto &output : context.Outputs("Ys")) { + VLOG(4) << "declare output " << output; + engine->DeclareOutput(output); + } + + engine->FreezeNetwork(); } class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { @@ -42,7 +122,10 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("Xs", "A list of inputs.").AsDuplicable(); AddOutput("Ys", "A list of outputs").AsDuplicable(); - AddAttr("subgraph", "the subgraph"); + AddAttr("subgraph", "the subgraph."); + AddAttr("engine_uniq_key", "unique key for the TRT engine."); + AddAttr("max_batch", "the maximum batch size."); + AddAttr("max_workspace", "the maximum batch size."); AddComment("TensorRT engine operator."); } }; diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index fe273d386c529be3df05a955f492e2c39d4d8812..295d6ba0395b68cabab3bd4117cedd912df48f5d 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -16,6 +16,9 @@ #ifdef PADDLE_WITH_CUDA +#include +#include + #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/tensorrt/engine.h" @@ -23,6 +26,9 @@ namespace paddle { namespace operators { +using inference::Singleton; +using inference::tensorrt::TRT_EngineManager; + class TensorRTEngineOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -32,9 +38,12 @@ class TensorRTEngineOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { + auto input0 = ctx.Inputs("Xs").front(); framework::OpKernelType kt = framework::OpKernelType( - framework::ToDataType( - ctx.Input("pre_ids")->type()), + framework::ToDataType(ctx.scope() + .FindVar(input0) + ->GetMutable() + ->type()), platform::CPUPlace()); return kt; } @@ -44,38 +53,39 @@ template class TensorRTEngineKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - if (!engine_) { + auto engine_name = context.Attr("engine_uniq_key"); + if (!Singleton::Global().HasEngine(engine_name)) { Prepare(context); } + auto* engine = Singleton::Global().Get(engine_name); auto input_names = context.op().Inputs("Xs"); PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs"); // Try to determine a batch_size - auto* tensor0 = context.Input(input_names.front()); - PADDLE_ENFORCE_NOT_NULL(tensor0); - int batch_size = tensor0->dims()[0]; - PADDLE_ENFORCE_LE(batch_size, max_batch_); + auto& tensor0 = inference::analysis::GetFromScope( + context.scope(), input_names.front()); + int batch_size = tensor0.dims()[0]; + PADDLE_ENFORCE_LE(batch_size, context.Attr("max_batch")); // Convert input tensor from fluid to engine. for (const auto& x : context.Inputs("Xs")) { // convert input and copy to TRT engine's buffer - auto* v = context.scope().FindVar(x); - PADDLE_ENFORCE_NOT_NULL(v, "no variable called %s", x); - auto& t = v->Get(); + auto& t = inference::analysis::GetFromScope( + context.scope(), x); if (platform::is_cpu_place(t.place())) { - engine_->SetInputFromCPU(x, static_cast(t.data()), - t.memory_size()); + engine->SetInputFromCPU(x, static_cast(t.data()), + t.memory_size()); } else { - engine_->SetInputFromGPU(x, static_cast(t.data()), - t.memory_size()); + engine->SetInputFromGPU(x, static_cast(t.data()), + t.memory_size()); } } // Execute the engine. PADDLE_ENFORCE_GT(batch_size, 0); - engine_->Execute(batch_size); + engine->Execute(batch_size); // Convert output tensor from engine to fluid for (const auto& y : context.Outputs("Ys")) { // convert output and copy to fluid. - nvinfer1::ITensor* trt_t = engine_->GetITensor(y); + nvinfer1::ITensor* trt_t = engine->GetITensor(y); auto dims = trt_t->getDimensions(); // Use the output ITensor's dims to reshape the Fluid Tensor. std::vector ddim(dims.d, dims.d + dims.nbDims); @@ -86,22 +96,23 @@ class TensorRTEngineKernel : public framework::OpKernel { fluid_t->Resize(framework::make_ddim(ddim)); auto size = inference::analysis::AccuDims(dims.d, dims.nbDims); if (platform::is_cpu_place(fluid_t->place())) { - engine_->GetOutputInCPU( - y, fluid_t->mutable_data(platform::CPUPlace()), size); + // TODO(Superjomn) change this float to dtype size. + engine->GetOutputInCPU( + y, fluid_t->mutable_data(platform::CPUPlace()), + size * sizeof(float)); } else { - engine_->GetOutputInGPU( - y, fluid_t->mutable_data(platform::CUDAPlace()), size); + engine->GetOutputInGPU( + y, fluid_t->mutable_data(platform::CUDAPlace()), + size * sizeof(float)); } } + + cudaStreamSynchronize(*engine->stream()); } protected: // Build the engine. void Prepare(const framework::ExecutionContext& context) const; - - private: - mutable std::unique_ptr engine_; - mutable int max_batch_{0}; }; } // namespace operators diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..358e2d151bb8f990503ea8a51ba5f81e0a1dc816 --- /dev/null +++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc @@ -0,0 +1,248 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +USE_CPU_ONLY_OP(tensorrt_engine); + +namespace paddle { +namespace operators { + +namespace { +void CreateCPUTensor(framework::Scope* scope, const std::string& name, + const std::vector& shape) { + auto* var = scope->Var(name); + auto* tensor = var->GetMutable(); + auto dims = framework::make_ddim(shape); + tensor->Resize(dims); + platform::CPUPlace place; + platform::CPUDeviceContext ctx(place); + inference::tensorrt::RandomizeTensor(tensor, place, ctx); +} + +void AddTensorToBlockDesc(framework::proto::BlockDesc* block, + const std::string& name, + const std::vector& shape) { + using framework::proto::VarType; + auto* var = block->add_vars(); + framework::VarDesc desc(name); + desc.SetType(VarType::LOD_TENSOR); + desc.SetDataType(VarType::FP32); + desc.SetShape(shape); + *var = *desc.Proto(); +} + +template +void SetAttr(framework::proto::OpDesc* op, const std::string& name, + const T& data); + +template <> +void SetAttr(framework::proto::OpDesc* op, const std::string& name, + const std::string& data) { + auto* attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::STRING); + attr->set_s(data); +} +template <> +void SetAttr(framework::proto::OpDesc* op, const std::string& name, + const int& data) { + auto* attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::INT); + attr->set_i(data); +} +template <> +void SetAttr(framework::proto::OpDesc* op, const std::string& name, + const int64_t& data) { + auto* attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::LONG); + attr->set_l(data); +} +template <> +void SetAttr>(framework::proto::OpDesc* op, + const std::string& name, + const std::vector& data) { + auto* attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::STRINGS); + for (const auto& s : data) { + attr->add_strings(s.c_str()); + } +} + +} // namespace + +TEST(TensorRTEngineOp, manual) { + framework::ProgramDesc program; + auto* block_ = program.Proto()->add_blocks(); + block_->set_idx(0); + block_->set_parent_idx(-1); + + LOG(INFO) << "create block desc"; + framework::BlockDesc block_desc(&program, block_); + LOG(INFO) << "create mul op"; + auto* mul = block_desc.AppendOp(); + mul->SetType("mul"); + mul->SetInput("X", std::vector({"x"})); // 2 x 4 + mul->SetInput("Y", std::vector({"y"})); // 4 x 6 + mul->SetOutput("Out", std::vector({"z"})); // 2 x 6 + + LOG(INFO) << "create fc op"; + auto* fc = block_desc.AppendOp(); + fc->SetType("mul"); + fc->SetInput("X", std::vector({"z"})); + fc->SetInput("Y", std::vector({"y0"})); // 6 x 8 + fc->SetOutput("Out", std::vector({"z0"})); // 2 x 8 + + // Set inputs' variable shape in BlockDesc + AddTensorToBlockDesc(block_, "x", std::vector({2, 4})); + AddTensorToBlockDesc(block_, "y", std::vector({4, 6})); + AddTensorToBlockDesc(block_, "y0", std::vector({6, 8})); + AddTensorToBlockDesc(block_, "z", std::vector({2, 6})); + + // It is wired, need to copy manually. + *block_->add_ops() = *mul->Proto(); + *block_->add_ops() = *fc->Proto(); + + ASSERT_EQ(block_->ops_size(), 2); + + LOG(INFO) << "create tensorrt desc"; + framework::OpDesc engine_op_desc(nullptr); + engine_op_desc.SetType("tensorrt_engine"); + engine_op_desc.SetInput("Xs", std::vector({"x", "y", "y0"})); + engine_op_desc.SetOutput("Ys", std::vector({"z0"})); + SetAttr(engine_op_desc.Proto(), "subgraph", + block_->SerializeAsString()); + SetAttr(engine_op_desc.Proto(), "max_batch", 100); + SetAttr(engine_op_desc.Proto(), "max_workspace", 1 << 10); + SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); + SetAttr>(engine_op_desc.Proto(), "parameters", + std::vector({})); + + LOG(INFO) << "create engine op"; + auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); + LOG(INFO) << "engine_op " << engine_op.get(); + + framework::Scope scope; + platform::CPUPlace place; + platform::CPUDeviceContext ctx(place); + // Prepare variables. + CreateCPUTensor(&scope, "x", std::vector({2, 4})); + CreateCPUTensor(&scope, "y", std::vector({4, 6})); + CreateCPUTensor(&scope, "z", std::vector({2, 6})); + + CreateCPUTensor(&scope, "y0", std::vector({6, 8})); + CreateCPUTensor(&scope, "z0", std::vector({2, 8})); + + // Execute them. + LOG(INFO) << "engine_op run"; + engine_op->Run(scope, place); +} + +void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { + framework::ProgramDesc program; + framework::Scope scope; + platform::CPUPlace place; + platform::CPUDeviceContext ctx(place); + + auto* block_ = program.Proto()->add_blocks(); + block_->set_idx(0); + block_->set_parent_idx(-1); + + using shape_t = std::vector; + + LOG(INFO) << "create block desc"; + framework::BlockDesc block_desc(&program, block_); + + auto AddFCLayer = [&](const std::string& x_name, const std::string& y_name, + const std::string& z_name, bool x_created, + const shape_t& x_shape, const shape_t& y_shape, + const shape_t& z_shape) { + LOG(INFO) << "create fc op"; + auto* fc = block_desc.AppendOp(); + fc->SetType("mul"); + fc->SetInput("X", std::vector({x_name})); + fc->SetInput("Y", std::vector({y_name})); + fc->SetOutput("Out", std::vector({z_name})); + + // Set inputs' variable shape in BlockDesc + if (!x_created) { + AddTensorToBlockDesc(block_, x_name, + std::vector({batch_size, input_dim, 1, 1})); + } + AddTensorToBlockDesc(block_, y_name, + std::vector({input_dim, output_dim})); + AddTensorToBlockDesc(block_, z_name, + std::vector({batch_size, output_dim})); + + // Prepare variables. + if (!x_created) { + CreateCPUTensor(&scope, x_name, std::vector(x_shape)); + } + CreateCPUTensor(&scope, y_name, std::vector(y_shape)); + CreateCPUTensor(&scope, z_name, std::vector(z_shape)); + + // It is wired, need to copy manually. + *block_->add_ops() = *fc->Proto(); + }; + + // Test with 4 layer FC + AddFCLayer("x0", "y0", "z0", false, {batch_size, input_dim}, + {input_dim, output_dim}, {batch_size, output_dim}); + AddFCLayer("z0", "y1", "z1", true, {}, {output_dim, output_dim}, + {batch_size, output_dim}); + AddFCLayer("z1", "y2", "z2", true, {}, {output_dim, output_dim}, + {batch_size, output_dim}); + AddFCLayer("z2", "y3", "z3", true, {}, {output_dim, output_dim}, + {batch_size, output_dim}); + + LOG(INFO) << "create tensorrt desc"; + framework::OpDesc engine_op_desc(nullptr); + engine_op_desc.SetType("tensorrt_engine"); + engine_op_desc.SetInput("Xs", std::vector({"x0"})); + engine_op_desc.SetOutput("Ys", std::vector({"z3"})); + + SetAttr(engine_op_desc.Proto(), "subgraph", + block_->SerializeAsString()); + SetAttr(engine_op_desc.Proto(), "max_batch", batch_size); + SetAttr(engine_op_desc.Proto(), "max_workspace", 2 << 10); + SetAttr>( + engine_op_desc.Proto(), "parameters", + std::vector({"y0", "y1", "y2", "y3"})); + SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "b_engine"); + + auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); + + // Execute them. + engine_op->Run(scope, place); +} + +// Test with a larger FC layer. +TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); } + +} // namespace operators +} // namespace paddle + +USE_TRT_CONVERTER(mul) +USE_TRT_CONVERTER(fc) diff --git a/paddle/fluid/operators/test_send_nccl_id.cc b/paddle/fluid/operators/test_send_nccl_id.cc index a845ba2eb038fa6a8e70dfbac06c31c19dbb9e3e..e2b7b6b8e447381229e4ad594b7974bc0aa159d5 100644 --- a/paddle/fluid/operators/test_send_nccl_id.cc +++ b/paddle/fluid/operators/test_send_nccl_id.cc @@ -20,25 +20,28 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/detail/grpc_server.h" -#include "paddle/fluid/operators/detail/request_handler_impl.h" +#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/string/printf.h" +#ifdef PADDLE_WITH_GRPC +#include "paddle/fluid/operators/send_recv_util.h" +#endif + USE_NO_KERNEL_OP(listen_and_serv); namespace f = paddle::framework; namespace p = paddle::platform; namespace m = paddle::operators::math; -namespace detail = paddle::operators::detail; +namespace distributed = paddle::operators::distributed; namespace string = paddle::string; -std::unique_ptr g_rpc_service; -std::unique_ptr g_req_handler; +std::unique_ptr g_rpc_service; +std::unique_ptr g_req_handler; void StartServer() { f::Scope scope; @@ -54,24 +57,23 @@ void StartServer() { g_req_handler->SetProgram(&empty_program); g_req_handler->SetExecutor(&executor); - g_rpc_service->RegisterRPC(detail::kRequestSend, g_req_handler.get()); + g_rpc_service->RegisterRPC(distributed::kRequestSend, g_req_handler.get()); g_req_handler->SetRPCServer(g_rpc_service.get()); std::thread server_thread( - std::bind(&detail::AsyncGRPCServer::StartServer, g_rpc_service.get())); + std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get())); - g_rpc_service->SetCond(detail::kRequestSend); - std::cout << "before WaitFanInOfSend" << std::endl; - g_rpc_service->WaitBarrier(detail::kRequestSend); + g_rpc_service->SetCond(distributed::kRequestSend); + g_rpc_service->WaitBarrier(distributed::kRequestSend); LOG(INFO) << "got nccl id and stop server..."; g_rpc_service->ShutDown(); server_thread.join(); } -TEST(SendNcclId, GrpcServer) { - g_req_handler.reset(new detail::RequestSendHandler(true)); - g_rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", 1)); +TEST(SendNcclId, RPCServer) { + g_req_handler.reset(new distributed::RequestSendHandler(true)); + g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); std::thread server_thread(StartServer); g_rpc_service->WaitServerReady(); @@ -88,12 +90,15 @@ TEST(SendNcclId, GrpcServer) { int port = g_rpc_service->GetSelectedPort(); std::string ep = string::Sprintf("127.0.0.1:%d", port); - detail::RPCClient client; + + distributed::RPCClient* client = + distributed::RPCClient::GetInstance(); + LOG(INFO) << "connect to server" << ep; - client.AsyncSendVariable(ep, dev_ctx, scope, NCCL_ID_VARNAME); - client.Wait(); - client.AsyncSendBatchBarrier(ep); - client.Wait(); + client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME); + client->Wait(); + client->AsyncSendBatchBarrier(ep); + client->Wait(); server_thread.join(); g_rpc_service.reset(nullptr); diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc index c17d1afc309c65035063348d4934ea1783b018ed..4a8ac441cfaf642fde58ee30865a22e83c065498 100644 --- a/paddle/fluid/operators/top_k_op.cc +++ b/paddle/fluid/operators/top_k_op.cc @@ -50,7 +50,7 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", "(Tensor) The input of Topk op"); - AddOutput("Out", "(Tensor) The output tensor of Topk op"); + AddOutput("Out", "(Tensor) The output tensor of Topk op").Reuse("X"); AddOutput("Indices", "(Tensor) The indices of Topk elements of input"); AddComment(R"DOC( Top K operator diff --git a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc index 78fee77df8151221459b0afa0d6789bfe82cfda5..75d6181749e4e9bd81a3c02de69caf0acd81eef9 100644 --- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc +++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc @@ -35,10 +35,10 @@ class UniformRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker { protected: void Apply() override { AddComment(R"DOC( -Uniform random operator +UniformRandomBatchSizeLike operator. This operator initializes a tensor with the same batch_size as the Input tensor - with random values sampled from a uniform distribution. +with random values sampled from a uniform distribution. )DOC"); AddAttr("min", diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index 137ea91caedabc3167146d91b063dbe9e2e2b931..edd1baa4ace4e246190afcd12b0716f1dd38e243 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -86,32 +86,24 @@ class UniformRandomOp : public framework::OperatorWithKernel { class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddOutput("Out", "(Tensor) The output tensor of uniform random op"); + AddOutput("Out", "The output tensor of uniform random op"); AddComment(R"DOC( -Uniform random operator. - This operator initializes a tensor with random values sampled from a -uniform distribution. +uniform distribution. The random result is in set [min, max]. )DOC"); - AddAttr>("shape", - "(vector) The shape of the output tensor"); - AddAttr("min", - "(float, default -1.0) " - "Minimum value of uniform random") + AddAttr>("shape", "The shape of the output tensor"); + AddAttr("min", "Minimum value of uniform random. [default -1.0].") .SetDefault(-1.0f); - AddAttr("max", - "(float, default 1.0) " - "Maximun value of uniform random") + AddAttr("max", "Maximun value of uniform random. [default 1.0].") .SetDefault(1.0f); AddAttr("seed", - "(int, default 0) " "Random seed used for generating samples. " "0 means use a seed generated by the system." "Note that if seed is not 0, this operator will always " - "generate the same random numbers every time.") + "generate the same random numbers every time. [default 0].") .SetDefault(0); - AddAttr("dtype", "(int, default 5(FP32)) Output tensor data type") + AddAttr("dtype", "Output tensor data type. [default 5(FP32)].") .SetDefault(framework::proto::VarType::FP32); } }; diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc index 175c3ac5d79f24e47d21417df8e3eaeb4d5b2335..f440058e8db2024f5c8a0129db3af87a80d6e551 100644 --- a/paddle/fluid/operators/while_op.cc +++ b/paddle/fluid/operators/while_op.cc @@ -203,11 +203,11 @@ class WhileGradOp : public framework::OperatorBase { ->set_lod(inside_tensor.lod()); } } - auto new_inside_name = cur_scope.Rename(inside_grad_name); auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {pg_names[param_id], new_inside_name}}}, - {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); + {{"Out", {pg_names[param_id]}}}, + framework::AttributeMap{{"use_mkldnn", {false}}}); sum_op->Run(cur_scope, dev_place); cur_scope.Rename(new_inside_name, inside_grad_name); } diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h index 123d3598f4f4753f70889e415aff0f41b7d212f7..2ce9b31bb81de867ff4ed6ee14afddecd95317b9 100644 --- a/paddle/fluid/platform/assert.h +++ b/paddle/fluid/platform/assert.h @@ -17,7 +17,7 @@ limitations under the License. */ #define STRINGIFY(x) #x #define TOSTRING(x) STRINGIFY(x) -#if defined(__APPLE__) && defined(__CUDA_ARCH__) && !defined(NDEBUG) +#if defined(__CUDA_ARCH__) #include #define PADDLE_ASSERT(e) \ do { \ @@ -38,6 +38,9 @@ limitations under the License. */ } while (0) #else #include -#define PADDLE_ASSERT(e) assert(e) +// For cuda, the assertions can affect performance and it is therefore +// recommended to disable them in production code +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion +#define PADDLE_ASSERT(e) assert((e)) #define PADDLE_ASSERT_MSG(e, m) assert((e) && (m)) #endif diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index 4fc9aae8e36e9b43d65fab0b92ec3a2549057128..f832d72b53e8d06a32d5c0ac2ecf7130aa28a666 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -21,12 +21,23 @@ limitations under the License. */ #include #endif +#include #include "gflags/gflags.h" DEFINE_double(fraction_of_cpu_memory_to_use, 1, "Default use 100% of CPU memory for PaddlePaddle," "reserve the rest for page tables, etc"); +DEFINE_uint64(initial_cpu_memory_in_mb, +#ifdef PADDLE_WITH_MKLDNN + /* Aligned with mozga-intel, MKLDNN need at least 5000 MB + * to obtain the best performance*/ + 5000, +#else + 500, +#endif + "Initial CPU memory for PaddlePaddle, in MD unit."); + DEFINE_double( fraction_of_cuda_pinned_memory_to_use, 0.5, "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," @@ -63,8 +74,11 @@ size_t CpuMinChunkSize() { } size_t CpuMaxChunkSize() { - // Allow to allocate the maximum chunk size is roughly 3% of CPU memory. - return CpuMaxAllocSize() / 32; + // Allow to allocate the maximum chunk size is roughly 3% of CPU memory, + // or the initial_cpu_memory_in_mb. + return std::min( + static_cast(CpuMaxAllocSize() / 32), + static_cast(FLAGS_initial_cpu_memory_in_mb * 1 << 20)); } size_t CUDAPinnedMaxAllocSize() { diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index c0d399d078f73743836fc2a0c1d4b1e6b31ecd83..6ea4f8b7cba18ce7f803dbd9b15a7ae70c3055f2 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -22,6 +22,8 @@ limitations under the License. */ #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/macros.h" +DECLARE_bool(cudnn_deterministic); + namespace paddle { namespace platform { @@ -76,8 +78,44 @@ enum class DataLayout { // Not use enum class PoolingMode { kMaximum, kAverage, + kMaximumDeterministic, }; +#if CUDNN_VERSION < 6000 +#pragma message "CUDNN version under 6.0 is supported at best effort." +#pragma message "We strongly encourage you to move to 6.0 and above." +#pragma message "This message is intended to annoy you enough to update." +#pragma message \ + "please see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/" + +inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) { + switch (mode) { + case PoolingMode::kMaximumDeterministic: + return CUDNN_POOLING_MAX; + case PoolingMode::kAverage: + return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + case PoolingMode::kMaximum: + return CUDNN_POOLING_MAX; + default: + PADDLE_THROW("Unexpected pooling mode."); + } +} +#else + +inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) { + switch (mode) { + case PoolingMode::kMaximumDeterministic: + return CUDNN_POOLING_MAX_DETERMINISTIC; + case PoolingMode::kAverage: + return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + case PoolingMode::kMaximum: + return CUDNN_POOLING_MAX; + default: + PADDLE_THROW("Unexpected pooling mode."); + } +} +#endif // CUDNN_VERSION < 6000 + template class CudnnDataType; @@ -293,9 +331,7 @@ class ScopedPoolingDescriptor { PADDLE_ENFORCE_EQ(kernel.size(), pads.size()); PADDLE_ENFORCE_EQ(kernel.size(), strides.size()); PADDLE_ENFORCE(dynload::cudnnSetPoolingNdDescriptor( - desc_, (mode == PoolingMode::kMaximum - ? CUDNN_POOLING_MAX - : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING), + desc_, (GetPoolingMode(mode)), CUDNN_PROPAGATE_NAN, // Always propagate nans. kernel.size(), kernel.data(), pads.data(), strides.data())); return desc_; diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 1f733d71bdfb777d4a2f316a5fefc3c874879862..6c50ab2685c56bafe146c67fe2ef081ee4c55628 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -175,7 +175,6 @@ CUDADeviceContext::~CUDADeviceContext() { Place CUDADeviceContext::GetPlace() const { return place_; } void CUDADeviceContext::Wait() const { - std::lock_guard guard(mutex_); PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaGetLastError()); } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index a9c1984616bc731e0557f2cb89282423aa9c3bac..292ffef1aef12732812b8c5b0020cad73b1d06fc 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -11,6 +11,7 @@ limitations under the License. */ #pragma once #include +#include // NOLINT #include #include #include @@ -100,7 +101,7 @@ class CUDADeviceContext : public DeviceContext { template void RecordEvent(cudaEvent_t ev, Callback callback) { - std::lock_guard guard(mutex_); + std::lock_guard guard(mtx_); callback(); PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); } @@ -110,8 +111,6 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr eigen_device_; std::unique_ptr eigen_stream_; - - mutable std::recursive_mutex mutex_; cudaStream_t stream_; cudnnHandle_t cudnn_handle_; cublasHandle_t cublas_handle_; @@ -119,6 +118,8 @@ class CUDADeviceContext : public DeviceContext { int compute_capability; int multi_process; int max_threads_per_mp; + + std::mutex mtx_; }; template <> diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 1a9be044e024e4b1dda5ef7d515c65f3a7513710..d9e2afadaf8ec439d158e57c94d3e6e684bce116 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -322,7 +322,6 @@ class DeviceTracerImpl : public DeviceTracer { DisableActivity(); dynload::cuptiUnsubscribe(subscriber_); CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_)); - PADDLE_ENFORCE(dynload::cuptiFinalize()); enabled_ = false; } diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 364c4901b297dbd647faae85b01f682a1daace9c..9da787a4073fa002f75154f7c4fba54e9ed8efa6 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -1,14 +1,23 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) -list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc) +list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc) + +# There is no macOS version of NCCL. +if (NOT APPLE) + list(APPEND CUDA_SRCS nccl.cc) +endif() + if (TENSORRT_FOUND) list(APPEND CUDA_SRCS tensorrt.cc) endif() - configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h) if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) +if (WITH_MKLML) + cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) +endif() +# TODO(TJ): add iomp, mkldnn? diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index 81acaff87d3c2025cf0d6185a1590b018bfbd83c..25bcda7eedc1ef42f75fb8fd1439f0c8f55015c3 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -45,7 +45,7 @@ extern void *cublas_dso_handle; std::call_once(cublas_dso_flag, []() { \ cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \ }); \ - void *p_##__name = dlsym(cublas_dso_handle, #__name); \ + static void *p_##__name = dlsym(cublas_dso_handle, #__name); \ return reinterpret_cast(p_##__name)(args...); \ } \ }; \ diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 34d83e395694f55eafca74d63ebf363169ab30e8..77e46fa768b62c277d7b4027de7173e39a5672b4 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -39,7 +39,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \ }); \ EnforceCUDNNLoaded(#__name); \ - void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ + static void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ return reinterpret_cast(p_##__name)(args...); \ } \ }; \ diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h index e64de7c20fc9d145e51cfc4528e321b3c4ec86c8..e8f4a82ef132be9e4ec3fb76f11766046a2ff638 100644 --- a/paddle/fluid/platform/dynload/cupti.h +++ b/paddle/fluid/platform/dynload/cupti.h @@ -45,7 +45,7 @@ extern void *cupti_dso_handle; std::call_once(cupti_dso_flag, []() { \ cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \ }); \ - void *p_##__name = dlsym(cupti_dso_handle, #__name); \ + static void *p_##__name = dlsym(cupti_dso_handle, #__name); \ return reinterpret_cast(p_##__name)(args...); \ } \ }; \ @@ -72,7 +72,6 @@ extern void *cupti_dso_handle; __macro(cuptiGetResultString); \ __macro(cuptiActivityGetNumDroppedRecords); \ __macro(cuptiActivityFlushAll); \ - __macro(cuptiFinalize); \ __macro(cuptiSubscribe); \ __macro(cuptiUnsubscribe); \ __macro(cuptiEnableCallback); \ diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h index 46ad4379d5f9572d415ef1d747077217ae29391e..5b9e0820e0b319fe7a636a57a0029caf038b4db3 100644 --- a/paddle/fluid/platform/dynload/curand.h +++ b/paddle/fluid/platform/dynload/curand.h @@ -34,7 +34,7 @@ extern void *curand_dso_handle; std::call_once(curand_dso_flag, []() { \ curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \ }); \ - void *p_##__name = dlsym(curand_dso_handle, #__name); \ + static void *p_##__name = dlsym(curand_dso_handle, #__name); \ return reinterpret_cast(p_##__name)(args...); \ } \ }; \ diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 19c01dc5a968c7e1d2b0f15cf9a0e8427004e58b..198d8566b1bd726c5b33d8af22a19cb30a280fa2 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -49,6 +49,8 @@ DEFINE_string( tensorrt_dir, "", "Specify path for loading tensorrt library, such as libnvinfer.so."); +DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so."); + namespace paddle { namespace platform { namespace dynload { @@ -76,6 +78,7 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, VLOG(3) << "Try to find library: " << dso_path << " from default system path."; // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH + // and /usr/local/lib path void* dso_handle = dlopen(dso_path.c_str(), dynload_flags); // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to @@ -97,6 +100,10 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, } #endif + if (nullptr == dso_handle) { + LOG(WARNING) << "Can not find library: " << dso_path + << ". Please try to add the lib path to LD_LIBRARY_PATH."; + } return dso_handle; } @@ -206,6 +213,14 @@ void* GetTensorRtDsoHandle() { #endif } +void* GetMKLMLDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib"); +#else + return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so"); +#endif +} + } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 0de3559b6088086cb52c254535b6ec42da7dd724..ca87dc47f355a8a4fc840262044413414edf00a0 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -26,6 +26,7 @@ void* GetWarpCTCDsoHandle(); void* GetLapackDsoHandle(); void* GetNCCLDsoHandle(); void* GetTensorRtDsoHandle(); +void* GetMKLMLDsoHandle(); } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/mklml.cc b/paddle/fluid/platform/dynload/mklml.cc new file mode 100644 index 0000000000000000000000000000000000000000..0f61a5e09b3243cbdf570ba7c28a260f181d8848 --- /dev/null +++ b/paddle/fluid/platform/dynload/mklml.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/mklml.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag mklml_dso_flag; +void* mklml_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MKLML_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h new file mode 100644 index 0000000000000000000000000000000000000000..17acefe8cde01809572e4c86cbdccfed9a477a51 --- /dev/null +++ b/paddle/fluid/platform/dynload/mklml.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag mklml_dso_flag; +extern void* mklml_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load mklml routine + * via operator overloading. + */ +#define DYNAMIC_LOAD_MKLML_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using mklmlFunc = decltype(&::__name); \ + std::call_once(mklml_dso_flag, []() { \ + mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \ + }); \ + static void* p_##_name = dlsym(mklml_dso_handle, #__name); \ + return reinterpret_cast(p_##_name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name) + +#define MKLML_ROUTINE_EACH(__macro) \ + __macro(cblas_sgemm); \ + __macro(cblas_saxpy); \ + __macro(cblas_scopy); \ + __macro(cblas_sgemv); \ + __macro(cblas_sgemm_batch); \ + __macro(cblas_dgemm); \ + __macro(cblas_daxpy); \ + __macro(cblas_dcopy); \ + __macro(cblas_dgemv); \ + __macro(cblas_dgemm_batch); \ + __macro(vsAdd); \ + __macro(vdAdd); \ + __macro(MKL_Set_Num_Threads) + +MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); + +#undef DYNAMIC_LOAD_MKLML_WRAP + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h index 37902ae20c5d9d64486232bbd468375c4a50a615..575516f81870fc9f7b92919ffc20a201cb5cbce8 100644 --- a/paddle/fluid/platform/dynload/nccl.h +++ b/paddle/fluid/platform/dynload/nccl.h @@ -37,7 +37,7 @@ extern void* nccl_dso_handle; std::call_once(nccl_dso_flag, []() { \ nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \ }); \ - void* p_##__name = dlsym(nccl_dso_handle, #__name); \ + static void* p_##__name = dlsym(nccl_dso_handle, #__name); \ return reinterpret_cast(p_##__name)(args...); \ } \ }; \ diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h index f584a49da0fefe0b064b5fb55b01ec132225ce5e..5d67658b94af75680a100e13eed7b6b052162e00 100644 --- a/paddle/fluid/platform/dynload/tensorrt.h +++ b/paddle/fluid/platform/dynload/tensorrt.h @@ -40,7 +40,7 @@ extern void* tensorrt_dso_handle; paddle::platform::dynload::GetTensorRtDsoHandle(); \ PADDLE_ENFORCE(tensorrt_dso_handle, "load tensorrt so failed"); \ }); \ - void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \ + static void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \ PADDLE_ENFORCE(p_##__name, "load %s failed", #__name); \ return reinterpret_cast(p_##__name)(args...); \ } \ diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h index 7c70649d21c547beb824576d4a8ecf6219a9bddf..d157c1fda789b98f06ad069d2a9c4f421ff82dcd 100644 --- a/paddle/fluid/platform/dynload/warpctc.h +++ b/paddle/fluid/platform/dynload/warpctc.h @@ -40,7 +40,7 @@ extern void* warpctc_dso_handle; std::call_once(warpctc_dso_flag, []() { \ warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \ }); \ - void* p_##_name = dlsym(warpctc_dso_handle, #__name); \ + static void* p_##_name = dlsym(warpctc_dso_handle, #__name); \ return reinterpret_cast(p_##_name)(args...); \ } \ }; \ diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 7b8c29e1e642ec6bb4023afd8c083311b8b31812..a34e4371cccfd1be0d173fa11595e4368eb65b85 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -44,8 +44,10 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/curand.h" +#ifndef __APPLE__ #include "paddle/fluid/platform/dynload/nccl.h" -#endif +#endif // __APPLE__ +#endif // PADDLE_WITH_CUDA namespace paddle { namespace platform { @@ -174,6 +176,7 @@ inline typename std::enable_if::type throw_on_error( throw std::runtime_error(err + string::Sprintf(args...)); } +#ifndef __APPLE__ template inline typename std::enable_if::type throw_on_error( ncclResult_t stat, const Args&... args) { @@ -184,7 +187,7 @@ inline typename std::enable_if::type throw_on_error( string::Sprintf(args...)); } } - +#endif // __APPLE__ #endif // PADDLE_WITH_CUDA template diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index f1187620d81ff3bc1deef2106edb54d6199fa927..ed99932546446eb877c9701de15e2d37d29b5f88 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace platform { @@ -86,5 +87,155 @@ inline mkldnn::memory::data_type MKLDNNGetDataType() { return mkldnn::memory::f32; } +inline void Reorder(const mkldnn::memory& src, const mkldnn::memory& dst) { + auto reorder_prim = mkldnn::reorder(src, dst); + std::vector pipeline; + pipeline.push_back(reorder_prim); + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); +} + +inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) { + return static_cast( + memory.get_primitive_desc().desc().data.format); +} + +inline mkldnn::memory::format GetMKLDNNFormat( + const mkldnn::sum::primitive_desc& memory) { + return static_cast( + memory.dst_primitive_desc().desc().data.format); +} + +class MKLDNNHandler { + public: + MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, + const std::string& base_key) + : dev_ctx_(dev_ctx), + engine_(engine), + key_(base_key), + is_reusing_(false) {} + + std::shared_ptr AcquireSrcMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_src_mem_p"); + } + + std::shared_ptr AcquireWeightsMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_weights_mem_p"); + } + + std::shared_ptr AcquireDstMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_dst_mem_p"); + } + + std::shared_ptr AcquireDiffDstMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p"); + } + + std::shared_ptr AcquireDiffSrcMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p"); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + mkldnn::memory::primitive_desc mdp, void* ptr, + const std::string& suffix) { + auto local_key = key_ + suffix; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), + "Fail to find mem primitive in device context"); + if (mem_p == nullptr) { + mem_p = std::make_shared(mdp, ptr); + dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + // Mark that reusing happenned. All primitives from operator instance + // should be reused or none of them. So we check consistency + is_reusing_ = true; + } + return mem_p; + } + + std::shared_ptr AcquireMemory(const mkldnn::memory::desc& md, + void* ptr, + const std::string& suffix) { + /*Generate key*/ + auto local_key = key_ + suffix; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), + "Fail to find mem primitive in device context"); + if (mem_p == nullptr) { + mem_p = std::make_shared( + mkldnn::memory::primitive_desc{md, engine_}, ptr); + dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + // Mark that reusing happenned. All primitives from operator instance + // should be reused or none of them. So we check consistency + is_reusing_ = true; + } + return mem_p; + } + + std::shared_ptr AcquireMemory( + mkldnn::memory::primitive_desc& mpd, + mkldnn::memory::primitive_desc& user_mpd, + const std::shared_ptr user_memory_p, + const std::string& suffix, std::vector& pipeline) { + // create reorder primitive if the input format is not the preferred one + auto local_key = key_ + suffix; + auto key_reorder_p = key_ + suffix + "reorder_p"; + + auto target_memory_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + PADDLE_ENFORCE((target_memory_p != nullptr) || (is_reusing_ == false), + "Fail to find mem primitive in device context"); + if (target_memory_p == nullptr) { + target_memory_p = user_memory_p; + std::shared_ptr reorder_p; + if (mpd != user_mpd) { + target_memory_p = std::make_shared(mpd); + + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + dev_ctx_.SetBlob(key_reorder_p, reorder_p); + pipeline.push_back(*reorder_p); + } + dev_ctx_.SetBlob(local_key, target_memory_p); + } else { + // Make reorder if needed + auto reorder_p = std::static_pointer_cast( + dev_ctx_.GetBlob(key_reorder_p)); + if (reorder_p != nullptr) { + pipeline.push_back(*reorder_p); + } + is_reusing_ = true; + } + return target_memory_p; + } + + static std::string GetHash(mkldnn::memory::dims& operand_dims, + const std::string& suffix) { + auto dims2str = [](const mkldnn::memory::dims& operand_dims) { + std::string dstr = ""; + for (size_t i = 0; i < operand_dims.size(); ++i) { + dstr += std::to_string(operand_dims[i]) + "-"; + } + return dstr; + }; + return dims2str(operand_dims) + suffix; + }; + + protected: + const MKLDNNDeviceContext& dev_ctx_; + mkldnn::engine engine_; + std::string key_; + bool is_reusing_; +}; + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 6f8e3f22db54d166cf97cfdd3d009058207a7ca5..cc46c88fd1f9a5d1bacad26beed6fd0af6405310 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) { } } +// NOTE(minqiyang): according to the ncclGroupEnd documentations: +// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html, +// ncclGroupEnd will wait for all communicators to be initialized, which will +// cause blocking problem when a runtime_error was thrown, so try only guard +// NCCL actions when use it. class NCCLGroupGuard { public: static std::mutex &NCCLMutex() { diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index bcf6d4dd3087060c016e53722cde80704ef2e834..fcd3356d44ee592233c3883d439d0677714900b8 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -268,7 +268,8 @@ void BindOpDesc(pybind11::module *m) { .value("STRINGS", pd::proto::AttrType::STRINGS) .value("BOOL", pd::proto::AttrType::BOOLEAN) .value("BOOLS", pd::proto::AttrType::BOOLEANS) - .value("BLOCK", pd::proto::AttrType::BLOCK); + .value("BLOCK", pd::proto::AttrType::BLOCK) + .value("BLOCKS", pd::proto::AttrType::BLOCKS); pybind11::class_ op_desc(*m, "OpDesc", ""); op_desc @@ -293,6 +294,7 @@ void BindOpDesc(pybind11::module *m) { .def("set_attr", &pd::OpDesc::SetAttr) .def("attr", &pd::OpDesc::GetAttr) .def("set_block_attr", &pd::OpDesc::SetBlockAttr) + .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr) .def("set_serialized_attr", [](pd::OpDesc &self, const std::string &name, const pybind11::bytes &seriralized) { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3af8941be69fe507bc105e26b608ec768e4b5998..36d080996831d4ad90d92baeafbe964693e2332a 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -34,6 +34,7 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" @@ -144,28 +145,75 @@ PYBIND11_PLUGIN(core) { py::class_(m, "LoDTensor") .def_buffer( [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) - .def( - "__init__", - [](LoDTensor &instance, const std::vector> &lod) { - LoD new_lod; - new_lod.reserve(lod.size()); - std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); - new (&instance) LoDTensor(new_lod); - }) + .def("__init__", + [](LoDTensor &instance, const std::vector> + &recursive_sequence_lengths) { + LoD new_lod; + new_lod.reserve(recursive_sequence_lengths.size()); + std::copy(recursive_sequence_lengths.begin(), + recursive_sequence_lengths.end(), + std::back_inserter(new_lod)); + LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); + PADDLE_ENFORCE( + CheckLoD(new_offset_lod, -1), + "the provided recursive_sequence_lengths info is invalid"); + new (&instance) LoDTensor(new_offset_lod); + }) .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); }) + // We implement offset based LOD in C++ while we use length based with + // Python API. So we changed set_lod to set_recursive_sequence_lengths to + // avoid misuse. + // The discussion is here: + // https://github.com/PaddlePaddle/Paddle/issues/10855 .def("set_lod", [](LoDTensor &self, const std::vector> &lod) { + // the input lod is offset-based level-of-detail info LoD new_lod; new_lod.reserve(lod.size()); std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); + PADDLE_ENFORCE(CheckLoD(new_lod, vectorize(self.dims()).front()), + "the provided lod info is invalid"); self.set_lod(new_lod); }) - .def("lod", [](LoDTensor &self) -> std::vector> { - auto lod = self.lod(); - std::vector> new_lod; - new_lod.reserve(lod.size()); - std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); - return new_lod; + .def("set_recursive_sequence_lengths", + [](LoDTensor &self, const std::vector> + &recursive_sequence_lengths) { + // the input recursive_sequence_lengths is length-based + // level-of-detail info + LoD new_lod; + new_lod.reserve(recursive_sequence_lengths.size()); + std::copy(recursive_sequence_lengths.begin(), + recursive_sequence_lengths.end(), + std::back_inserter(new_lod)); + LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); + PADDLE_ENFORCE( + CheckLoD(new_offset_lod, vectorize(self.dims()).front()), + "the provided recursive_sequence_lengths info is invalid"); + self.set_lod(new_offset_lod); + }) + .def("lod", + [](LoDTensor &self) -> std::vector> { + // output the offset-based lod info + LoD lod = self.lod(); + std::vector> new_lod; + new_lod.reserve(lod.size()); + std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); + return new_lod; + }) + // Set above comments of set_lod. + .def("recursive_sequence_lengths", + [](LoDTensor &self) -> std::vector> { + // output the length-based lod info + LoD lod = ConvertToLengthBasedLoD(self.lod()); + std::vector> new_lod; + new_lod.reserve(lod.size()); + std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); + return new_lod; + }) + .def("has_valid_recursive_sequence_lengths", [](LoDTensor &self) -> bool { + // Check that the lod info is valid and match the outermost + // dimension of the LoDTensor data + return CheckLoD(self.lod(), vectorize(self.dims()).front()); }); py::class_(m, "SelectedRows") @@ -250,6 +298,37 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Reader", "") .def("reset", &framework::ReaderHolder::ReInit); + using LoDTensorBlockingQueue = + ::paddle::operators::reader::LoDTensorBlockingQueue; + using LoDTensorBlockingQueueHolder = + ::paddle::operators::reader::LoDTensorBlockingQueueHolder; + py::class_(m, "LoDTensorBlockingQueue", "") + .def("push", + [](LoDTensorBlockingQueue &self, + const std::vector &lod_tensor_vec) { + pybind11::gil_scoped_release release; + return self.Push(lod_tensor_vec); + }) + .def("size", &LoDTensorBlockingQueue::Size) + .def("capacity", &LoDTensorBlockingQueue::Cap) + .def("close", &LoDTensorBlockingQueue::Close) + .def("is_closed", &LoDTensorBlockingQueue::IsClosed); + + m.def("init_lod_tensor_blocking_queue", + [](Variable &var, size_t capacity, + const std::vector> &shapes) + -> LoDTensorBlockingQueue * { + std::vector dims(shapes.size()); + std::transform(shapes.begin(), shapes.end(), dims.begin(), + [](const std::vector &shape) { + return make_ddim(shape); + }); + auto *holder = var.GetMutable(); + holder->InitOnce(capacity, dims); + return holder->GetQueue().get(); + }, + py::return_value_policy::reference); + py::class_(m, "Scope", "") .def("var", [](Scope &self, const std::string &name) -> Variable * { @@ -413,9 +492,14 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Executor") .def(py::init()) - .def("run", - (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) & - Executor::Run); +#ifdef PADDLE_WITH_DISTRIBUTE + .def("complete", &Executor::Complete) +#endif + .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope, + int block_id, bool create_local_scope, bool create_vars) { + pybind11::gil_scoped_release release; + self.Run(prog, scope, block_id, create_local_scope, create_vars); + }); m.def("init_gflags", framework::InitGflags); m.def("init_glog", framework::InitGLOG); @@ -509,16 +593,24 @@ All parameter, weight, gradient are variables in Paddle. self.num_threads_ = num_threads; }) .def_property( - "use_event", - [](const ExecutionStrategy &self) { return self.use_event_; }, - [](ExecutionStrategy &self, bool use_event) { - self.use_event_ = use_event; + "use_cuda", + [](const ExecutionStrategy &self) { return self.use_cuda_; }, + [](ExecutionStrategy &self, bool use_cuda) { + self.use_cuda_ = use_cuda; }) .def_property( "allow_op_delay", [](const ExecutionStrategy &self) { return self.allow_op_delay_; }, [](ExecutionStrategy &self, bool allow_op_delay) { self.allow_op_delay_ = allow_op_delay; + }) + .def_property( + "num_iteration_per_drop_scope", + [](const ExecutionStrategy &self) { + return self.num_iteration_per_drop_scope_; + }, + [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) { + self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope; }); py::class_ build_strategy(pe, "BuildStrategy"); @@ -545,6 +637,12 @@ All parameter, weight, gradient are variables in Paddle. [](BuildStrategy &self, BuildStrategy::GradientScaleStrategy strategy) { self.gradient_scale_ = strategy; + }) + .def_property( + "debug_graphviz_path", + [](const BuildStrategy &self) { return self.debug_graphviz_path_; }, + [](BuildStrategy &self, const std::string &path) { + self.debug_graphviz_path_ = path; }); pe.def(py::init &, @@ -567,7 +665,12 @@ All parameter, weight, gradient are variables in Paddle. &ParallelExecutor::FeedTensorsIntoLocalScopes) .def("feed_and_split_tensor_into_local_scopes", &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes) - .def("run", &ParallelExecutor::Run); + .def("run", [](ParallelExecutor &self, + const std::vector &fetch_tensors, + const std::string &fetched_var_name) { + pybind11::gil_scoped_release release; + self.Run(fetch_tensors, fetched_var_name); + }); BindRecordIOWriter(&m); return m.ptr(); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 93b09ed6922b32a5531224acc470daf0d97f95bd..3e2ea1ef88b03f5b2576c1cee2b5d26a439943da 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -97,7 +97,7 @@ struct CastToPyBufferImpl { inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) { auto buffer_info = details::CastToPyBufferImpl()(tensor); + uint8_t, platform::float16>()(tensor); return buffer_info; } @@ -146,7 +146,7 @@ void PyCPUTensorSetFromArray( template <> // This following specialization maps uint16_t in the parameter type to // platform::float16. -void PyCPUTensorSetFromArray( +inline void PyCPUTensorSetFromArray( framework::Tensor *self, pybind11::array_t @@ -185,7 +185,7 @@ void PyCUDATensorSetFromArray( template <> // This following specialization maps uint16_t in the parameter type to // platform::float16. -void PyCUDATensorSetFromArray( +inline void PyCUDATensorSetFromArray( framework::Tensor *self, pybind11::array_t @@ -224,7 +224,7 @@ void PyCUDAPinnedTensorSetFromArray( template <> // This following specialization maps uint16_t in the parameter type to // platform::float16. -void PyCUDAPinnedTensorSetFromArray( +inline void PyCUDAPinnedTensorSetFromArray( framework::Tensor *self, pybind11::array_t diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h index f3d8b1a39e849d5f5a9e79cf33252b60170ced81..854e4baa3987f61353038c7b26acf43943c89636 100644 --- a/paddle/math/MathFunctions.h +++ b/paddle/math/MathFunctions.h @@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef MATHFUNCTIONS_H_ -#define MATHFUNCTIONS_H_ +#pragma once #ifdef PADDLE_WITH_MKLML #include @@ -21,7 +20,7 @@ limitations under the License. */ #include #endif -#if defined(PADDLE_USE_VECLIB) +#ifdef PADDLE_USE_VECLIB extern "C" { #include #include @@ -30,8 +29,10 @@ extern "C" { #ifdef PADDLE_USE_OPENBLAS #include +#ifdef LAPACK_FOUND #include #endif +#endif #ifndef LAPACK_FOUND extern "C" { @@ -126,5 +127,3 @@ template void vTanh(const int n, const T* a, T* r); } // namespace paddle - -#endif // MATHFUNCTIONS_H_ diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 113d02ce4865877d9385da31d996c0985c348716..037688bde9122c1d999e90f2438977b46c1eb531 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -22,7 +22,7 @@ function print_usage() { echo -e "\n${RED}Usage${NONE}: ${BOLD}${SCRIPT_NAME}${NONE} [OPTION]" - + echo -e "\n${RED}Options${NONE}: ${BLUE}build${NONE}: run build for x86 platform ${BLUE}build_android${NONE}: run build for android platform @@ -132,7 +132,8 @@ EOF -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ - -DWITH_CONTRIB=${WITH_CONTRIB:-ON} + -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ + -DWITH_ANAKIN=${WITH_ANAKIN:-ON} } function abort(){ @@ -181,6 +182,7 @@ function build() { ============================================ EOF make clean + make -j `nproc` make install -j `nproc` } @@ -196,7 +198,7 @@ function build_android() { fi ANDROID_STANDALONE_TOOLCHAIN=$ANDROID_TOOLCHAINS_DIR/$ANDROID_ARCH-android-$ANDROID_API - + cat <(new_argv.size()); char** new_argv_address = new_argv.data(); diff --git a/python/paddle/batch.py b/python/paddle/batch.py index d48c54fcbb66487617b1946bc69724870c8f879c..3c6a53db3c2287e8ef5931a06ca5dad455665ee0 100644 --- a/python/paddle/batch.py +++ b/python/paddle/batch.py @@ -15,7 +15,7 @@ __all__ = ['batch'] -def batch(reader, batch_size, drop_last=False): +def batch(reader, batch_size, drop_last=True): """ Create a batched reader. diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py index f082e33be3357fbe405ab1a1ef5e0e601108a363..527044b415533cc640e3cfc5837c08ab0f8b74b1 100644 --- a/python/paddle/dataset/flowers.py +++ b/python/paddle/dataset/flowers.py @@ -119,7 +119,8 @@ def reader_creator(data_file, yield sample, int(label) - 1 if use_xmap: - return xmap_readers(mapper, reader, cpu_count(), buffered_size) + cpu_num = int(os.environ.get('CPU_NUM', cpu_count())) + return xmap_readers(mapper, reader, cpu_num, buffered_size) else: return map_readers(mapper, reader) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index d53a96a7a79456d1f3ba640b1cbab6cc314e4d24..45af83708ea63fc1b6aa86f1e8423bb44b7388a6 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -26,6 +26,7 @@ from trainer import BeginEpochEvent from trainer import EndEpochEvent from trainer import BeginStepEvent from trainer import EndStepEvent +from trainer import CheckpointConfig import inferencer from inferencer import Inferencer @@ -43,7 +44,7 @@ import metrics import transpiler from param_attr import ParamAttr, WeightNormParamAttr from data_feeder import DataFeeder -from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace +from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope from transpiler import DistributeTranspiler, InferenceTranspiler, \ memory_optimize, release_memory from concurrency import (Go, make_channel, channel_send, channel_recv, @@ -82,6 +83,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \ 'profiler', 'unique_name', 'recordio_writer', + 'Scope', ] @@ -116,11 +118,11 @@ def __bootstrap__(): read_env_flags = [ 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', - 'eager_delete_scope' + 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb' ] if core.is_compiled_with_cuda(): read_env_flags += [ - 'fraction_of_gpu_memory_to_use', 'cudnn_algo_use_autotune' + 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic' ] core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) diff --git a/python/paddle/fluid/average.py b/python/paddle/fluid/average.py index 6abe8233b07c484494848c566e9898600a7d8f5c..358e24df31bb517604481bb48b9180e579f8460d 100644 --- a/python/paddle/fluid/average.py +++ b/python/paddle/fluid/average.py @@ -36,6 +36,25 @@ def _is_number_or_matrix_(var): class WeightedAverage(object): + """ + Calculate weighted average. + + The average calculating is accomplished via Python totally. + They do not change Paddle's Program, nor do anything to + modify NN model's configuration. They are completely + wrappers of Python functions. + + Examples: + .. code-block:: python + avg = fluid.average.WeightedAverage() + avg.add(value=2.0, weight=1) + avg.add(value=4.0, weight=2) + avg.eval() + + # The result is 3.333333333. + # For (2.0 * 1 + 4.0 * 2) / (1 + 2) = 3.333333333 + """ + def __init__(self): warnings.warn( "The %s is deprecated, please use fluid.metrics.Accuracy instead." % diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 4f9622d04dc98f41b503ceb780802d2a4e4c58a0..4faa06303170488d0de2fda4c1461cfe2d623d35 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -132,9 +132,9 @@ def _addup_repetitive_outputs_(op_descs): for idx, op_desc in enumerate(op_descs): for var_name in op_desc.input_arg_names(): if len(renamed_vars[var_name]) > 1: - pending_sum_ops.append( - (_create_op_desc_("sum", {"X": renamed_vars[var_name]}, - {"Out": [var_name]}, {}), idx)) + pending_sum_ops.append((_create_op_desc_( + "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]}, + {"use_mkldnn": False}), idx)) renamed_vars[var_name] = [var_name] for var_name in op_desc.output_arg_names(): if var_name == core.empty_var_name( @@ -147,7 +147,7 @@ def _addup_repetitive_outputs_(op_descs): else: if len(renamed_vars[var_name]) == 1: new_name = var_name + "@RENAME@" + \ - str(var_rename_count[var_name]) + str(var_rename_count[var_name]) var_rename_count[var_name] += 1 # rename original var_name renamed_vars[var_name][0] = new_name @@ -155,14 +155,15 @@ def _addup_repetitive_outputs_(op_descs): _rename_arg_(pending_sum_ops, var_name, new_name) new_name = var_name + "@RENAME@" + \ - str(var_rename_count[var_name]) + str(var_rename_count[var_name]) var_rename_count[var_name] += 1 op_desc.rename_output(var_name, new_name) renamed_vars[var_name].append(new_name) for var_name, inputs in renamed_vars.iteritems(): if len(inputs) > 1: - pending_sum_ops.append((_create_op_desc_( - "sum", {"X": inputs}, {"Out": [var_name]}, {}), len(op_descs))) + pending_sum_ops.append( + (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]}, + {"use_mkldnn": False}), len(op_descs))) # sum_op descs are sorted according to their insert position for p in reversed(pending_sum_ops): op_descs.insert(p[1], p[0]) @@ -434,18 +435,65 @@ def _get_stop_gradients_(program): def append_backward(loss, parameter_list=None, no_grad_set=None, callbacks=None): """ - Append backward part to main_program + Append backward part to main_program. - Args: - loss(Variable): The variable generated by cost function. - parameter_list(list[string]): Parameters that need to be updated by - optimizer. If None, it means all parameters need to be updated. - no_grad_set(set): Variables that have no gradients in Block 0. - All variables with `step_gradient=True` from all blocks will be - automatically added. + A complete neural network training is made up of forward and backward + propagation. However, when we configure a network, we only need to + specify its forwrd part. The backward part is generated automatically + according to the forward part by this function. - Return: - (list[(Variable,Variable)]): list of (parameter, gradient) pair. + In most cases, users do not need to invoke this function manually. It + will be automatically invoked by the optimizer's `minimize` function. + + Args: + loss(Variable): The loss variable of the network. + parameter_list(list[string]|None): Names of parameters that need + to be updated by optimizers. + If it is None, all parameters + will be updated. + Default: None + no_grad_set(set|None): Variables in the Block 0 whose gradients + should be ignored. All variables with + `step_gradient=True` from all blocks will + be automatically added into this set. + Default: None + callbacks(list[callable object]|None): The callbacks are used for + doing some custom jobs during + backward part building. All + callable objects in it will + be invoked once each time a + new gradient operator is added + into the program. The callable + object must has two input + parameters: 'block' and 'context'. + The 'block' is the block which + the new gradient operator will + be added to. The 'context' is a + map, whose keys are gradient + variable names and values are + corresponding original variables. + In addition to this, the 'context' + has another special key-value pair: + the key is string '__current_op_desc__' + and the value is the op_desc of the + gradient operator who has just + triggered the callable object. + + Returns: + list[(Variable,Variable)]: Pairs of parameter and its + corresponding gradients. The key is the parameter and the + value is gradient variable. + + Raises: + AssertionError: If `loss` is not an instance of Variable. + + Examples: + .. code-block:: python + + # network configuration code + # ... + avg_loss = fluid.layers.mean(loss) + param_grad_list = fluid.backward.append_backward(loss=avg_loss) """ assert isinstance(loss, framework.Variable) diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 66c3fc6b66d61bc9578f84594409ad0f24c99910..18e2f3045e272fb4712391f87bffd3f367c1c744 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -24,8 +24,6 @@ __all__ = [ 'GradientClipByValue', 'GradientClipByNorm', 'GradientClipByGlobalNorm', - 'append_gradient_clip_ops', - 'error_clip_callback', ] @@ -38,6 +36,25 @@ class BaseErrorClipAttr(object): class ErrorClipByValue(BaseErrorClipAttr): + """ + Clips tensor values to the range [min, max]. + + Given a tensor t, this operation clips its value to min and max inplace. + + - Any values less than min are set to min. + - Any values greater than max are set to max. + + Args: + max (float): The maximum value to clip by. + min (float, optional): The minimum value to clip by. if not set by user, \ + will be set to -max by framework. + + Examples: + .. code-block:: python + + var = fluid.framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...) + """ + def __init__(self, max, min=None): max = float(max) if min is None: @@ -99,6 +116,31 @@ class NullGradientClipAttr(BaseGradientClipAttr): class GradientClipByValue(BaseGradientClipAttr): + """ + Clips gradient values to the range [min, max]. + + Given a tensor t, this operation clips its value to min and max inplace. + + - Any values less than min are set to min. + - Any values greater than max are set to max. + + Args: + max (float): The maximum value to clip by. + min (float, optional): The minimum value to clip by. if not set by user, \ + will be set to -max by framework. + + Examples: + .. code-block:: python + + w_param_attrs = ParamAttr(name=None, + initializer=UniformInitializer(low=-1.0, high=1.0, seed=0), + learning_rate=1.0, + regularizer=L1Decay(1.0), + trainable=True, + clip=GradientClipByValue(-1.0, 1.0)) + y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs) + """ + def __init__(self, max, min=None): max = float(max) if min is None: @@ -120,6 +162,37 @@ class GradientClipByValue(BaseGradientClipAttr): class GradientClipByNorm(BaseGradientClipAttr): + """ + Clips tensor values to a maximum L2-norm. + + This operator limits the L2 norm of the input :math:`X` within :math:`max\_norm`. + If the L2 norm of :math:`X` is less than or equal to :math:`max\_norm`, :math:`Out` + will be the same as :math:`X`. If the L2 norm of :math:`X` is greater than + :math:`max\_norm`, :math:`X` will be linearly scaled to make the L2 norm of + :math:`Out` equal to :math:`max\_norm`, as shown in the following formula: + + .. math:: + + Out = \\frac{max\_norm * X}{norm(X)}, + + where :math:`norm(X)` represents the L2 norm of :math:`X`. + + Args: + clip_norm (float): The maximum norm value + + Examples: + .. code-block:: python + + w_param_attrs = ParamAttr(name=None, + initializer=UniformInitializer(low=-1.0, high=1.0, seed=0), + learning_rate=1.0, + regularizer=L1Decay(1.0), + trainable=True, + clip=GradientClipByNorm(clip_norm=2.0)) + y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs) + + """ + def __init__(self, clip_norm): self.clip_norm = clip_norm @@ -135,6 +208,44 @@ class GradientClipByNorm(BaseGradientClipAttr): class GradientClipByGlobalNorm(BaseGradientClipAttr): + """ + Clips values of multiple tensors by the ratio of the sum of their norms. + + Given a list of tensors t_list, and a clipping ratio clip_norm, this + operation returns a list of clipped tensors list_clipped and the global + norm (global_norm) of all tensors in t_list. + + To perform the clipping, the values :math:`t\_list[i]` are set to: + + .. math:: + + t\_list[i] = t\_list[i] * \\frac{clip\_norm}{\max(global\_norm, clip\_norm)} + + where: + + .. math:: + + global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2} + + If :math:`clip\_norm > global\_norm` then the entries in t_list remain as they are, + otherwise they're all shrunk by the global ratio. + + Args: + clip_norm (float): The maximum norm value + group_name (str, optional): The group name for this clip. + + Examples: + .. code-block:: python + + p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip) + + with fluid.program_guard(main_program=prog_clip): + fluid.clip.set_gradient_clip( + fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0)) + p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip) + + """ + def __init__(self, clip_norm, group_name="default_group"): if not isinstance(group_name, basestring): raise TypeError("'group_name' must be a basestring.") @@ -183,15 +294,16 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): def set_gradient_clip(clip, param_list=None, program=None): """ - To specify parameters that require gradient clip. - Args: - clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr, - which describes the type and detailed attributes of required gradient clip. - param_list(list, None by default): Parameters that require gradient clip. - It can be a list of parameter or a list of parameter's name. - When it's None, all parameters in the program will be included. - program(Program, None by default): The program where parameters are. - Will be the default main program when assigned with None. + To specify parameters that require gradient clip. + + Args: + clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr, + which describes the type and detailed attributes of required gradient clip. + param_list(list(Variable)): Parameters that require gradient clip. + It can be a list of parameter or a list of parameter's name. + When it's None, all parameters in the program will be included. + program(Program): The program where parameters are. + Will be the default main program when assigned with None. """ if not isinstance(clip, BaseGradientClipAttr): raise TypeError( diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index 7940dabcfb03cc9eb46f678365685a6e99bcceec..c859778b3757f638ac531620f241e684522add57 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -15,6 +15,7 @@ from __future__ import print_function import core import numpy +import os import six.moves as six import multiprocessing @@ -28,6 +29,13 @@ class DataToLoDTensorConverter(object): self.place = place self.lod_level = lod_level self.shape = shape + negtive_count = 0 + for s in self.shape: + if s < 0: + negtive_count += 1 + if negtive_count > 1: + self.shape = None + break if dtype == core.VarDesc.VarType.FP32: self.dtype = 'float32' elif dtype == core.VarDesc.VarType.INT64: @@ -46,7 +54,7 @@ class DataToLoDTensorConverter(object): self.lod = [] for i in six.range(lod_level): - self.lod.append([0]) + self.lod.append([]) def feed(self, data): self._feed_impl_(data, self.lod, self.lod_level) @@ -55,21 +63,77 @@ class DataToLoDTensorConverter(object): if lod_level == 0: self.data.append(data) else: - cur_lod_len = len(data) - lod[0].append(lod[0][-1] + cur_lod_len) + lod[0].append(len(data)) for each_data in data: self._feed_impl_(each_data, lod[1:], lod_level - 1) def done(self): - arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape) + arr = numpy.array(self.data, dtype=self.dtype) + if self.shape: + arr = arr.reshape(self.shape) t = core.LoDTensor() t.set(arr, self.place) if self.lod_level > 0: - t.set_lod(self.lod) + t.set_recursive_sequence_lengths(self.lod) return t class DataFeeder(object): + """ + DataFeeder converts the data that returned by a reader into a data + structure that can feed into Executor and ParallelExecutor. The reader + usually returns a list of mini-batch data entries. Each data entry in + the list is one sample. Each sample is a list or a tuple with one + feature or multiple features. + + The simple usage shows below: + + .. code-block:: python + + place = fluid.CPUPlace() + img = fluid.layers.data(name='image', shape=[1, 28, 28]) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + feeder = fluid.DataFeeder([img, label], fluid.CPUPlace()) + result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])]) + + + If you want to feed data into GPU side separately in advance when you + use multi-GPU to train a model, you can use `decorate_reader` function. + + .. code-block:: python + + place=fluid.CUDAPlace(0) + feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) + reader = feeder.decorate_reader( + paddle.batch(flowers.train(), batch_size=16)) + + Args: + feed_list(list): The Variables or Variables'name that will + feed into model. + place(Place): place indicates feed data into CPU or GPU, if you want to + feed data into GPU, please using `fluid.CUDAPlace(i)` (`i` represents + the GPU id), or if you want to feed data into CPU, please using + `fluid.CPUPlace()`. + program(Program): The Program that will feed data into, if program + is None, it will use default_main_program(). Default None. + + Raises: + ValueError: If some Variable is not in this Program. + + Examples: + .. code-block:: python + + # ... + place = fluid.CPUPlace() + feed_list = [ + main_program.global_block().var(var_name) for var_name in feed_vars_name + ] # feed_vars_name is a list of variables' name. + feeder = fluid.DataFeeder(feed_list, place) + for data in reader(): + outs = exe.run(program=main_program, + feed=feeder.feed(data)) + """ + def __init__(self, feed_list, place, program=None): self.feed_dtypes = [] self.feed_names = [] @@ -99,6 +163,16 @@ class DataFeeder(object): self.place = place def feed(self, iterable): + """ + According to feed_list and iterable, converters the input into + a data structure that can feed into Executor and ParallelExecutor. + + Args: + iterable(list|tuple): the input data. + + Returns: + dict: the result of conversion. + """ converter = [] for lod_level, shape, dtype in six.zip( self.feed_lod_level, self.feed_shapes, self.feed_dtypes): @@ -121,6 +195,20 @@ class DataFeeder(object): return ret_dict def feed_parallel(self, iterable, num_places=None): + """ + Takes multiple mini-batches. Each mini-batch will be feed on each + device in advance. + + Args: + iterable(list|tuple): the input data. + num_places(int): the number of devices. Default None. + + Returns: + dict: the result of conversion. + + Notes: + The number of devices and number of mini-batches must be same. + """ if isinstance(self.place, core.CUDAPlace): places = [ core.CUDAPlace(i) @@ -150,13 +238,33 @@ class DataFeeder(object): elif isinstance(self.place, core.CUDAPlace): return core.get_cuda_device_count() else: - return multiprocessing.cpu_count() + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + return cpu_num def decorate_reader(self, reader, multi_devices, num_places=None, drop_last=True): + """ + Converter the input data into a data that returned by reader into + multiple mini-batches. Each mini-batch will be feed on each device. + + Args: + reader(fun): the input data. + multi_devices(bool): the number of places. Default None. + num_places(int): the number of places. Default None. + drop_last(bool): the number of places. Default None. + + Returns: + dict: the result of conversion. + + Raises: + ValueError: If drop_last is False and the data batch which cannot + fit for devices. + """ + def __reader_creator__(): if not multi_devices: for item in reader(): diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py index 7c6ad6f27dcfd7040f79c72c01413c8cc84a28ba..00ba1a0457583d1cc1fa7136ebd51e9ced167832 100644 --- a/python/paddle/fluid/evaluator.py +++ b/python/paddle/fluid/evaluator.py @@ -41,7 +41,12 @@ def _clone_var_(block, var): class Evaluator(object): """ - Base Class for all evaluators + Warning: better to use the fluid.metrics.* things, more + flexible support via pure Python and Operator, and decoupled + with executor. Short doc are intended to urge new user + start from Metrics. + + Base Class for all evaluators. Args: name(str): The name of evaluator. such as, "accuracy". Used for generate @@ -69,6 +74,10 @@ class Evaluator(object): def reset(self, executor, reset_program=None): """ reset metric states at the begin of each pass/user specified batch + + Args: + executor(Executor|ParallelExecutor): a executor for executing the reset_program + reset_program(Program): a single Program for reset process """ if reset_program is None: reset_program = Program() @@ -85,15 +94,16 @@ class Evaluator(object): def eval(self, executor, eval_program=None): """ Evaluate the statistics merged by multiple mini-batches. + Args: + executor(Executor|ParallelExecutor): a executor for executing the eval_program + eval_program(Program): a single Program for eval process """ raise NotImplementedError() - def create_state(self, suffix, dtype, shape): + def _create_state(self, suffix, dtype, shape): """ Create state variable. - NOTE: It is not a public API. - Args: suffix(str): the state suffix. dtype(str|core.VarDesc.VarType): the state data type @@ -113,9 +123,35 @@ class Evaluator(object): class ChunkEvaluator(Evaluator): """ + Warning: This would be deprecated in the future. Please use fluid.metrics.ChunkEvaluator + instead. + Accumulate counter numbers output by chunk_eval from mini-batches and compute the precision recall and F1-score using the accumulated counter numbers. + For some basics of chunking, please refer to + 'Chunking with Support Vector Machines '. + + Args: + input (Variable): prediction output of the network. + label (Variable): label of the test data set. + chunk_scheme (str): can be IOB/IOE/IOBES and IO. See the chunk_eval op for details. + num_chunk_types (int): the number of chunk type. + excluded_chunk_types (list): A list including chunk type ids, indicating chunk types that are not counted. + + Returns: + tuple: tuple containing: precision, recall, f1_score + + Examples: + .. code-block:: python + + exe = fluid.executor(place) + evaluator = fluid.Evaluator.ChunkEvaluator(input, label) + for epoch in PASS_NUM: + evaluator.reset(exe) + for data in batches: + loss = exe.run(fetch_list=[cost]) + distance, instance_error = distance_evaluator.eval(exe) """ def __init__( @@ -130,11 +166,11 @@ class ChunkEvaluator(Evaluator): if main_program.current_block().idx != 0: raise ValueError("You can only invoke Evaluator in root block") - self.num_infer_chunks = self.create_state( + self.num_infer_chunks = self._create_state( dtype='int64', shape=[1], suffix='num_infer_chunks') - self.num_label_chunks = self.create_state( + self.num_label_chunks = self._create_state( dtype='int64', shape=[1], suffix='num_label_chunks') - self.num_correct_chunks = self.create_state( + self.num_correct_chunks = self._create_state( dtype='int64', shape=[1], suffix='num_correct_chunks') precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval( input=input, @@ -178,6 +214,8 @@ class ChunkEvaluator(Evaluator): class EditDistance(Evaluator): """ + Warning: This would be deprecated in the future. Please use fluid.metrics.EditDistance + instead. Accumulate edit distance sum and sequence number from mini-batches and compute the average edit_distance and instance error of all batches. @@ -188,15 +226,16 @@ class EditDistance(Evaluator): ignored_tokens(list of int): Tokens that should be removed before calculating edit distance. - Example: + Examples: + .. code-block:: python - exe = fluid.executor(place) - distance_evaluator = fluid.Evaluator.EditDistance(input, label) - for epoch in PASS_NUM: - distance_evaluator.reset(exe) - for data in batches: - loss = exe.run(fetch_list=[cost]) - distance, instance_error = distance_evaluator.eval(exe) + exe = fluid.executor(place) + distance_evaluator = fluid.Evaluator.EditDistance(input, label) + for epoch in PASS_NUM: + distance_evaluator.reset(exe) + for data in batches: + loss = exe.run(fetch_list=[cost]) + distance, instance_error = distance_evaluator.eval(exe) In the above example: 'distance' is the average of the edit distance in a pass. @@ -210,11 +249,11 @@ class EditDistance(Evaluator): if main_program.current_block().idx != 0: raise ValueError("You can only invoke Evaluator in root block") - self.total_distance = self.create_state( + self.total_distance = self._create_state( dtype='float32', shape=[1], suffix='total_distance') - self.seq_num = self.create_state( + self.seq_num = self._create_state( dtype='int64', shape=[1], suffix='seq_num') - self.instance_error = self.create_state( + self.instance_error = self._create_state( dtype='int64', shape=[1], suffix='instance_error') distances, seq_num = layers.edit_distance( input=input, label=label, ignored_tokens=ignored_tokens) @@ -256,9 +295,10 @@ class EditDistance(Evaluator): class DetectionMAP(Evaluator): """ + Warning: This would be deprecated in the future. Please use fluid.metrics.DetectionMAP + instead. Calculate the detection mean average precision (mAP). - TODO (Dang Qingqing): update the following doc. The general steps are as follows: 1. calculate the true positive and false positive according to the input of detection and labels. @@ -293,17 +333,18 @@ class DetectionMAP(Evaluator): - 11point: the 11-point interpolated average precision. - integral: the natural integral of the precision-recall curve. - Example: + Examples: + .. code-block:: python - exe = fluid.executor(place) - map_evaluator = fluid.Evaluator.DetectionMAP(input, - gt_label, gt_box, gt_difficult) - cur_map, accum_map = map_evaluator.get_map_var() - fetch = [cost, cur_map, accum_map] - for epoch in PASS_NUM: - map_evaluator.reset(exe) - for data in batches: - loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch) + exe = fluid.executor(place) + map_evaluator = fluid.Evaluator.DetectionMAP(input, + gt_label, gt_box, gt_difficult) + cur_map, accum_map = map_evaluator.get_map_var() + fetch = [cost, cur_map, accum_map] + for epoch in PASS_NUM: + map_evaluator.reset(exe) + for data in batches: + loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch) In the above example: @@ -340,9 +381,10 @@ class DetectionMAP(Evaluator): evaluate_difficult=evaluate_difficult, ap_version=ap_version) - self.create_state(dtype='int32', shape=None, suffix='accum_pos_count') - self.create_state(dtype='float32', shape=None, suffix='accum_true_pos') - self.create_state(dtype='float32', shape=None, suffix='accum_false_pos') + self._create_state(dtype='int32', shape=None, suffix='accum_pos_count') + self._create_state(dtype='float32', shape=None, suffix='accum_true_pos') + self._create_state( + dtype='float32', shape=None, suffix='accum_false_pos') self.has_state = None var = self.helper.create_variable( diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 93aa5f908ec929a33089a62caa2186ba9e57fffe..145f1423e4b4a2ce35ba8ac3cca37935df90727e 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -18,17 +18,24 @@ from framework import Program, default_main_program, Variable from . import core __all__ = [ - 'Executor', 'global_scope', 'scope_guard', 'switch_scope', 'fetch_var' + 'Executor', 'global_scope', 'scope_guard', '_switch_scope', 'fetch_var' ] g_scope = core.Scope() def global_scope(): + """ + Get the global/default scope instance. There are a lot of APIs use + :code:`global_scope` as its default value, e.g., :code:`Executor.run` + + Returns: + Scope: The global/default scope instance. + """ return g_scope -def switch_scope(scope): +def _switch_scope(scope): global g_scope ex = g_scope g_scope = scope @@ -37,12 +44,42 @@ def switch_scope(scope): @contextlib.contextmanager def scope_guard(scope): - ex = switch_scope(scope) + """ + Change the global/default scope instance by Python `with` statement. All + variable in runtime will assigned to the new scope. + + Examples: + >>> import paddle.fluid as fluid + >>> new_scope = fluid.Scope() + >>> with fluid.scope_guard(new_scope): + >>> ... + + Args: + scope: The new global/default scope. + """ + ex = _switch_scope(scope) yield - switch_scope(ex) + _switch_scope(ex) def as_numpy(tensor): + """ + Convert a Tensor to a numpy.ndarray, its only support Tensor without LoD information. + For higher dimensional sequence data, please use LoDTensor directly. + Examples: + >>> import paddle.fluid as fluid + >>> outs = executor.run(...) + >>> np_outs = map(lambda x: as_numpy(x), outs) + >>> ... + + Args: + tensor(Variable): a instance of Tensor + + Returns: + numpy.ndarray + """ + if isinstance(tensor, core.LoDTensorArray): + return [as_numpy(t) for t in tensor] if isinstance(tensor, list): return [as_numpy(t) for t in tensor] assert isinstance(tensor, core.LoDTensor) @@ -135,14 +172,18 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name): def fetch_var(name, scope=None, return_numpy=True): """ - Fetch the value of the variable with the given name from the given scope + Fetch the value of the variable with the given name from the + given scope. + Args: name(str): name of the variable. Typically, only persistable variables can be found in the scope used for running the program. scope(core.Scope|None): scope object. It should be the scope where you pass to Executor.run() when running your program. - If None, global_scope() will be used. - return_numpy(bool): whether convert the tensor to numpy.ndarray + If None, global_scope() will be used. Default None. + return_numpy(bool): whether convert the tensor to numpy.ndarray. + Default True. + Returns: LodTensor|numpy.ndarray """ @@ -162,7 +203,7 @@ def fetch_var(name, scope=None, return_numpy=True): return tensor -def get_program_cache_key(feed, fetch_list): +def _get_program_cache_key(feed, fetch_list): feed_var_names = feed.keys() def to_name_str(var): @@ -170,6 +211,8 @@ def get_program_cache_key(feed, fetch_list): return var.desc.name() elif isinstance(var, str): return var + elif isinstance(var, basestring): + return str(var) else: raise TypeError(str(var) + " should be Variable or str") @@ -179,6 +222,25 @@ def get_program_cache_key(feed, fetch_list): class Executor(object): + """ + An Executor in Python, only support the single-GPU running. For multi-cards, please refer to + ParallelExecutor. + Python executor takes a program, add feed operators and fetch operators to this program according + to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides + the variables(or names) that user want to get after program run. Note: the executor will run all + operators in the program but not only the operators dependent by the fetch_list. + It store the global variables into the global scope, and create a local scope for the temporary + variables. The local scope contents will be discarded after every minibatch forward/backward finished. + But the global scope variables will be persistent through different runs. + All of ops in program will be running in sequence. + + Args: + place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device + + Note: For debugging complicated network in parallel-GPUs, you can test it on the executor. + They has the exactly same arguments, and expected the same results. + """ + def __init__(self, place): self.place = place p = core.Place() @@ -187,6 +249,23 @@ class Executor(object): self.program_caches = dict() def as_lodtensor(self, data): + """ + Convert numpy.ndarray to Tensor, its only support Tensor without LoD information. + For higher dimensional sequence data, please use LoDTensor directly. + + Examples: + >>> import paddle.fluid as fluid + >>> exe = fluid.executor(fluid.CPUPlace()) + >>> data = np.array(size=(100, 200, 300)) + >>> np_outs = map(lambda x: exe.as_lodtensor(x), data) + >>> ... + + Args: + data(numpy.ndarray): a instance of array + + Returns: + LoDTensor + """ if isinstance(data, list): raise RuntimeError("Some of your feed data hold LoD information. \ They can not be completely cast from a list of Python \ @@ -278,23 +357,47 @@ class Executor(object): scope=None, return_numpy=True, use_program_cache=False): - """ Run program by this Executor. Feed data by feed map, fetch result by fetch_list. - + """ + Run program by this Executor. Feed data by feed map, fetch result by fetch_list. Python executor takes a program, add feed operators and fetch operators to this program according to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides - the variables(or names) that user want to get after program run. Note: the executor will run all + the variables(or names) that user want to get after program run. + + Note: the executor will run all operators in the program but not only the operators dependent by the fetch_list - :param program: the program that need to run, if not provied, then default_main_program will be used. - :param feed: feed variable map, e.g. {"image": ImageData, "label": LableData} - :param fetch_list: a list of variable or variable names that user want to get, run will return them according - to this list. - :param feed_var_name: the name for the input variable of feed Operator. - :param fetch_var_name: the name for the output variable of feed Operator. - :param scope: the scope used to run this program, you can switch it to different scope. default is global_scope - :param return_numpy: if convert the fetched tensor to numpy - :param use_program_cache: set use_program_cache to true if program not changed compare to the last step. - :return: result according to fetch_list. + Args: + program(Program): the program that need to run, if not provied, then default_main_program will be used. + feed(dict): feed variable map, e.g. {"image": ImageData, "label": LableData} + fetch_list(list): a list of variable or variable names that user want to get, run will return them according to this list. + feed_var_name(str): the name for the input variable of feed Operator. + fetch_var_name(str): the name for the output variable of fetch Operator. + scope(Scope): the scope used to run this program, you can switch it to different scope. default is global_scope + return_numpy(bool): if convert the fetched tensor to numpy + use_program_cache(bool): set use_program_cache to true if program not changed compare to the last step. + + Returns: + + list(numpy.array): fetch result according to fetch_list. + + + Examples: + + >>> data = layers.data(name='X', shape=[1], dtype='float32') + >>> hidden = layers.fc(input=data, size=10) + >>> layers.assign(hidden, out) + >>> loss = layers.mean(out) + >>> adam = fluid.optimizer.Adam() + >>> adam.minimize(loss) + + >>> cpu = core.CPUPlace() + >>> exe = Executor(cpu) + >>> exe.run(default_startup_program()) + + >>> x = numpy.random.random(size=(10, 1)).astype('float32') + >>> outs = exe.run( + >>> feed={'X': x}, + >>> fetch_list=[loss.name]) """ if feed is None: feed = {} @@ -315,7 +418,7 @@ class Executor(object): if scope is None: scope = global_scope() - cache_key = get_program_cache_key(feed, fetch_list) + cache_key = _get_program_cache_key(feed, fetch_list) if use_program_cache: cached_program = self._get_program_cache(cache_key) if cached_program is None: diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 33b5caa0eab0ec192eb4a3b63cf82a672c58d2cb..2b2462b771a3801bf220ad6e09ee0c44f7b367b2 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -30,8 +30,6 @@ __all__ = [ 'default_startup_program', 'default_main_program', 'program_guard', - 'switch_startup_program', - 'switch_main_program', 'get_var', ] @@ -43,7 +41,8 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix() def grad_var_name(var_name): """ - return gradient name for a certain var name + Returns: + str: gradient name for a certain var name """ return var_name + GRAD_VAR_SUFFIX @@ -51,10 +50,12 @@ def grad_var_name(var_name): def convert_np_dtype_to_dtype_(np_dtype): """ Convert the data type in numpy to the data type in Paddle + Args: - np_dtype(np.dtype): the data type in numpy + np_dtype(np.dtype): the data type in numpy. - Returns(core.VarDesc.VarType): the data type in Paddle + Returns: + core.VarDesc.VarType: the data type in Paddle. """ dtype = np.dtype(np_dtype) @@ -72,6 +73,8 @@ def convert_np_dtype_to_dtype_(np_dtype): return core.VarDesc.VarType.INT64 elif dtype == np.bool: return core.VarDesc.VarType.BOOL + elif dtype == np.uint16: + return core.VarDesc.VarType.INT16 elif dtype == np.uint8: return core.VarDesc.VarType.UINT8 else: @@ -118,37 +121,53 @@ def _debug_string_(proto, throw_on_error=True): class Variable(object): """ - Python variable. Every input and output of an operator is a variable. Every - variable belongs to a block. The variable has a name and two variables in - different blocks could have the same name. - - There are many kinds of variables. Please reference the framework.proto for - details. + In Fluid, every input and output of an operator is a variable. In most + cases, variables are used for holding different kinds of data or training + labels. A variable belongs to a block. All variable has its own name and + two variables in different blocks could have the same name. - Notes: The constructor of Variable should not be invoked directly. Please - use `Block.create_var` to create a variable. + There are many kinds of variables. Each kind of them has its own attributes + and usages. Please reference the framework.proto for details. - >>> cur_program = Program() - >>> cur_block = cur_program.current_block() - >>> new_variable = cur_block.create_var( - >>> name="X", shape=[-1, 23, 48], dtype='float32') + Most of a Variable's member variables can be setted to be None. It mean + it is not available or will be specified later. Args: - block(Block): The associated block. It will be passed by - `Block.create_var` automatically. + block(Block): The block that the variable belongs to. type(core.VarDesc.VarType): Variable type. Please reference the framework.proto for details. - shape(tuple|list|None): The shape of variable. -1 means the batch size. + name(str|None): The name of the variable. If setted None, it will be + generated automatically. Default: None + shape(tuple|list|None): The shape of the variable. -1 means the batch size. Some kinds of variable do not contain shape, just set it to None. - dtype(np.dtype|core.VarDesc.VarType|str): The data type of variable. - lod_level(int): The level of lod tensor. 0 means it is not a time + Default: None + dtype(np.dtype|core.VarDesc.VarType|str|None): The data type of variable. + Default: None + lod_level (int|None): The level of lod tensor. 0 means it is not a time series data. - capacity(int): The capacity of Channel variable. Ignored - for other types. - persistable(bool): True if the variable should be saved as check point. - Defaults to False. - stop_gradient(bool): True if the variable will stop to calculate - gradients when backward. Defaults to False. + Default: None + capacity (int|None): The capacity of Channel variable. Ignored for other + types. Default: None + persistable (bool|None): True if the variable is persistable. A persistable + variable will not be deleted after an iteration ending. Defaults: None. + error_clip (BaseErrorClipAttr|None): The error clip attributes of the + corresponding gradient variable. Default: None + stop_gradient (bool): True if the variable will stop to calculate its + gradients when backward. Default: False. + is_data (bool): True if the variable is an input data. Default: False + + Notes: + The constructor of Variable should not be invoked directly. Please + use `Block.create_var` to create a variable. + + Examples: + .. code-block:: python + + cur_program = Program() + cur_block = cur_program.current_block() + new_variable = cur_block.create_var(name="X", + shape=[-1, 23, 48], + dtype='float32') """ def __init__(self, @@ -251,13 +270,14 @@ class Variable(object): Get debug string. Args: - throw_on_error(bool): True if raise an exception when self is not - intialized. + throw_on_error(bool): True if raise an exception when self is + not initialized. with_details(bool): more details about variables and parameters - (e.g. trainable, optimize_attr, ...) will be printed when with_details is True - - Returns(str): The debug string. + (e.g. trainable, optimize_attr, ...) will be printed when + with_details is True. Default False; + Returns: + str: The debug string. """ assert isinstance(throw_on_error, bool) and isinstance(with_details, bool) @@ -274,6 +294,15 @@ class Variable(object): __repr__ = __str__ def set_desc(self, input): + """ + Set the variable description. + + Args: + input(core.VarDesc): The new VarDesc. + + Returns: + None + """ self.desc = input @property @@ -310,6 +339,15 @@ class Variable(object): return self.desc.type() def set_error_clip(self, error_clip): + """ + Set the error_clip. + + Args: + error_clip(BaseErrorClipAttr) : The new error_clip. + + Returns: + None + """ self.error_clip = error_clip @@ -317,8 +355,8 @@ def get_all_op_protos(): """ Get all registered op proto from PaddlePaddle C++ end. - Returns(list): list of OpProto - + Returns: + list: list of OpProto. """ protostrs = core.get_all_op_protos() ret_values = [] @@ -361,13 +399,63 @@ class OpProtoHolder(object): raise ValueError("Operator \"%s\" has not been registered." % type) return self.op_proto_map[type] + @staticmethod + def generated_op_attr_names(): + return { + core.op_proto_and_checker_maker.kOpRoleAttrName(), + core.op_proto_and_checker_maker.kOpRoleVarAttrName() + } + class Operator(object): """ - Python Operator class. The operator represents the build in instructions in a - Block. Users can use the build in instructions to describe their neural - network. + In Fluid, all the operation are represented by Operator, and Operator + is regarded as a build in an instruction of a Block. Users can use the + build in instructions to describe their neural network. + + Args: + block(Block): The block has the current operator. + desc(core.OpDesc): The protobuf description of Operator. + type(str): The type of operator. Default None. + inputs(dict): The input of this Operator. it is a dictionary, for every + element, key is the input parameter name, and value is a list of + variables. Default None. + outputs(dict): The output of this Operator. it is a dictionary, for + every element, key is the input parameter name, and value is a list + of variables. Default None. + attrs(dict): The attributes of this Operator. it is a dictionary, for + every element, key is attribute name, and value is the attribute value. + The attribute type should be as same as the type registered in C++ side. + Default None. + + Returns: + Operator: The initialized Operator. + + Raises: + ValueError: If the passed input, output and attrs doesn't match the + initializing Operator's that registered in C++ side. + + Notes: + The constructor of operator should not be invoked directly. Use + Block.append_op or Block.prepend_op instead. + + Examples: + .. code-block:: python + + cur_program = Program() + cur_block = cur_program.current_block() + # var1 += var2 + var3 + cur_block.append_op(type="sum", + inputs={"X": [var1, var2, var3]}, + outputs={"Out": [var1]}) """ + OP_WITHOUT_KERNEL_SET = { + 'feed', 'fetch', 'save', 'load', 'recurrent', 'go', + 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', + 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', + 'ncclInit', 'channel_create', 'channel_close', 'channel_send', + 'channel_recv', 'select', 'checkpoint_notify', 'gen_nccl_id' + } def __init__(self, block, @@ -376,31 +464,7 @@ class Operator(object): inputs=None, outputs=None, attrs=None): - """ - Constructor. - - Notes: The constructor of operator should not be invoked directly. Use - Block.append_op or Block.prepend_op instead. - - >>> cur_program = Program() - >>> cur_block = cur_program.current_block() - >>> # var1 += var2 + var3 - >>> cur_block.append_op(type="sum", - >>> inputs={"X": [var1, var2, var3]}, - >>> outputs={"Out": [var1]}) - Args: - block(Block): The block has the current operator. - desc(core.OpDesc): The protobuf description. - type(str): The type of operator. - inputs(dict): The input dictionary. Key is the input parameter name. - Value is a list of variables. - outputs(dict): The output dictionary which has the same format with - inputs. - attrs(dict): The attributes dictionary. Key is attribute name. Value - is the attribute value. The attribute type should be as same as - the type registered in C++ - """ self.block = block self.desc = desc self.attrs = attrs @@ -494,35 +558,27 @@ class Operator(object): if (attr_name not in self.attrs) or ( self.attrs[attr_name] is None): continue - if isinstance(self.attrs[attr_name], Block): - self.desc.set_block_attr(attr_name, - self.attrs[attr_name].desc) - elif isinstance(self.attrs[attr_name], core.BlockDesc) or \ - isinstance(self.attrs[attr_name], core.ProgramDesc): - self.desc.set_serialized_attr( - attr_name, self.attrs[attr_name].serialize_to_string()) - else: - self.desc.set_attr(attr_name, self.attrs[attr_name]) + attr_val = self.attrs[attr_name] + self._update_desc_attr(attr_name, attr_val) + self.desc.check_attrs() - no_kernel_op_set = { - 'feed', 'fetch', 'save', 'load', 'recurrent', 'go', - 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', - 'recv', 'listen_and_serv', 'parallel_do', 'save_combine', - 'load_combine', 'ncclInit', 'channel_create', 'channel_close', - 'channel_send', 'channel_recv', 'select', 'gen_nccl_id' - } - if type not in no_kernel_op_set: + if self.has_kernel(type): self.desc.infer_var_type(self.block.desc) self.desc.infer_shape(self.block.desc) + def has_kernel(self, op_type): + return op_type not in self.OP_WITHOUT_KERNEL_SET + def to_string(self, throw_on_error): """ - To debug string. + Get debug string. + Args: - throw_on_error(bool): raise exception when self is not initialized - when throw_on_error is True + throw_on_error(bool): Whether to raise exception if self is not + initialized. - Returns(str): The debug string. + Returns: + str: The debug string. """ protostr = self.desc.serialize_to_string() @@ -540,29 +596,45 @@ class Operator(object): def input(self, name): """ - Get input arguments by the input parameter name - Args: - name(str): The input parameter name + Get the input arguments according to the input parameter name. - Returns(list): return the list of argument names associated with the - specific parameter name. + Args: + name(str): The input parameter name. + Returns: + list: return the list of argument names that associated with \ + the specific parameter name. """ return self.desc.input(name) def rename_input(self, old_name, new_name): + """ + Rename the `old_name` to `new_name`. + + Args: + old_name(str): The old name of the Operator's input. + new_name(str): The new name of the Operator's input. + + Returns: + None + """ self.desc.rename_input(old_name, new_name) def rename_output(self, old_name, new_name): + """ + Rename the `old_name` to `new_name`. + + Args: + old_name(str): The old name of the Operator's output. + new_name(str): The new name of the Operator's output. + + Returns: + None + """ self.desc.rename_output(old_name, new_name) @property def input_names(self): - """ - Get all input parameter names - Returns(list): return a list of input parameter names - - """ return self.desc.input_names() @property @@ -575,33 +647,23 @@ class Operator(object): def output(self, name): """ - Get output arguments by the output parameter name - Args: - name(str): The output parameter name + Get output arguments by the output parameter name. - Returns(list): return the list of argument names associated with the - specific parameter name. + Args: + name(str): The output parameter name. + Returns: + list: return the list of argument names associated with \ + the specific parameter name. """ return self.desc.output(name) @property def output_names(self): - """ - Get all output parameter names - Returns(list): return a list of output parameter names - - """ return self.desc.output_names() @property def idx(self): - """ - Return the array index of current operator. - Returns(int): The array index in block.ops array - Raises: - ValueError: when the operator is not found. - """ for i, op in enumerate(self.block.ops): if op == self: return i @@ -610,66 +672,100 @@ class Operator(object): def has_attr(self, name): """ - operator has the attribute with name or not. + Whether this Operator has the attribute with name or not. + Args: - name(str): the attribute name + name(str): the attribute name. - Returns(bool): True if has this attribute. + Returns: + bool: True if has this attribute. """ return self.desc.has_attr(name) def attr_type(self, name): """ - Get the type of attribute by attribute name - Args: - name(str): the attribute name + Get the type of attribute by attribute's name. - Returns(core.AttrType): the attribute type + Args: + name(str): the attribute name. + Returns: + core.AttrType: the attribute type. """ return self.desc.attr_type(name) def set_attr(self, name, val): + """ + Set the value of attribute by attribute's name. + + Args: + name(str): the attribute name. + val(bool|int|str|float|list): the value of the attribute. + + Raises: + ValueError: If the type of value doesn't match with desc.attr_type(name). + """ self.attrs[name] = val - self.desc.set_attr(name, val) + self._update_desc_attr(name, val) - @property - def attr_names(self): + def _update_desc_attr(self, name, val): """ - Get all attribute names - Returns(list): The list of attribute name + Update the value of desc's attribute by attribute's name. + + Args: + name(str): the attribute name. + val(bool|int|str|float|list): the value of the attribute. + Raises: + ValueError: If the type of value doesn't match with desc.attr_type(name). """ + if isinstance(val, Block): + self.desc.set_block_attr(name, val.desc) + elif isinstance(val, list) and val and all( + isinstance(v, Block) for v in val): + self.desc.set_blocks_attr(name, [v.desc for v in val]) + elif isinstance(val, core.BlockDesc) or \ + isinstance(val, core.ProgramDesc): + self.desc.set_serialized_attr(name, val.serialize_to_string()) + else: + self.desc.set_attr(name, val) + + @property + def attr_names(self): return self.desc.attr_names() def attr(self, name): """ - Get attribute by name + Get the attribute by name. + Args: - name(str): the attribute name + name(str): the attribute name. - Returns(bool|int|str|float|list): The attribute value. The return value + Returns: + bool|int|str|float|list: The attribute value. The return value can be any valid attribute type. - """ return self.desc.attr(name) def block_attr(self, name): """ - Get the block attribute by name - Args: - name(str): the attribute name + Get the block attribute by name. - Returns(int): the block index + Args: + name(str): the attribute name. + Returns: + int: the block index. """ return self.desc.block_attr(name) def all_attrs(self): """ - Get the attribute dict - Returns(dict): The Operator's attribute dict + Get the attribute dict. + + Returns: + dict: The Operator's attribute dict. """ attr_names = self.attr_names attr_map = {} @@ -682,6 +778,35 @@ class Operator(object): class Block(object): + """ + In Fluid, a Program is consistence of multi-Block, and Block stores + VarDesc and OpDesc. In a specific Block, a VarDesc have a unique name. + One block could have some child blocks, and child block's name scopes + should inherit the parent's so that OpDesc in child block can reference + a VarDesc that is stored in the parent block. + Please reference the framework.proto for details. + + Args: + program(Program): The Program that the Block belongs to. + idx(int): The block's id in the Program. + + Notes: + The constructor of Block should not be invoked directly. Please + use `Program.create_block()` to create a block. + + Examples: + .. code-block:: python + + cur_program = Program() + cur_block = cur_program.current_block() + var = cur_block.create_var(name="X", + shape=[-1, 23, 48], + dtype='float32') + cur_block.append_op(type="abs", + inputs={"X": [var]}, + outputs={"Out": [var]}) + """ + def __init__(self, program, idx): self.desc = program.desc.block(idx) self.vars = collections.OrderedDict() # var_name --> var @@ -694,15 +819,17 @@ class Block(object): def to_string(self, throw_on_error, with_details=False): """ - To debug string. + Get debug string. + Args: throw_on_error(bool): raise exception when self is not initialized - when throw_on_error is True + when throw_on_error is True. with_details(bool): more details about variables and parameters - (e.g. trainable, optimize_attr, ...) will be printed when with_details is True - - Returns(str): The debug string. + (e.g. trainable, optimize_attr, ...) will be printed when + with_details is True. Default False. + Returns: + str: The debug string. """ assert isinstance(throw_on_error, bool) and isinstance(with_details, bool) @@ -734,6 +861,15 @@ class Block(object): return self.desc.get_forward_block_idx() def set_forward_block_idx(self, idx): + """ + Set the forward block Idx. + + Args: + idx(int): the block index. + + Returns: + None + """ self.desc.set_forward_block_idx(idx) @property @@ -741,14 +877,42 @@ class Block(object): return self.desc.id def var(self, name): + """ + Get a Variable by name from this block. + + Args: + name(str): the Variable's name. + + Raises: + ValueError: The If input's type is not str, or this block + doesn't have a Variable with the giving name. + + Returns: + Variable: the Variable with the giving name. + """ if not isinstance(name, basestring): - raise TypeError() + raise TypeError( + "var require string as parameter, but get %s instead." % + (type(name))) v = self.vars.get(name, None) if v is None: raise ValueError("var %s not in this block" % name) return v def var_recursive(self, name): + """ + Get a Variable by name from this block recursively. + + Args: + name(str): the Variable's name. + + Raises: + ValueError: this block and this parent block doesn't + have a Variable with the giving name. + + Returns: + Variable: the Variable with the giving name. + """ frontier = list() visited = set() @@ -795,6 +959,18 @@ class Block(object): def rename_var(self, name, new_name): """ Rename variable in vars and ops' inputs and outputs + + Args: + name(str): the name that need to be renamed. + new_name(str): the name that need to rename to. + + Raises: + ValueError: If this block doesn't have this the giving name, + or the type of the var with the giving name is not Parameter + or Variable. + + Returns: + Variable: the Variable with the giving name. """ if not self.has_var(name): raise ValueError("var %s is not in current block" % name) @@ -858,12 +1034,27 @@ class Block(object): return param def append_op(self, *args, **kwargs): + """ + Appends a new Operator according to the giving arguments. + + Returns: + Operator: the append Operator. + """ op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) self.ops.append(op) return op def insert_op(self, index, *args, **kwargs): + """ + Insert a Operator according to the giving arguments. + + Args: + index(int): the place that the operator to insert. + + Returns: + Operator: the insert Operator. + """ self.sync_with_cpp() op_desc = self.desc.insert_op(index) op = Operator(block=self, desc=op_desc, *args, **kwargs) @@ -871,11 +1062,30 @@ class Block(object): return op def remove_op(self, index): + """ + Remove the specific position operator. + + Args: + index(int): the position that the operator to insert. + + Returns: + None + """ self.sync_with_cpp() self.desc.remove_op(index, index + 1) del self.ops[index] def slice_ops(self, start, end): + """ + Return the Operator between start and end. + + Args: + start(int): the start position. + end(int): the end position. + + Returns: + list: the Operators between start and end. + """ return self.ops[start:end] def prepend_op(self, *args, **kwargs): @@ -886,9 +1096,8 @@ class Block(object): def sync_with_cpp(self): """ - Sync from the desc on the c++ end. - - This method is used to synchronize the c++ desc instance generated by backward. + Sync from the desc on the c++ end. This method is used to synchronize + the c++ desc instance generated by backward. """ # sync variables from cpp for var in self.desc.all_vars(): @@ -953,9 +1162,14 @@ class Block(object): def copy_param_info_from(self, other): """ - Copy the information of parameters from the other block + Copy the information of parameters from the other block. + Args: - other(Block): the other block + other(Block): the other block. + + Raises: + ValueError: If type of input is not Block, or the `other` and this + block is not in the same topology. Returns: None @@ -987,11 +1201,12 @@ class Block(object): def clone_variable(self, var): """ Clone a variable into current block. + Args: var: the variable to be cloned. Returns: - The new variable cloned from 'var' in current block. + Variable: the new variable cloned from 'var' in current block. """ assert isinstance(var, Variable) ret_var = None @@ -999,6 +1214,9 @@ class Block(object): if var.type == core.VarDesc.VarType.STEP_SCOPES: ret_var = self.create_var( name=var.name, persistable=var.persistable, type=var.type) + elif var.type == core.VarDesc.VarType.RAW: + ret_var = self.create_var( + name=var.name, persistable=var.persistable, type=var.type) elif var.type == core.VarDesc.VarType.SELECTED_ROWS: ret_var = self.create_var( name=var.name, @@ -1020,6 +1238,32 @@ class Block(object): class Program(object): + """ + Python Program. Beneath it is a ProgramDesc, which is used for + create c++ Program. A program is a self-contained programing + language like container. It has at least one Block, when the + control flow op like conditional_block, while_op is included, + it will contains nested block. + Please reference the framework.proto for details. + + Notes: we have default_startup_program and default_main_program + by default, a pair of them will shared the parameters. + The default_startup_program only run once to initialize parameters, + default_main_program run in every mini batch and adjust the weights. + + Returns: + A empty program. + + Examples: + >>> main_program = fluid.Program() + >>> startup_program = fluid.Program() + >>> with fluid.program_guard(main_program=main_program, startup_program=startup_program): + >>> fluid.layers.data(name="x", shape=[-1, 784], dtype='float32') + >>> fluid.layers.data(name="y", shape=[-1, 1], dtype='int32') + >>> fluid.layers.fc(name="fc", shape=[10], dtype='float32', act="relu") + + """ + def __init__(self): self.desc = core.ProgramDesc() self.blocks = [Block(self, 0)] @@ -1030,6 +1274,19 @@ class Program(object): @property def op_role(self): + """ + The operator role. In a enum {Forward, Backward, Optimize}. + + Notes: this is a low level API. It is used only for ParallelExecutor to + duplicate or schedule operator to devices. + + For example, the forward operator should be executed on every device. + The backward operator should be executed on every device and the + parameter gradient of backward (use :code:`op_role_var` to get this + variable) operator should be merged to one device. The optimization + operators should be executed on only one device and broadcast the + optimization result, i.e., the new parameter, to every other device. + """ return self._current_role @op_role.setter @@ -1038,6 +1295,13 @@ class Program(object): @property def op_role_var(self): + """ + The auxiliary variables for :code:`op_role` property. + + See Also: :code:`Program.op_role`'s documentation for details. + + Notes: This is a very low-level API. Users should not use it directly. + """ return self._op_role_var @op_role_var.setter @@ -1046,6 +1310,21 @@ class Program(object): @contextlib.contextmanager def optimized_guard(self, var): + """ + A with guard to set :code:`Optimization` :code:`OpRole` and + :code:`OpRoleVar` automatically. + + Notes: This is a very low level API. Users should not use it directly. + + Args: + var(Variable|str): The variable (name) to be optimized. + + Examples: + + >>> p, g = backward(...) + >>> with program.optimized_guard(p): + >>> p = p - 0.001 * g + """ OpRole = core.op_proto_and_checker_maker.OpRole self._current_role = OpRole.Optimize self._op_role_var = [var.name if isinstance(var, Variable) else var] @@ -1054,18 +1333,35 @@ class Program(object): self._current_role = OpRole.Forward def __str__(self): + """ + Get the protobuf debug string of this Program. + + Returns: + (str): The protobuf debug string. + + Raises: + ValueError: If any of required fields is not set. + """ return self.to_string(True) def to_string(self, throw_on_error, with_details=False): """ To debug string. + Args: - throw_on_error(bool): raise exception when self is not initialized - when throw_on_error is True - with_details(bool): more details about variables and parameters - (e.g. trainable, optimize_attr, ...) will be printed when with_details is True + throw_on_error(bool): raise Value error when any of required fields + is not set. - Returns(str): The debug string. + with_details(bool): True if more details about variables and + parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need + to print. + + Returns + (str): The debug string. + + Raises: + ValueError: If any of required fields is not set and throw_on_error is + True. """ assert isinstance(throw_on_error, bool) and isinstance(with_details, @@ -1081,22 +1377,93 @@ class Program(object): return res_str def get_desc(self): + """ + Get the C++ side of `ProgramDesc` object pointer. The C++ object is + exposed by :code:`pybind`. + + Notes: This is a very low level API. Users should not use this API + directly. + """ return self.desc def clone(self, for_test=False): - """Clone the Program object + """ + Create a new, duplicated program. + + + Some operators, e.g., :code:`batch_norm`, behave differently between + training and testing. They have an attribute, :code:`is_test`, to + control this behaviour. This method will change the :code:`is_test` + attribute of them to :code:`True` when :code:`for_test=True`. + + * Set for_test to False when we want to clone the program for training. + * Set for_test to True when we want to clone the program for testing. - Set for_test to False when we want to clone the program for training. - Set for_test to True when we want to clone the program for testing. + Notes: This API DOES NOT prune any operator. Use + :code:`clone(for_test=True)` before backward and optimization please. e.g. + + >>> test_program = fluid.default_main_program().clone(for_test=True) + >>> optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) + >>> optimizer.minimize() Args: - for_test(bool): Some operators, such as batch_norm and drop_out ops, - behave differently in training and testing. If for_test is True, - the is_test attributes in these operators will be set to True for - testing purposes, otherwise, they remain unchanged. + for_test(bool): True if change the :code:`is_test` attribute of + operators to :code:`True`. - Returns(Program): - The cloned Program object. + Returns: + Program: The new, duplicated Program object. + + Examples: + + 1. To clone a test program, the sample code is: + + >>> import paddle.fluid as fluid + >>> train_program = fluid.Program() + >>> startup_program = fluid.Program() + >>> with fluid.program_guard(train_program, startup_program): + >>> img = fluid.layers.data(name='image', shape=[784]) + >>> hidden = fluid.layers.fc(input=img, size=200, act='relu') + >>> hidden = fluid.layers.dropout(hidden, dropout_prob=0.5) + >>> loss = fluid.layers.cross_entropy( + >>> input=fluid.layers.fc(hidden, size=10, act='softmax'), + >>> label=fluid.layers.data(name='label', shape=[1], dtype='int64')) + >>> + >>> test_program = train_program.clone(for_test=True) + >>> + >>> sgd = fluid.optimizer.SGD(learning_rate=1e-3) + >>> with fluid.program_guard(train_program, startup_program): + >>> sgd.minimize(loss) + + 2. The :code:`clone` method can be avoid if you create program for + training and program for testing individually. + + >>> import paddle.fluid as fluid + >>> + >>> def network(is_test): + >>> img = fluid.layers.data(name='image', shape=[784]) + >>> hidden = fluid.layers.fc(input=img, size=200, act='relu') + >>> hidden = fluid.layers.dropout(hidden, dropout_prob=0.5, is_test=is_test) + >>> loss = fluid.layers.cross_entropy( + >>> input=fluid.layers.fc(hidden, size=10, act='softmax'), + >>> label=fluid.layers.data(name='label', shape=[1], dtype='int64')) + >>> return loss + >>> + >>> train_program = fluid.Program() + >>> startup_program = fluid.Program() + >>> test_program = fluid.Program() + >>> + >>> with fluid.program_guard(train_program, startup_program): + >>> with fluid.unique_name.guard(): + >>> loss = network(is_test=False) + >>> sgd = fluid.optimizer.SGD(learning_rate=1e-3) + >>> sgd.minimize(loss) + >>> + >>> # the test startup program is not used. + >>> with fluid.program_guard(test_program, fluid.Program()): + >>> with fluid.unique_name.guard(): + >>> loss = network(is_test=True) + + The two code snippets above will generate same programs. """ if for_test: p = self.inference_optimize() @@ -1111,6 +1478,21 @@ class Program(object): return p def prune(self, targets): + """ + Prune operators and variables which are not needed to generate + :code:`targets`. + + Notes: This is a very low level API. Users should not use this API + directly. This API is in flux and not stable. + + Args: + targets(list|Variable|Operator): A list of variables or operators + need to be pruned + + Returns: + Program: A new, pruned program. + + """ if not isinstance(targets, list): targets = [targets] targets_idx = [] @@ -1145,6 +1527,17 @@ class Program(object): return res def inference_optimize(self): + """ + This method will create a new program and change the :code:`is_test` + attribute of operators to :code:`True`. All the :code:`Parameter` + information will be lost. + + Notes: This API is a very low level API. Use + :code:`Program.clone(for_test=True)` instead. + + Returns: + Program: The new program. + """ # this is an alternative implement before # core.inference_optimize being fixed. res = Program() @@ -1161,6 +1554,18 @@ class Program(object): @staticmethod def parse_from_string(binary_str): + """ + Deserialize a program desc from protobuf binary string. + + Notes: All information about parameters will be lost after serialization + and deserialization. + + Args: + binary_str(str): The binary prootbuf string. + + Returns: + Program: A deserialized program desc. + """ p = Program() p.desc = core.ProgramDesc(binary_str) p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())] @@ -1169,10 +1574,19 @@ class Program(object): @property def random_seed(self): + """ + The default random seed for random operators in Program. Zero means get + the random seed from random device. + + Notes: It must be set before the operators have been added. + """ return self._seed @property def num_blocks(self): + """ + The number of blocks in this program. + """ return self.desc.num_blocks() @random_seed.setter @@ -1185,15 +1599,40 @@ class Program(object): return str(self) def global_block(self): + """ + Get the first block of this program. + """ return self.blocks[0] def block(self, index): + """ + Get the :code:`index` block of this program + Args: + index(int): The index of block to get + + Returns: + Block: The :code:`index` block + """ return self.blocks[index] def current_block(self): + """ + Get the current block. The :code:`current` block is the block to append + operators. + """ return self.blocks[self.current_block_idx] def create_block(self, parent_idx=None): + """ + Create a new block with the :code:`parent_idx` and change the current block + to new block. + + Args: + parent_idx(int): The parent block index. + + Returns: + Block: The new block. + """ new_block_idx = len(self.blocks) parent = self.current_block() if parent_idx is None else self.block( parent_idx) @@ -1203,9 +1642,24 @@ class Program(object): return self.current_block() def rollback(self): + """ + Exit a code block, i.e., roll back to the parent block. + Returns: + None + """ self.current_block_idx = self.current_block().parent_idx def sync_with_cpp(self): + """ + Synchronize Python instance to its binding C++ object instance. + If the program is modified in C++ space, this method should be invoked. + + Notes: This is a very low level API. Users should not invoke it + directly. + + Returns: + None + """ for block_idx in range(len(self.blocks), self.desc.num_blocks()): self.blocks.append(Block(self, block_idx)) for block in self.blocks: @@ -1214,6 +1668,10 @@ class Program(object): def copy_param_info_from(self, other): """ Copy the information of parameters from other program. + + Notes: This is a very low level API. Users should not invoke it + directly. + Args: other(Program): Other program @@ -1232,6 +1690,10 @@ class Program(object): def copy_data_info_from(self, other): """ Copy the information of data variables from other program. + + Notes: This is a very low level API. Users should not invoke it + directly. + Args: other(Program): Other program @@ -1250,12 +1712,41 @@ class Program(object): self.global_block().var(var.name).is_data = True def list_vars(self): + """ + Get all variables from this Program. A iterable object is returned. + + Returns: + iterable: The generator will yield every variable in this program. + """ for each_block in self.blocks: for each_var in each_block.vars.itervalues(): yield each_var class Parameter(Variable): + """ + Parameter is derived from Variable. A parameter is a persistable + Variable, and will be updated by optimizers after each iteration. + The training of a neural network is essentially the updating of + its parameters. + + Relative to a general Variable, a Parameter has several its own + member variables: + + Args: + trainable(bool): True if the parameter need to be updated after + iterations. + optimize_attr(map): Parameter attributes related with optimizing. + Currently, it only contains 'learning_rate'. + Default: {'learning_rate': 1.0} + regularizer(WeightDecayRegularizer): The Regularizer which will + be applied on the parameter. Default: None + gradient_clip_attr(BaseGradientClipAttr): The gradint clip strategy + which will be applied on the parameter. Default: None + do_model_average(bool): True if the model average strategy will + be applied on this parameter. + """ + def __init__(self, block, shape, dtype, **kwargs): if shape is None or dtype is None: raise ValueError("Parameter must set shape and dtype") @@ -1285,6 +1776,7 @@ class Parameter(Variable): def to_string(self, throw_on_error, with_details=False): """ To debug string. + Args: throw_on_error(bool): raise exception when self is not initialized when throw_on_error is True @@ -1317,8 +1809,15 @@ _startup_program_ = Program() def default_startup_program(): """ - Get default startup program. In startup program, Paddle will initialize - parameters, initialize nccl handle, etc. + Get default/global startup program. + + The layer function in :code:`fluid.layers` will create parameters, readers, + NCCL handles as global variables. The :code:`startup_program` will + initialize them by the operators in startup program. The layer function will + append these initialization operators into startup program. + + This method will return the :code:`default` or the :code:`current` startup + program. Users can use :code:`fluid.program_guard` to switch program. Returns: Program: startup program @@ -1328,7 +1827,15 @@ def default_startup_program(): def default_main_program(): """ - Get default main program. The main program is used for training or testing. + Get default/global main program. The main program is used for training or + testing. + + All layer function in :code:`fluid.layers` will append operators and + variables to the :code:`default_main_program`. + + The :code:`default_main_program` is the default program in a lot of APIs. + For example, the :code:`Executor.run()` will execute the + :code:`default_main_program` when the program is not specified. Returns: Program: main program @@ -1370,20 +1877,34 @@ def switch_startup_program(program): @contextlib.contextmanager def program_guard(main_program, startup_program=None): """ - Switch program with `with` statement + Change the global main program and startup program with `with` statement. + Layer functions in the Python `with` block will append operators and + variables to the new main programs. Examples: - >>> with program_guard(Program()): - >>> data = fluid.layers.data(...) - >>> hidden = fluid.layers.fc(...) + + >>> import paddle.fluid as fluid + >>> main_program = fluid.Program() + >>> startup_program = fluid.Program() + >>> with fluid.program_guard(main_program, startup_program): + >>> data = fluid.layers.data(...) + >>> hidden = fluid.layers.fc(...) + + Notes: The temporary :code:`Program` can be used if the user does not need + to construct either of startup program or main program. + + Examples: + + >>> import paddle.fluid as fluid + >>> main_program = fluid.Program() + >>> # does not care about startup program. Just pass a temporary value. + >>> with fluid.program_guard(main_program, fluid.Program()): + >>> data = ... Args: - main_program(Program): New main program inside `with` statement + main_program(Program): New main program inside `with` statement. startup_program(Program): New startup program inside `with` statement. None means do not change startup program. - - Returns: - None """ if not isinstance(main_program, Program): raise TypeError("main_program should be Program") @@ -1400,11 +1921,12 @@ def program_guard(main_program, startup_program=None): def get_var(name, program=None): """ - Get a variable by name from the global block of a program + Get a variable by name from the global block of a program. + Args: name(str): name of the variable program(Program|None): program object. - If None, default_global_program() will be used. + If None, default_global_program() will be used. Returns: Variable diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py index 9f242cf29a56573349f192307a68e135a409a4be..a81e39695b78f235d6ae896d90117dd392692634 100644 --- a/python/paddle/fluid/inferencer.py +++ b/python/paddle/fluid/inferencer.py @@ -27,13 +27,30 @@ __all__ = ['Inferencer', ] class Inferencer(object): + """ + Inferencer High Level API. + + Args: + infer_func (Python func): Infer function that will return predict Variable + param_path (str): The path where the inference model is saved by fluid.io.save_params + place (Place): place to do the inference + parallel (bool): use parallel_executor to run the inference, it will use multi CPU/GPU. + + Examples: + .. code-block:: python + + def inference_program(): + x = fluid.layers.data(name='x', shape=[13], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + return y_predict + + place = fluid.CPUPlace() + inferencer = fluid.Inferencer( + infer_func=inference_program, param_path="/tmp/model", place=place) + + """ + def __init__(self, infer_func, param_path, place=None, parallel=False): - """ - :param infer_func: a function that will return predict Variable - :param param_path: the path where the inference model is saved by fluid.io.save_params - :param place: place to do the inference - :param parallel: use parallel_executor to run the inference, it will use multi CPU/GPU. - """ self.param_path = param_path self.scope = core.Scope() self.parallel = parallel @@ -56,11 +73,24 @@ class Inferencer(object): else: self.exe = executor.Executor(self.place) + self.inference_program = self.inference_program.clone(for_test=True) + def infer(self, inputs, return_numpy=True): """ - :param inputs: a map of {"input_name": input_var} that will be feed into the inference program - to get the predict value - :return: the predict value of the inference model + Do Inference for Inputs + + Args: + inputs (map): a map of {"input_name": input_var} that will be feed into the inference program + return_numpy (bool): transform return value into numpy or not + + Returns: + Tensor or Numpy: the predict value of the inference model for the inputs + + Examples: + .. code-block:: python + + tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32") + results = inferencer.infer({'x': tensor_x}) """ if not isinstance(inputs, dict): raise ValueError( diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 4e132ed26183eaa5e572128e679cdbffd42e5a42..373e9c060de1ee27c165ccd2380cd8c38612c4d9 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -15,28 +15,43 @@ import framework import numpy as np import contextlib +from framework import convert_np_dtype_to_dtype_ +from core import VarDesc __all__ = [ - 'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu', - 'init_on_cpu', 'ConstantInitializer', 'UniformInitializer', - 'NormalInitializer', 'XavierInitializer' + 'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'MSRA', + 'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer', + 'UniformInitializer', 'NormalInitializer', 'XavierInitializer', + 'BilinearInitializer', 'MSRAInitializer' ] _force_init_on_cpu_ = False def force_init_on_cpu(): + """ + The flag of whether force to init variables on CPU. + + Examples: + .. code-block:: python + + if force_init_on_cpu(): + pass + + """ return _force_init_on_cpu_ @contextlib.contextmanager def init_on_cpu(): """ - Switch program with `with` statement + Force the variable to be inited on CPU. Examples: - >>> with init_on_cpu(): - >>> step = layers.create_global_var() + .. code-block:: python + + with init_on_cpu(): + step = layers.create_global_var() """ global _force_init_on_cpu_ @@ -102,14 +117,18 @@ class Initializer(object): class ConstantInitializer(Initializer): """Implements the constant initializer + + Args: + value (float): constant value to initialize the variable + + Examples: + .. code-block:: python + + fc = fluid.layers.fc(input=x, size=10, + param_attr=fluid.initializer.Constant(value=2.0)) """ def __init__(self, value=0.0, force_cpu=False): - """Constructor for ConstantInitializer - - Args: - value: constant value to initialize the variable - """ assert value is not None super(ConstantInitializer, self).__init__() self._value = value @@ -144,16 +163,20 @@ class ConstantInitializer(Initializer): class UniformInitializer(Initializer): """Implements the random uniform distribution initializer + + Args: + low (float): lower boundary of the uniform distribution + high (float): upper boundary of the uniform distribution + seed (int): random seed + + Examples: + .. code-block:: python + + fc = fluid.layers.fc(input=x, size=10, + param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5)) """ def __init__(self, low=-1.0, high=1.0, seed=0): - """Constructor for UniformInitializer - - Args: - low: lower boundary of the uniform distribution - high: upper boundary of the uniform distribution - seed: random seed - """ assert low is not None assert high is not None assert high >= low @@ -194,17 +217,21 @@ class UniformInitializer(Initializer): class NormalInitializer(Initializer): - """Implements the random Normal(Gaussian) distribution initializer + """Implements the Random Normal(Gaussian) distribution initializer + + Args: + loc (float): mean of the normal distribution + scale (float): standard deviation of the normal distribution + seed (int): random seed + + Examples: + .. code-block:: python + + fc = fluid.layers.fc(input=x, size=10, + param_attr=fluid.initializer.Normal(loc=0.0, scale=2.0)) """ def __init__(self, loc=0.0, scale=1.0, seed=0): - """Constructor for NormalInitializer - - Args: - loc: mean of the normal distribution - scale: standard deviation of the normal distribution - seed: random seed - """ assert loc is not None assert scale is not None assert seed is not None @@ -244,39 +271,49 @@ class NormalInitializer(Initializer): class XavierInitializer(Initializer): - """Implements the Xavier initializer - + """ This class implements the Xavier weight initializer from the paper - Understanding the difficulty of training deep feedforward neural - networks[1] by Xavier Glorot and Yoshua Bengio. + `Understanding the difficulty of training deep feedforward neural + networks `_ + by Xavier Glorot and Yoshua Bengio. This initializer is designed to keep the scale of the gradients approximately same in all the layers. In case of Uniform distribution, - the range is [-x, x], where x = sqrt(6 / (fan_in + fan_out)). + the range is [-x, x], where + + .. math:: + + x = \sqrt{\\frac{6.0}{fan\_in + fan\_out}} + In case of Normal distribution, the mean is 0 and the standard deviation - is sqrt(2/ (fan_in + fan_out)). + is + + .. math:: + + \sqrt{\\frac{2.0}{fan\_in + fan\_out}} + + + Args: + uniform (bool): whether to use uniform or normal distribution + fan_in (float): fan_in for Xavier initialization. If None, it is + inferred from the variable. + fan_out (float): fan_out for Xavier initialization. If None, it is + inferred from the variable. + seed (int): random seed + + Note: + It is recommended to set fan_in and fan_out to None for most cases. + + Examples: + .. code-block:: python + + fc = fluid.layers.fc( + input=queries, size=10, + param_attr=fluid.initializer.Xavier(uniform=False)) - References: - [1] Understanding the difficulty of training deep feedforward neural - networks. International conference on artificial intelligence and - statistics. - (http://proceedings.mlr.press/v9/glorot10a.html) """ def __init__(self, uniform=True, fan_in=None, fan_out=None, seed=0): - """Constructor for XavierInitializer - - Args: - uniform: whether to use uniform or normal distribution - fan_in: fan_in for Xavier initialization. If None, it is - inferred from the variable. - fan_out: fan_out for Xavier initialization. If None, it is - inferred from the variable. - seed: random seed - - Note: It is recommended to set fan_in and fan_out to None for - most cases. - """ assert uniform is not None assert seed is not None super(XavierInitializer, self).__init__() @@ -340,30 +377,42 @@ class MSRAInitializer(Initializer): """Implements the MSRA initializer a.k.a. Kaiming Initializer This class implements the weight initialization from the paper - Delving Deep into Rectifiers: Surpassing Human-Level Performance on - ImageNet Classification[1] by Kaiming He, Xiangyu Zhang, Shaoqing Ren - and Jian Sun. This is a robust initialization method that particularly - considers the rectifier nonlinearities. In case of Uniform distribution, - the range is [-x, x], where x = sqrt(6 / fan_in). In case of Normal - distribution, the mean is 0 and the standard deviation - is sqrt(2/ fan_in). - - References: - [1] Delving Deep into Rectifiers: Surpassing Human-Level Performance - on ImageNet Classification - (https://arxiv.org/abs/1502.01852) + `Delving Deep into Rectifiers: Surpassing Human-Level Performance on + ImageNet Classification `_ + by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a + robust initialization method that particularly considers the rectifier + nonlinearities. In case of Uniform distribution, the range is [-x, x], where + + .. math:: + + x = \sqrt{\\frac{6.0}{fan\_in}} + + In case of Normal distribution, the mean is 0 and the standard deviation + is + + .. math:: + + \sqrt{\\frac{2.0}{fan\_in}} + + Args: + uniform (bool): whether to use uniform or normal distribution + fan_in (float): fan_in for MSRAInitializer. If None, it is\ + inferred from the variable. + seed (int): random seed + + Note: + It is recommended to set fan_in to None for most cases. + + Examples: + .. code-block:: python + + fc = fluid.layers.fc( + input=queries, size=10, + param_attr=fluid.initializer.MSRA(uniform=False)) """ def __init__(self, uniform=True, fan_in=None, seed=0): """Constructor for MSRAInitializer - - Args: - uniform: whether to use uniform or normal distribution - fan_in: fan_in for MSRAInitializer. If None, it is - inferred from the variable. - seed: random seed - - Note: It is recommended to set fan_in to None for most cases. """ assert uniform is not None assert seed is not None @@ -422,6 +471,104 @@ class MSRAInitializer(Initializer): return op +class BilinearInitializer(Initializer): + """ + This initializer can be used in transposed convolution operator to + act as upsampling. Users can upsample a feature map with shape of + (B, C, H, W) by any integer factor. The usage is: + + Examples: + + .. code-block:: python + + factor = 2 + w_attr = ParamAttr(learning_rate=0., regularizer=L2Decay(0.), + initializer=Bilinear()) + conv_up = fluid.layers.conv2d_transpose( + input, + num_filters=C, + output_size=None, + filter_size=2 * factor - factor % 2, + padding=ceil((factor - 1) / 2.), + stride=factor, + groups=C, + param_attr=w_attr, + bias_attr=False) + + Where, `num_filters=C` and `groups=C` means this is channel-wise transposed + convolution. The filter shape will be (C, 1, K, K) where K is `filer_size`, + This initializer will set a (K, K) interpolation kernel for every channel + of the filter identically. The resulting shape of the output feature map + will be (B, C, factor * H, factor * W). Note that the learning rate and the + weight decay are set to 0 in order to keep coefficient values of bilinear + interpolation unchanged during training. + + """ + + def __init__(self): + """Constructor for BilinearInitializer. + """ + super(BilinearInitializer, self).__init__() + + def __call__(self, var, block): + """Add biliear initialization ops for a variable + + Args: + var (Variable): Variable that needs to be initialized. + block (Block): The block in which initialization ops should + be added. + + Returns: + Operator: the initialization op + + Raises: + ValueError: If type of `var` and `block` is not right. + If the shape of `var` size is not 4 and + var.shape[2] != var.shape[3]. + """ + if not isinstance(var, framework.Variable): + raise ValueError("var must be framework.Variable.") + + if not isinstance(block, framework.Block): + raise ValueError("block must be framework.Block.") + + shape = var.shape + if len(shape) != 4: + raise ValueError("the length of shape must be 4.") + if shape[2] != shape[3]: + raise ValueError("shape[2] must be equal to shape[3].") + + weight = np.zeros(np.prod(var.shape), dtype='float32') + size = shape[3] + # factor + f = np.ceil(size / 2.) + # center + c = (2 * f - 1 - f % 2) / (2. * f) + for i in range(np.prod(shape)): + x = i % size + y = (i / size) % size + weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c)) + weight = np.reshape(weight, shape) + + if var.dtype == VarDesc.VarType.FP32: + value_name = "fp32_values" + values = [float(v) for v in weight.flat] + else: + raise ValueError("Unsupported dtype %s", input.dtype) + if np.prod(shape) > 1024 * 1024: + raise ValueError("The size of input is too big. ") + op = block.append_op( + type='assign_value', + outputs={'Out': [var]}, + attrs={ + 'dtype': var.dtype, + 'shape': list(shape), + value_name: values + }) + var.op = op + return op + + # We short the class name, since users will use the initializer with the package # name. The sample code: # @@ -436,3 +583,4 @@ Uniform = UniformInitializer Normal = NormalInitializer Xavier = XavierInitializer MSRA = MSRAInitializer +Bilinear = BilinearInitializer diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 8e58e5eb794e1bb507ab05394a1f7b57a1d2ed42..d94564e11f982575dd9c065deb20d29396203227 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -13,6 +13,7 @@ # limitations under the License. import os +import errno import time import shutil @@ -24,25 +25,49 @@ __all__ = [ 'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params', 'load_persistables', 'save_inference_model', 'load_inference_model', 'get_inference_program', 'save_checkpoint', 'load_checkpoint', - 'clean_checkpoint' + 'clean_checkpoint', 'load_persist_vars_without_grad', + 'load_lookup_table_vars', 'save_persist_vars_without_grad', + 'get_latest_checkpoint_serial' ] def is_parameter(var): - """Check whether the variable is a Parameter. - - This function checks whether the input variable is a Parameter. + """ + Check whether the given variable is an instance of Parameter. Args: - var : The input variable. + var(Variable): The variable to be checked. Returns: - boolean result whether the variable is a Parameter. + bool: True if the given `var` is an instance of Parameter, + False if not. + + Examples: + .. code-block:: python + + param = fluid.default_main_program().global_block().var('fc.w') + res = fluid.io.is_parameter(param) """ return isinstance(var, Parameter) def is_persistable(var): + """ + Check whether the given variable is persistable. + + Args: + var(Variable): The variable to be checked. + + Returns: + bool: True if the given `var` is persistable + False if not. + + Examples: + .. code-block:: python + + param = fluid.default_main_program().global_block().var('fc.w') + res = fluid.io.is_persistable(param) + """ if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ var.desc.type() == core.VarDesc.VarType.FETCH_LIST: return False @@ -67,20 +92,69 @@ def save_vars(executor, predicate=None, filename=None): """ - Save variables to directory by executor. + Save variables to the given directory by executor. + + There are two ways to specify variables to be saved: The first way, list + variables in a list and assign it to the `vars`. The second way, assign the + `main_program` with an existing program, then all variables in the program + will be saved. The first way has a higher priority. In other words, if `vars` + are assigned, the `main_program` and the `predicate` will be ignored. - :param executor: executor that save variable - :param dirname: directory path - :param main_program: program. If vars is None, then filter all variables in this - program which fit `predicate`. Default default_main_program. - :param predicate: The Predicate describes a callable that returns a variable - as a bool. If it returns true, the corresponding input variable will be saved. - :param vars: variables need to be saved. If vars is specified, program & predicate - will be ignored - :param filename: The name of a single file that all vars are saved to. - If it is None, save variables to separate files. + The `dirname` are used to specify the folder where to save variables. + If you prefer to save variables in separate files in the folder `dirname`, + set `filename` None; if you prefer to save all variables in a single file, + use `filename` to specify it. - :return: None + Args: + executor(Executor): The executor to run for saving variables. + dirname(str): The directory path. + main_program(Program|None): The program whose variables will be saved. + If it is None, the default main program will + be used automatically. + Default: None + vars(list[Variable]|None): The list that contains all variables to save. + It has a higher priority than the `main_program`. + Default: None + predicate(function|None): If it is not None, only variables in the + `main_program` that makes predicate(variable)==True + will be saved. It only works when we are using the + `main_program` to specify variables (In other words + `vars` is None). + Default: None + filename(str|None): The file which to save all variables. If you prefer to save + variables separately, set it to None. + Default: None + + Returns: + None + + Raises: + TypeError: If `main_program` is not an instance of Program nor None. + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + + # The first usage: using `main_program` to specify variables + def name_has_fc(var): + res = "fc" in var.name + return res + + prog = fluid.default_main_program() + fluid.io.save_vars(executor=exe, dirname=path, main_program=prog, + vars=None) + # All variables in `main_program` whose name includes "fc" will be saved. + # And variables are going to be saved separately. + + + # The second usage: using `vars` to specify variables + var_list = [var_a, var_b, var_c] + fluid.io.save_vars(executor=exe, dirname=path, vars=var_list, + filename="vars_file") + # var_a, var_b and var_c will be saved. And they are going to be + # saved in the same file named 'var_file' in the path "./my_paddle_model". """ if vars is None: if main_program is None: @@ -128,7 +202,42 @@ def save_vars(executor, def save_params(executor, dirname, main_program=None, filename=None): """ - Save all parameters to directory with executor. + This function filters out all parameters from the give `main_program` + and then save them to the folder `dirname` or the file `filename`. + + Use the `dirname` to specify the saving folder. If you would like to + save parameters in separate files, set `filename` None; if you would + like to save all parameters in a single file, use `filename` to specify + the file name. + + NOTICE: Some variables are not Parameter while they are necessary for + training. So you can NOT save and continue your training just by + `save_params()` and `load_params()`. Please use `save_persistables()` + and `load_persistables()` instead. + + Args: + executor(Executor): The executor to run for saving parameters. + dirname(str): The saving directory path. + main_program(Program|None): The program whose parameters will be + saved. If it is None, the default + main program will be used automatically. + Default: None + filename(str|None): The file to save all parameters. If you prefer + to save parameters in differnet files, set it + to None. + Default: None + + Returns: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + prog = fluid.default_main_program() + fluid.io.save_params(executor=exe, dirname=param_path, + main_program=None) """ save_vars( executor, @@ -141,7 +250,37 @@ def save_params(executor, dirname, main_program=None, filename=None): def save_persistables(executor, dirname, main_program=None, filename=None): """ - Save all persistables to directory with executor. + This function filters out all variables with `persistable==True` from the + give `main_program` and then saves these variables to the folder `dirname` + or file `filename`. + + The `dirname` is used to specify the folder where persistable variables + are going to be saved. If you would like to save variables in separate + files, set `filename` None; if you would like to save all variables in a + single file, use `filename` to specify the file name. + + Args: + executor(Executor): The executor to run for saving persistable variables. + dirname(str): The directory path. + main_program(Program|None): The program whose persistbale variables will + be saved. If it is None, the default main + program will be used automatically. + Default: None + filename(str|None): The file to saved all variables. If you prefer to + save variables in differnet files, set it to None. + Default: None + + Returns: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + prog = fluid.default_main_program() + fluid.io.save_persistables(executor=exe, dirname=param_path, + main_program=None) """ save_vars( executor, @@ -159,20 +298,69 @@ def load_vars(executor, predicate=None, filename=None): """ - Load variables from directory by executor. + Load variables from the given directory by executor. + + There are two ways to specify variables to be loaded: The first way, list + variables in a list and assign it to the `vars`. The second way, assign the + `main_program` with an existing program, then all variables in the program + will be loaded. The first way has a higher priority. In other words if `vars` + are assigned, the `main_program` and the `predicate` will be ignored. + + The `dirname` are used to specify the folder where to load variables. + If variables were saved in separate files in the folder `dirname`, + set `filename` None; if all variables were saved in a single file, + use `filename` to specify it. - :param executor: executor that load variable - :param dirname: directory path - :param main_program: program. If vars is None, then filter all variables in this - program which fit `predicate`. Default default_main_program(). - :param predicate: The Predicate describes a callable that returns a variable - as a bool. If it returns true, the corresponding input variable will be loaded. - :param vars: variables need to be loaded. If vars is specified, program & - predicate will be ignored - :param filename: The name of the single file that all vars are loaded from. - If it is None, load variables from separate files. + Args: + executor(Executor): The executor to run for loading variables. + dirname(str): The directory path. + main_program(Program|None): The program whose variables will be loaded. + If it is None, the default main program will + be used automatically. + Default: None + vars(list[Variable]|None): The list that contains all variables to load. + It has a higher priority than the `main_program`. + Default: None + predicate(function|None): If it is not None, only variables in the + `main_program` that makes predicate(variable)==True + will be loaded. It only works when we are using the + `main_program` to specify variables (In other words + `vars` is None). + Default: None + filename(str|None): The file which saved all required variables. If variables + were saved in differnet files, set it to None. + Default: None + + Returns: + None + + Raises: + TypeError: If `main_program` is not an instance of Program nor None. + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + + # The first usage: using `main_program` to specify variables + def name_has_fc(var): + res = "fc" in var.name + return res - :return: None + prog = fluid.default_main_program() + fluid.io.load_vars(executor=exe, dirname=path, main_program=prog, + vars=None) + # All variables in `main_program` whose name includes "fc" will be loaded. + # And all the variables are supposed to have been saved in differnet files. + + + # The second usage: using `vars` to specify variables + var_list = [var_a, var_b, var_c] + fluid.io.load_vars(executor=exe, dirname=path, vars=var_list, + filename="vars_file") + # var_a, var_b and var_c will be loaded. And they are supposed to haven + # been saved in the same file named 'var_file' in the path "./my_paddle_model". """ if vars is None: if main_program is None: @@ -220,7 +408,42 @@ def load_vars(executor, def load_params(executor, dirname, main_program=None, filename=None): """ - load all parameters from directory by executor. + This function filters out all parameters from the give `main_program` + and then trys to load these parameters from the folder `dirname` or + the file `filename`. + + Use the `dirname` to specify the folder where parameters were saved. If + parameters were saved in separate files in the folder `dirname`, set + `filename` None; if all parameters were saved in a single file, use + `filename` to specify the file name. + + NOTICE: Some variables are not Parameter while they are necessary for + training. So you can NOT save and continue your training just by + `save_params()` and `load_params()`. Please use `save_persistables()` + and `load_persistables()` instead. + + Args: + executor(Executor): The executor to run for loading parameters. + dirname(str): The directory path. + main_program(Program|None): The program whose parameters will be + loaded. If it is None, the default + main program will be used automatically. + Default: None + filename(str|None): The file which saved all parameters. If parameters + were saved in differnet files, set it to None. + Default: None + + Returns: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + prog = fluid.default_main_program() + fluid.io.load_params(executor=exe, dirname=param_path, + main_program=None) """ load_vars( executor, @@ -232,7 +455,37 @@ def load_params(executor, dirname, main_program=None, filename=None): def load_persistables(executor, dirname, main_program=None, filename=None): """ - load all persistables from directory by executor. + This function filters out all variables with `persistable==True` from the + give `main_program` and then trys to load these variables from the folder + `dirname` or the file `filename`. + + Use the `dirname` to specify the folder where persistable variables were + saved. If variables were saved in separate files, set `filename` None; + if all variables were saved in a single file, use `filename` to specify + the file name. + + Args: + executor(Executor): The executor to run for loading persistable variables. + dirname(str): The directory path. + main_program(Program|None): The program whose persistbale variables will + be loaded. If it is None, the default main + program will be used automatically. + Default: None + filename(str|None): The file which saved all variables. If variables were + saved in differnet files, set it to None. + Default: None + + Returns: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + prog = fluid.default_main_program() + fluid.io.load_persistables(executor=exe, dirname=param_path, + main_program=None) """ load_vars( executor, @@ -305,22 +558,48 @@ def save_inference_model(dirname, model_filename=None, params_filename=None): """ - Build a model especially for inference, - and save it to directory by the executor. + Prune the given `main_program` to build a new program especially for inference, + and then save it and all related parameters to given `dirname` by the `executor`. - :param dirname: directory path - :param feeded_var_names: Names of variables that need to be feeded data during inference - :param target_vars: Variables from which we can get inference results. - :param executor: executor that save inference model - :param main_program: original program, which will be pruned to build the inference model. - Default default_main_program(). - :param model_filename: The name of file to save inference program. - If not specified, default filename `__model__` will be used. - :param params_filename: The name of file to save parameters. - It is used for the case that all parameters are saved in a single binary file. - If not specified, parameters are considered saved in separate files. + Args: + dirname(str): The directory path to save the inference model. + feeded_var_names(list[str]): Names of variables that need to be feeded data + during inference. + target_vars(list[Variable]): Variables from which we can get inference + results. + executor(Executor): The executor that saves the inference model. + main_program(Program|None): The original program, which will be pruned to + build the inference model. If is setted None, + the default main program will be used. + Default: None. + model_filename(str|None): The name of file to save the inference program + itself. If is setted None, a default filename + `__model__` will be used. + params_filename(str|None): The name of file to save all related parameters. + If it is setted None, parameters will be saved + in separate files . + + Returns: + None + + Raises: + ValueError: If `feed_var_names` is not a list of basestring. + ValueError: If `target_vars` is not a list of Variable. + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + path = "./infer_model" + fluid.io.save_inference_model(dirname=path, feeded_var_names=['img'], + target_vars=[predict_var], executor=exe) + + # In this exsample, the function will prune the default main program + # to make it suitable for infering the `predict_var`. The pruned + # inference program is going to be saved in the "./infer_model/__model__" + # and parameters are going to be saved in separate files under folder + # "./infer_model". - :return: None """ if isinstance(feeded_var_names, basestring): feeded_var_names = [feeded_var_names] @@ -381,18 +660,49 @@ def load_inference_model(dirname, """ Load inference model from a directory - :param dirname: directory path - :param executor: executor that load inference model - :param model_filename: The name of file to load inference program. - If not specified, default filename `__model__` will be used. - :param params_filename: The name of file to load parameters. - It is used for the case that all parameters are saved in a single binary file. - If not specified, parameters are considered saved in separate files. + Args: + dirname(str): The directory path + executor(Executor): The executor to run for loading inference model. + model_filename(str|None): The name of file to load inference program. + If it is None, the default filename + '__model__' will be used. + Default: None + params_filename(str|None): The name of file to load all parameters. + It is only used for the case that all + parameters were saved in a single binary + file. If parameters were saved in separate + files, set it as 'None'. + + Returns: + tuple: The return of this function is a tuple with three elements: + (program, feed_target_names, fetch_targets). The `program` is a + Program, it's the program for inference. The `feed_target_names` is + a list of str, it contains Names of variables that need to feed + data in the inference program. The `fetch_targets` is a list of + Variable. It contains variables from which we can get inference + results. + + Raises: + ValueError: If `dirname` is not a existing directory. + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + path = "./infer_model" + [inference_program, feed_target_names, fetch_targets] = + fluid.io.load_inference_model(dirname=path, executor=exe) + results = exe.run(inference_program, + feed={feed_target_names[0]: tensor_img}, + fetch_list=fetch_targets) + + # In this exsample, the inference program was saved in the + # "./infer_model/__model__" and parameters were saved in + # separate files in ""./infer_model". + # After getting inference program, feed target names and + # fetch targets, we can use an Executor to run the inference + # program to get the inference result. - :return: [program, feed_target_names, fetch_targets] - program: program especially for inference. - feed_target_names: Names of variables that need to feed data - fetch_targets: Variables from which we can get inference results. """ if not os.path.isdir(dirname): raise ValueError("There is no directory named '%s'", dirname) @@ -423,12 +733,25 @@ def load_inference_model(dirname, def get_parameter_value(para, executor): """ - Get the LoDTensor for the parameter + Get the LoDTensor value of the given parameter. + + Args: + para(Parameter): The parameter to get value from. + executor(Executor): The executor to run for retrieving the value. + + Returns: + numpy.array: The given parameter's values. + + Raises: + AssertionError: If the `para` is not an instance of Parameter. + + Examples: + .. code-block:: python - :param executor: executor for retrieving the value - :param para: the given parameter + exe = fluid.Executor(fluid.CPUPlace()) + param = fluid.default_main_program().global_block().var('fc.w') + p = fluid.io.get_parameter_value(param, exe) - :return: the LoDTensor for the parameter """ assert is_parameter(para) @@ -440,14 +763,30 @@ def get_parameter_value(para, executor): def get_parameter_value_by_name(name, executor, program=None): """ - Get the LoDTensor for paramter with the given name + Get the LoDTensor value of a certain parameter by its name. - :param executor: executor for retrieving the value - :param name: the name of the parameter - :param program: the program where the variable is found - Default default_main_program(). + Args: + name(str): The parameter's name. + executor(Executor): The executor to run for retrieving the value. + program(Program | None): The program where to find the parameter. + If it's set to be None, the function will + try to find the parameter in the default + main program. + + Returns: + numpy.array: The parameter's values. + + Raises: + TypeError: If given `name` is not an instance of basestring. + TypeError: If the parameter with the given name doesn't exist. + AssertionError: If there is a varibale named `name` in the + given program but it is not a Parameter. - :return: the LoDTensor for the variable + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + p = fluid.io.get_parameter_value('fc.w', exe) """ if program is None: program = default_main_program() @@ -457,95 +796,434 @@ def get_parameter_value_by_name(name, executor, program=None): SUCCESS_MARK_FILENAME = "_SUCCESS" CHECKPOINT_PREFIX = "checkpoint" +MODEL_DIR = "__model__" +LOOKUP_TABLE_DIR = "__lookup_table__" +TRAINER_PREFIX = "trainer" CHECKPOINT_SEPARATOR = "_" def save_checkpoint(executor, - checkpoint_dir=None, + checkpoint_dir, + trainer_id, + trainer_args=None, + main_program=None, max_num_checkpoints=3, - save_interval_secs=600, - main_program=None): + lookup_table=None, + ps_endpoint_list=None): """ - Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory, - the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy - to keep numbers of checkpoint directory, the numbers of checkpoint directory are max_num_checkpoints at most, - The interval between two saved checkpoints must greater than save_interval_secs. + This function filters out all checkpoint variables from the give + main_program and then saves these variables to the `checkpoint_dir` + directory. + + In the training precess, we generally save a checkpoint in each + iteration. So there might be a lot of checkpoints in the + `checkpoint_dir`. To avoid them taking too much disk space, the + `max_num_checkpoints` are introduced to limit the total number of + checkpoints. If the number of existing checkpints is greater than + the `max_num_checkpoints`, oldest ones will be scroll deleted. + + A variable is a checkpoint variable and will be saved if it meets + all following conditions: + 1. It's persistable. + 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. + 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". + + Args: + executor(Executor): The executor to run for save checkpoint. + checkpoint_dir(str): The folder where to save checkpoints. + trainer_id(int): currect trainer id, if id is equal to 0, the trainer + is chief. + trainer_args(dict|None): Current training arguments. Such as 'epoch_id' + and 'step_id'. + Defaut: None + main_program(Program|None): The program whose checkpoint variables will + be saved. If it is None, the default main program will be used. + max_num_checkpoints(int): The max number of total number of existing + checkpoints. + Default: 3 + lookup_table(string|None): the lookup table name, when use distribute + lookup table, we can get lookup table name by DistributeTranspiler. + table_name + ps_endpoint_list(list|None): the parameter server ip:port list. + when use distribute lookup table, we can get ps_endpoint_list by + distribute arguments. - :param executor - :param checkpoint_dir - :param max_num_checkpoints - :param save_interval_secs - :param main_program + Returns: + None + + Raises: + ValueError: If `checkpoint_dir` is None. + AssertionError: If `trainer_args` is not a dict. + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + path = "./checkpoints" + prog = fluid.default_main_program() + trainer_args = {"epoch_id": 200, + "step_id": 20} # just an example + table_name = "share_w" + ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"] + + fluid.io.save_checkpoint(executor=exe, + checkpoint_dir=path, + trainer_id=0, + trainer_args=trainer_args, + main_program=prog, + max_num_checkpoints=3, + lookup_table=table_name, + ps_endpoint_list = ps_endpoints) """ if checkpoint_dir is None: - checkpoint_dir = os.getcwd() + raise ValueError("'checkpoint_dir' should not be None") + assert checkpoint_dir - if not os.path.isdir(checkpoint_dir): - os.makedirs(checkpoint_dir) + if trainer_args: + assert isinstance(trainer_args, dict) - serial = _get_lastest_checkpoint_dir(checkpoint_dir) - if serial >= 0 and not _interval_secs_exceed( - _get_serial_dir(serial, checkpoint_dir), save_interval_secs): - return + is_chief = trainer_id == 0 - serial += 1 - cur_dir = _get_serial_dir(serial, checkpoint_dir) + _make_chekcpoint_dirs(checkpoint_dir) + serial = get_latest_checkpoint_serial(checkpoint_dir) + 1 + cur_dir = _get_serial_dir(checkpoint_dir, serial) - save_vars( + save_trainer_args(cur_dir, trainer_id, trainer_args) + + if is_chief: + save_persist_vars_without_grad(executor, cur_dir, main_program) + + if is_chief and lookup_table and ps_endpoint_list: + save_pserver_vars_by_notify(executor, cur_dir, lookup_table, + ps_endpoint_list) + + _scroll_delete(checkpoint_dir, max_num_checkpoints) + + +def load_checkpoint(executor, checkpoint_dir, serial, main_program): + """ + This function filters out all checkpoint variables from the give + main_program and then try to load these variables from the + `checkpoint_dir` directory. + + In the training precess, we generally save a checkpoint in each + iteration. So there are more than one checkpoint in the + `checkpoint_dir` (each checkpoint has its own sub folder), use + `serial` to specify which serial of checkpoint you would like to + load. + + A variable is a checkpoint variable and will be loaded if it meets + all following conditions: + 1. It's persistable. + 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. + 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". + + Args: + executor(Executor): The executor to run for loading checkpoint. + checkpoint_dir(str): The folder where all checkpoints are. + serial(int): The serial of checkpoint you would like to load. + main_program(Program): The program whose checkpoint variables will + be loaded. + + Returns: + None + + Raises: + ValueError: If `checkpoint_dir` is None. + ValueError: If `serial` is None or `serial` is less than 0. + ValueError: If `main_program` is None. + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + path = "./checkpoints" + prog = fluid.default_main_program() + fluid.io.load_checkpoint(executor=exe, checkpoint_dir=path, + serial=9, main_program=prog) + + # In this example, `load_checkpoint` function + # will first filters out all checkpoint variables in the default + # main program, and then try to load these variables form the + # folder "./checkpoints/checkpoint_9/__model__". + """ + + if checkpoint_dir is None: + raise ValueError("'checkpoint_dir' should not be None") + + if serial is None or serial < 0: + raise ValueError("'serial' should not be None or <0 ") + + if main_program is None: + raise ValueError('main_program should not be None.') + + cur_dir = _get_serial_dir(checkpoint_dir, serial) + load_persist_vars_without_grad(executor, cur_dir, main_program, True) + + +def clean_checkpoint(checkpoint_dir, delete_dir=False): + """ + clean the checkpoint dir, when the train exits normally, + the trainer will call clean_checkpoint to delete checkpoint directory saved before. + delete_dir only works when the directory is empty, otherwise, OSError is raised. + + : param checkpoint_dir + : param delete_dir + """ + + if checkpoint_dir is None: + raise ValueError("'checkpoint_dir' should not be None") + _scroll_delete(checkpoint_dir, max_num_checkpoints=0) + + if delete_dir and not os.listdir(checkpoint_dir): + os.rmdir(checkpoint_dir) + + +def load_persist_vars_without_grad(executor, + dirname, + program, + has_model_dir=False): + """ + This function filters out all checkpoint variables from the give + program and then trys to load these variables from the given directory. + + A variable is a checkpoint variable if it meets all following + conditions: + 1. It's persistable. + 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. + 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". + + Args: + executor(Executor): The executor to run for loading variables. + dirname(str): The directory path. + program(Program): The program whose checkpoint variables will + be loaded. + has_model_dir(bool): if True, the function loads variables + from a sub directory named '__model__'. + Default: False + + Returns: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + prog = fluid.default_main_program() + fluid.io.load_persist_vars_without_grad(executor=exe, + dirname=param_path, program=prog, has_model_dir=True) + + # In this example, `load_persist_vars_without_grad` function + # will first filters out all checkpoint variables in the default + # main program, and then trys to load these variables form the + # folder "./my_paddle_model/__model__". + """ + + if has_model_dir: + dirname = _get_model_dir(dirname) + + load_vars( executor, - dirname=cur_dir, - main_program=main_program, - vars=None, + dirname=dirname, + main_program=program, predicate=_is_checkpoint_var, filename=None) - _write_success(cur_dir) - _lru_delete(checkpoint_dir, max_num_checkpoints) -def load_checkpoint(executor, checkpoint_dir=None, main_program=None): +def load_lookup_table_vars(executor, dirname, program, pserver_id, table_name): """ - Load checkpoint from a directory by executor, - it will find the most recent saved checkpoint file and load it auto. + The parameter server will load lookup table's local file in + selectedrows variable. - :param executor - :param checkpoint_dir - :param main_program + Args: + executor(Executor): The executor to run for loading persistable variables + dirname(str): The directory path + main_program(Program): Find the variable named table_name in main_program + pserver_id(int): the serial number in pserver_endpoints list + table_name(str): lookup table name + + Returns: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + dirname = "./checkpoints/checkpoint_9/__model__" + prog = fluid.default_main_program() + pserver_id = 1 + table_name = "share_w" + fluid.io.load_lookup_table_vars(executor=exe, + dirname=dirname, program=prog, pserver_id=pserver_id, + table_name=table_name) """ - if checkpoint_dir is None: - checkpoint_dir = os.getcwd() + for var in program.list_vars(): + if var.name == table_name: + lookup_table_var = var + break - serial = _get_lastest_checkpoint_dir(checkpoint_dir) + assert lookup_table_var is not None - if serial < 0: - return + lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR) + table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id) - cur_dir = _get_serial_dir(serial, checkpoint_dir) + load_prog = Program() + load_block = load_prog.global_block() - load_vars( + load_block.append_op( + type='load', + inputs={}, + outputs={'Out': [lookup_table_var]}, + attrs={'file_path': os.path.join(lookup_table_dir, table_file)}) + + executor.run(load_prog) + + +def save_persist_vars_without_grad(executor, dirname, program): + """ + This function filters out all checkpoint variables from the give + program and then save these variables to a sub-folder '__model__' of + the given directory. + + A variable is a checkpoint variable if it meets all following + conditions: + 1. It's persistable. + 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. + 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". + + Args: + executor(Executor): The executor to run for saving variables. + dirname(str): The directory path. + program(Program): The program whose checkpoint variables will + be saved. + + Returns: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + prog = fluid.default_main_program() + fluid.io.save_persist_vars_without_grad(executor=exe, + dirname=param_path, program=prog) + + # In this example, `save_persist_vars_without_grad` function + # will first filters out all checkpoint variables in the default + # main program, and then saves these variables to the folder + # "./my_paddle_model/__model__". + """ + cur_dir = _get_model_dir(dirname) + save_vars( executor, dirname=cur_dir, - main_program=main_program, + main_program=program, + vars=None, predicate=_is_checkpoint_var, filename=None) + _write_success(cur_dir) -def clean_checkpoint(checkpoint_dir, delete_dir=False): +def save_pserver_vars_by_notify(executor, dirname, lookup_table, + ps_endpoint_list): """ - clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before. - delete_dir only works when the directory is empty, otherwise, OSError is raised. + This function will send checkpoint notify message from Trainer 0 + to all the pservers. + The checkpoint notify message contains lookup table name, + the absolute path on pserver to save lookup_table. + + Args: + executor(Executor): The executor to run for send checkpoint notify. + dirname(str): The folder where to save checkpoints. + lookup_table(string): the lookup table name, when use distribute + lookup table, we can get lookup table name by DistributeTranspiler. + table_name + ps_endpoint_list(list): the parameter server ip:port list. + when use distribute lookup table, we can get ps_endpoint_list by + distribute arguments. + Return: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + prog = fluid.default_main_program() + table_name = "share_w" + ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"] + + fluid.io.save_pserver_vars_by_notify(executor=exe, + dirname=param_path, lookup_table=table_name, + ps_endpoint_list=ps_endpoints) """ - if checkpoint_dir is None: - checkpoint_dir = os.getcwd() - _lru_delete(checkpoint_dir, max_num_checkpoints=0) + cur_dir = _get_lookuptable_dir(dirname) - if delete_dir and not os.listdir(checkpoint_dir): - os.rmdir(checkpoint_dir) + checkpoint_notify_program = Program() + checkpoint_notify_block = checkpoint_notify_program.global_block() + attrs = {} + attrs['epmap'] = ps_endpoint_list + attrs['dir'] = cur_dir + attrs['lookup_table'] = lookup_table -def _get_serial_dir(serial, checkpoint_dir): - serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial) - return os.path.join(checkpoint_dir, serial_folder) + checkpoint_notify_block.append_op( + type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs) + executor.run(checkpoint_notify_program) + + +def save_trainer_args(dirname, trainer_id, trainer_args): + assert isinstance(trainer_args, dict) + + cur_dir = _get_trainer_dir(dirname, trainer_id) + + for name, value in trainer_args.iteritems(): + args_file = os.path.join(cur_dir, name) + with open(args_file, 'w') as f: + f.write(str(value)) + _write_success(cur_dir) + + +def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args): + """ + trainer will load some args from it's independent directory, + such as epoch_id and step_id. + + Args: + checkpoint_dir(str): The folder where all checkpoints are. + serial(int): The serial of checkpoint you would like to load. + trainer_id(int): current trainer id. + trainer_args(list): list about load trainer args + Return: + None + + Examples: + .. code-block:: python + + param_path = "./checkpoint/" + serial = 7 + trainer_id = 2 + trainer_args = ["epoch_id", "step_id"] + + fluid.io.load_trainer_args(checkpoint_dir=param_path, serial=serial, + trainer_id=trainer_id, trainer_args=trainer_args) + """ + assert isinstance(trainer_args, list) + + cur_dir = _get_serial_dir(checkpoint_dir, serial) + cur_dir = _get_trainer_dir(cur_dir, trainer_id) + + ret_values = [] + + for arg in trainer_args: + cur_file = os.path.join(cur_dir, arg) + with open(cur_file, 'r') as f: + contents = f.read() + ret_values.append(contents.strip()) + return ret_values def _is_checkpoint_var(var): @@ -553,50 +1231,107 @@ def _is_checkpoint_var(var): the checkpoint will not save or load all the variables. var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded. - :param var + : param var(Variable) """ if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ var.desc.type() == core.VarDesc.VarType.RAW: return False + # @GRAD are named for gradient variables, checkpoint will not save it. + if "@GRAD" in var.name: + return False + # .trainer_ are named for distribute train variables, checkpoint will not save it. + if ".trainer_" in var.name: + return False - if var.name.endswith("@GRAD"): + # .block is named for distribute train variables, checkpoint will not save it. + if ".block" in var.name: return False return var.persistable -def _interval_secs_exceed(dirname, save_interval_secs): - dir_time = os.path.getmtime(dirname) - if save_interval_secs > (time.time() - dir_time): - return False - return True +def _make_chekcpoint_dirs(dirs): + """ + _make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it. + """ + assert dirs is not None + + if os.path.isfile(dirs): + raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs) + + if not os.path.isdir(dirs): + try: + os.makedirs(dirs) + except OSError as err: + if err.errno != errno.EEXIST: + raise err + + +def _get_dir_serial(dirname): + _, serial = dirname.split(CHECKPOINT_SEPARATOR) + + try: + serial_num = int(serial) + except ValueError: + serial_num = -1 + return serial_num + + +def _get_serial_dir(dirname, serial): + serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial) + serial_dir = os.path.join(dirname, serial_folder) + _make_chekcpoint_dirs(serial_dir) + + return serial_dir + + +def _get_model_dir(dirname): + model_dir = os.path.join(dirname, MODEL_DIR) + _make_chekcpoint_dirs(model_dir) + return model_dir + + +def _get_lookuptable_dir(dirname): + lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR) + _make_chekcpoint_dirs(lookuptable_dir) + return lookuptable_dir + +def _get_trainer_dir(dirname, trainer_id): + trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id) + trainer_dir = os.path.join(dirname, trainer_folder) + _make_chekcpoint_dirs(trainer_dir) + return trainer_dir -def _lru_delete(dirname, max_num_checkpoints=3): + +def _scroll_delete(dirname, max_num_checkpoints=3): dirs = os.listdir(dirname) - serials = [] + serial_map = {} for serial in dirs: - try: - serials.append(int(serial)) - except ValueError: - continue + serial_num = _get_dir_serial(serial) + serial_map[serial_num] = serial - if len(serials) <= max_num_checkpoints: + if len(serial_map.keys()) <= max_num_checkpoints: return + serials = serial_map.keys() serials.sort(reverse=True) serials = serials[max_num_checkpoints:] for serial in serials: - cur_dir = os.path.join(dirname, str(serial)) - shutil.rmtree(cur_dir) + cur_dir = _get_serial_dir(dirname, serial) + try: + shutil.rmtree(cur_dir) + except OSError as err: + if err.errno != errno.ENOENT: + raise err def _write_success(dirname): """ write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct. - :param dirname + : param dirname """ success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME) with open(success_file, 'a') as f: @@ -604,33 +1339,30 @@ def _write_success(dirname): f.write(now) -def _get_lastest_checkpoint_dir(checkpoint_dir): +def get_latest_checkpoint_serial(checkpoint_dir): """ get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory - :param checkpoint_dir + : param checkpoint_dir """ - if not checkpoint_dir.strip(): + if not checkpoint_dir: return -1 def has_success(checkpoint_dir, cur_dir): """ is _SUCCESS in this dir """ - _, serial = cur_dir.split(CHECKPOINT_SEPARATOR) - - try: - int(serial) - except ValueError: - return -1 - if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)): + serial = _get_dir_serial(cur_dir) + if serial == -1 or not os.path.isdir( + os.path.join(checkpoint_dir, cur_dir)): return -1 success_path = os.path.join( - _get_serial_dir(serial, checkpoint_dir), SUCCESS_MARK_FILENAME) + _get_serial_dir(checkpoint_dir, serial), MODEL_DIR, + SUCCESS_MARK_FILENAME) if os.path.isfile(success_path): - return int(serial) + return serial if not os.path.isdir(checkpoint_dir): return -1 diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py index a568f61dcb2da976baa7847ae26281a34d6f88dd..cd1492da24d5e9d09a9eaac0b1b9c7aaffac6250 100644 --- a/python/paddle/fluid/layers/__init__.py +++ b/python/paddle/fluid/layers/__init__.py @@ -28,8 +28,8 @@ import math_op_patch from math_op_patch import * import detection from detection import * -import metric -from metric import * +import metric_op +from metric_op import * from learning_rate_scheduler import * __all__ = [] @@ -41,5 +41,5 @@ __all__ += control_flow.__all__ __all__ += ops.__all__ __all__ += device.__all__ __all__ += detection.__all__ -__all__ += metric.__all__ +__all__ += metric_op.__all__ __all__ += learning_rate_scheduler.__all__ diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index d1ea9f148566d20988a43f4c9d421c4452697ef1..849474dc58461ac3772f439da7bf5d57592daa8c 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -13,20 +13,20 @@ # limitations under the License. import contextlib -from layer_function_generator import autodoc +from layer_function_generator import autodoc, templatedoc from tensor import assign, fill_constant from .. import core from ..framework import Program, Variable, Operator from ..layer_helper import LayerHelper, unique_name from ..initializer import force_init_on_cpu from ops import logical_and, logical_not, logical_or +import numpy __all__ = [ 'split_lod_tensor', 'merge_lod_tensor', 'BlockGuard', 'BlockGuardWithCompletion', - 'StaticRNNMemoryLink', 'WhileGuard', 'While', 'Switch', @@ -55,34 +55,36 @@ __all__ = [ def split_lod_tensor(input, mask, level=0): """ - **split_lod_tensor** - This function takes in an input that contains the complete lod information, and takes in a mask which is used to mask certain parts of the input. The output is the true branch and the false branch with the mask applied to - the input at a certain level in the tensor. + the input at a certain level in the tensor. Mainly used in IfElse to split + data into two parts. Args: input(tuple|list|None): The input tensor that contains complete lod information needed to construct the output. mask(list): A bool column vector which masks the input. - level(int): The specific lod level to rank. + level(int): The specific lod level to split. Returns: - Variable: The true branch of tensor as per the mask applied to input. - Variable: The false branch of tensor as per the mask applied to input. + tuple(Variable, Variable): + The true branch of tensor as per the mask applied to input. + + The false branch of tensor as per the mask applied to input. Examples: .. code-block:: python - x = layers.data(name='x', shape=[1]) + x = fluid.layers.data(name='x', shape=[1]) x.persistable = True - y = layers.data(name='y', shape=[1]) + y = fluid.layers.data(name='y', shape=[1]) y.persistable = True - out_true, out_false = layers.split_lod_tensor( + out_true, out_false = fluid.layers.split_lod_tensor( input=x, mask=y, level=level) + """ helper = LayerHelper('split_lod_tensor', **locals()) out_true = helper.create_tmp_variable(dtype=input.dtype) @@ -105,8 +107,9 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0): This function takes in an input :math:`x`, the True branch, the False branch and a binary :math:`mask`. Using this information, this function - merges the True and False branches of the tensor into a single Output - at a certain lod level indiacted by :math:`level`. + merges the True and False branches of the tensor into a single tensor as + output at a certain lod level indicated by :math:`level`. Used in IfElse + to merge the output if True block and False Block. Args: in_true(tuple|list|None): The True branch to be merged. @@ -114,7 +117,7 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0): x(tuple|list|None): The input tensor that contains complete lod information needed to construct the output. mask(list): A bool column vector which masks the input. - level(int): The specific lod level to rank. + level(int): The specific lod level to merge. Returns: Variable: The merged output tensor. @@ -182,12 +185,14 @@ def Print(input, Returns: Variable: Output tensor, same data with input tensor. + Examples: + .. code-block:: python - value = some_layer(...) - Print(value, summarize=10, - message="The content of some_layer: ") + value = some_layer(...) + Print(value, summarize=10, + message="The content of some_layer: ") ''' helper = LayerHelper('print', **locals()) out = helper.create_tmp_variable(dtype=helper.input_dtype()) @@ -233,9 +238,56 @@ class BlockGuard(object): class ParallelDo(object): """ - ParallelDo class. + ParallelDo is used to represent multi-thread data parallel processing. + + Its vanilla implementation can be shown as the following (:math:`|` means + single thread and :math:`||||` means multiple threads) + + .. code-block:: text + + In the forward pass + | Split input onto different devices + | Copy parameter onto different devices + |||| Compute forward pass in parallel + | Merge output from different devices - ParallelDo class is used to create a ParallelDo. + In the backward pass + | Split output@grad onto different devices + |||| Compute backward pass in parallel + | accumulate param@grad from different devices to the first device + | Merge input@grad from different devices + | Copy param@grad to the place of parallel_do_op + + Examples: + + .. code-block:: python + + images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # ParallelDo version & Single-thread version + if thread_num > 1: + places = fluid.layers.get_places(thread_num) + pd = fluid.layers.ParallelDo(places) + with pd.do(): + images = pd.read_input(images) + label = pd.read_input(label) + predict = cnn_model(images) + cost = fluid.layers.cross_entropy(input=predict, label=label) + + avg_cost = fluid.layers.mean(x=cost) + pd.write_output(avg_cost) + + avg_cost = pd() + avg_cost = fluid.layers.mean(avg_cost) + else: + predict = cnn_model(images) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + .. warning:: + + It will be soon deprecated, please use ParallelExecutor instead. """ def __init__(self, places, use_nccl=False, name=None): @@ -362,16 +414,17 @@ class StaticRNNMemoryLink(object): """ StaticRNNMemoryLink class. - Args: - init: the initial variable for Memory - init: Variable - pre_mem: the memory variable in previous time step - pre_mem: Variable - mem: the memory variable in current time step - mem: Variable - StaticRNNMemoryLink class is used to create a link between two memory cells of a StaticRNN. + + + NOTE: This is a internal data structure of a very low-level API. + Please use StaticRNN instead. + + Args: + init(Variable): the initial variable for Memory. + pre_mem(Variable): the memory variable in previous time step. + mem(Variable): the memory variable in current time step. """ def __init__(self, init, pre_mem, mem=None): @@ -606,6 +659,29 @@ class WhileGuard(BlockGuard): class While(object): + """ + while loop control flow. + + Args: + cond (Variable): condition used to compare. + name (str): The name of this layer. + + Examples: + .. code-block:: python + + d0 = layers.data("d0", shape=[10], dtype='float32') + data_array = layers.array_write(x=d0, i=i) + array_len = layers.fill_constant(shape=[1],dtype='int64', value=3) + + cond = layers.less_than(x=i, y=array_len) + while_op = layers.While(cond=cond) + with while_op.block(): + d = layers.array_read(array=data_array, i=i) + i = layers.increment(x=i, in_place=True) + layers.array_write(result, i=i, array=d) + layers.less_than(x=i, y=array_len, cond=cond) + """ + BEFORE_WHILE_BLOCK = 0 IN_WHILE_BLOCK = 1 AFTER_WHILE_BLOCK = 2 @@ -675,8 +751,8 @@ def lod_rank_table(x, level=0): .. code-block:: text x is a LoDTensor: - x.lod = [[0, 2, 3], - [0, 5, 6, 7]] + x.lod = [[2, 1], + [5, 1, 1]] x.data = [a, b, c, d, e, f, g] 1. set level to 0: @@ -706,7 +782,7 @@ def lod_rank_table(x, level=0): .. code-block:: python x = fluid.layers.data(name='x', shape=[10], - dtype='float32', lod_level=1) + dtype='float32', lod_level=1) out = layers.lod_rank_table(x=x, level=0) """ helper = LayerHelper("lod_rank_table", **locals()) @@ -721,26 +797,22 @@ def lod_rank_table(x, level=0): return table +@templatedoc() def max_sequence_len(rank_table): - """Max Sequence Len Operator. Given a LoDRankTable object, this layer - returns the max length of a batch of sequences. In fact, a LoDRankTable - object contains a list of tuples() and - the list is already sorted by sequence length in descending order, so the - operator just returns the sequence length of the first tuple element. + """ + ${comment} + + >>> import paddle.fluid as fluid + >>> x = fluid.layers.data(name='x', shape=[10], dtype='float32', + >>> lod_level=1) + >>> rank_table = layers.lod_rank_table(x=x, level=0) + >>> max_seq_len = layers.max_sequence_len(rank_table) Args: - rank_table (Variable): Input variable which is a LoDRankTable object. + rank_table(${rank_table_type}): ${rank_table_comment}. Returns: - Variable: The max length of sequence. - - Examples: - .. code-block:: python - - x = fluid.layers.data(name='x', shape=[10], - dtype='float32', lod_level=1) - rank_table = layers.lod_rank_table(x=x, level=0) - max_seq_len = layers.max_sequence_len(rank_table) + ${out_comment}. """ helper = LayerHelper("max_seqence_len", **locals()) res = helper.create_tmp_variable(dtype="int64") @@ -752,17 +824,25 @@ def max_sequence_len(rank_table): def lod_tensor_to_array(x, table): - """ Convert a LOD_TENSOR to an LOD_TENSOR_ARRAY. + """ + Convert a LoDTensor to a LoDTensorArray. + + This function split a LoDTesnor to a LoDTensorArray according to its LoD + information. LoDTensorArray is an alias of C++ std::vector in + PaddlePaddle. The generated LoDTensorArray of this function can be further read + or written by `read_from_array()` and `write_to_array()` operators. However, + this function is generally an internal component of PaddlePaddle `DynamicRNN`. + Users should not use it directly. Args: - x (Variable|list): The LOD tensor to be converted to a LOD tensor array. + x (Variable|list): The LoDTensor to be converted to a LoDTensorArray. table (ParamAttr|list): The variable that stores the level of lod which is ordered by sequence length in - descending order. + descending order. It is generally generated + by `layers.lod_rank_table()` API. Returns: - Variable: The variable of type array that has been converted from a - tensor. + Variable: The LoDTensorArray that has been converted from the input tensor. Examples: .. code-block:: python @@ -827,8 +907,7 @@ def increment(x, value=1.0, in_place=True): in_place (bool): If the increment should be performed in-place. Returns: - Variable: The tensor variable storing the transformation of - element-wise increment of each value in the input. + Variable: The elementwise-incremented object. Examples: .. code-block:: python @@ -870,7 +949,7 @@ def array_write(x, i, array=None): Variable: The output LOD_TENSOR_ARRAY where the input tensor is written. Examples: - .. code-block::python + .. code-block:: python tmp = fluid.layers.zeros(shape=[10], dtype='int32') i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10) @@ -891,14 +970,17 @@ def array_write(x, i, array=None): def create_array(dtype): - """This function creates an array of type :math:`LOD_TENSOR_ARRAY` using the - LayerHelper. + """ + **Create LoDTensorArray** + + This function creates an array of LOD_TENSOR_ARRAY . It is mainly used to + implement RNN with array_write, array_read and While. Args: - dtype (int|float): The data type of the elements in the array. + dtype (int|float): The data type of the elements in the lod_tensor_array. Returns: - Variable: The tensor variable storing the elements of data type. + Variable: The lod_tensor_array variable storing the elements of data type. Examples: .. code-block:: python @@ -913,37 +995,40 @@ def create_array(dtype): dtype=dtype) -def less_than(x, y, force_cpu=True, cond=None, **ignored): +@templatedoc() +def less_than(x, y, force_cpu=None, cond=None, **ignored): """ - **Less than** + ${comment} - This layer returns the truth value of :math:`x < y` elementwise. + >>> import paddle.fluid as fluid + >>> less = fluid.layers.less_than(x=label, y=limit) Args: - x(Variable): First operand of *less_than* - y(Variable): Second operand of *less_than* - force_cpu(Bool|True): The output data will be on CPU if set true. + x(${x_type}): ${x_comment}. + y(${y_type}): ${y_comment}. + force_cpu(${force_cpu_type}): ${force_cpu_comment}. cond(Variable|None): Optional output variable to store the result of *less_than* Returns: - Variable: The tensor variable storing the output of *less_than*. - - Examples: - .. code-block:: python - - less = fluid.layers.less_than(x=label, y=limit) + ${out_comment}. """ helper = LayerHelper("less_than", **locals()) if cond is None: cond = helper.create_tmp_variable(dtype='bool') cond.stop_gradient = True + attrs = dict() + if force_cpu is not None: + attrs['force_cpu'] = force_cpu + elif force_init_on_cpu(): + attrs['force_cpu'] = force_init_on_cpu() + helper.append_op( type='less_than', inputs={'X': [x], 'Y': [y]}, outputs={'Out': [cond]}, - attrs={'force_cpu': force_cpu or force_init_on_cpu()}) + attrs=attrs) return cond @@ -978,16 +1063,34 @@ def equal(x, y, cond=None, **ignored): def array_read(array, i): - """This function performs the operation to read the data in as an + """ + This function performs the operation to read the data in as an LOD_TENSOR_ARRAY. + + .. code-block:: text + + Given: + + array = [0.6, 0.1, 0.3, 0.1] + + And: + + i = 2 + + Then: + + output = 0.3 + Args: - array (Variable|list): The input tensor that will be written to an array. - i (Variable|list): The subscript index in tensor array, that points the - place where data will be written to. + array (Variable|list): The input tensor that store data to be read. + i (Variable|list): The index of the data to be read from input array. + Returns: Variable: The tensor type variable that has the data written to it. + Examples: - .. code-block::python + .. code-block:: python + tmp = fluid.layers.zeros(shape=[10], dtype='int32') i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10) arr = layers.array_read(tmp, i=i) @@ -1008,8 +1111,28 @@ def array_read(array, i): def shrink_memory(x, i, table): """ - This function creates an operator to shrink_rnn_memory using the RankTable + This function creates an operator to shrink rnn memory using the RankTable as mentioned in the input parameter. + + NOTE: This API is very low-level API. It is used by DynamicRNN only. + + Since the Dynamic RNN uses no-padding way to implement RNN. The sequence + will be sorted by order, and the length of valid memory will be shrink after + each time step. + + Args: + x(Variable): The memory object in the previous time step. + i(Variable): The step count variable. A int scalar as LoDTensor. + table(Variable): The RNNRankTable object. + + Returns: + the memory variable after shrink. + + Examples: + + Since this API is very low level API. The example is not provided. + Please reference the implementation of class DynamicRNN for detail + usage. """ helper = LayerHelper('shrink_memory', **locals()) out = helper.create_tmp_variable(dtype=x.dtype) @@ -1024,9 +1147,14 @@ def shrink_memory(x, i, table): def array_length(array): - """This function performs the operation to find the length of the input + """ + **Get the Length of Input LoDTensorArray** + + This function performs the operation to find the length of the input LOD_TENSOR_ARRAY. + Related API: array_read, array_write, While. + Args: array (LOD_TENSOR_ARRAY): The input array that will be used to compute the length. @@ -1035,12 +1163,13 @@ def array_length(array): Variable: The length of the input LoDTensorArray. Examples: - .. code-block::python + .. code-block:: python tmp = fluid.layers.zeros(shape=[10], dtype='int32') i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10) arr = fluid.layers.array_write(tmp, i=i) arr_len = fluid.layers.array_length(arr) + """ helper = LayerHelper('array_length', **locals()) tmp = helper.create_tmp_variable(dtype='int64') @@ -1051,6 +1180,13 @@ def array_length(array): class ConditionalBlockGuard(BlockGuard): + """ + ConditionalBlockGuard is derived from BlockGuard. It is dedicated for + holding a ConditionalBlock, and helping users entering and exiting the + ConditionalBlock via Python's 'with' keyword. However, ConditionalBlockGuard + is generally an internal component of IfElse, users should not use it directly. + """ + def __init__(self, block): if not isinstance(block, ConditionalBlock): raise TypeError("block should be conditional block") @@ -1067,6 +1203,31 @@ class ConditionalBlockGuard(BlockGuard): class ConditionalBlock(object): + ''' + **ConditionalBlock** + + ConditionalBlock is an operator that bind a block to a specific condition, + if the condition matches, the corresponding block will be executed. + + Args: + inputs (Variable): bool conditions. + is_scalar_condition (bool): whether the branch is controled by a scalar. + name(str): name of this ConditionalBlock. + + Examples: + .. code-block:: python + + cond = layers.less_than(x=label, y=limit) + true_image, false_image = layers.split_lod_tensor( + input=image, mask=cond) + true_cond = layers.ConditionalBlock([true_image]) + + with true_cond.block(): + ... + with false_cond.block(): + ... + ''' + def __init__(self, inputs, is_scalar_condition=False, name=None): for each_input in inputs: if not isinstance(each_input, Variable): @@ -1124,6 +1285,42 @@ class ConditionalBlock(object): class Switch(object): + """ + Switch class works just like a `if-elif-else`. Can be used in learning rate scheduler + to modify learning rate + + The Semantics: + + 1. A `switch` control-flow checks cases one-by-one. + + 2. The condition of each case is a boolean value, which is a scalar Variable. + + 3. It runs the first matched case, or the default case if there is one. + + 4. Once it matches a case, it runs the corresponding branch and only that branch. + + Examples: + .. code-block:: python + + lr = fluid.layers.tensor.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=True, + name="learning_rate") + one_var = tensor.fill_constant( + shape=[1], dtype='float32', value=1.0) + two_var = tensor.fill_constant( + shape=[1], dtype='float32', value=2.0) + + with fluid.layers.control_flow.Switch() as switch: + with switch.case(global_step == zero_var): + fluid.layers.tensor.assign(input=one_var, output=lr) + with switch.default(): + fluid.layers.tensor.assign(input=two_var, output=lr) + + """ + def __init__(self, name=None): self.helper = LayerHelper('switch', name=name) self.inside_scope = False @@ -1153,7 +1350,8 @@ class Switch(object): return ConditionalBlockGuard(cond_block) def default(self): - """create a default case for this switch + """ + create a default case for this switch """ pre_cond_num = len(self.pre_not_conditions) if pre_cond_num == 0: @@ -1213,6 +1411,34 @@ class IfElseBlockGuard(object): class IfElse(object): + """ + if-else control flow. + + Args: + cond (Variable): condition used to compare. + name (str, default None): The name of this layer. + + Examples: + .. code-block:: python + + limit = fluid.layers.fill_constant_batch_size_like( + input=label, dtype='int64', shape=[1], value=5.0) + cond = fluid.layers.less_than(x=label, y=limit) + ie = fluid.layers.IfElse(cond) + with ie.true_block(): + true_image = ie.input(image) + hidden = fluid.layers.fc(input=true_image, size=100, act='tanh') + prob = fluid.layers.fc(input=hidden, size=10, act='softmax') + ie.output(prob) + + with ie.false_block(): + false_image = ie.input(image) + hidden = fluid.layers.fc( + input=false_image, size=200, act='tanh') + prob = fluid.layers.fc(input=hidden, size=10, act='softmax') + ie.output(prob) + prob = ie() + """ OUT_IF_ELSE_BLOCKS = 0 IN_IF_ELSE_TRUE_BLOCKS = 1 IN_IF_ELSE_FALSE_BLOCKS = 2 @@ -1315,6 +1541,38 @@ class IfElse(object): class DynamicRNN(object): + """ + The dynamic RNN can process a batch of sequence data. The length of each + sample sequence can be different. This API automatically process them in + batch. + + The input lod must be set. Please reference `lod_tensor` + + >>> import paddle.fluid as fluid + >>> data = fluid.layers.data(name='sentence', dtype='int64', lod_level=1) + >>> embedding = fluid.layers.embedding(input=data, size=[65535, 32], + >>> is_sparse=True) + >>> + >>> drnn = fluid.layers.DynamicRNN() + >>> with drnn.block(): + >>> word = drnn.step_input(embedding) + >>> prev = drnn.memory(shape=[200]) + >>> hidden = fluid.layers.fc(input=[word, prev], size=200, act='relu') + >>> drnn.update_memory(prev, hidden) # set prev to hidden + >>> drnn.output(hidden) + >>> + >>> # last is the last time step of rnn. It is the encoding result. + >>> last = fluid.layers.sequence_last_step(drnn()) + + The dynamic RNN will unfold sequence into timesteps. Users need to define + how to process each time step during the :code:`with` block. + + The `memory` is used staging data cross time step. The initial value of + memory can be zero or another variable. + + The dynamic RNN can mark multiple variables as its output. Use `drnn()` to + get the output sequence. + """ BEFORE_RNN = 0 IN_RNN = 1 AFTER_RNN = 2 @@ -1337,6 +1595,15 @@ class DynamicRNN(object): self.mem_link = [] def step_input(self, x): + """ + Mark a sequence as a dynamic RNN input. + Args: + x(Variable): The input sequence. + + Returns: + The current timestep in the input sequence. + + """ self._assert_in_rnn_block_("step_input") if not isinstance(x, Variable): raise TypeError( @@ -1380,6 +1647,15 @@ class DynamicRNN(object): return array_read(array=input_array, i=self.step_idx) def static_input(self, x): + """ + Mark a variable as a RNN input. The input will not be scattered into + time steps. + Args: + x(Variable): The input variable. + + Returns: + The input variable that can access in RNN. + """ self._assert_in_rnn_block_("static_input") if not isinstance(x, Variable): raise TypeError( @@ -1401,6 +1677,10 @@ class DynamicRNN(object): @contextlib.contextmanager def block(self): + """ + The block for user to define operators in RNN. See the class docstring + for more details. + """ if self.status != DynamicRNN.BEFORE_RNN: raise ValueError("rnn.block() can only be invoke once") self.step_idx = fill_constant( @@ -1427,6 +1707,9 @@ class DynamicRNN(object): x=each_array, table=self.lod_rank_table)) def __call__(self, *args, **kwargs): + """ + Get the output of RNN. This API should only be invoked after RNN.block() + """ if self.status != DynamicRNN.AFTER_RNN: raise ValueError(("Output of the dynamic RNN can only be visited " "outside the rnn block.")) @@ -1441,6 +1724,70 @@ class DynamicRNN(object): value=0.0, need_reorder=False, dtype='float32'): + """ + Create a memory variable for dynamic rnn. + + If the :code:`init` is not None, :code:`memory` will be initialized by + this variable. The :code:`need_reorder` is used to reorder the memory as + the input variable. It should be set to true when the initialized memory + depends on the input sample. + + For example, + + >>> import paddle.fluid as fluid + >>> sentence = fluid.layers.data( + >>> name='sentence', dtype='float32', shape=[32]) + >>> boot_memory = fluid.layers.data( + >>> name='boot', dtype='float32', shape=[10]) + >>> + >>> drnn = fluid.layers.DynamicRNN() + >>> with drnn.block(): + >>> word = drnn.step_input(sentence) + >>> memory = drnn.memory(init=boot_memory, need_reorder=True) + >>> hidden = fluid.layers.fc( + >>> input=[word, memory], size=10, act='tanh') + >>> drnn.update_memory(ex_mem=memory, new_mem=hidden) + >>> drnn.output(hidden) + >>> rnn_output = drnn() + + + Otherwise, if :code:`shape`, :code:`value`, :code:`dtype` are set, the + :code:`memory` will be initialized by this :code:`value`. + + For example, + + >>> import paddle.fluid as fluid + >>> sentence = fluid.layers.data( + >>> name='sentence', dtype='float32', shape=[32]) + >>> + >>> drnn = fluid.layers.DynamicRNN() + >>> with drnn.block(): + >>> word = drnn.step_input(sentence) + >>> memory = drnn.memory(shape=[10], dtype='float32', value=0) + >>> hidden = fluid.layers.fc( + >>> input=[word, memory], size=10, act='tanh') + >>> drnn.update_memory(ex_mem=memory, new_mem=hidden) + >>> drnn.output(hidden) + >>> rnn_output = drnn() + + + Args: + init(Variable|None): The initialized variable. + + shape(list|tuple): The memory shape. NOTE the shape does not contain + batch_size. + + value(float): the initalized value. + + need_reorder(bool): True if the initialized memory depends on the + input sample. + + dtype(str|numpy.dtype): The data type of the initialized memory. + + Returns: + the memory variable. + + """ self._assert_in_rnn_block_('memory') if init is not None: if not isinstance(init, Variable): @@ -1508,6 +1855,16 @@ class DynamicRNN(object): return self.memory(init=init) def update_memory(self, ex_mem, new_mem): + """ + Update the memory from ex_mem to new_mem. NOTE that the shape and data + type of :code:`ex_mem` and :code:`new_mem` must be same. + Args: + ex_mem(Variable): the memory variable. + new_mem(Variable): the plain variable generated in RNN block. + + Returns: + None + """ self._assert_in_rnn_block_('update_memory') if not isinstance(ex_mem, Variable): raise TypeError("The input arg `ex_mem` of update_memory() must " @@ -1525,6 +1882,15 @@ class DynamicRNN(object): self.mem_link.append((new_mem, mem_array)) def output(self, *outputs): + """ + mark the RNN output variables. + + Args: + outputs: The output variables. + + Returns: + None + """ self._assert_in_rnn_block_('output') parent_block = self._parent_block_() for each in outputs: @@ -1567,26 +1933,26 @@ def reorder_lod_tensor_by_rank(x, rank_table): def is_empty(x, cond=None, **ignored): """ - **Is Empty** - - This layer returns the truth value of whether the variable is empty. + Test whether a Variable is empty. Args: - x(Variable): Operand of *is_empty* - cond(Variable|None): Optional output variable to store the result - of *is_empty* + x (Variable): The Variable to be tested. + cond (Variable|None): Output parameter. Returns the test result + of given 'x'. Default: None Returns: - Variable: The tensor variable storing the output of *is_empty*. + Variable: A bool scalar. True if 'x' is an empty Variable. Raises: TypeError: If input cond is not a variable, or cond's dtype is - not bool + not bool. Examples: .. code-block:: python - less = fluid.layers.is_empty(x=input) + res = fluid.layers.is_empty(x=input) + # or: + fluid.layers.is_empty(x=input, cond=res) """ helper = LayerHelper("is_empty", **locals()) if cond is None: diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 3a83db12fd13651578deeac6b562bac2f1e4e4b6..200db87f1793a41e8327b59677252c19eab567de 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -16,7 +16,7 @@ All layers just related to the detection neural network. """ from layer_function_generator import generate_layer_fn -from layer_function_generator import autodoc +from layer_function_generator import autodoc, templatedoc from ..layer_helper import LayerHelper import tensor import nn @@ -97,7 +97,9 @@ def detection_output(loc, nms_eta(float): The parameter for adaptive NMS. Returns: - Variable: The detection outputs is a LoDTensor with shape [No, 6]. + Variable: + + The detection outputs is a LoDTensor with shape [No, 6]. Each row has six values: [label, confidence, xmin, ymin, xmax, ymax]. `No` is the total number of detections in this mini-batch. For each instance, the offsets in first dimension are called LoD, the offset @@ -110,15 +112,15 @@ def detection_output(loc, Examples: .. code-block:: python - pb = layers.data(name='prior_box', shape=[10, 4], + pb = layers.data(name='prior_box', shape=[10, 4], append_batch_size=False, dtype='float32') - pbv = layers.data(name='prior_box_var', shape=[10, 4], + pbv = layers.data(name='prior_box_var', shape=[10, 4], append_batch_size=False, dtype='float32') - loc = layers.data(name='target_box', shape=[2, 21, 4], + loc = layers.data(name='target_box', shape=[2, 21, 4], append_batch_size=False, dtype='float32') - scores = layers.data(name='scores', shape=[2, 21, 10], + scores = layers.data(name='scores', shape=[2, 21, 10], append_batch_size=False, dtype='float32') - nmsed_outs = fluid.layers.detection_output(scores=scores, + nmsed_outs = fluid.layers.detection_output(scores=scores, loc=loc, prior_box=pb, prior_box_var=pbv) @@ -153,7 +155,7 @@ def detection_output(loc, return nmsed_outs -@autodoc() +@templatedoc() def detection_map(detect_res, label, class_num, @@ -164,6 +166,47 @@ def detection_map(detect_res, input_states=None, out_states=None, ap_version='integral'): + """ + ${comment} + + Args: + detect_res: ${detect_res_comment} + label: ${label_comment} + class_num: ${class_num_comment} + background_label: ${background_label_comment} + overlap_threshold: ${overlap_threshold_comment} + evaluate_difficult: ${evaluate_difficult_comment} + has_state: ${has_state_comment} + input_states: If not None, It contains 3 elements: + 1. pos_count ${pos_count_comment}. + 2. true_pos ${true_pos_comment}. + 3. false_pos ${false_pos_comment}. + out_states: If not None, it contains 3 elements. + 1. accum_pos_count ${accum_pos_count_comment}. + 2. accum_true_pos ${accum_true_pos_comment}. + 3. accum_false_pos ${accum_false_pos_comment}. + ap_version: ${ap_type_comment} + + Returns: + ${map_comment} + + + Examples: + .. code-block:: python + + detect_res = fluid.layers.data( + name='detect_res', + shape=[10, 6], + append_batch_size=False, + dtype='float32') + label = fluid.layers.data( + name='label', + shape=[10, 6], + append_batch_size=False, + dtype='float32') + + map_out = fluid.layers.detection_map(detect_res, label, 21) + """ helper = LayerHelper("detection_map", **locals()) def __create_var(type): @@ -210,53 +253,68 @@ def bipartite_match(dist_matrix, dist_threshold=None, name=None): """ - **Bipartite matchint operator** - - This operator is a greedy bipartite matching algorithm, which is used to - obtain the matching with the maximum distance based on the input + This operator implements a greedy bipartite matching algorithm, which is + used to obtain the matching with the maximum distance based on the input distance matrix. For input 2D matrix, the bipartite matching algorithm can - find the matched column for each row, also can find the matched row for - each column. And this operator only calculate matched indices from column - to row. For each instance, the number of matched indices is the number of - of columns of the input ditance matrix. - - There are two outputs to save matched indices and distance. - A simple description, this algothrim matched the best (maximum distance) + find the matched column for each row (matched means the largest distance), + also can find the matched row for each column. And this operator only + calculate matched indices from column to row. For each instance, + the number of matched indices is the column number of the input distance + matrix. + + There are two outputs, matched indices and distance. + A simple description, this algorithm matched the best (maximum distance) row entity to the column entity and the matched indices are not duplicated in each row of ColToRowMatchIndices. If the column entity is not matched any row entity, set -1 in ColToRowMatchIndices. - Please note that the input DistMat can be LoDTensor (with LoD) or Tensor. + NOTE: the input DistMat can be LoDTensor (with LoD) or Tensor. If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size. If Tensor, the height of ColToRowMatchIndices is 1. + NOTE: This API is a very low level API. It is used by :code:`ssd_loss` + layer. Please consider to use :code:`ssd_loss` instead. + Args: dist_matrix(Variable): This input is a 2-D LoDTensor with shape [K, M]. It is pair-wise distance matrix between the entities represented by each row and each column. For example, assumed one entity is A with shape [K], another entity is B with shape [M]. The - dist_matirx[i][j] is the distance between A[i] and B[j]. The bigger - the distance is, the better macthing the pairs are. Please note, - This tensor can contain LoD information to represent a batch of - inputs. One instance of this batch can contain different numbers of - entities. + dist_matrix[i][j] is the distance between A[i] and B[j]. The bigger + the distance is, the better matching the pairs are. + + NOTE: This tensor can contain LoD information to represent a batch + of inputs. One instance of this batch can contain different numbers + of entities. match_type(string|None): The type of matching method, should be - 'bipartite' or 'per_prediction', 'bipartite' by defalut. + 'bipartite' or 'per_prediction'. [default 'bipartite']. dist_threshold(float|None): If `match_type` is 'per_prediction', this threshold is to determine the extra matching bboxes based - on the maximum distance, 0.5 by defalut. + on the maximum distance, 0.5 by default. Returns: - match_indices(Variable): A 2-D Tensor with shape [N, M] in int type. - N is the batch size. If match_indices[i][j] is -1, it - means B[j] does not match any entity in i-th instance. - Otherwise, it means B[j] is matched to row - match_indices[i][j] in i-th instance. The row number of - i-th instance is saved in match_indices[i][j]. - match_distance(Variable): A 2-D Tensor with shape [N, M] in float type. - N is batch size. If match_indices[i][j] is -1, - match_distance[i][j] is also -1.0. Otherwise, assumed - match_distance[i][j] = d, and the row offsets of each instance - are called LoD. Then match_distance[i][j] = dist_matrix[d+LoD[i]][j]. + tuple: a tuple with two elements is returned. The first is + matched_indices, the second is matched_distance. + + The matched_indices is a 2-D Tensor with shape [N, M] in int type. + N is the batch size. If match_indices[i][j] is -1, it + means B[j] does not match any entity in i-th instance. + Otherwise, it means B[j] is matched to row + match_indices[i][j] in i-th instance. The row number of + i-th instance is saved in match_indices[i][j]. + + The matched_distance is a 2-D Tensor with shape [N, M] in float type + . N is batch size. If match_indices[i][j] is -1, + match_distance[i][j] is also -1.0. Otherwise, assumed + match_distance[i][j] = d, and the row offsets of each instance + are called LoD. Then match_distance[i][j] = + dist_matrix[d+LoD[i]][j]. + + Examples: + + >>> x = fluid.layers.data(name='x', shape=[4], dtype='float32') + >>> y = fluid.layers.data(name='y', shape=[4], dtype='float32') + >>> iou = fluid.layers.iou_similarity(x=x, y=y) + >>> matched_indices, matched_dist = fluid.layers.bipartite_match(iou) """ helper = LayerHelper('bipartite_match', **locals()) match_indices = helper.create_tmp_variable(dtype='int32') @@ -281,8 +339,6 @@ def target_assign(input, mismatch_value=None, name=None): """ - **Target assigner operator** - This operator can be, for given the target bounding boxes or labels, to assign classification and regression targets to each prediction as well as weights to prediction. The weights is used to specify which prediction would @@ -296,20 +352,24 @@ def target_assign(input, 1. Assigning all outpts based on `match_indices`: - If id = match_indices[i][j] > 0, + .. code-block:: text + + If id = match_indices[i][j] > 0, - out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K] - out_weight[i][j] = 1. + out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K] + out_weight[i][j] = 1. - Otherwise, + Otherwise, - out[j][j][0 : K] = {mismatch_value, mismatch_value, ...} - out_weight[i][j] = 0. + out[j][j][0 : K] = {mismatch_value, mismatch_value, ...} + out_weight[i][j] = 0. 2. Assigning out_weight based on `neg_indices` if `neg_indices` is provided: Assumed that the row offset for each instance in `neg_indices` is called neg_lod, for i-th instance and each `id` of neg_indices in this instance: + + .. code-block:: text out[i][id][0 : K] = {mismatch_value, mismatch_value, ...} out_weight[i][id] = 1.0 @@ -326,10 +386,23 @@ def target_assign(input, mismatch_value (float32): Fill this value to the mismatched location. Returns: - out (Variable): The output is a 3D Tensor with shape [N, P, K], - N and P is the same as they are in `neg_indices`, K is the - same as it in input of X. If `match_indices[i][j]`. - out_weight (Variable): The weight for output with the shape of [N, P, 1]. + tuple: + + A tuple(out, out_weight) is returned. out is a 3D Tensor with + shape [N, P, K], N and P is the same as they are in + `neg_indices`, K is the same as it in input of X. If + `match_indices[i][j]`. out_weight is the weight for output with + the shape of [N, P, 1]. + + Examples: + + .. code-block:: python + + matched_indices, matched_dist = fluid.layers.bipartite_match(iou) + gt = layers.data( + name='gt', shape=[1, 1], dtype='int32', lod_level=1) + trg, trg_weight = layers.target_assign( + gt, matched_indices, mismatch_value=0) """ helper = LayerHelper('target_assign', **locals()) out = helper.create_tmp_variable(dtype=input.dtype) @@ -364,7 +437,7 @@ def ssd_loss(location, normalize=True, sample_size=None): """ - **Multi-box loss layer for object dection algorithm of SSD** + **Multi-box loss layer for object detection algorithm of SSD** This layer is to compute dection loss for SSD given the location offset predictions, confidence predictions, prior boxes and ground-truth boudding @@ -372,21 +445,35 @@ def ssd_loss(location, is a weighted sum of the localization loss (or regression loss) and confidence loss (or classification loss) by performing the following steps: - 1. Find matched boundding box by bipartite matching algorithm. + 1. Find matched bounding box by bipartite matching algorithm. + 1.1 Compute IOU similarity between ground-truth boxes and prior boxes. + 1.2 Compute matched boundding box by bipartite matching algorithm. + 2. Compute confidence for mining hard examples + 2.1. Get the target label based on matched indices. + 2.2. Compute confidence loss. + 3. Apply hard example mining to get the negative example indices and update the matched indices. + 4. Assign classification and regression targets + 4.1. Encoded bbox according to the prior boxes. + 4.2. Assign regression targets. + 4.3. Assign classification targets. + 5. Compute the overall objective loss. + 5.1 Compute confidence loss. + 5.1 Compute localization loss. + 5.3 Compute the overall weighted loss. Args: @@ -421,39 +508,36 @@ def ssd_loss(location, mining_type (str): The hard example mining type, should be 'hard_example' or 'max_negative', now only support `max_negative`. normalize (bool): Whether to normalize the SSD loss by the total number - of output locations, True by defalut. + of output locations, True by default. sample_size (int): The max sample size of negative box, used only when mining_type is 'hard_example'. Returns: - Variable: The weighted sum of the localization loss and confidence loss, - with shape [N * Np, 1], N and Np are the same as they are - in `location`. + The weighted sum of the localization loss and confidence loss, with \ + shape [N * Np, 1], N and Np are the same as they are in `location`. Raises: - ValueError: If mining_type is 'hard_example', now only support - mining type of `max_negative`. + ValueError: If mining_type is 'hard_example', now only support mining \ + type of `max_negative`. Examples: - .. code-block:: python - - pb = layers.data( - name='prior_box', - shape=[10, 4], - append_batch_size=False, - dtype='float32') - pbv = layers.data( - name='prior_box_var', - shape=[10, 4], - append_batch_size=False, - dtype='float32') - loc = layers.data(name='target_box', shape=[10, 4], dtype='float32') - scores = layers.data(name='scores', shape=[10, 21], dtype='float32') - gt_box = layers.data( - name='gt_box', shape=[4], lod_level=1, dtype='float32') - gt_label = layers.data( - name='gt_label', shape=[1], lod_level=1, dtype='float32') - loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv) + >>> pb = fluid.layers.data( + >>> name='prior_box', + >>> shape=[10, 4], + >>> append_batch_size=False, + >>> dtype='float32') + >>> pbv = fluid.layers.data( + >>> name='prior_box_var', + >>> shape=[10, 4], + >>> append_batch_size=False, + >>> dtype='float32') + >>> loc = fluid.layers.data(name='target_box', shape=[10, 4], dtype='float32') + >>> scores = fluid.layers.data(name='scores', shape=[10, 21], dtype='float32') + >>> gt_box = fluid.layers.data( + >>> name='gt_box', shape=[4], lod_level=1, dtype='float32') + >>> gt_label = fluid.layers.data( + >>> name='gt_label', shape=[1], lod_level=1, dtype='float32') + >>> loss = fluid.layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv) """ helper = LayerHelper('ssd_loss', **locals()) @@ -577,7 +661,7 @@ def prior_box(input, offset=0.5, name=None): """ - **Prior box operator** + **Prior Box Operator** Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm. Each position of the input produce N prior boxes, N is determined by @@ -606,26 +690,30 @@ def prior_box(input, name(str): Name of the prior box op. Default: None. Returns: - boxes(Variable): the output prior boxes of PriorBox. - The layout is [H, W, num_priors, 4]. - H is the height of input, W is the width of input, - num_priors is the total - box count of each position of input. - Variances(Variable): the expanded variances of PriorBox. - The layout is [H, W, num_priors, 4]. - H is the height of input, W is the width of input - num_priors is the total - box count of each position of input + tuple: A tuple with two Variable (boxes, variances) + + boxes: the output prior boxes of PriorBox. + The layout is [H, W, num_priors, 4]. + H is the height of input, W is the width of input, + num_priors is the total + box count of each position of input. + + variances: the expanded variances of PriorBox. + The layout is [H, W, num_priors, 4]. + H is the height of input, W is the width of input + num_priors is the total + box count of each position of input Examples: .. code-block:: python - box, var = prior_box( - input=conv1, - image=images, - min_sizes=[100.], - flip=True, - clip=True) + + box, var = fluid.layers.prior_box( + input=conv1, + image=images, + min_sizes=[100.], + flip=True, + clip=True) """ helper = LayerHelper("prior_box", **locals()) dtype = helper.input_dtype() @@ -695,11 +783,9 @@ def multi_box_head(inputs, stride=1, name=None): """ - **Prior_boxes** - Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm. The details of this algorithm, please refer the - section 2.2 of SSD paper (SSD: Single Shot MultiBox Detector) + section 2.2 of SSD paper `SSD: Single Shot MultiBox Detector `_ . Args: @@ -740,24 +826,27 @@ def multi_box_head(inputs, name(str): Name of the prior box layer. Default: None. Returns: - mbox_loc(Variable): The predicted boxes' location of the inputs. - The layout is [N, H*W*Priors, 4]. where Priors - is the number of predicted boxes each position of each input. - mbox_conf(Variable): The predicted boxes' confidence of the inputs. - The layout is [N, H*W*Priors, C]. where Priors - is the number of predicted boxes each position of each input - and C is the number of Classes. - boxes(Variable): the output prior boxes of PriorBox. - The layout is [num_priors, 4]. num_priors is the total - box count of each position of inputs. - Variances(Variable): the expanded variances of PriorBox. - The layout is [num_priors, 4]. num_priors is the total - box count of each position of inputs + tuple: A tuple with four Variables. (mbox_loc, mbox_conf, boxes, variances) + + mbox_loc: The predicted boxes' location of the inputs. The layout + is [N, H*W*Priors, 4]. where Priors is the number of predicted + boxes each position of each input. + + mbox_conf: The predicted boxes' confidence of the inputs. The layout + is [N, H*W*Priors, C]. where Priors is the number of predicted boxes + each position of each input and C is the number of Classes. + + boxes: the output prior boxes of PriorBox. The layout is [num_priors, 4]. + num_priors is the total box count of each position of inputs. + + variances: the expanded variances of PriorBox. The layout is + [num_priors, 4]. num_priors is the total box count of each position of inputs Examples: .. code-block:: python - mbox_locs, mbox_confs, box, var = layers.multi_box_head( + + mbox_locs, mbox_confs, box, var = fluid.layers.multi_box_head( inputs=[conv1, conv2, conv3, conv4, conv5, conv5], image=images, num_classes=21, diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 8758ac9f94ab91b5be5fc70917c64db38997d1c1..f33ae76aea95ceeca73c5bae6e4e490cdff29bf3 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -19,11 +19,12 @@ from ..unique_name import generate as unique_name from control_flow import BlockGuard from ..layer_helper import LayerHelper from ..executor import global_scope +from layer_function_generator import generate_layer_fn, templatedoc __all__ = [ - 'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file', - 'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer', - 'random_data_generator', 'Preprocessor' + 'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'Recv', + 'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch', + 'double_buffer', 'random_data_generator', 'Preprocessor', 'load' ] @@ -108,10 +109,35 @@ class BlockGuardServ(BlockGuard): class ListenAndServ(object): """ - ListenAndServ class. + **ListenAndServ Layer** - ListenAndServ class is used to wrap listen_and_serv op to create a server - which can receive variables from clients and run a block. + ListenAndServ is used to create a rpc server bind and listen + on specific TCP port, this server will run the sub-block when + received variables from clients. + + Args: + endpoint(string): IP:port string which the server will listen on. + inputs(list): a list of variables that the server will get from clients. + fan_in(int): how many client are expected to report to this server, default: 1. + optimizer_mode(bool): whether to run the server as a parameter server, default: True. + + Examples: + .. code-block:: python + + with fluid.program_guard(main): + serv = layers.ListenAndServ( + "127.0.0.1:6170", ["X"], optimizer_mode=False) + with serv.do(): + x = layers.data( + shape=[32, 32], + dtype='float32', + name="X", + append_batch_size=False) + fluid.initializer.Constant(value=1.0)(x, main.global_block()) + layers.scale(x=x, scale=10.0, out=out_var) + + exe = fluid.Executor(place) + exe.run(main) """ def __init__(self, endpoint, inputs, fan_in=1, optimizer_mode=True): @@ -160,7 +186,6 @@ class ListenAndServ(object): main_program = self.helper.main_program current_block = main_program.current_block() parent_block = self.parent_block() - empty_block = Program().global_block() parent_block.append_op( type='listen_and_serv', @@ -169,25 +194,25 @@ class ListenAndServ(object): attrs={ 'endpoint': self.endpoint, 'Fanin': self.fan_in, - 'OptimizeBlock': current_block, - 'PrefetchBlock': empty_block, + 'optimize_blocks': [ + current_block + ], # did not support multiple optimize blocks in layers 'sync_mode': True, # did not support async now in layers 'grad_to_block_id': [""] }) -def Send(endpoints, send_vars, get_vars=None): +def Send(endpoints, send_vars, sync=True): """ - Send layer + Send variables to the server side, and get vars from server + side when server have finished running server side program. Args: - endpoints: comma seperated IP:PORT pairs in the order + endpoints (str): comma seperated IP:PORT pairs in the order of send_vars to send - send_vars: vars to send - get_vars: vars to get from server after send completes. + send_vars (list): variables to send to server + sync (bool): whether to wait the request finish - Send variables to the server side, and get vars from server - side when server have finished running server side program. """ assert (type(send_vars) == list) @@ -195,40 +220,33 @@ def Send(endpoints, send_vars, get_vars=None): endpoints = list(set(epmap)) helper = LayerHelper("Send", **locals()) - if not get_vars: - get_vars = [] - for s in send_vars: - v = helper.create_tmp_variable(dtype=s.dtype, stop_gradient=True) - get_vars.append(v) rpc_op_role_name = core.op_proto_and_checker_maker.kOpRoleAttrName() helper.append_op( type="send", inputs={"X": send_vars}, - outputs={"Out": get_vars}, attrs={ "endpoints": endpoints, "epmap": epmap, rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC }) - - return get_vars + if sync: + helper.append_op(type="send_barrier", attrs={"endpoints": endpoints}) -def Recv(endpoints, get_vars): +def Recv(endpoints, get_vars, sync=True): """ - Recv layer + Receive variables from server side Args: - endpoints: comma seperated IP:PORT pairs in the order + endpoints (str): comma seperated IP:PORT pairs in the order of send_vars to send - send_vars: vars to send - get_vars: vars to get from server after send completes. + get_vars (list): vars to get from server after send completes. + sync (bool): whether to wait the request finish - Send variables to the server side, and get vars from server - side when server have finished running server side program. + Returns: + list: list of received variables """ - assert (type(send_vars) == list) assert (type(get_vars) == list) epmap = endpoints.split(",") @@ -241,6 +259,9 @@ def Recv(endpoints, get_vars): outputs={"Out": get_vars}, attrs={"endpoints": endpoints, "epmap": epmap}) + if sync: + helper.append_op(type="fetch_barrier", attrs={"endpoints": endpoints}) + return get_vars def monkey_patch_reader_methods(reader): @@ -291,6 +312,7 @@ def _copy_reader_create_op_(block, op): return new_op +@templatedoc(op_type='create_recordio_file_reader') def open_recordio_file(filename, shapes, lod_levels, @@ -298,34 +320,30 @@ def open_recordio_file(filename, pass_num=1, for_parallel=True): """ - Open a RecordIO file - - This layer takes a RecordIO file to read from and returns a Reader Variable. - Via the Reader Variable, we can get data from the given RecordIO file. + ${comment} Args: - filename(str): The RecordIO file's name. + filename(${filename_type}): ${filename_comment}. shapes(list): List of tuples which declaring data shapes. - lod_levels(list): List of ints which declaring data lod_level. + lod_levels(${lod_levels_type}): ${lod_levels_comment}. dtypes(list): List of strs which declaring data type. pass_num(int): Number of passes to run. for_parallel(Bool): Set it as True if you are going to run subsequent operators in parallel. Returns: - Variable: A Reader Variable via which we can get RecordIO file data. + ${out_comment}. Examples: - .. code-block:: python - reader = fluid.layers.io.open_recordio_file( - filename='./data.recordio', - shapes=[(3,224,224), (1)], - lod_levels=[0, 0], - dtypes=['float32', 'int64']) - - # Via the reader, we can use 'read_file' layer to get data: - image, label = fluid.layers.io.read_file(reader) + >>> import paddle.fluid as fluid + >>> reader = fluid.layers.io.open_recordio_file( + >>> filename='./data.recordio', + >>> shapes=[(3,224,224), (1)], + >>> lod_levels=[0, 0], + >>> dtypes=['float32', 'int64']) + >>> # Via the reader, we can use 'read_file' layer to get data: + >>> image, label = fluid.layers.io.read_file(reader) """ dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] shape_concat = [] @@ -385,16 +403,16 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True): Variable: A Reader Variable from which we can get random data. Examples: - .. code-block:: python - reader = fluid.layers.io.random_data_generator( - low=0.0, - high=1.0, - shapes=[(3,224,224), (1)], - lod_levels=[0, 0]) + .. code-block:: python - # Via the reader, we can use 'read_file' layer to get data: - image, label = fluid.layers.io.read_file(reader) + reader = fluid.layers.random_data_generator( + low=0.0, + high=1.0, + shapes=[[3,224,224], [1]], + lod_levels=[0, 0]) + # Via the reader, we can use 'read_file' layer to get data: + image, label = fluid.layers.read_file(reader) """ dtypes = [core.VarDesc.VarType.FP32] * len(shapes) shape_concat = [] @@ -434,7 +452,7 @@ def open_files(filenames, shapes, lod_levels, dtypes, - thread_num, + thread_num=1, buffer_size=None, pass_num=1, for_parallel=True): @@ -451,10 +469,13 @@ def open_files(filenames, lod_levels(list): List of ints which declaring data lod_level. dtypes(list): List of strs which declaring data type. thread_num(int): The maximal concurrent prefetch thread number. - buffer_size(int): The size of prefetch buffer. + buffer_size(int|None): The size of prefetch buffer. If it is setted None, + buffer size will be thread_num * 3. + Default: None pass_num(int): Number of passes to run. for_parallel(Bool): Set it as True if you are going to run subsequent operators in parallel. + Default: True Returns: Variable: A Reader Variable via which we can get file data. @@ -474,7 +495,7 @@ def open_files(filenames, image, label = fluid.layers.io.read_file(reader) """ if buffer_size is None: - buffer_size = thread_num + buffer_size = thread_num * 3 if isinstance(filenames, basestring): filenames = [filenames] dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] @@ -543,16 +564,77 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None): def shuffle(reader, buffer_size): + """ + Shuffle the reader. + """ return __create_unshared_decorated_reader__( 'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)}) def batch(reader, batch_size): + """ + This layer is a reader decorator. It takes a reader and adds + 'batching' decoration on it. When reading with the result + decorated reader, output data will be automatically organized + to the form of batches. + + Args: + reader(Variable): The reader to be decorated with 'batching'. + batch_size(int): The batch size. + + Returns: + Variable: The reader which has been decorated with 'batching'. + + Examples: + .. code-block:: python + + raw_reader = fluid.layers.io.open_files(filenames=['./data1.recordio', + './data2.recordio'], + shapes=[(3,224,224), (1)], + lod_levels=[0, 0], + dtypes=['float32', 'int64'], + thread_num=2, + buffer_size=2) + batch_reader = fluid.layers.batch(reader=raw_reader, batch_size=5) + + # If we read data with the raw_reader: + # data = fluid.layers.read_file(raw_reader) + # We can only get data instance by instance. + # + # However, if we read data with the batch_reader: + # data = fluid.layers.read_file(batch_reader) + # Each 5 adjacent instances will be automatically combined together + # to become a batch. So what we get('data') is a batch data instead + # of an instance. + """ return __create_unshared_decorated_reader__( 'create_batch_reader', reader, {'batch_size': int(batch_size)}) def double_buffer(reader, place=None, name=None): + """ + Wrap a double buffer reader. The data will copy to target place with a + double buffer queue. If the target place is None, the place that executor + perform on will be used. + + Args: + reader(Variable): the reader variable need to be wrapped. + place(Place): the place of target data. Default is the sample place of + executor perform. + + name(str): Variable name. None if the user does not care. + + Returns: + wrapped reader with double buffer. + + Examples: + + >>> reader = fluid.layers.open_files(filenames=['somefile'], + >>> shapes=[[-1, 784], [-1, 1]], + >>> dtypes=['float32', 'int64']) + >>> reader = fluid.layers.double_buffer(reader) + >>> img, label = fluid.layers.read_file(reader) + """ attrs = dict() if place is not None: attrs['place'] = str(place).upper() @@ -570,15 +652,41 @@ def parallel(reader): {}) -def read_file(file_obj): +def read_file(reader): + """ + Execute the given reader and get data via it. + + A reader is also a Variable. It can be a raw reader generated by + `fluid.layers.open_files()` or a decorated one generated by + `fluid.layers.double_buffer()` and so on. + + Args: + + reader(Variable): The reader to execute. + + Returns: + Tuple[Variable]: Data read via the given reader. + + Examples: + .. code-block:: python + + data_file = fluid.layers.open_files( + filenames=['mnist.recordio'], + shapes=[(-1, 748), (-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int64"]) + data_file = fluid.layers.double_buffer( + fluid.layers.batch(data_file, batch_size=64)) + input, label = fluid.layers.read_file(data_file) + """ helper = LayerHelper('read_file') out = [ helper.create_tmp_variable( stop_gradient=True, dtype='float32') - for _ in range(len(file_obj.desc.shapes())) + for _ in range(len(reader.desc.shapes())) ] helper.append_op( - type='read', inputs={'Reader': [file_obj]}, outputs={'Out': out}) + type='read', inputs={'Reader': [reader]}, outputs={'Out': out}) if len(out) == 1: return out[0] else: @@ -586,6 +694,26 @@ def read_file(file_obj): class Preprocessor(object): + """ + A block for data pre-processing in reader. + + Args: + reader (Variable): A reader variable. + name (str, default None): The name of the reader. + + Examples: + .. code-block:: python + + preprocessor = fluid.layers.io.Preprocessor(reader=reader) + with preprocessor.block(): + img, lbl = preprocessor.inputs() + img_out = img / 2 + lbl_out = lbl + 1 + preprocessor.outputs(img_out, lbl_out) + + data_file = fluid.layers.io.double_buffer(preprocessor()) + + """ BEFORE_SUB_BLOCK = 0 IN_SUB_BLOCK = 1 AFTER_SUB_BLOCK = 2 @@ -662,3 +790,29 @@ class Preprocessor(object): "sink_var_names": self.sink_var_names }) return monkey_patch_reader_methods(self.reader) + + +@templatedoc() +def load(out, file_path, load_as_fp16=None): + """ + ${comment} + + >>> import paddle.fluid as fluid + >>> tmp_tensor = fluid.layers.create_tensor(dtype='float32') + >>> fluid.layers.load(tmp_tensor, "./tmp_tensor.bin") + + Args: + out(${out_type}): ${out_comment}. + + file_path(${file_path_type}): ${file_path_comment}. + + load_as_fp16(${load_as_fp16_type}): ${load_as_fp16_comment}. + + Returns: + None + """ + helper = LayerHelper("load", **locals()) + attrs = {"file_path": file_path} + if load_as_fp16 is not None: + attrs['load_as_fp16'] = load_as_fp16 + helper.append_op(type="load", inputs={}, output={"Out": out}, args=attrs) diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index 295d1b7190ec39bcc6efdf72aebede14a99807aa..3096389101a5e5b302c78145b8bc9f1d71f6b8cb 100644 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -15,16 +15,13 @@ import re import cStringIO import functools import warnings +import string from ..proto import framework_pb2 from ..framework import OpProtoHolder, Variable from ..layer_helper import LayerHelper -__all__ = [ - 'deprecated', - 'generate_layer_fn', - 'autodoc', -] +__all__ = ['deprecated', 'generate_layer_fn', 'autodoc', 'templatedoc'] def _convert_(name): @@ -43,6 +40,22 @@ def _convert_(name): return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() +def _type_to_str_(tp): + return framework_pb2.AttrType.Name(tp) + + +_two_dollar_pattern_ = re.compile(r"\$\$([^\$]+)\$\$") +_single_dollar_pattern_ = re.compile(r"\$([^\$]+)\$") +_two_bang_pattern_ = re.compile(r"!!([^!]+)!!") + + +def escape_math(text): + return _two_bang_pattern_.sub( + r'$$\1$$', + _single_dollar_pattern_.sub(r':math:`\1`', + _two_dollar_pattern_.sub(r"!!\1!!", text))) + + def _generate_doc_string_(op_proto): """ Generate docstring by OpProto @@ -54,34 +67,33 @@ def _generate_doc_string_(op_proto): str: the document string """ - def _type_to_str_(tp): - return framework_pb2.AttrType.Name(tp) - if not isinstance(op_proto, framework_pb2.OpProto): raise TypeError("OpProto should be `framework_pb2.OpProto`") buf = cStringIO.StringIO() - buf.write(op_proto.comment) + buf.write(escape_math(op_proto.comment)) buf.write('\nArgs:\n') for each_input in op_proto.inputs: line_begin = ' {0}: '.format(_convert_(each_input.name)) buf.write(line_begin) - buf.write(each_input.comment) - buf.write('\n') - buf.write(' ' * len(line_begin)) - buf.write('Duplicable: ') - buf.write(str(each_input.duplicable)) - buf.write(' Optional: ') - buf.write(str(each_input.dispensable)) + buf.write(escape_math(each_input.comment)) + if each_input.duplicable: + buf.write(" Duplicatable.") + if each_input.dispensable: + buf.write(" Optional.") buf.write('\n') + skip_attrs = OpProtoHolder.generated_op_attr_names() + for each_attr in op_proto.attrs: + if each_attr.name in skip_attrs: + continue buf.write(' ') buf.write(each_attr.name) buf.write(' (') buf.write(_type_to_str_(each_attr.type)) buf.write('): ') - buf.write(each_attr.comment) + buf.write(escape_math(each_attr.comment)) buf.write('\n') if len(op_proto.outputs) != 0: @@ -90,7 +102,7 @@ def _generate_doc_string_(op_proto): for each_opt in op_proto.outputs: if not each_opt.intermediate: break - buf.write(each_opt.comment) + buf.write(escape_math(each_opt.comment)) return buf.getvalue() @@ -220,3 +232,61 @@ def autodoc(comment=""): return func return __impl__ + + +def templatedoc(op_type=None): + """ + Decorator of layer function. It will use the docstring from the layer + function as the template. The template arguments are: + + * ${comment}: The operator comment written in CPP. + * ${{name}_comment}: The comment of ${name} written with AddAttr, AddOutput, + and AddInput. The ${name} is Python snake style. i.e., xxx_xxx. + * ${{name}_type}: The type of ${name}. + + Returns: + Decorated function. + """ + + def trim_ending_dot(msg): + return msg.rstrip('.') + + def __impl__(func): + if op_type is None: + op_type_name = func.__name__ + else: + op_type_name = op_type + op_proto = OpProtoHolder.instance().get_op_proto(op_type_name) + tmpl = string.Template(func.__doc__) + + comment_lines = op_proto.comment.split("\n") + comment = "" + for line in comment_lines: + line = line.strip() + if len(line) != 0: + comment += escape_math(line) + comment += " " + elif len(comment) != 0: + comment += "\n \n " + + args = {"comment": trim_ending_dot(comment)} + for each_input in op_proto.inputs: + input_name = _convert_(each_input.name) + args["{0}_comment".format(input_name)] = trim_ending_dot( + each_input.comment) + args["{0}_type".format(input_name)] = "Variable" + for each_attr in op_proto.attrs: + input_name = _convert_(each_attr.name) + args["{0}_comment".format(input_name)] = trim_ending_dot( + each_attr.comment) + args["{0}_type".format(input_name)] = _type_to_str_(each_attr.type) + + for each_opt in op_proto.outputs: + output_name = _convert_(each_opt.name) + args["{0}_comment".format(output_name)] = trim_ending_dot( + each_opt.comment) + args["{0}_type".format(output_name)] = "Variable" + func.__doc__ = tmpl.substitute(args) + return func + + return __impl__ diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index d13c54daa5a985e2e1bf9357630fe29d24a17bb4..6071e3e74218e4db4cddc223818d3a9b7086fd86 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -11,25 +11,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +When training a model, it's often useful to decay the +learning rate during training process, this is called +learning_rate_decay. There are many strategies to do +this, this module will provide some classical method. +User can also implement their own learning_rate_decay +strategy according to this module. +""" import control_flow import nn import ops import tensor from ..initializer import init_on_cpu +from ..framework import default_main_program, Parameter __all__ = [ 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', - 'polynomial_decay', 'piecewise_decay', 'noam_decay' + 'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS' ] -""" -When training a model, it's often useful to decay the -learning rate during training process, this is called -learning_rate_decay. There are many strategies to do -this, this module will provide some classical method. -User can also implement their own learning_rate_decay -strategy according to this module. -""" def _decay_step_counter(begin=0): @@ -41,18 +42,20 @@ def _decay_step_counter(begin=0): def noam_decay(d_model, warmup_steps): - """Apply decay to learning rate. - ```python - lr_value = np.power(d_model, -0.5) * np.min([ - np.power(current_steps, -0.5), - np.power(warmup_steps, -1.5) * current_steps - ]) - ``` + """ + Noam decay method. The numpy implementation of noam decay as follows. + + >>> import numpy as np + >>> lr_value = np.power(d_model, -0.5) * np.min([ + >>> np.power(current_steps, -0.5), + >>> np.power(warmup_steps, -1.5) * current_steps]) + + Please reference `attention is all you need + `_. Args: d_model(Variable): The dimensionality of input and output of model. - Reference: attention is all you need - https://arxiv.org/pdf/1706.03762.pdf + warmup_steps(Variable): A super parameter. Returns: @@ -68,21 +71,40 @@ def noam_decay(d_model, warmup_steps): def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): - """Applies exponential decay to the learning rate. + """ + Applies exponential decay to the learning rate. + + When training a model, it is often recommended to lower the learning rate as the + training progresses. By using this function, the learning rate will be decayed by + 'decay_rate' every 'decay_steps' steps. + + >>> if staircase == True: + >>> decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps) + >>> else: + >>> decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps) - ```python - decayed_learning_rate = learning_rate * - decay_rate ^ (global_step / decay_steps) - ``` Args: - learning_rate: A scalar float32 value or a Variable. This - will be the initial learning rate during training - decay_steps: A Python `int32` number. - decay_rate: A Python `float` number. - staircase: Boolean. If set true, decay the learning rate every decay_steps. + learning_rate(Variable|float): The initial learning rate. + decay_steps(int): See the decay computation above. + decay_rate(float): The decay rate. See the decay computation above. + staircase(Boolean): If True, decay the learning rate at discrete intervals. + Default: False Returns: - The decayed learning rate + Variable: The decayed learning rate + + Examples: + .. code-block:: python + + base_lr = 0.1 + sgd_optimizer = fluid.optimizer.SGD( + learning_rate=fluid.layers.exponential_decay( + learning_rate=base_lr, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + sgd_optimizer.minimize(avg_cost) + """ global_step = _decay_step_counter() @@ -126,22 +148,39 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): - """Applies inverse time decay to the initial learning rate. + """ + Applies inverse time decay to the initial learning rate. + + When training a model, it is often recommended to lower the learning rate as the + training progresses. By using this function, an inverse decay function will be + applied to the initial learning rate. - >>> if staircase: + >>> if staircase == True: >>> decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step)) >>> else: >>> decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step) Args: - learning_rate: A scalar float32 value or a Variable. This - will be the initial learning rate during training. - decay_steps: A Python `int32` number. - decay_rate: A Python `float` number. - staircase: Boolean. If set true, decay the learning rate every decay_steps. + learning_rate(Variable|float): The initial learning rate. + decay_steps(int): See the decay computation above. + decay_rate(float): The decay rate. See the decay computation above. + staircase(Boolean): If True, decay the learning rate at discrete intervals. + Default: False Returns: - The decayed learning rate + Variable: The decayed learning rate + + Examples: + .. code-block:: python + + base_lr = 0.1 + sgd_optimizer = fluid.optimizer.SGD( + learning_rate=fluid.layers.inverse_time_decay( + learning_rate=base_lr, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + sgd_optimizer.minimize(avg_cost) """ global_step = _decay_step_counter() @@ -160,25 +199,28 @@ def polynomial_decay(learning_rate, end_learning_rate=0.0001, power=1.0, cycle=False): - """Applies polynomial decay to the initial learning rate. + """ + Applies polynomial decay to the initial learning rate. + + .. code-block:: python + + if cycle: + decay_steps = decay_steps * ceil(global_step / decay_steps) + else: + global_step = min(global_step, decay_steps) + decayed_learning_rate = (learning_rate - end_learning_rate) * + (1 - global_step / decay_steps) ^ power + end_learning_rate - >>> if cycle: - >>> decay_steps = decay_steps * ceil(global_step / decay_steps) - >>> else: - >>> global_step = min(global_step, decay_steps) - >>> decayed_learning_rate = (learning_rate - end_learning_rate) * - >>> (1 - global_step / decay_steps) ^ power + - >>> end_learning_rate Args: - learning_rate: A scalar float32 value or a Variable. This - will be the initial learning rate during training - decay_steps: A Python `int32` number. - end_learning_rate: A Python `float` number. - power: A Python `float` number - cycle: Boolean. If set true, decay the learning rate every decay_steps. + learning_rate(Variable|float32): A scalar float32 value or a Variable. This + will be the initial learning rate during training. + decay_steps(int32): A Python `int32` number. + end_learning_rate(float): A Python `float` number. + power(float): A Python `float` number. + cycle(bool): If set true, decay the learning rate every decay_steps. Returns: - The decayed learning rate + Variable: The decayed learning rate """ global_step = _decay_step_counter() @@ -207,15 +249,27 @@ def polynomial_decay(learning_rate, def piecewise_decay(boundaries, values): """Applies piecewise decay to the initial learning rate. - >>> boundaries = [10000, 20000] - >>> values = [1.0, 0.5, 0.1] - >>> - >>> if step < 10000: - >>> learning_rate = 1.0 - >>> elif 10000 <= step < 20000: - >>> learning_rate = 0.5 - >>> else: - >>> learning_rate = 0.1 + The algorithm can be described as the code below. + + .. code-block:: python + + boundaries = [10000, 20000] + values = [1.0, 0.5, 0.1] + if step < 10000: + learning_rate = 1.0 + elif 10000 <= step < 20000: + learning_rate = 0.5 + else: + learning_rate = 0.1 + Args: + boundaries: A list of steps numbers. + values: A list of learning rate values that will be picked during + different step boundaries. + + Returns: + The decayed learning rate. + + """ if len(values) - len(boundaries) != 1: @@ -247,3 +301,41 @@ def piecewise_decay(boundaries, values): tensor.assign(last_value_var, lr) return lr + + +def append_LARS(params_grads, learning_rate, weight_decay): + """Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for + each layer. + + ```python + learning_rate *= local_gw_ratio * sqrt(sumsq(param)) + / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param))) + ``` + + Args: + learning_rate: A learning rate Variable. This + is the global learning rate for LARS. + weight_decay: A Python `float` number. + + Returns: + The decayed learning rate + """ + + def _balanced_weight(param_norm, grad_norm): + if weight_decay == 1.0: + return grad_norm + param_norm + else: + return grad_norm + weight_decay * param_norm + + for param, grad in params_grads: + param_lr = param.optimize_attr['learning_rate'] + param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param))) + grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad))) + if type(param_lr) == float and param_lr == 1.0: + decayed_lr = learning_rate * param_norm \ + / _balanced_weight(param_norm, grad_norm) + else: + decayed_lr = learning_rate * param_lr * param_norm \ + / _balanced_weight(param_norm, grad_norm) + # set back param local learning rate + param.optimize_attr['learning_rate'] = decayed_lr diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric_op.py similarity index 50% rename from python/paddle/fluid/layers/metric.py rename to python/paddle/fluid/layers/metric_op.py index cab2eb55510542bdd4dd7eca7667601697759181..99e82fdd04282177fae63f1fb94b5e32d41c612e 100644 --- a/python/paddle/fluid/layers/metric.py +++ b/python/paddle/fluid/layers/metric_op.py @@ -27,8 +27,32 @@ __all__ = ['accuracy', 'auc'] def accuracy(input, label, k=1, correct=None, total=None): """ + accuracy layer. + Refer to the https://en.wikipedia.org/wiki/Precision_and_recall + This function computes the accuracy using the input and label. - The output is the top k inputs and their indices. + If the correct label occurs in top k predictions, then correct will increment by one. + Note: the dtype of accuracy is determined by input. the input and label dtype can be different. + + Args: + input(Variable): The input of accuracy layer, which is the predictions of network. + Carry LoD information is supported. + label(Variable): The label of dataset. + k(int): The top k predictions for each class will be checked. + correct(Variable): The correct predictions count. + total(Variable): The total entries count. + + Returns: + Variable: The correct rate. + + Examples: + .. code-block:: python + + data = fluid.layers.data(name="data", shape=[-1, 32, 32], dtype="float32") + label = fluid.layers.data(name="data", shape=[-1,1], dtype="int32") + predict = fluid.layers.fc(input=data, size=10) + acc = fluid.layers.accuracy(input=predict, label=label, k=5) + """ helper = LayerHelper("accuracy", **locals()) topk_out, topk_indices = nn.topk(input, k=k) @@ -53,6 +77,43 @@ def accuracy(input, label, k=1, correct=None, total=None): def auc(input, label, curve='ROC', num_thresholds=200): + """ + **Area Under the Curve (AUC) Layer** + + This implementation computes the AUC according to forward output and label. + It is used very widely in binary classification evaluation. + + Note: If input label contains values other than 0 and 1, it will be cast + to `bool`. Find the relevant definitions `here `_. + + There are two types of possible curves: + + 1. ROC: Receiver operating characteristic; + 2. PR: Precision Recall + + Args: + input(Variable): A floating-point 2D Variable, values are in the range + [0, 1]. Each row is sorted in descending order. This + input should be the output of topk. Typically, this + Variable indicates the probability of each label. + label(Variable): A 2D int Variable indicating the label of the training + data. The height is batch size and width is always 1. + curve(str): Curve type, can be 'ROC' or 'PR'. Default 'ROC'. + num_thresholds(int): The number of thresholds to use when discretizing + the roc curve. Default 200. + + Returns: + Variable: A scalar representing the current AUC. + + Examples: + .. code-block:: python + + # network is a binary classification model and label the ground truth + prediction = network(image, is_infer=True) + auc_out=fluid.layers.auc(input=prediction, label=label) + """ + warnings.warn( "This interface not recommended, fluid.layers.auc compute the auc at every minibatch, \ but can not aggregate them and get the pass AUC, because pass \ @@ -64,12 +125,8 @@ def auc(input, label, curve='ROC', num_thresholds=200): topk_indices = helper.create_tmp_variable(dtype="int64") topk_out, topk_indices = nn.topk(input, k=k) auc_out = helper.create_tmp_variable(dtype="float32") - if correct is None: - correct = helper.create_tmp_variable(dtype="int64") - if total is None: - total = helper.create_tmp_variable(dtype="int64") helper.append_op( - type="accuracy", + type="auc", inputs={ "Out": [topk_out], "Indices": [topk_indices], diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index bd6ed0f30e4d71df7a4e84c6dd3472c391008393..f5700ed5626a0c6ebd432aefa6fd97a75711d48f 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -19,9 +19,11 @@ from ..layer_helper import LayerHelper from ..initializer import Normal, Constant from ..framework import Variable from ..param_attr import ParamAttr -from layer_function_generator import autodoc +from layer_function_generator import autodoc, templatedoc from tensor import concat import utils +import random +from .. import unique_name __all__ = [ 'fc', @@ -38,13 +40,16 @@ __all__ = [ 'chunk_eval', 'sequence_conv', 'conv2d', + 'conv3d', 'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', + 'pool3d', 'batch_norm', 'beam_search_decode', 'conv2d_transpose', + 'conv3d_transpose', 'sequence_expand', 'lstm_unit', 'reduce_sum', @@ -81,9 +86,15 @@ __all__ = [ 'label_smooth', 'roi_pool', 'dice_loss', + 'image_resize', + 'image_resize_short', 'resize_bilinear', 'gather', 'random_crop', + 'mean_iou', + 'relu', + 'log', + 'crop', ] @@ -92,7 +103,6 @@ def fc(input, num_flatten_dims=1, param_attr=None, bias_attr=None, - use_cudnn=False, use_mkldnn=False, act=None, is_test=False, @@ -100,14 +110,15 @@ def fc(input, """ **Fully Connected Layer** - The fully connected layer can take multiple tensors as its inputs. It - creates a variable called weights for each input tensor, which represents - a fully connected weight matrix from each input unit to each output unit. - The fully connected layer multiplies each input tensor with its coresponding - weight to produce an output Tensor. If multiple input tensors are given, - the results of multiple multiplications will be sumed up. If bias_attr is - not None, a bias variable will be created and added to the output. Finally, - if activation is not None, it will be applied to the output as well. + This function creates a fully connected layer in the network. It can take + multiple tensors as its inputs. It creates a variable called weights for + each input tensor, which represents a fully connected weight matrix from + each input unit to each output unit. The fully connected layer multiplies + each input tensor with its coresponding weight to produce an output Tensor. + If multiple input tensors are given, the results of multiple multiplications + will be sumed up. If bias_attr is not None, a bias variable will be created + and added to the output. Finally, if activation is not None, it will be applied + to the output as well. This process can be formulated as follows: @@ -148,7 +159,7 @@ def fc(input, name (str, default None): The name of this layer. Returns: - A tensor variable storing the transformation result. + Variable: The transformation result. Raises: ValueError: If rank of the input tensor is less than 2. @@ -156,8 +167,7 @@ def fc(input, Examples: .. code-block:: python - data = fluid.layers.data( - name="data", shape=[32, 32], dtype="float32") + data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32") fc = fluid.layers.fc(input=data, size=1000, act="tanh") """ @@ -189,7 +199,10 @@ def fc(input, else: pre_bias = helper.create_tmp_variable(dtype) helper.append_op( - type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias}) + type="sum", + inputs={"X": mul_results}, + outputs={"Out": pre_bias}, + attrs={"use_mkldnn": use_mkldnn}) # add bias pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims) # add activation @@ -219,10 +232,11 @@ def embedding(input, have two elements which indicate the size of the dictionary of embeddings and the size of each embedding vector respectively. is_sparse(bool): The flag indicating whether to use sparse update. + is_distributed(bool): Whether to run lookup table from remote parameter server. padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup. Otherwise the given :attr:`padding_idx` indicates padding the output with zeros whenever lookup encounters it in :attr:`input`. If - :math:`padding_idx < 0`, the padding_idx to use in lookup is + :math:`padding_idx < 0`, the :attr:`padding_idx` to use in lookup is :math:`size[0] + dim`. param_attr(ParamAttr): Parameters for this layer dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc @@ -258,9 +272,11 @@ def embedding(input, return tmp -# TODO(qijun): expose H0 and C0 +@templatedoc(op_type="lstm") def dynamic_lstm(input, size, + h_0=None, + c_0=None, param_attr=None, bias_attr=None, use_peepholes=True, @@ -271,56 +287,18 @@ def dynamic_lstm(input, dtype='float32', name=None): """ - **Dynamic LSTM Layer** - - The defalut implementation is diagonal/peephole connection - (https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows: - - .. math:: - - i_t & = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) - - f_t & = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) - - \\tilde{c_t} & = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) - - o_t & = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) - - c_t & = f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} - - h_t & = o_t \odot act_h(c_t) - - where the :math:`W` terms denote weight matrices (e.g. :math:`W_{xi}` is - the matrix of weights from the input gate to the input), :math:`W_{ic}, \ - W_{fc}, W_{oc}` are diagonal weight matrices for peephole connections. In - our implementation, we use vectors to reprenset these diagonal weight - matrices. The :math:`b` terms denote bias vectors (:math:`b_i` is the input - gate bias vector), :math:`\sigma` is the non-linear activations, such as - logistic sigmoid function, and :math:`i, f, o` and :math:`c` are the input - gate, forget gate, output gate, and cell activation vectors, respectively, - all of which have the same size as the cell output activation vector :math:`h`. - - The :math:`\odot` is the element-wise product of the vectors. :math:`act_g` - and :math:`act_h` are the cell input and cell output activation functions - and `tanh` is usually used for them. :math:`\\tilde{c_t}` is also called - candidate hidden state, which is computed based on the current input and - the previous hidden state. - - Set `use_peepholes` to `False` to disable peephole connection. The formula - is omitted here, please refer to the paper - http://www.bioinf.jku.at/publications/older/2604.pdf for details. - - Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}` - operations on the input :math:`x_{t}` are NOT included in this operator. - Users can choose to use fully-connect layer before LSTM layer. + ${comment} Args: - input(Variable): The input of dynamic_lstm layer, which supports - variable-time length input sequence. The underlying - tensor in this Variable is a matrix with shape - (T X 4D), where T is the total time steps in this - mini-batch, D is the hidden size. - size(int): 4 * hidden size. + input (Variable): ${input_comment} + size (int): 4 * hidden size. + h_0(Variable): The initial hidden state is an optional input, default is zero. + This is a tensor with shape (N x D), where N is the + batch size and D is the hidden size. + c_0(Variable): The initial cell state is an optional input, default is zero. + This is a tensor with shape (N x D), where N is the + batch size. `h_0` and `c_0` can be NULL but only at the same time. + param_attr(ParamAttr|None): The parameter attribute for the learnable hidden-hidden weights. @@ -328,33 +306,26 @@ def dynamic_lstm(input, W_{fh}, W_{oh}`} - The shape is (D x 4D), where D is the hidden size. - bias_attr(ParamAttr|None): The bias attribute for the learnable bias + bias_attr (ParamAttr|None): The bias attribute for the learnable bias weights, which contains two parts, input-hidden bias weights and peephole connections weights if setting `use_peepholes` to `True`. 1. `use_peepholes = False` - - Biases = {:math:`b_c, b_i, b_f, b_o`}. - - The shape is (1 x 4D). + - Biases = {:math:`b_c, b_i, b_f, b_o`}. + - The shape is (1 x 4D). 2. `use_peepholes = True` - - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ + - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ W_{fc}, W_{oc}`}. - - The shape is (1 x 7D). - use_peepholes(bool): Whether to enable diagonal/peephole connections, - default `True`. - is_reverse(bool): Whether to compute reversed LSTM, default `False`. - gate_activation(str): The activation for input gate, forget gate and - output gate. Choices = ["sigmoid", "tanh", "relu", - "identity"], default "sigmoid". - cell_activation(str): The activation for cell output. Choices = ["sigmoid", - "tanh", "relu", "identity"], default "tanh". - candidate_activation(str): The activation for candidate hidden state. - Choices = ["sigmoid", "tanh", - "relu", "identity"], - default "tanh". - dtype(str): Data type. Choices = ["float32", "float64"], default "float32". - name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + - The shape is (1 x 7D). + use_peepholes (bool): ${use_peepholes_comment} + is_reverse (bool): ${is_reverse_comment} + gate_activation (str): ${gate_activation_comment} + cell_activation (str): ${cell_activation_comment} + candidate_activation (str): ${candidate_activation_comment} + dtype (str): Data type. Choices = ["float32", "float64"], default "float32". + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Returns: tuple: The hidden state, and cell state of LSTM. The shape of both \ @@ -384,12 +355,20 @@ def dynamic_lstm(input, cell = helper.create_tmp_variable(dtype) batch_gate = helper.create_tmp_variable(dtype) batch_cell_pre_act = helper.create_tmp_variable(dtype) + inputs = {'Input': input, 'Weight': weight, 'Bias': bias} + batch_size = input.shape[0] + if h_0: + assert h_0.shape == (batch_size, size), \ + 'The shape of h0 should be (batch_size, %d)' % size + inputs['H0'] = h_0 + if c_0: + assert c_0.shape == (batch_size, size), \ + 'The shape of c0 should be (batch_size, %d)' % size + inputs['C0'] = c_0 helper.append_op( type='lstm', - inputs={'Input': input, - 'Weight': weight, - 'Bias': bias}, + inputs=inputs, outputs={ 'Hidden': hidden, 'Cell': cell, @@ -517,27 +496,31 @@ def dynamic_lstmp(input, cell_activation(str): The activation for cell output. Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh". candidate_activation(str): The activation for candidate hidden state. - Choices = ["sigmoid", "tanh", - "relu", "identity"], + Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh". proj_activation(str): The activation for projection output. - Choices = ["sigmoid", "tanh", - "relu", "identity"], + Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh". dtype(str): Data type. Choices = ["float32", "float64"], default "float32". name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. Returns: - tuple: The projection of hidden state, and cell state of LSTMP. The \ - shape of projection is (T x P), for the cell state which is \ - (T x D), and both LoD is the same with the `input`. + tuple: A tuple of two output variable: the projection of hidden state, \ + and cell state of LSTMP. The shape of projection is (T x P), \ + for the cell state which is (T x D), and both LoD is the same \ + with the `input`. Examples: + .. code-block:: python + dict_dim, emb_dim = 128, 64 + data = fluid.layers.data(name='sequence', shape=[1], + dtype='int32', lod_level=1) + emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim]) hidden_dim, proj_dim = 512, 256 - fc_out = fluid.layers.fc(input=input_seq, size=hidden_dim * 4, + fc_out = fluid.layers.fc(input=emb, size=hidden_dim * 4, act=None, bias_attr=None) proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out, size=hidden_dim * 4, @@ -603,10 +586,10 @@ def dynamic_gru(input, candidate_activation='tanh', h_0=None): """ - **Dynamic GRU Layer** + **Gated Recurrent Unit (GRU) Layer** Refer to `Empirical Evaluation of Gated Recurrent Neural Networks on - Sequence Modeling `_ + Sequence Modeling `_ . The formula is as follows: @@ -651,18 +634,27 @@ def dynamic_gru(input, :attr:`False`. gate_activation(str): The activation for update gate and reset gate. Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid". - activation(str): The activation for candidate hidden state. + candidate_activation(str): The activation for candidate hidden state. Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh". + h_0 (Variable): This is initial hidden state. If not set, default is + zero. This is a tensor with shape (N x D), where N is the number of + total time steps of input mini-batch feature and D is the hidden + size. Returns: Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \ - and lod is the same with the input. + and sequence length is the same with the input. Examples: + .. code-block:: python + dict_dim, emb_dim = 128, 64 + data = fluid.layers.data(name='sequence', shape=[1], + dtype='int32', lod_level=1) + emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim]) hidden_dim = 512 - x = fluid.layers.fc(input=data, size=hidden_dim * 3) + x = fluid.layers.fc(input=emb, size=hidden_dim * 3) hidden = fluid.layers.dynamic_gru(input=x, dim=hidden_dim) """ @@ -673,11 +665,13 @@ def dynamic_gru(input, attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) bias = helper.create_parameter( attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True) + batch_size = input.shape[0] inputs = {'Input': input, 'Weight': weight, 'Bias': bias} if h_0 != None: assert h_0.shape == ( - size, size), 'The shape of h0 should be(%d, %d)' % (size, size) - inputs['h0'] = h_0 + batch_size, size + ), 'The shape of h0 should be(batch_size, %d)' % size + inputs['H0'] = h_0 hidden = helper.create_tmp_variable(dtype) batch_gate = helper.create_tmp_variable(dtype) @@ -799,7 +793,25 @@ def gru_unit(input, return updated_hidden, reset_hidden_pre, gate +@templatedoc() def linear_chain_crf(input, label, param_attr=None): + """ + Linear Chain CRF. + + ${comment} + + Args: + input(${emission_type}): ${emission_comment} + input(${transition_type}): ${transition_comment} + label(${label_type}): ${label_comment} + param_attr(ParamAttr): The attribute of the learnable parameter. + + Returns: + output(${emission_exps_type}): ${emission_exps_comment} \n + output(${transition_exps_type}): ${transition_exps_comment} \n + output(${log_likelihood_type}): ${log_likelihood_comment} + + """ helper = LayerHelper('linear_chain_crf', **locals()) size = input.shape[1] transition = helper.create_parameter( @@ -825,7 +837,27 @@ def linear_chain_crf(input, label, param_attr=None): return log_likelihood +@templatedoc() def crf_decoding(input, param_attr, label=None): + """ + ${comment} + + Args: + input(${emission_type}): ${emission_comment} + + param_attr(ParamAttr): The parameter attribute for training. + + label(${label_type}): ${label_comment} + + Returns: + Variable: ${viterbi_path_comment} + + Examples: + .. code-block:: python + + crf_decode = layers.crf_decoding( + input=hidden, param_attr=ParamAttr(name="crfw")) + """ helper = LayerHelper('crf_decoding', **locals()) transition = helper.get_parameter(param_attr.name) viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype()) @@ -839,10 +871,17 @@ def crf_decoding(input, param_attr, label=None): return viterbi_path +@templatedoc() def cos_sim(X, Y): """ - This function performs the cosine similarity between two tensors - X and Y and returns that as the output. + ${comment} + + Args: + X (Variable): ${x_comment}. + Y (Variable): ${y_comment}. + + Returns: + Variable: the output of cosine(X, Y). """ helper = LayerHelper('cos_sim', **locals()) out = helper.create_tmp_variable(dtype=X.dtype) @@ -864,29 +903,30 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None): Drop or keep each element of `x` independently. Dropout is a regularization technique for reducing overfitting by preventing neuron co-adaption during - training. The dropout operator randomly set (according to the given dropout + training. The dropout operator randomly sets (according to the given dropout probability) the outputs of some units to zero, while others are remain unchanged. Args: - x(variable): The input tensor. - dropout_prob(float): Probability of setting units to zero. - is_test(bool): A flag indicating whether it is in test phrase or not. - seed(int): A Python integer used to create random seeds. If this - parameter is set to None, a random seed is used. - NOTE: If an integer seed is given, always the same output - units will be dropped. DO NOT use a fixed seed in training. - name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + x (Variable): The input tensor variable. + dropout_prob (float): Probability of setting units to zero. + is_test (bool): A flag indicating whether it is in test phrase or not. + seed (int): A Python integer used to create random seeds. If this + parameter is set to None, a random seed is used. + NOTE: If an integer seed is given, always the same output + units will be dropped. DO NOT use a fixed seed in training. + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Returns: - Variable: A tensor variable. + Variable: A tensor variable is the shape with `x`. Examples: + .. code-block:: python - x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32") - droped = fluid.layers.dropout(input=x, dropout_rate=0.5) + x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32") + droped = fluid.layers.dropout(x, dropout_prob=0.5) """ helper = LayerHelper('dropout', **locals()) @@ -999,8 +1039,8 @@ def square_error_cost(input, label): * :math:`Out`: Output value, same shape with :math:`X`. Args: - input(Variable): Input tensor, has predictions. - label(Variable): Label tensor, has target labels. + input (Variable): Input tensor, has predictions. + label (Variable): Label tensor, has target labels. Returns: Variable: The tensor variable storing the element-wise squared error \ @@ -1029,14 +1069,101 @@ def square_error_cost(input, label): return square_out +@templatedoc() def chunk_eval(input, label, chunk_scheme, num_chunk_types, excluded_chunk_types=None): """ + **Chunk Evaluator** + This function computes and outputs the precision, recall and F1-score of chunk detection. + + For some basics of chunking, please refer to + 'Chunking with Support Vector Machines '. + + ChunkEvalOp computes the precision, recall, and F1-score of chunk detection, + and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. + Here is a NER example of labeling for these tagging schemes: + + .. code-block:: python + + ====== ====== ====== ===== == ============ ===== ===== ===== == ========= + Li Ming works at Agricultural Bank of China in Beijing. + ====== ====== ====== ===== == ============ ===== ===== ===== == ========= + IO I-PER I-PER O O I-ORG I-ORG I-ORG I-ORG O I-LOC + IOB B-PER I-PER O O B-ORG I-ORG I-ORG I-ORG O B-LOC + IOE I-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O E-LOC + IOBES B-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O S-LOC + ====== ====== ====== ===== == ============ ===== ===== ===== == ========= + + There are three chunk types(named entity types) including PER(person), ORG(organization) + and LOC(LOCATION), and we can see that the labels have the form -. + + Since the calculations actually use label ids rather than labels, extra attention + should be paid when mapping labels to ids to make CheckEvalOp work. The key point + is that the listed equations are satisfied by ids. + + .. code-block:: python + + tag_type = label % num_tag_type + chunk_type = label / num_tag_type + + where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type` + is the num of chunk types, and `tag_type` get its value from the following table. + + .. code-block:: python + + Scheme Begin Inside End Single + plain 0 - - - + IOB 0 1 - - + IOE - 0 1 - + IOBES 0 1 2 3 + + Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG, + PER and LOC. To satisfy the above equations, the label map can be like this: + + .. code-block:: python + + B-ORG 0 + I-ORG 1 + B-PER 2 + I-PER 3 + B-LOC 4 + I-LOC 5 + O 6 + + It's not hard to verify the equations noting that the num of chunk types + is 3 and the num of tag types in IOB scheme is 2. For example, the label + id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of + I-LOC is 2, which consistent with the results from the equations. + + Args: + input (Variable): prediction output of the network. + label (Variable): label of the test data set. + chunk_scheme (str): ${chunk_scheme_comment} + num_chunk_types (int): ${num_chunk_types_comment} + excluded_chunk_types (list): ${excluded_chunk_types_comment} + + Returns: + tuple: tuple containing: precision, recall, f1_score, + num_infer_chunks, num_label_chunks, + num_correct_chunks + + Examples: + .. code-block:: python + + crf = fluid.layers.linear_chain_crf( + input=hidden, label=label, param_attr=ParamAttr(name="crfw")) + crf_decode = fluid.layers.crf_decoding( + input=hidden, param_attr=ParamAttr(name="crfw")) + fluid.layers.chunk_eval( + input=crf_decode, + label=label, + chunk_scheme="IOB", + num_chunk_types=(label_dict_len - 1) / 2) """ helper = LayerHelper("chunk_eval", **locals()) @@ -1069,6 +1196,7 @@ def chunk_eval(input, num_correct_chunks) +@templatedoc() def sequence_conv(input, num_filters, filter_size=3, @@ -1081,11 +1209,20 @@ def sequence_conv(input, This function creates the op for sequence_conv, using the inputs and other convolutional configurations for the filters and stride as given in the input parameters to the function. - """ - # FIXME(dzh) : want to unify the argument of python layer - # function. So we ignore some unecessary attributes. - # such as, padding_trainable, context_start. + Args: + input (Variable): ${x_comment} + num_filters (int): number of filters. + filter_size (int): the filter size (H and W). + filter_stride (int): stride of the filter. + padding (bool): if True, add paddings. + bias_attr (ParamAttr|None): attributes for bias + param_attr (ParamAttr|None): attributes for parameter + act (str): the activation type + + Returns: + Variable: output of sequence_conv + """ helper = LayerHelper('sequence_conv', **locals()) dtype = helper.input_dtype() @@ -1111,6 +1248,41 @@ def sequence_conv(input, def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True): + """ + This function computes the softmax activation among all time-steps for each + sequence. The dimension of each time-step should be 1. Thus, the shape of + input Tensor can be either :math:`[N, 1]` or :math:`[N]`, where :math:`N` + is the sum of the length of all sequences. + + For i-th sequence in a mini-batch: + + .. math:: + + Out(X[lod[i]:lod[i+1]], :) = \\frac{\exp(X[lod[i]:lod[i+1], :])}{\sum(\exp(X[lod[i]:lod[i+1], :]))} + + For example, for a mini-batch of 3 sequences with variable-length, + each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7], + then softmax will be computed among :math:`X[0:2, :]`, :math:`X[2:5, :]`, + :math:`X[5:7, :]`, and :math:`N` turns out to be 7. + + Args: + input (Variable): The input variable which is a LoDTensor. + bias_attr (ParamAttr|None): attributes for bias + param_attr (ParamAttr|None): attributes for parameter + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \ + library is installed. Default: True + + Returns: + Variable: output of sequence_softmax + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[7, 1], + dtype='float32', lod_level=1) + x_sequence_softmax = fluid.layers.sequence_softmax(input=x) + """ helper = LayerHelper('sequence_softmax', **locals()) dtype = helper.input_dtype() softmax_out = helper.create_tmp_variable(dtype) @@ -1123,6 +1295,45 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True): def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None): + """ + The input of the softmax layer is a 2-D tensor with shape N x K (N is the + batch_size, K is the dimension of input feature). The output tensor has the + same shape as the input tensor. + + For each row of the input tensor, the softmax operator squashes the + K-dimensional vector of arbitrary real values to a K-dimensional vector of real + values in the range [0, 1] that add up to 1. + + It computes the exponential of the given dimension and the sum of exponential + values of all the other dimensions in the K-dimensional vector input. + Then the ratio of the exponential of the given dimension and the sum of + exponential values of all the other dimensions is the output of the softmax + operator. + + For each row :math:`i` and each column :math:`j` in Input(X), we have: + + .. math:: + + Out[i, j] = \\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])} + + Args: + input (Variable): The input variable. + bias_attr (ParamAttr): attributes for bias + param_attr (ParamAttr): attributes for parameter + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \ + library is installed. + + Returns: + Variable: output of softmax + + Examples: + + .. code-block:: python + + fc = fluid.layers.fc(input=x, size=10) + softmax = fluid.layers.softmax(input=fc) + + """ helper = LayerHelper('softmax', **locals()) dtype = helper.input_dtype() softmax_out = helper.create_tmp_variable(dtype) @@ -1148,14 +1359,17 @@ def conv2d(input, act=None, name=None): """ - **Convlution2D Layer** - The convolution2D layer calculates the output based on the input, filter - and strides, paddings, dilations, groups parameters. Input(Input) and - Output(Output) are in NCHW format. Where N is batch size, C is the number of + and strides, paddings, dilations, groups parameters. Input and + Output are in NCHW format, where N is batch size, C is the number of channels, H is the height of the feature, and W is the width of the feature. - The details of convolution layer, please refer UFLDL's `convolution, - `_ . + Filter is in MCHW format, where M is the number of output image channels, + C is the number of input image channels, H is the height of the filter, + and W is the width of the filter. If the groups is greater than 1, + C will equal the number of input image channels divided by the groups. + Please refer to UFLDL's `convolution + `_ + for more detials. If bias attribution and activation type are provided, bias is added to the output of the convolution, and the corresponding activation function is applied to the final result. @@ -1166,62 +1380,64 @@ def conv2d(input, Out = \sigma (W \\ast X + b) - In the above equation: + Where: * :math:`X`: Input value, a tensor with NCHW format. * :math:`W`: Filter value, a tensor with MCHW format. * :math:`\\ast`: Convolution operation. * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. * :math:`\\sigma`: Activation function. - * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be - different. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. Example: - Input: - Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Input shape: :math:`(N, C_{in}, H_{in}, W_{in})` - Filter shape: $(C_{out}, C_{in}, H_f, W_f)$ + Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)` - Output: - Output shape: $(N, C_{out}, H_{out}, W_{out})$ + + Output shape: :math:`(N, C_{out}, H_{out}, W_{out})` Where .. math:: - H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\ - W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1 - - Args: - input(Variable): The input image with [N, C, H, W] format. - num_filters(int): The number of filter. It is as same as the output - image channel. - filter_size(int|tuple|None): The filter size. If filter_size is a tuple, - it must contain two integers, (filter_size_H, filter_size_W). - Otherwise, the filter will be a square. - stride(int|tuple): The stride size. If stride is a tuple, it must - contain two integers, (stride_H, stride_W). Otherwise, the - stride_H = stride_W = stride. Default: stride = 1. - padding(int|tuple): The padding size. If padding is a tuple, it must - contain two integers, (padding_H, padding_W). Otherwise, the - padding_H = padding_W = padding. Default: padding = 0. - dilation(int|tuple): The dilation size. If dilation is a tuple, it must - contain two integers, (dilation_H, dilation_W). Otherwise, the - dilation_H = dilation_W = dilation. Default: dilation = 1. - groups(int): The groups number of the Conv2d Layer. According to grouped - convolution in Alex Krizhevsky's Deep CNN paper: when group=2, - the first half of the filters is only connected to the first half - of the input channels, while the second half of the filters is only - connected to the second half of the input channels. Default: groups=1 - param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None - bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None - use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn - library is installed. Default: True - act(str): Activation type. Default: None - name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\ + W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1 + + Args: + input (Variable): The input image with [N, C, H, W] format. + num_filters(int): The number of filter. It is as same as the output + image channel. + filter_size (int|tuple|None): The filter size. If filter_size is a tuple, + it must contain two integers, (filter_size_H, filter_size_W). + Otherwise, the filter will be a square. + stride (int|tuple): The stride size. If stride is a tuple, it must + contain two integers, (stride_H, stride_W). Otherwise, the + stride_H = stride_W = stride. Default: stride = 1. + padding (int|tuple): The padding size. If padding is a tuple, it must + contain two integers, (padding_H, padding_W). Otherwise, the + padding_H = padding_W = padding. Default: padding = 0. + dilation (int|tuple): The dilation size. If dilation is a tuple, it must + contain two integers, (dilation_H, dilation_W). Otherwise, the + dilation_H = dilation_W = dilation. Default: dilation = 1. + groups (int): The groups number of the Conv2d Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. Default: groups=1 + param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None + bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled + with mkldnn library. Default: False + act (str): Activation type. Default: None + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Returns: Variable: The tensor variable storing the convolution and \ @@ -1234,13 +1450,9 @@ def conv2d(input, Examples: .. code-block:: python - data = fluid.layers.data( - name='data', shape=[3, 32, 32], dtype='float32') - conv2d = fluid.layers.conv2d( - input=data, num_filters=2, filter_size=3, act="relu") + data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32') + conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu") """ - if stride is None: - stride = [1, 1] num_channels = input.shape[1] @@ -1303,6 +1515,168 @@ def conv2d(input, return helper.append_activation(pre_act) +def conv3d(input, + num_filters, + filter_size, + stride=1, + padding=0, + dilation=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + use_mkldnn=False, + act=None, + name=None): + """ + **Convlution3D Layer** + + The convolution3D layer calculates the output based on the input, filter + and strides, paddings, dilations, groups parameters. Input(Input) and + Output(Output) are in NCDHW format. Where N is batch size C is the number of + channels, D is the depth of the feature, H is the height of the feature, + and W is the width of the feature. Convlution3D is similar with Convlution2D + but adds one dimension(depth). If bias attribution and activation type are + provided, bias is added to the output of the convolution, and the + corresponding activation function is applied to the final result. + + For each input :math:`X`, the equation is: + + .. math:: + + Out = \sigma (W \\ast X + b) + + In the above equation: + + * :math:`X`: Input value, a tensor with NCDHW format. + * :math:`W`: Filter value, a tensor with MCDHW format. + * :math:`\\ast`: Convolution operation. + * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. + * :math:`\\sigma`: Activation function. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + + Example: + + - Input: + + Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` + + Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)` + + - Output: + Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` + + Where + + .. math:: + + D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\ + H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\ + W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1 + + Args: + input (Variable): The input image with [N, C, D, H, W] format. + num_filters(int): The number of filter. It is as same as the output + image channel. + filter_size (int|tuple|None): The filter size. If filter_size is a tuple, + it must contain three integers, (filter_size_D, filter_size_H, filter_size_W). + Otherwise, the filter will be a square. + stride (int|tuple): The stride size. If stride is a tuple, it must + contain three integers, (stride_D, stride_H, stride_W). Otherwise, the + stride_D = stride_H = stride_W = stride. Default: stride = 1. + padding (int|tuple): The padding size. If padding is a tuple, it must + contain three integers, (padding_D, padding_H, padding_W). Otherwise, the + padding_D = padding_H = padding_W = padding. Default: padding = 0. + dilation (int|tuple): The dilation size. If dilation is a tuple, it must + contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the + dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1. + groups (int): The groups number of the Conv3d Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. Default: groups=1 + param_attr (ParamAttr): The parameters to the Conv3d Layer. Default: None + bias_attr (ParamAttr): Bias parameter for the Conv3d layer. Default: None + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + use_mkldnn (bool): Use mkldnn kernels or not. + act (str): Activation type. Default: None + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The tensor variable storing the convolution and \ + non-linearity activation result. + + Raises: + ValueError: If the shapes of input, filter_size, stride, padding and + groups mismatch. + + Examples: + .. code-block:: python + + data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32') + conv3d = fluid.layers.conv3d(input=data, num_filters=2, filter_size=3, act="relu") + """ + + l_type = 'conv3d' + + helper = LayerHelper(l_type, **locals()) + dtype = helper.input_dtype() + + num_channels = input.shape[1] + + if groups is None: + num_filter_channels = num_channels + else: + if num_channels % groups != 0: + raise ValueError("num_channels must be divisible by groups.") + num_filter_channels = num_channels / groups + + filter_size = utils.convert_to_list(filter_size, 3, 'filter_size') + stride = utils.convert_to_list(stride, 3, 'stride') + padding = utils.convert_to_list(padding, 3, 'padding') + dilation = utils.convert_to_list(dilation, 3, 'dilation') + + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + + input_shape = input.shape + filter_shape = [num_filters, num_filter_channels] + filter_size + + def _get_default_param_initializer(): + std = (2.0 / (filter_size[0]**3 * num_channels))**0.5 + return Normal(0.0, std, 0) + + filter_param = helper.create_parameter( + attr=helper.param_attr, + shape=filter_shape, + dtype=dtype, + default_initializer=_get_default_param_initializer()) + + pre_bias = helper.create_tmp_variable(dtype) + + helper.append_op( + type=l_type, + inputs={ + 'Input': input, + 'Filter': filter_param, + }, + outputs={"Output": pre_bias}, + attrs={ + 'strides': stride, + 'paddings': padding, + 'dilations': dilation, + 'groups': groups, + 'use_cudnn': use_cudnn, + 'use_mkldnn': use_mkldnn + }) + + pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) + + return helper.append_activation(pre_act) + + def sequence_pool(input, pool_type): """ This function add the operator for sequence pooling. @@ -1319,13 +1693,13 @@ def sequence_pool(input, pool_type): .. code-block:: text x is a 1-level LoDTensor: - x.lod = [[0, 2, 5, 7]] + x.lod = [[2, 3, 2]] x.data = [1, 3, 2, 4, 6, 5, 1] x.dims = [7, 1] then output is a Tensor: out.dim = [3, 1] - with condition len(x.lod[-1]) - 1 == out.dims[0] + with condition len(x.lod[-1]) == out.dims[0] for different pool_type: average: out.data = [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2 @@ -1379,18 +1753,18 @@ def sequence_pool(input, pool_type): def sequence_first_step(input): """ - This funciton get the first step of sequence. + This function gets the first step of sequence. .. code-block:: text x is a 1-level LoDTensor: - x.lod = [[0, 2, 5, 7]] + x.lod = [[2, 3, 2]] x.data = [1, 3, 2, 4, 6, 5, 1] x.dims = [7, 1] then output is a Tensor: out.dim = [3, 1] - with condition len(x.lod[-1]) - 1 == out.dims[0] + with condition len(x.lod[-1]) == out.dims[0] out.data = [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1) Args: @@ -1412,18 +1786,18 @@ def sequence_first_step(input): def sequence_last_step(input): """ - This funciton get the last step of sequence. + This function gets the last step of sequence. .. code-block:: text x is a 1-level LoDTensor: - x.lod = [[0, 2, 5, 7]] + x.lod = [[2, 3, 2]] x.data = [1, 3, 2, 4, 6, 5, 1] x.dims = [7, 1] then output is a Tensor: out.dim = [3, 1] - with condition len(x.lod[-1]) - 1 == out.dims[0] + with condition len(x.lod[-1]) == out.dims[0] out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1) Args: @@ -1443,6 +1817,7 @@ def sequence_last_step(input): return sequence_pool(input=input, pool_type="last") +@templatedoc() def pool2d(input, pool_size=-1, pool_type="max", @@ -1454,8 +1829,45 @@ def pool2d(input, use_mkldnn=False, name=None): """ - This function adds the operator for pooling in 2 dimensions, using the - pooling configurations mentioned in input parameters. + ${comment} + + Args: + input (Variable): The input tensor of pooling operator. The format of + input tensor is NCHW, where N is batch size, C is + the number of channels, H is the height of the + feature, and W is the width of the feature. + pool_size (int): The side length of pooling windows. All pooling + windows are squares with pool_size on a side. + pool_type: ${pooling_type_comment} + pool_stride (int): stride of the pooling layer. + pool_padding (int): padding size. + global_pooling: ${global_pooling_comment} + use_cudnn: ${use_cudnn_comment} + ceil_mode: ${ceil_mode_comment} + use_mkldnn: ${use_mkldnn_comment} + name (str|None): A name for this layer(optional). If set None, the + layer will be named automatically. + + Returns: + Variable: The pooling result. + + Raises: + ValueError: If 'pool_type' is not "max" nor "avg" + ValueError: If 'global_pooling' is False and 'pool_size' is -1 + ValueError: If 'use_cudnn' is not a bool value. + + Examples: + + .. code-block:: python + + data = fluid.layers.data( + name='data', shape=[3, 32, 32], dtype='float32') + conv2d = fluid.layers.pool2d( + input=data, + pool_size=2, + pool_type='max', + pool_stride=1, + global_pooling=False) """ if pool_type not in ["max", "avg"]: raise ValueError( @@ -1474,12 +1886,84 @@ def pool2d(input, if not isinstance(use_cudnn, bool): raise ValueError("use_cudnn should be True or False") - helper = LayerHelper('pool2d', **locals()) + l_type = 'pool2d' + + helper = LayerHelper(l_type, **locals()) dtype = helper.input_dtype() pool_out = helper.create_tmp_variable(dtype) helper.append_op( - type="pool2d", + type=l_type, + inputs={"X": input}, + outputs={"Out": pool_out}, + attrs={ + "pooling_type": pool_type, + "ksize": pool_size, + "global_pooling": global_pooling, + "strides": pool_stride, + "paddings": pool_padding, + "use_cudnn": use_cudnn, + "ceil_mode": ceil_mode, + "use_mkldnn": use_mkldnn + }) + + return pool_out + + +def pool3d(input, + pool_size=-1, + pool_type="max", + pool_stride=1, + pool_padding=0, + global_pooling=False, + use_cudnn=True, + ceil_mode=False, + use_mkldnn=False, + name=None): + """ + This function adds the operator for pooling in 3-dimensions, using the + pooling configurations mentioned in input parameters. + + Args: + input (Variable): ${input_comment} + pool_size (int): ${ksize_comment} + pool_type (str): ${pooling_type_comment} + pool_stride (int): stride of the pooling layer. + pool_padding (int): padding size. + global_pooling (bool): ${global_pooling_comment} + use_cudnn (bool): ${use_cudnn_comment} + ceil_mode (bool): ${ceil_mode_comment} + use_mkldnn (bool): ${use_mkldnn_comment} + name (str): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: output of pool3d layer. + """ + if pool_type not in ["max", "avg"]: + raise ValueError( + "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.", + str(pool_type)) + + if global_pooling is False and pool_size == -1: + raise ValueError( + "When the global_pooling is False, pool_size must be passed " + "and be a valid value. Received pool_size: " + str(pool_size)) + + pool_size = utils.convert_to_list(pool_size, 3, 'pool_size') + pool_padding = utils.convert_to_list(pool_padding, 3, 'pool_padding') + pool_stride = utils.convert_to_list(pool_stride, 3, 'pool_stride') + + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + + l_type = "pool3d" + helper = LayerHelper(l_type, **locals()) + dtype = helper.input_dtype() + pool_out = helper.create_tmp_variable(dtype) + + helper.append_op( + type=l_type, inputs={"X": input}, outputs={"Out": pool_out}, attrs={ @@ -1511,8 +1995,57 @@ def batch_norm(input, moving_variance_name=None, do_model_average_for_mean_and_var=False): """ - This function helps create an operator to implement - the BatchNorm layer using the configurations from the input parameters. + **Batch Normalization Layer** + + Can be used as a normalizer function for conv2d and fully_connected operations. + The required data format for this layer is one of the following: + + 1. NHWC `[batch, in_height, in_width, in_channels]` + + 2. NCHW `[batch, in_channels, in_height, in_width]` + + Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing + Internal Covariate Shift `_ + for more details. + + :math:`input` is the input features over a mini-batch. + + .. math:: + + \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ + \ mini-batch\ mean \\\\ + \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ + \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ + \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ + y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + + Args: + input(variable): The input variable which is a LoDTensor. + act(string, Default None): Activation type, linear|relu|prelu|... + is_test(bool, Default False): Used for training or training. + momentum(float, Default 0.9): + epsilon(float, Default 1e-05): + param_attr(ParamAttr): The parameter attribute for Parameter `scale`. + bias_attr(ParamAttr): The parameter attribute for Parameter `bias`. + data_layout(string, default NCHW): NCHW|NHWC + in_place(bool, Default False): Make the input and output of batch norm reuse memory. + use_mkldnn(bool, Default false): ${use_mkldnn_comment} + name(string, Default None): A name for this layer(optional). If set None, the layer + will be named automatically. + moving_mean_name(string, Default None): The name of moving_mean which store the global Mean. + moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance. + do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not. + + Returns: + Variable: A tensor variable which is the result after applying batch normalization on the input. + + Examples: + + .. code-block:: python + + hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w') + hidden2 = fluid.layers.batch_norm(input=hidden1) """ helper = LayerHelper('batch_norm', **locals()) dtype = helper.input_dtype() @@ -1594,6 +2127,7 @@ def batch_norm(input, return helper.append_activation(batch_norm_out) +@templatedoc() def layer_norm(input, scale=True, shift=True, @@ -1604,20 +2138,11 @@ def layer_norm(input, act=None, name=None): """ - **Layer Normalization** - - Assume feature vectors exist on dimensions - :attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics - along these dimensions for each feature vector :math:`a` with size - :math:`H`, then normalize each feature vector using the corresponding - statistics. After that, apply learnable gain and bias on the normalized - tensor to scale and shift if :attr:`scale` and :attr:`shift` are set. - - Refer to `Layer Normalization `_ + ${comment} The formula is as follows: - .. math:: + .. math:: \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i @@ -1625,6 +2150,15 @@ def layer_norm(input, h & = f(\\frac{g}{\\sigma}(a - \\mu) + b) + * :math:`a`: the vector representation of the summed inputs to the neurons + in that layer. + + * :math:`H`: the number of hidden units in a layers + + * :math:`g`: the trainable scale parameter. + + * :math:`b`: the trainable bias parameter. + Args: input(Variable): The input tensor variable. scale(bool): Whether to learn the adaptive gain :math:`g` after @@ -1640,16 +2174,16 @@ def layer_norm(input, bias_attr(ParamAttr|None): The parameter attribute for the learnable bias :math:`b`. act(str): Activation to be applied to the output of layer normalizaiton. + name (str): The name of this layer. It is optional. Returns: - Variable: A tensor variable with the same shape as the input. + ${y_comment} Examples: - .. code-block:: python - data = fluid.layers.data( - name='data', shape=[3, 32, 32], dtype='float32') - x = fluid.layers.layer_norm(input=data, begin_norm_axis=1) + >>> data = fluid.layers.data(name='data', shape=[3, 32, 32], + >>> dtype='float32') + >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1) """ helper = LayerHelper('layer_norm', **locals()) dtype = helper.input_dtype() @@ -1690,23 +2224,6 @@ def layer_norm(input, return helper.append_activation(layer_norm_out) -def beam_search_decode(ids, scores, name=None): - helper = LayerHelper('beam_search_decode', **locals()) - sentence_ids = helper.create_tmp_variable(dtype=ids.dtype) - sentence_scores = helper.create_tmp_variable(dtype=ids.dtype) - - helper.append_op( - type="beam_search_decode", - inputs={"Ids": ids, - "Scores": scores}, - outputs={ - "SentenceIds": sentence_ids, - "SentenceScores": sentence_scores - }) - - return sentence_ids, sentence_scores - - def conv2d_transpose(input, num_filters, output_size=None, @@ -1731,32 +2248,36 @@ def conv2d_transpose(input, represent height and width, respectively. The details of convolution transpose layer, please refer to the following explanation and references `therein `_. + If bias attribution and activation type are provided, bias is added to + the output of the convolution, and the corresponding activation function + is applied to the final result. For each input :math:`X`, the equation is: .. math:: - Out = W \\ast X + Out = \sigma (W \\ast X + b) - In the above equation: + Where: * :math:`X`: Input value, a tensor with NCHW format. * :math:`W`: Filter value, a tensor with MCHW format. - * :math:`\\ast` : Convolution transpose operation. - * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be - different. + * :math:`\\ast`: Convolution operation. + * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. + * :math:`\\sigma`: Activation function. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. Example: - Input: - Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Input shape: :math:`(N, C_{in}, H_{in}, W_{in})` - Filter shape: $(C_{in}, C_{out}, H_f, W_f)$ + Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)` - Output: - Output shape: $(N, C_{out}, H_{out}, W_{out})$ + Output shape: :math:`(N, C_{out}, H_{out}, W_{out})` Where @@ -1766,54 +2287,52 @@ def conv2d_transpose(input, W_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 Args: - input(Variable): The input image with [N, C, H, W] format. - num_filters(int): The number of the filter. It is as same as the output - image channel. - output_size(int|tuple|None): The output image size. If output size is a - tuple, it must contain two integers, (image_H, image_W). This - parameter only works when filter_size is None. - filter_size(int|tuple|None): The filter size. If filter_size is a tuple, - it must contain two integers, (filter_size_H, filter_size_W). - Otherwise, the filter will be a square. None if use output size to - calculate filter_size. - padding(int|tuple): The padding size. If padding is a tuple, it must - contain two integers, (padding_H, padding_W). Otherwise, the - padding_H = padding_W = padding. Default: padding = 0. - stride(int|tuple): The stride size. If stride is a tuple, it must - contain two integers, (stride_H, stride_W). Otherwise, the - stride_H = stride_W = stride. Default: stride = 1. - dilation(int|tuple): The dilation size. If dilation is a tuple, it must - contain two integers, (dilation_H, dilation_W). Otherwise, the - dilation_H = dilation_W = dilation. Default: dilation = 1. - groups(int): The groups number of the Conv2d transpose layer. Inspired by - grouped convolution in Alex Krizhevsky's Deep CNN paper, in which - when group=2, the first half of the filters is only connected to the - first half of the input channels, while the second half of the - filters is only connected to the second half of the input channels. - Default: groups=1 - param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer. - Default: None - bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None - use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn - library is installed. Default: True - act(str): Activation type. Default: None - name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. - - Returns: - Variable: The tensor variable storing the convolution transpose result. + input(Variable): The input image with [N, C, H, W] format. + num_filters(int): The number of the filter. It is as same as the output + image channel. + output_size(int|tuple|None): The output image size. If output size is a + tuple, it must contain two integers, (image_H, image_W). This + parameter only works when filter_size is None. + filter_size(int|tuple|None): The filter size. If filter_size is a tuple, + it must contain two integers, (filter_size_H, filter_size_W). + Otherwise, the filter will be a square. None if use output size to + calculate filter_size. + padding(int|tuple): The padding size. If padding is a tuple, it must + contain two integers, (padding_H, padding_W). Otherwise, the + padding_H = padding_W = padding. Default: padding = 0. + stride(int|tuple): The stride size. If stride is a tuple, it must + contain two integers, (stride_H, stride_W). Otherwise, the + stride_H = stride_W = stride. Default: stride = 1. + dilation(int|tuple): The dilation size. If dilation is a tuple, it must + contain two integers, (dilation_H, dilation_W). Otherwise, the + dilation_H = dilation_W = dilation. Default: dilation = 1. + groups(int): The groups number of the Conv2d transpose layer. Inspired by + grouped convolution in Alex Krizhevsky's Deep CNN paper, in which + when group=2, the first half of the filters is only connected to the + first half of the input channels, while the second half of the + filters is only connected to the second half of the input channels. + Default: groups=1 + param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer. + Default: None + bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None + use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + act(str): Activation type. Default: None + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The tensor variable storing the convolution transpose result. Raises: - ValueError: If the shapes of input, filter_size, stride, padding and - groups mismatch. + ValueError: If the shapes of input, filter_size, stride, padding and + groups mismatch. Examples: .. code-block:: python - data = fluid.layers.data( - name='data', shape=[3, 32, 32], dtype='float32') - conv2d_transpose = fluid.layers.conv2d_transpose( - input=data, num_filters=2, filter_size=3) + data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32') + conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3) """ helper = LayerHelper("conv2d_transpose", **locals()) if not isinstance(input, Variable): @@ -1869,6 +2388,175 @@ def conv2d_transpose(input, return out +def conv3d_transpose(input, + num_filters, + output_size=None, + filter_size=None, + padding=0, + stride=1, + dilation=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + name=None): + """ + **Convlution3D transpose layer** + + The convolution3D transpose layer calculates the output based on the input, + filter, and dilations, strides, paddings. Input(Input) and output(Output) + are in NCDHW format. Where N is batch size, C is the number of channels, + D is the depth of the feature, H is the height of the feature, and W + is the width of the feature. Parameters(dilations, strides, paddings) are + two elements. These two elements represent height and width, respectively. + The details of convolution transpose layer, please refer to the following + explanation and references `therein `_. + If bias attribution and activation type are provided, bias is added to + the output of the convolution, and the corresponding activation function + is applied to the final result. + + For each input :math:`X`, the equation is: + + .. math:: + + Out = \sigma (W \\ast X + b) + + In the above equation: + + * :math:`X`: Input value, a tensor with NCDHW format. + * :math:`W`: Filter value, a tensor with MCDHW format. + * :math:`\\ast`: Convolution operation. + * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. + * :math:`\\sigma`: Activation function. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + + Example: + + - Input: + + Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` + + Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)` + + - Output: + + Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` + + Where + + .. math:: + + D_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\ + H_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\ + W_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 + + Args: + input(Variable): The input image with [N, C, D, H, W] format. + num_filters(int): The number of the filter. It is as same as the output + image channel. + output_size(int|tuple|None): The output image size. If output size is a + tuple, it must contain three integers, (image_D, image_H, image_W). This + parameter only works when filter_size is None. + filter_size(int|tuple|None): The filter size. If filter_size is a tuple, + it must contain three integers, (filter_size_D, filter_size_H, filter_size_W). + Otherwise, the filter will be a square. None if use output size to + calculate filter_size. + padding(int|tuple): The padding size. If padding is a tuple, it must + contain three integers, (padding_D, padding_H, padding_W). Otherwise, the + padding_D = padding_H = padding_W = padding. Default: padding = 0. + stride(int|tuple): The stride size. If stride is a tuple, it must + contain three integers, (stride_D, stride_H, stride_W). Otherwise, the + stride_D = stride_H = stride_W = stride. Default: stride = 1. + dilation(int|tuple): The dilation size. If dilation is a tuple, it must + contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the + dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1. + groups(int): The groups number of the Conv3d transpose layer. Inspired by + grouped convolution in Alex Krizhevsky's Deep CNN paper, in which + when group=2, the first half of the filters is only connected to the + first half of the input channels, while the second half of the + filters is only connected to the second half of the input channels. + Default: groups=1 + param_attr(ParamAttr): The parameters to the Conv3d_transpose Layer. + Default: None + bias_attr(ParamAttr): Bias parameter for the Conv3d layer. Default: None + use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + act(str): Activation type. Default: None + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The tensor variable storing the convolution transpose result. + + Raises: + ValueError: If the shapes of input, filter_size, stride, padding and + groups mismatch. + + Examples: + .. code-block:: python + + data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32') + conv3d_transpose = fluid.layers.conv3d_transpose(input=data, num_filters=2, filter_size=3) + """ + l_type = "conv3d_transpose" + helper = LayerHelper(l_type, **locals()) + if not isinstance(input, Variable): + raise TypeError("Input of conv3d_transpose must be Variable") + input_channel = input.shape[1] + + padding = utils.convert_to_list(padding, 3, 'padding') + stride = utils.convert_to_list(stride, 3, 'stride') + dilation = utils.convert_to_list(dilation, 3, 'dilation') + + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + + if filter_size is None: + if output_size is None: + raise ValueError("output_size must be set when filter_size is None") + if isinstance(output_size, int): + output_size = [output_size, output_size] + + d_in = input.shape[2] + h_in = input.shape[3] + w_in = input.shape[4] + + filter_size_d = (output_size[0] - (d_in - 1) * stride[0] + 2 * + padding[0] - 1) / dilation[0] + 1 + filter_size_h = (output_size[1] - (h_in - 1) * stride[1] + 2 * + padding[1] - 1) / dilation[1] + 1 + filter_size_w = (output_size[2] - (w_in - 1) * stride[2] + 2 * + padding[2] - 1) / dilation[2] + 1 + filter_size = [filter_size_d, filter_size_h, filter_size_w] + else: + filter_size = utils.convert_to_list(filter_size, 3, + 'conv3d_transpose.filter_size') + + groups = 1 if groups is None else groups + filter_shape = [input_channel, num_filters / groups] + filter_size + img_filter = helper.create_parameter( + dtype=input.dtype, shape=filter_shape, attr=helper.param_attr) + + pre_bias = helper.create_tmp_variable(dtype=input.dtype) + helper.append_op( + type=l_type, + inputs={'Input': [input], + 'Filter': [img_filter]}, + outputs={'Output': pre_bias}, + attrs={ + 'strides': stride, + 'paddings': padding, + 'dilations': dilation, + 'groups': groups, + 'use_cudnn': use_cudnn + }) + + pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) + out = helper.append_activation(pre_act) + return out + + def sequence_expand(x, y, ref_level=-1, name=None): """Sequence Expand Layer. This layer will expand the input variable **x** according to specified level lod of **y**. Please note that lod level of @@ -1880,18 +2568,18 @@ def sequence_expand(x, y, ref_level=-1, name=None): * Case 1 x is a LoDTensor: - x.lod = [[0, 2, 4]] + x.lod = [[2, 2]] x.data = [[a], [b], [c], [d]] x.dims = [4, 1] y is a LoDTensor: - y.lod = [[0, 2, 4], - [0, 3, 6, 7, 8]] + y.lod = [[2, 2], + [3, 3, 1, 1]] ref_level: 0 then output is a 1-level LoDTensor: - out.lod = [[0, 2, 4, 6, 8]] + out.lod = [[2, 2, 2, 2]] out.data = [[a], [b], [a], [b], [c], [d], [c], [d]] out.dims = [8, 1] @@ -1901,7 +2589,7 @@ def sequence_expand(x, y, ref_level=-1, name=None): x.dims = [3, 1] y is a LoDTensor: - y.lod = [[0, 2, 2, 5]] + y.lod = [[2, 0, 3]] ref_level: -1 @@ -1939,10 +2627,89 @@ def sequence_expand(x, y, ref_level=-1, name=None): return tmp -def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0): - ''' - This function implements the beam search algorithm. - ''' +def beam_search(pre_ids, + pre_scores, + ids, + scores, + beam_size, + end_id, + level=0, + name=None): + """ + Beam search is a classical algorithm for selecting candidate words in a + machine translation task. + + Refer to `Beam search `_ + for more details. + + This layer does the search in beams for one time step. Specifically, it + selects the top-K candidate word ids of current step from :attr:`ids` + according to their :attr:`scores` for all source sentences, where K is + :attr:`beam_size` and :attr:`ids, scores` are predicted results from the + computation cell. Additionally, :attr:`pre_ids` and :attr:`pre_scores` are + the output of beam_search at previous step, they are needed for special use + to handle ended candidate translations. + + Note that the :attr:`scores` passed in should be accumulated scores, and + length penalty should be done with extra operators before calculating the + accumulated scores if needed, also suggest finding top-K before it and + using the top-K candidates following. + + Please see the following demo for a fully beam search usage example: + + fluid/tests/book/test_machine_translation.py + + Args: + pre_ids(Variable): The LodTensor variable which is the output of + beam_search at previous step. It should be a LodTensor with shape + :math:`(batch_size, 1)` and lod + :math:`[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at the + first step. + pre_scores(Variable): The LodTensor variable which is the output of + beam_search at previous step. + ids(Variable): The LodTensor variable containing the candidates ids. + Its shape should be :math:`(batch_size \\times beam_size, K)`, + where :math:`K` supposed to be :attr:`beam_size`. + scores(Variable): The LodTensor variable containing the accumulated + scores corresponding to :attr:`ids` and its shape is the same as + the shape of :attr:`ids`. + beam_size(int): The beam width used in beam search. + end_id(int): The id of end token. + level(int, default 0): It can be ignored and mustn't change currently. + It means the source level of lod, which is explained as following. + The lod level of :attr:`ids` should be 2. The first level is source + level which describes how many prefixes (branchs) for each source + sentece (beam), and the second level is sentence level which + describes how these candidates belong to the prefix. The paths + linking prefixes and selected candidates are organized and reserved + in lod. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The LodTensor pair containing the selected ids and the \ + corresponding scores. + + Examples: + .. code-block:: python + + # Suppose `probs` contains predicted results from the computation + # cell and `pre_ids` and `pre_scores` is the output of beam_search + # at previous step. + topk_scores, topk_indices = layers.topk(probs, k=beam_size) + accu_scores = layers.elementwise_add( + x=layers.log(x=topk_scores)), + y=layers.reshape( + pre_scores, shape=[-1]), + axis=0) + selected_ids, selected_scores = layers.beam_search( + pre_ids=pre_ids, + pre_scores=pre_scores, + ids=topk_indices, + scores=accu_scores, + beam_size=beam_size, + end_id=end_id) + """ helper = LayerHelper('beam_search', **locals()) score_type = scores.dtype id_type = ids.dtype @@ -1954,6 +2721,7 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0): type='beam_search', inputs={ 'pre_ids': pre_ids, + 'pre_scores': pre_scores, 'ids': ids, 'scores': scores, }, @@ -1971,6 +2739,56 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0): return selected_ids, selected_scores +def beam_search_decode(ids, scores, beam_size, end_id, name=None): + """ + Beam Search Decode Layer. This layer constructs the full hypotheses for + each source sentence by walking back along the LoDTensorArray :attr:`ids` + whose lods can be used to restore the path in the beam search tree. + Please see the following demo for a fully beam search usage example: + fluid/tests/book/test_machine_translation.py + + Args: + ids(Variable): The LodTensorArray variable containing the selected ids + of all steps. + scores(Variable): The LodTensorArray variable containing the selected + scores of all steps. + beam_size(int): The beam width used in beam search. + end_id(int): The id of end token. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The LodTensor pair containing the generated id sequences \ + and the corresponding scores. The shapes and lods of the two \ + LodTensor are same. The lod level is 2 and the two levels \ + separately indicate how many hypotheses each source sentence has \ + and how many ids each hypothesis has. + + Examples: + .. code-block:: python + # Suppose `ids` and `scores` are LodTensorArray variables reserving + # the selected ids and scores of all steps + finished_ids, finished_scores = layers.beam_search_decode( + ids, scores, beam_size=5, end_id=0) + """ + helper = LayerHelper('beam_search_decode', **locals()) + sentence_ids = helper.create_tmp_variable(dtype=ids.dtype) + sentence_scores = helper.create_tmp_variable(dtype=ids.dtype) + + helper.append_op( + type="beam_search_decode", + inputs={"Ids": ids, + "Scores": scores}, + outputs={ + "SentenceIds": sentence_ids, + "SentenceScores": sentence_scores + }, + attrs={"beam_size": beam_size, + "end_id": end_id}) + + return sentence_ids, sentence_scores + + def lstm_unit(x_t, hidden_t_prev, cell_t_prev, @@ -2149,23 +2967,24 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None): def reduce_mean(input, dim=None, keep_dim=False, name=None): """ - Computes the mean of tensor elements over the given dimension. + Computes the mean of the input tensor's elements along the given dimension. Args: input (Variable): The input variable which is a Tensor or LoDTensor. - dim (list|int|None): The dimensions along which the mean is computed. If - :attr:`None`, compute the mean over all elements of :attr:`input` - and return a Tensor variable with a single element, otherwise + dim (list|int|None): The dimension along which the mean is computed. If + `None`, compute the mean over all elements of :attr:`input` + and return a variable with a single element, otherwise it must be in the range :math:`[-rank(input), rank(input))`. If - :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`. + :math:`dim[i] < 0`, the dimension to reduce is + :math:`rank(input) + dim[i]`. keep_dim (bool): Whether to reserve the reduced dimension in the output Tensor. The result tensor will have one fewer dimension than the :attr:`input` unless :attr:`keep_dim` is true. - name(str|None): A name for this layer(optional). If set None, the layer + name(str|None): A name for this layer(optional). If set `None`, the layer will be named automatically. Returns: - Variable: The reduced Tensor variable. + Variable: The reduced mean Variable. Examples: .. code-block:: python @@ -2387,7 +3206,7 @@ def split(input, num_or_sections, dim=-1, name=None): will be named automatically. Returns: - List: The list of segmented tensor variables. + list(Variable): The list of segmented tensor variables. Examples: .. code-block:: python @@ -2437,77 +3256,51 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None): The l2 normalize layer normalizes `x` along dimension `axis` using an L2 norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes - output = x / sqrt(max(sum(x**2), epsilon)) + .. math:: + + y = \\frac{x}{ \sqrt{\sum {x^2} + epsion }} For `x` with more dimensions, this layer independently normalizes each 1-D slice along dimension `axis`. Args: - x(Variable|list): The input tensor to l2_normalize layer. - axis(int): Dimension along which to normalize the input. - epsilon(float): A lower bound value for `x`'s l2 norm. sqrt(epsilon) will - be used as the divisor if the l2 norm of `x` is less than - sqrt(epsilon). - name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. - + x(Variable|list): The input tensor to l2_normalize layer. + axis(int): The axis on which to apply normalization. If `axis < 0`, \ + the dimension to normalization is rank(X) + axis. -1 is the + last dimension. + epsilon(float): The epsilon value is used to avoid division by zero, \ + the defalut value is 1e-10. + name(str|None): A name for this layer(optional). If set None, the layer \ + will be named automatically. Returns: - Variable: The output tensor variable. + Variable: The output tensor variable is the same shape with `x`. Examples: + .. code-block:: python - data = fluid.layers.data(name="data", - shape=(3, 17, 13), - dtype="float32") - normed = fluid.layers.l2_normalize(x=data, axis=1) + data = fluid.layers.data(name="data", + shape=(3, 17, 13), + dtype="float32") + normed = fluid.layers.l2_normalize(x=data, axis=1) """ if len(x.shape) == 1: axis = 0 helper = LayerHelper("l2_normalize", **locals()) - square = helper.create_tmp_variable(dtype=x.dtype) - helper.append_op(type="square", inputs={"X": x}, outputs={"Out": square}) - - reduced_sum = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_tmp_variable(dtype=x.dtype) + norm = helper.create_tmp_variable(dtype=x.dtype) helper.append_op( - type="reduce_sum", - inputs={"X": square}, - outputs={"Out": reduced_sum}, + type="norm", + inputs={"X": x}, + outputs={"Out": out, + "Norm": norm}, attrs={ - "dim": [1] if axis is None else [axis], - "keep_dim": True, - "reduce_all": False + "axis": 1 if axis is None else axis, + "epsilon": epsilon, }) - - # TODO(caoying) A lower bound value epsilon for the norm is needed to - # imporve the numeric stability of reciprocal. This requires a maximum_op. - rsquare = helper.create_tmp_variable(dtype=x.dtype) - helper.append_op( - type="reciprocal", inputs={"X": reduced_sum}, outputs={"Out": rsquare}) - - # TODO(caoying) the current elementwise_mul operator does not support a - # general broadcast rule which broadcasts input(Y) to have the same - # dimension with Input(X) starting from a specified dimension. So this - # exanpsion is requred. Once a general broadcast rule is spported, this - # expanding canbe removed. - rsquare_expanded = helper.create_tmp_variable(dtype=x.dtype) - expand_times = [1] * len(x.shape) - expand_times[axis] = int(x.shape[axis]) - helper.append_op( - type="expand", - inputs={"X": rsquare}, - outputs={"Out": rsquare_expanded}, - attrs={"expand_times": expand_times}) - - out = helper.create_tmp_variable(dtype=x.dtype) - helper.append_op( - type="elementwise_mul", - inputs={"X": x, - "Y": rsquare_expanded}, - outputs={"Out": out}) return out @@ -2622,25 +3415,51 @@ def topk(input, k, name=None): This operator is used to find values and indices of the k largest entries for the last dimension. - If the input is a vector (rank=1), finds the k largest entries in the vector + If the input is a vector (1-D Tensor), finds the k largest entries in the vector and outputs their values and indices as vectors. Thus values[j] is the j-th largest entry in input, and its index is indices[j]. If the input is a Tensor with higher rank, this operator computes the top k entries along the last dimension. + For example: + + .. code-block:: text + + If: + input = [[5, 4, 2, 3], + [9, 7, 10, 25], + [6, 2, 10, 1]] + k = 2 + + Then: + The first output: + values = [[5, 4], + [10, 25], + [6, 10]] + + The second output: + indices = [[0, 1], + [2, 3], + [0, 2]] + Args: input(Variable): The input variable which can be a vector or Tensor with higher rank. - k(int): An integer value to specify the top k largest elements. + k(int): The number of top elements to look for along the last dimension + of input. name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. + Default: None Returns: - values(Variable): The k largest elements along each last dimensional - slice. - indices(Variable): The indices of values within the last dimension of - input. + Tuple[Variable]: A tuple with two elements. Each element is a Variable. + The first one is k largest elements along each last + dimensional slice. The second one is indices of values + within the last dimension of input. + + Raises: + ValueError: If k < 1 or k is not less than the last dimension of input Examples: .. code-block:: python @@ -2648,7 +3467,7 @@ def topk(input, k, name=None): top5_values, top5_indices = layers.topk(input, k=5) """ shape = input.shape - if k < 1 and k >= shape[-1]: + if k < 1 or k >= shape[-1]: raise ValueError("k must be greater than 0 and less than %d." % (shape[-1])) @@ -2666,8 +3485,7 @@ def topk(input, k, name=None): return values, indices -def edit_distance(input, label, normalized=True, ignored_tokens=None, - name=None): +def edit_distance(input, label, normalized=True, ignored_tokens=None): """ EditDistance operator computes the edit distances between a batch of hypothesis strings and their references. Edit distance, also called @@ -2681,26 +3499,23 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None, "kitten" -> "sitten" -> "sittin" -> "sitting" - Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with + The input is a LoDTensor consisting of all the hypothesis strings with the total number denoted by `batch_size`, and the separation is specified by the LoD information. And the `batch_size` reference strings are arranged - in order in the same way in the LoDTensor Input(Refs). + in order in the same way in the input LoDTensor. - Output(Out) contains the `batch_size` results and each stands for the edit + The output contains the `batch_size` results and each stands for the edit distance for a pair of strings respectively. If Attr(normalized) is true, the edit distance will be divided by the length of reference string. Args: - input(Variable): The indices for hypothesis strings. - label(Variable): The indices for reference strings. - - normalized(bool): Indicated whether to normalize the edit distance by + normalized(bool, default True): Indicated whether to normalize the edit distance by the length of reference string. - - ignored_tokens(list of int): Tokens that should be removed before + ignored_tokens(list, default None): Tokens that should be removed before calculating edit distance. + name (str): The name of this layer. It is optional. Returns: Variable: sequence-to-sequence edit distance in shape [batch_size, 1]. @@ -2710,7 +3525,6 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None, x = fluid.layers.data(name='x', shape=[8], dtype='float32') y = fluid.layers.data(name='y', shape=[7], dtype='float32') - cost = fluid.layers.edit_distance(input=x,label=y) """ helper = LayerHelper("edit_distance", **locals()) @@ -2751,6 +3565,7 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None, def ctc_greedy_decoder(input, blank, name=None): """ This op is used to decode sequences by greedy policy by below steps: + 1. Get the indexes of max value for each row in input. a.k.a. numpy.argmax(input, axis=0). 2. For each sequence in result of step1, merge repeated tokens between two @@ -2772,7 +3587,7 @@ def ctc_greedy_decoder(input, blank, name=None): [0.2, 0.2, 0.1, 0.5], [0.5, 0.1, 0.3, 0.1]] - input.lod = [[0, 4, 8]] + input.lod = [[4, 4]] Then: @@ -2780,7 +3595,7 @@ def ctc_greedy_decoder(input, blank, name=None): [1], [3]] - output.lod = [[0, 2, 3]] + output.lod = [[2, 1]] Args: @@ -2790,14 +3605,14 @@ def ctc_greedy_decoder(input, blank, name=None): where Lp is the sum of all input sequences' length and num_classes is the true number of classes. (not including the blank label). - blank(int): the blank label index of Connectionist Temporal Classification (CTC) loss, which is in thehalf-opened interval [0, num_classes + 1). + name (str): The name of this layer. It is optional. Returns: Variable: CTC greedy decode result. If all the sequences in result were - empty, the result LoDTensor will be [-1] with LoD [[0]] and dims [1, 1]. + empty, the result LoDTensor will be [-1] with LoD [[]] and dims [1, 1]. Examples: .. code-block:: python @@ -2830,35 +3645,33 @@ def warpctc(input, label, blank=0, norm_by_times=False): input tensor. Args: - input(Variable): (LodTensor, default: LoDTensor), - the unscaled probabilities of variable-length sequences, + input (Variable): The unscaled probabilities of variable-length sequences, which is a 2-D Tensor with LoD information. It's shape is [Lp, num_classes + 1], where Lp is the sum of all input sequences' length and num_classes is the true number of classes. (not including the blank label). - label(Variable): (LodTensor, default: LoDTensor), the ground truth - of variable-length sequence, which is a 2-D Tensor with LoD - information. It is of the shape [Lg, 1], where Lg is th sum of - all labels' length. - blank: (int, default: 0), the blank label index of Connectionist + label (Variable): The ground truth of variable-length sequence, + which is a 2-D Tensor with LoD information. It is of the shape [Lg, 1], + where Lg is th sum of all labels' length. + blank (int, default 0): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). - norm_by_times: (bool, default: false), whether to normalize - the gradients by the number of time-step, which is also the - sequence's length. There is no need to normalize the gradients - if warpctc layer was follewed by a mean_op. + norm_by_times(bool, default false): Whether to normalize the gradients + by the number of time-step, which is also the sequence's length. + There is no need to normalize the gradients if warpctc layer was + follewed by a mean_op. Returns: Variable: The Connectionist Temporal Classification (CTC) loss, which is a 2-D Tensor of the shape [batch_size, 1]. Examples: + .. code-block:: python - y = layers.data( - name='y', shape=[11, 8], dtype='float32', lod_level=1) - y_predict = layers.data( - name='y_predict', shape=[11, 1], dtype='float32') - cost = layers.warpctc(input=y_predict, label=y) + + label = fluid.layers.data(shape=[11, 8], dtype='float32', lod_level=1) + predict = fluid.layers.data(shape=[11, 1], dtype='float32') + cost = fluid.layers.warpctc(input=predict, label=label) """ helper = LayerHelper('warpctc', **locals()) @@ -2888,16 +3701,20 @@ def sequence_reshape(input, new_dim): x is a LoDTensor: x.lod = [[0, 2, 6]] - x.data = [[1, 2], [3, 4], - [5, 6], [7, 8], [9, 10], [11, 12]] + x.data = [[1, 2], [3, 4], + [5, 6], [7, 8], + [9, 10], [11, 12]] x.dims = [6, 2] set new_dim = 4 then out is a LoDTensor: + out.lod = [[0, 1, 3]] - out.data = [[1, 2, 3, 4], - [5, 6, 7, 8], [9, 10, 11, 12]] + + out.data = [[1, 2, 3, 4], + [5, 6, 7, 8], + [9, 10, 11, 12]] out.dims = [3, 4] Currently, only 1-level LoDTensor is supported and please make sure @@ -2905,19 +3722,19 @@ def sequence_reshape(input, new_dim): no remainder for each sequence. Args: - input (Variable): (LodTensor, default: LoDTensor), a 2-D LoDTensor - with shape being [N, M] where M for dimension. - new_dim (int): New dimension which the input LoDTensor is reshaped to. + + input (Variable): A 2-D LoDTensor with shape being [N, M] where M for dimension. + new_dim (int): New dimension that the input LoDTensor is reshaped to. Returns: + Variable: Reshaped LoDTensor according to new dimension. Examples: .. code-block:: python - x = fluid.layers.data(name='x', shape=[5, 20], - dtype='float32', lod_level=1) - x_reshaped = layers.sequence_reshape(input=x, new_dim=10) + x = fluid.layers.data(shape=[5, 20], dtype='float32', lod_level=1) + x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=10) """ helper = LayerHelper('sequence_reshape', **locals()) out = helper.create_tmp_variable(helper.input_dtype()) @@ -2929,7 +3746,10 @@ def sequence_reshape(input, new_dim): return out -@autodoc() +# FIXME(wuyi): let docstring_checker.py understand @autodoc. +# For now, the comments in c++ use types like Tensor, but in python side +# the type is often "Variable", and arguments may vary. +@templatedoc(op_type="nce") def nce(input, label, num_total_classes, @@ -2937,6 +3757,49 @@ def nce(input, param_attr=None, bias_attr=None, num_neg_samples=None): + """ + ${comment} + + Args: + input (Variable): input variable. + label (Variable): label. + num_total_classes (int):${num_total_classes_comment} + sample_weight (Variable|None): A Variable of shape [batch_size, 1] + storing a weight for each sample. The default weight for each + sample is 1.0. + param_attr (ParamAttr|None): attributes for parameter + bias_attr (ParamAttr|None): attributes for bias + num_neg_samples (int): ${num_neg_samples_comment} + + Returns: + Variable: The output nce loss. + + Examples: + .. code-block:: python + + window_size = 5 + words = [] + for i in xrange(window_size): + words.append(layers.data( + name='word_{0}'.format(i), shape=[1], dtype='int64')) + + dict_size = 10000 + label_word = int(window_size / 2) + 1 + + embs = [] + for i in xrange(window_size): + if i == label_word: + continue + + emb = layers.embedding(input=words[i], size=[dict_size, 32], + param_attr='emb.w', is_sparse=True) + embs.append(emb) + + embs = layers.concat(input=embs, axis=1) + loss = layers.nce(input=embs, label=words[label_word], + num_total_classes=dict_size, param_attr='nce.w', + bias_attr='nce.b') + """ helper = LayerHelper('nce', **locals()) assert isinstance(input, Variable) dim = input.shape[1] @@ -2986,16 +3849,15 @@ def nce(input, def transpose(x, perm, name=None): """ - **transpose Layer** - Permute the dimensions of `input` according to `perm`. The `i`-th dimension of the returned tensor will correspond to the perm[i]-th dimension of `input`. Args: - input (Variable): (Tensor), A Tensor. - perm (list): A permutation of the dimensions of `input`. + x (Variable): The input Tensor. + perm (list): A permutation of the dimensions of `input`. + name (str): The name of this layer. It is optional. Returns: Variable: A transposed Tensor. @@ -3076,8 +3938,6 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): Examples: - As an example: - .. code-block:: text Given: @@ -3119,9 +3979,9 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): output.dims = {8, 9} - output.lod = [[0, 4, 8]] + output.lod = [[4, 4]] - The simple usage is: + Examples: .. code-block:: python @@ -3154,29 +4014,13 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): return out +@templatedoc() def row_conv(input, future_context_size, param_attr=None, act=None): - """Row Conv Operator. This layer will apply lookahead convolution to - **input**. The input variable should be a 2D LoDTensor with shape [T, D]. - Parameters with shape [future_context_size + 1, D] will be created. The math - equation of row convolution is as follows: - - .. math:: - Out_{i} = \sum_{j = i} ^ {i + \\tau} X_{j} \odot W_{i - j} - - In the above equation: - - * :math:`Out_{i}`: The i-th row of output variable with shape [1, D]. - * :math:`\\tau`: Future context size. - * :math:`X_{j}`: The j-th row of input variable with shape [1, D]. - * :math:`W_{i-j}`: The (i-j)-th row of parameters with shape [1, D]. - - More details about row_conv please refer to the paper \ - (http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf) and - the design document \ - (https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645). + """ + ${comment} Args: - input (Variable): Input variable, a 2D LoDTensor with shape [T, D]. + input (${x_type}): ${x_comment}. future_context_size (int): Future context size. Please note, the shape of convolution kernel is [future_context_size + 1, D]. param_attr (ParamAttr): Attributes of parameters, including @@ -3184,14 +4028,13 @@ def row_conv(input, future_context_size, param_attr=None, act=None): act (str): Non-linear activation to be applied to output variable. Returns: - Variable: The output tensor with same shape as input tensor. + ${out_comment}. Examples: - .. code-block:: python - - x = fluid.layers.data(name='x', shape=[16], - dtype='float32', lod_level=1) - out = fluid.layers.row_conv(input=x, future_context_size=2) + >>> import paddle.fluid as fluid + >>> x = fluid.layers.data(name='x', shape=[16], + >>> dtype='float32', lod_level=1) + >>> out = fluid.layers.row_conv(input=x, future_context_size=2) """ helper = LayerHelper('row_conv', **locals()) dtype = helper.input_dtype() @@ -3207,42 +4050,23 @@ def row_conv(input, future_context_size, param_attr=None, act=None): return helper.append_activation(out) +@templatedoc() def multiplex(inputs, index): """ - **Multiplex Layer** + ${comment} - Referring to the given index variable, this layer selects rows from the - input variables to construct a multiplex variable. Assuming that there are - :math:`m` input variables and :math:`I_i` represents the i-th input - variable and :math:`i` is in [0, :math:`m`). All input variables are - tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`]. - Please note that rank of the input tensor should be at least 2. Each input - variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`] - where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2` - * ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input - variable. The given index variable should be a 2-D tensor with shape - [:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable. - Then the output variable will be a tensor with shape [:math:`d_0`, - :math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D - matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th - row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`. + >>> import paddle.fluid as fluid + >>> x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32') + >>> x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32') + >>> index = fluid.layers.data(name='index', shape=[1], dtype='int32') + >>> out = fluid.layers.multiplex(inputs=[x1, x2], index=index) Args: - inputs (list): A list of variables to gather from. All variables have the - same shape and the rank is at least 2. - index (Variable): Tensor, index variable which is a 2-D tensor - with shape [M, 1] where M is the batch size. + inputs (list): ${x_comment}. + index (${ids_type}): ${ids_comment}. Returns: - Variable: Multiplex variable gathered from input variables. - - Examples: - .. code-block:: python - - x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32') - x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32') - index = fluid.layers.data(name='index', shape=[1], dtype='int32') - out = fluid.layers.multiplex(inputs=[x1, x2], index=index) + ${out_comment}. """ helper = LayerHelper('multiplex', **locals()) @@ -3328,31 +4152,30 @@ def softmax_with_cross_entropy(logits, label, soft_label=False): def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None): """ - **Smooth L1 Loss Operator. ** - - This operator computes the smooth L1 loss for X and Y. - The operator takes the first dimension of X and Y as batch size. + This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`. + It takes the first dimension of :attr:`x` and :attr:`y` as batch size. For each instance, it computes the smooth L1 loss element by element first - and then sums all the losses. So the shape of Out is [batch_size, 1]. + and then sums all the losses. So the shape of ouput Variable is + [batch_size, 1]. Args: x (Variable): A tensor with rank at least 2. The input value of smooth L1 loss op with shape [batch_size, dim1, ..., dimN]. y (Variable): A tensor with rank at least 2. The target value of smooth - L1 loss op with same shape as x. + L1 loss op with same shape as :attr:`x`. inside_weight (Variable|None): A tensor with rank at least 2. This - input is optional and should have same shape with x. If provided, - the result of (x - y) will be multiplied by this tensor element by - element. + input is optional and should have same shape with :attr:`x`. If + provided, the result of (:attr:`x` - :attr:`y`) will be multiplied + by this tensor element by element. outside_weight (Variable|None): A tensor with rank at least 2. This - input is optional and should have same shape with x. If provided, - the out smooth L1 loss will be multiplied by this tensor element - by element. - sigma (float|None): Hyper parameter of smooth L1 loss op. A float scalar - with default value 1.0. + input is optional and should have same shape with :attr:`x`. If + provided, the out smooth L1 loss will be multiplied by this tensor + element by element. + sigma (float|None): Hyper parameter of smooth L1 loss layer. A float + scalar with default value 1.0. + Returns: - Variable: A tensor with rank be 2. The output smooth L1 loss with - shape [batch_size, 1]. + Variable: The output smooth L1 loss with shape [batch_size, 1]. Examples: .. code-block:: python @@ -3363,6 +4186,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None): fc = fluid.layers.fc(input=data, size=100) out = fluid.layers.smooth_l1(x=fc, y=label) """ + helper = LayerHelper('smooth_l1_loss', **locals()) diff = helper.create_tmp_variable(dtype=x.dtype) loss = helper.create_tmp_variable(dtype=x.dtype) @@ -3382,32 +4206,20 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None): def one_hot(input, depth): """ - One Hot Operator. This operator creates the one-hot representations for input - index values. The following example will help to explain the function of this - operator. + This layer creates the one-hot representations for input indices. Args: - input(variable): A Tensor/LodTensor of indices, last dimension must be 1. - depth(scalar): an interger defining the depth of the one hot dimension. + input(Variable): Input indices, last dimension must be 1. + depth(scalar): An interger defining the depth of the one-hot dimension. Returns: - The one-hot tensor or LodTensor, same as input. + Variable: The one-hot representations of input. Examples: .. code-block:: python - X is a LoDTensor: - X.lod = [[0, 1, 4]] - X.shape = [4, 1] - X.data = [[1], [1], [3], [0]] - set depth = 4 - Out is a LoDTensor: - Out.lod = [[0, 1, 4]] - Out.shape = [4, 4] - Out.data = [[0., 1., 0., 0.], - [0., 1., 0., 0.], - [0., 0., 0., 1.], - [1., 0., 0., 0.]] + label = layers.data(name="label", shape=[1], dtype="float32") + one_hot_label = layers.one_hot(input=label, depth=10) """ helper = LayerHelper("one_hot", **locals()) one_hot_out = helper.create_tmp_variable(dtype='float32') @@ -3421,15 +4233,23 @@ def one_hot(input, depth): def autoincreased_step_counter(counter_name=None, begin=1, step=1): """ - NOTE: The counter will be automatically increased by 1 every mini-batch - Return the run counter of the main program, which is started with 1. + Create an auto-increase variable + which will be automatically increased by 1 every mini-batch + Return the run counter of the main program, default is started from 1. Args: counter_name(str): The counter name, default is '@STEP_COUNTER@'. begin(int): The first value of this counter. step(int): The increment step between each execution. - Returns(Variable): The global run counter. + Returns: + Variable: The global run counter. + + Examples: + .. code-block:: python + + global_step = fluid.layers.autoincreased_step_counter( + counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1) """ helper = LayerHelper('global_step_counter') if counter_name is None: @@ -3490,7 +4310,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): the corresponding dimension of x. Args: - input(variable): The input tensor. + x(variable): The input tensor. shape(list): The new shape. At most one dimension of the new shape can be -1. actual_shape(variable): An optional input. If provided, reshape @@ -3499,11 +4319,17 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): say :attr:`actual_shape` has a higher priority than :attr:`shape`. act (str): The non-linear activation to be applied to output variable. - inplace(bool): If this flag is set true, a new output tensor is created - whose data is copied from input x, otherwise the output - shares data with input without copying. + inplace(bool): If this flag is set true, the output + shares data with input without copying, otherwise + a new output tensor is created + whose data is copied from input x. + name (str): The name of this layer. It is optional. + + Returns: + Variable: The output tensor. - Returns(variable): The output tensor. + Raises: + TypeError: if actual_shape is neither Variable nor None. Examples: .. code-block:: python @@ -3516,6 +4342,11 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): if not (isinstance(shape, list) or isinstance(shape, tuple)): raise ValueError("Input shape must be a python lsit or tuple.") + inputs = {"X": x} + if isinstance(actual_shape, Variable): + inputs["Shape"] = actual_shape + elif actual_shape is not None: + raise TypeError("actual_shape should either be Variable or None") # Validate the shape unk_dim_idx = -1 @@ -3536,9 +4367,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): reshaped = helper.create_tmp_variable(dtype=x.dtype) helper.append_op( type="reshape", - inputs={"X": x, - "Shape": actual_shape} - if isinstance(actual_shape, Variable) else {"X": x}, + inputs=inputs, attrs={"shape": shape, "inplace": inplace}, outputs={"Out": reshaped}) @@ -3548,73 +4377,74 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): def lod_reset(x, y=None, target_lod=None): """ - LoD Reset Operator. Set LoD of **x** to a new one specified by **y** or - **target_lod**. When **y** provided, **y.lod** would be considered as target - LoD first, otherwise **y.data** would be considered as target LoD. If **y** - is not provided, target LoD should be specified by **target_lod**. - If target LoD is specified by **Y.data** or **target_lod**, only one level - LoD is supported. + Set LoD of :attr:`x` to a new one specified by :attr:`y` or + :attr:`target_lod`. When :attr:`y` provided, :attr:`y.lod` would be + considered as target LoD first, otherwise :attr:`y.data` would be + considered as target LoD. If :attr:`y` is not provided, target LoD should + be specified by :attr:`target_lod`. If target LoD is specified by + :attr:`Y.data` or :attr:`target_lod`, only one level LoD is supported. .. code-block:: text * Example 1: Given a 1-level LoDTensor x: - x.lod = [[ 0, 2, 5 6 ]] + x.lod = [[ 2, 3, 1 ]] x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]] x.dims = [6, 1] - target_lod: [0, 4, 6] + target_lod: [4, 2] then we get a 1-level LoDTensor: - out.lod = [[ 0, 4, 6 ]] + out.lod = [[4, 2]] out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]] out.dims = [6, 1] * Example 2: Given a 1-level LoDTensor x: - x.lod = [[ 0, 2, 5 6 ]] + x.lod = [[2, 3, 1]] x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]] x.dims = [6, 1] y is a Tensor: - y.data = [[0, 2, 6]] + y.data = [[2, 4]] y.dims = [1, 3] then we get a 1-level LoDTensor: - out.lod = [[ 0, 2, 6 ]] + out.lod = [[2, 4]] out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]] out.dims = [6, 1] * Example 3: Given a 1-level LoDTensor x: - x.lod = [[ 0, 2, 5 6 ]] + x.lod = [[2, 3, 1]] x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]] x.dims = [6, 1] y is a 2-level LoDTensor: - y.lod = [[0, 2, 4], [0, 2, 5, 6]] + y.lod = [[2, 2], [2, 2, 1, 1]] y.data = [[1.1], [2.1], [3.1], [4.1], [5.1], [6.1]] y.dims = [6, 1] then we get a 2-level LoDTensor: - out.lod = [[0, 2, 4], [0, 2, 5, 6]] + out.lod = [[2, 2], [2, 2, 1, 1]] out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]] out.dims = [6, 1] Args: x (Variable): Input variable which could be a Tensor or LodTensor. - y (Variable|None): If provided, output's LoD would be derived from y. + y (Variable|None): If provided, output's LoD would be derived + from :attr:`y`. target_lod (list|tuple|None): One level LoD which should be considered - as target LoD when y not provided. + as target LoD when :attr:`y` not provided. Returns: - Variable: Output variable with LoD specified by this operator. + Variable: Output variable with LoD specified by this layer. Raises: - ValueError: If y and target_lod are both None. + ValueError: If :attr:`y` and :attr:`target_lod` are both None. Examples: .. code-block:: python @@ -3650,9 +4480,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None): .. math:: - Output(i, x, y) = Input(i, x, y) / \left( - k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)} - (Input(j, x, y))^2 \right)^{\beta} + Output(i, x, y) = Input(i, x, y) / \\left(k + \\alpha \\sum\\limits^{\\min(C, c + n/2)}_{j = \\max(0, c - n/2)}(Input(j, x, y))^2\\right)^{\\beta} In the above equation: @@ -3836,34 +4664,20 @@ def label_smooth(label, return smooth_label +@templatedoc() def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0): """ - Region of interest pooling (also known as RoI pooling) is to perform - is to perform max pooling on inputs of nonuniform sizes to obtain - fixed-size feature maps (e.g. 7*7). - The operator has three steps: - 1. Dividing each region proposal into equal-sized sections with - the pooled_width and pooled_height - 2. Finding the largest value in each section - 3. Copying these max values to the output buffer - - Args: - input (Variable): The input for ROI pooling. - rois (Variable): ROIs (Regions of Interest) to pool over. It should - be a 2-D one level LoTensor of shape [num_rois, 4]. - The layout is [x1, y1, x2, y2], where (x1, y1) - is the top left coordinates, and (x2, y2) is the - bottom right coordinates. The num_rois is the - total number of ROIs in this batch data. - pooled_height (integer): The pooled output height. Default: 1 - pooled_width (integer): The pooled output width. Default: 1 - spatial_scale (float): Multiplicative spatial scale factor. To - translate ROI coords from their input scale - to the scale used when pooling. Default: 1.0 - - Returns: - pool_out (Variable): The output is a 4-D tensor of the shape - (num_rois, channels, pooled_h, pooled_w). + ${comment} + + Args: + input (Variable): ${x_comment} + rois (Variable): ROIs (Regions of Interest) to pool over. + pooled_height (integer): ${pooled_height_comment} Default: 1 + pooled_width (integer): ${pooled_width_comment} Default: 1 + spatial_scale (float): ${spatial_scale_comment} Default: 1.0 + + Returns: + Variable: ${out_comment}. Examples: .. code-block:: python @@ -3929,22 +4743,26 @@ def dice_loss(input, label, epsilon=0.00001): return reduce_mean(dice_score) -def resize_bilinear(input, out_shape=None, scale=None, name=None): +def image_resize(input, + out_shape=None, + scale=None, + name=None, + resample='BILINEAR'): """ - The mathematical meaning of resize bilinear layer is - Bilinear interpolation. - Bilinear interpolation is an extension of linear interpolation for - interpolating functions of two variables (e.g. H-direction and - W-direction in this layer) on a rectilinear 2D grid. + **Resize a Batch of Images** - For details, please refer to Wikipedia: - https://en.wikipedia.org/wiki/Bilinear_interpolation + The input must be a tensor of the shape (num_batches, channels, in_h, in_w), + and the resizing only applies on the last two dimensions(hight and width). + + Supporting resample methods: + + 'BILINEAR' : Bilinear interpolation Args: - input (Variable): The input tensor of resize bilinear layer, + input (Variable): The input tensor of image resize layer, This is a 4-D tensor of the shape (num_batches, channels, in_h, in_w). - out_shape(list|tuple|Variable|None): Output shape of resize bilinear + out_shape(list|tuple|Variable|None): Output shape of image resize layer, the shape is (out_h, out_w). Default: None scale(float|None): The multiplier for the input height or width. @@ -3953,16 +4771,22 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None): Default: None name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. + resample(str): The resample method. It can only be 'BILINEAR' currently. + Default: 'BILINEAR' Returns: - out (Variable): The output is a 4-D tensor of the shape - (num_batches, channls, out_h, out_w). + Variable: The output is a 4-D tensor of the shape + (num_batches, channls, out_h, out_w). Examples: .. code-block:: python - out = fluid.layers.resize_bilinear(input, out_shape=[12, 12]) + out = fluid.layers.image_resize(input, out_shape=[12, 12]) """ + resample_methods = {'BILINEAR': 'bilinear_interp'} + if resample not in resample_methods: + raise ValueError( + "The 'resample' of image_resize can only be 'BILINEAR' currently.") if out_shape is None and scale is None: raise ValueError("One of out_shape and scale must not be None") helper = LayerHelper('bilinear_interp', **locals()) @@ -3990,7 +4814,7 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None): out = helper.create_tmp_variable(dtype) helper.append_op( - type="bilinear_interp", + type=resample_methods[resample], inputs=inputs, outputs={"Out": out}, attrs={"out_h": out_h, @@ -3998,14 +4822,72 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None): return out +@templatedoc(op_type="bilinear_interp") +def resize_bilinear(input, out_shape=None, scale=None, name=None): + """ + ${comment} + + Args: + input(${x_type}): ${x_comment}. + + out_shape(${out_size_type}): ${out_size_comment}. + + scale(float|None): The multiplier for the input height or width. At + least one of out_shape or scale must be set. And out_shape has + a higher priority than scale. Default: None. + + name(str|None): The output variable name. + + Returns: + ${out_comment}. + """ + + return image_resize(input, out_shape, scale, name, 'BILINEAR') + + +def image_resize_short(input, out_short_len, resample='BILINEAR'): + """ + Resize a batch of images. The short edge of input images will be + resized to the given 'out_short_len'. The long edge of input images + will be resized proportionately to make images' length-width ratio + constant. + + Args: + input (Variable): The input tensor of image resize layer, + This is a 4-D tensor of the shape + (num_batches, channels, in_h, in_w). + out_short_len(int): The length of output images' short edge. + resample (str): resample method, default: BILINEAR. + + Returns: + Variable: The output is a 4-D tensor of the shape + (num_batches, channls, out_h, out_w). + """ + in_shape = input.shape + if len(in_shape) != 4: + raise ValueError( + "The rank of input must be 4 (num_batches, channels, in_h, in_w).") + hw = in_shape[2:4] + short_idx = hw.index(min(hw)) + long_idx = 1 - short_idx + out_shape = list(hw) + out_shape[short_idx] = out_short_len + out_shape[long_idx] = int( + float(out_shape[long_idx]) * (float(out_short_len) / float(hw[ + short_idx])) + 0.5) + return image_resize(input=input, out_shape=out_shape, resample=resample) + + def gather(input, index): """ - Output is obtained by gathering entries of the outer-most dimension + **Gather Layer** + + Output is obtained by gathering entries of the outer-most dimension of X indexed by `index` and concatenate them together. .. math:: - Out = X[Index] + Out = X[Index] .. code-block:: text @@ -4013,8 +4895,8 @@ def gather(input, index): Given: - X = [[1, 2], - [3, 4], + X = [[1, 2], + [3, 4], [5, 6]] Index = [1, 2] @@ -4025,13 +4907,14 @@ def gather(input, index): [5, 6]] Args: - input (Variable): The source input with rank>=1. + input (Variable): The source input with rank>=1. index (Variable): The index input with rank=1. Returns: output (Variable): The output is a tensor with the same rank as input. Examples: + .. code-block:: python output = fluid.layers.gather(x, index) @@ -4047,31 +4930,247 @@ def gather(input, index): return out -def random_crop(input, shape, seed=1): +@templatedoc() +def random_crop(x, shape, seed=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + shape(${shape_type}): ${shape_comment} + seed(int|${seed_type}|None): ${seed_comment} By default, the seed will + get from `random.randint(-65536, 65535)`. + + Returns: + ${out_comment} + + Examples: + >>> img = fluid.layers.data("img", [3, 256, 256]) + >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224]) + """ helper = LayerHelper("random_crop", **locals()) - dtype = helper.input_dtype() + dtype = x.dtype out = helper.create_tmp_variable(dtype) + if seed is None: + seed = random.randint(-65536, 65535) + op_attrs = {"shape": shape} if isinstance(seed, int): - seed_value = seed - seed = helper.create_tmp_variable(dtype="int64") - helper.append_op( - type="fill_constant", - inputs={}, - outputs={"Out": seed}, - attrs={ - "dtype": seed.dtype, - "shape": [1], - "value": float(seed_value), - "force_cpu": True - }) + op_attrs["startup_seed"] = seed + seed = helper.create_variable( + name=unique_name.generate("random_crop_seed"), + dtype="int64", + persistable=True) elif not isinstance(seed, Variable): raise ValueError("'seed' must be a Variable or an int.") - seed_out = helper.create_tmp_variable(dtype="int64") helper.append_op( type="random_crop", - inputs={"X": input, + inputs={"X": x, "Seed": seed}, outputs={"Out": out, - "SeedOut": seed_out}, - attrs={"shape": shape}) + "SeedOut": seed}, + attrs=op_attrs) + return out + + +def log(x): + """ + Calculates the natural log of the given input tensor, element-wise. + + .. math:: + + Out = \\ln(x) + + Args: + x (Variable): Input tensor. + + Returns: + Variable: The natural log of the input tensor computed element-wise. + + Examples: + + .. code-block:: python + + output = fluid.layers.log(x) + """ + helper = LayerHelper('log', **locals()) + dtype = helper.input_dtype(input_param_name='x') + out = helper.create_tmp_variable(dtype) + helper.append_op(type="log", inputs={"X": x}, outputs={"Out": out}) + return out + + +def relu(x): + """ + Relu takes one input data (Tensor) and produces one output data (Tensor) + where the rectified linear function, y = max(0, x), is applied to + the tensor elementwise. + + .. math:: + + Out = \\max(0, x) + + Args: + x (Variable): The input tensor. + + Returns: + Variable: The output tensor with the same shape as input. + + Examples: + + .. code-block:: python + + output = fluid.layers.relu(x) + """ + helper = LayerHelper('relu', **locals()) + dtype = helper.input_dtype(input_param_name='x') + out = helper.create_tmp_variable(dtype) + helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out}) + return out + + +def mean_iou(input, label, num_classes): + """ + Mean Intersection-Over-Union is a common evaluation metric for + semantic image segmentation, which first computes the IOU for each + semantic class and then computes the average over classes. + IOU is defined as follows: + + .. math:: + + IOU = \\frac{true\_positiv}{(true\_positive + false\_positive + false\_negative)}. + + The predictions are accumulated in a confusion matrix and mean-IOU + is then calculated from it. + + + Args: + input (Variable): A Tensor of prediction results for semantic labels with type int32 or int64. + label (Variable): A Tensor of ground truth labels with type int32 or int64. + Its shape should be the same as input. + num_classes (int): The possible number of labels. + + Returns: + mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1]. + out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class. + out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class. + + Examples: + + .. code-block:: python + + iou, wrongs, corrects = fluid.layers.mean_iou(predict, label, num_classes) + """ + helper = LayerHelper('mean_iou', **locals()) + dtype = helper.input_dtype() + out_mean_iou = helper.create_tmp_variable(dtype='float32') + out_wrong = helper.create_tmp_variable(dtype='int32') + out_correct = helper.create_tmp_variable(dtype='int32') + helper.append_op( + type="mean_iou", + inputs={"predictions": input, + "labels": label}, + outputs={ + "out_mean_iou": out_mean_iou, + "out_wrong": out_wrong, + "out_correct": out_correct + }, + attrs={"num_classes": num_classes}) + return out_mean_iou, out_wrong, out_correct + + +def crop(x, shape=None, offsets=None, name=None): + """ + Crop input into output, as specified by offsets and shape. + + .. code-block:: text + + * Case 1: + Given + X = [[0, 1, 2, 0, 0] + [0, 3, 4, 0, 0] + [0, 0, 0, 0, 0]], + and + shape = [2, 2], + offsets = [0, 1], + output is: + Out = [[1, 2], + [3, 4]]. + * Case 2: + Given + X = [[0, 1, 2, 5, 0] + [0, 3, 4, 6, 0] + [0, 0, 0, 0, 0]], + and shape is tensor + shape = [[0, 0, 0] + [0, 0, 0]] + and + offsets = [0, 1], + + output is: + Out = [[1, 2, 5], + [3, 4, 6]]. + + Args: + x (Variable): The input tensor variable. + shape (Variable|list/tuple of integer): The output shape is specified + by `shape`, which can a Variable or a list/tupe of integer. + If a tensor Variable, it's rank must be the same as `x`. This way + is suitable for the case that the output shape may be changed each + iteration. If a list/tupe of integer, it's length must be the same + as the rank of `x` + offsets (Variable|list/tuple of integer|None): Specifies the copping + offsets at each dimension. It can be a Variable or or a list/tupe + of integer. If a tensor Variable, it's rank must be the same as `x`. + This way is suitable for the case that the offsets may be changed + each iteration. If a list/tupe of integer, it's length must be the + same as the rank of `x`. If None, the offsets are 0 at each + dimension. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The cropped tensor variable. + + Raises: + ValueError: If shape is not a list, tuple or Variable. + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name="x", shape=[3, 5], dtype="float32") + y = fluid.layers.data(name="y", shape=[2, 3], dtype="float32") + crop = fluid.layers.crop(x, shape=y) + + # or + z = fluid.layers.data(name="z", shape=[3, 5], dtype="float32") + crop = fluid.layers.crop(z, shape=[2, 3]) + + """ + helper = LayerHelper('crop', **locals()) + + if not (isinstance(shape, list) or isinstance(shape, tuple) or \ + isinstance(shape, Variable)): + raise ValueError("The shape should be a list, tuple or Variable.") + + if offsets is None: + offsets = [0] * len(x.shape) + + out = helper.create_tmp_variable(x.dtype) + ipts = {'X': x} + attrs = {} + if isinstance(shape, Variable): + ipts['Y'] = shape + else: + attrs['shape'] = shape + if isinstance(offsets, Variable): + ipts['Offsets'] = offsets + else: + attrs['offsets'] = offsets + + helper.append_op( + type='crop', + inputs=ipts, + outputs={'Out': out}, + attrs=None if len(attrs) == 0 else attrs) return out diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 69cfde852dd087bb9192da1f7582f925582dbce4..9e97ec9a6f55680a2eb44ad712ac002df4fecda5 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -17,7 +17,6 @@ __activations__ = [ 'sigmoid', 'logsigmoid', 'exp', - 'relu', 'tanh', 'tanh_shrink', 'softshrink', @@ -29,7 +28,6 @@ __activations__ = [ 'sin', 'round', 'reciprocal', - 'log', 'square', 'softplus', 'softsign', @@ -40,8 +38,6 @@ __activations__ = [ 'relu6', 'pow', 'stanh', - 'hard_shrink', - 'thresholded_relu', 'hard_sigmoid', 'swish', ] @@ -64,16 +60,102 @@ __all__ = [ 'logical_or', 'logical_xor', 'logical_not', - 'uniform_random', 'uniform_random_batch_size_like', 'gaussian_random', 'gaussian_random_batch_size_like', - 'cumsum', 'scatter', 'sum', + 'slice', 'polygon_box_transform', 'shape', + 'iou_similarity', + 'maxout', ] + __activations__ for _OP in set(__all__): globals()[_OP] = generate_layer_fn(_OP) + +__all__ += ["uniform_random"] + +_uniform_random_ = generate_layer_fn('uniform_random') + + +def uniform_random(shape, dtype=None, min=None, max=None, seed=None): + kwargs = dict() + for name in locals(): + val = locals()[name] + if val is not None: + kwargs[name] = val + return _uniform_random_(**kwargs) + + +uniform_random.__doc__ = _uniform_random_.__doc__ + """ +Examples: + + >>> result = fluid.layers.uniform_random(shape=[32, 784]) +""" + +__all__ += ['hard_shrink'] + +_hard_shrink_ = generate_layer_fn('hard_shrink') + + +def hard_shrink(x, threshold=None): + kwargs = dict() + for name in locals(): + val = locals()[name] + if val is not None: + kwargs[name] = val + return _hard_shrink_(**kwargs) + + +hard_shrink.__doc__ = _hard_shrink_.__doc__ + """ +Examples: + + >>> data = fluid.layers.data(name="input", shape=[784]) + >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3) +""" + +__all__ += ['cumsum'] + +_cum_sum_ = generate_layer_fn('cumsum') + + +def cumsum(x, axis=None, exclusive=None, reverse=None): + kwargs = dict() + for name in locals(): + val = locals()[name] + if val is not None: + kwargs[name] = val + + return _cum_sum_(**kwargs) + + +cumsum.__doc__ = _cum_sum_.__doc__ + """ +Examples: + + >>> data = fluid.layers.data(name="input", shape=[32, 784]) + >>> result = fluid.layers.cumsum(data, axis=0) +""" + +__all__ += ['thresholded_relu'] + +_thresholded_relu_ = generate_layer_fn('thresholded_relu') + + +def thresholded_relu(x, threshold=None): + kwargs = dict() + for name in locals(): + val = locals()[name] + if val is not None: + kwargs[name] = val + + _thresholded_relu_(**kwargs) + + +thresholded_relu.__doc__ = _thresholded_relu_.__doc__ + """ +Examples: + + >>> data = fluid.layers.data(name="input", shape=[1]) + >>> result = fluid.layers.thresholded_relu(data, threshold=0.4) +""" diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index be34cc81a5d5ca0e781e5984b6c3eeaa4e25eb90..ce5f08de623c8e4572599f8088ecae2e4821cce0 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -6,7 +6,7 @@ # # http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software +# Unlessf required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and @@ -18,6 +18,7 @@ from ..framework import convert_np_dtype_to_dtype_ from ..framework import Variable from ..initializer import Constant, force_init_on_cpu from ..core import VarDesc +from layer_function_generator import templatedoc import numpy __all__ = [ @@ -30,12 +31,33 @@ __all__ = [ 'assign', 'fill_constant_batch_size_like', 'fill_constant', + 'argmin', + 'argmax', 'ones', 'zeros', + 'reverse', ] def create_tensor(dtype, name=None, persistable=False): + """ + Create an variable, which will hold a LoDTensor with data type dtype. + + Args: + dtype(string): 'float32'|'int32'|..., the data type of the + created tensor. + name(string): The name of the created tensor, if not set, + the name will be a random unique one. + persistable(bool): Set the persistable flag of the create tensor. + + Returns: + Variable: The tensor variable storing the created tensor. + + Examples: + .. code-block:: python + + tensor = fluid.layers.create_tensor(dtype='float32') + """ helper = LayerHelper("create_tensor", **locals()) return helper.create_variable( name=helper.name, dtype=dtype, persistable=persistable) @@ -48,7 +70,12 @@ def create_parameter(shape, is_bias=False, default_initializer=None): """ - Create a parameter + Create a parameter. The parameter is a learnable variable, which can have + gradient, and can be optimized. + + NOTE: this is a very low-level API. This API is useful when you create + operator by your self. instead of using layers. + Args: shape(list[int]): shape of the parameter dtype(string): element type of the parameter @@ -60,7 +87,12 @@ def create_parameter(shape, default_initializer(Initializer): initializer for the parameter Returns: - Parameter: the created parameter + the created parameter. + + Examples: + >>> W = fluid.layers.create_parameter(shape=[784, 200], dtype='float32') + >>> data = fluid.layers.data(name="img", shape=[64, 784], append_batch_size=False) + >>> hidden = fluid.layers.matmul(x=data, y=W) """ helper = LayerHelper("create_parameter", **locals()) if attr is None: @@ -76,16 +108,29 @@ def create_global_var(shape, force_cpu=False, name=None): """ - Create a global variable. such as global_step + Create a new variable in the global block(block 0). + Args: shape(list[int]): shape of the variable - value(float): the value of the variable - dtype(string): element type of the parameter - persistable(bool): if this variable is persistable - force_cpu(bool): force this variable to be on CPU + value(float): the value of the variable. The new created + variable will be filled with it. + dtype(string): data type of the variable + persistable(bool): if this variable is persistable. + Default: False + force_cpu(bool): force this variable to be on CPU. + Default: False + name(str|None): The name of the variable. If set to None the variable + name will be generated automatically. + Default: None Returns: Variable: the created Variable + + Examples: + .. code-block:: python + + var = fluid.create_global_var(shape=[2,3], value=1.0, dtype='float32', + persistable=True, force_cpu=True, name='new_var') """ helper = LayerHelper("global_var", **locals()) var = helper.create_global_variable( @@ -98,8 +143,21 @@ def create_global_var(shape, def cast(x, dtype): """ - This function takes in the input with input_dtype - and casts it to the output_dtype as the output. + This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts + it to the output with :attr:`dtype`. + + Args: + x (Variable): The input Variable for casting. + dtype(np.dtype|core.VarDesc.VarType|str): Data type of the output Variable. + + Returns: + Variable: The output Variable after casting. + + Examples: + .. code-block:: python + + data = fluid.layers.data(name='x', shape=[13], dtype='float32') + result = fluid.layers.cast(x=data, dtype='float64') """ helper = LayerHelper('cast', **locals()) out = helper.create_tmp_variable(dtype=dtype) @@ -130,7 +188,8 @@ def concat(input, axis=0, name=None): Examples: .. code-block:: python - out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth]) + + out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth]) """ helper = LayerHelper('concat', **locals()) out = helper.create_tmp_variable(dtype=helper.input_dtype()) @@ -143,19 +202,21 @@ def concat(input, axis=0, name=None): def sums(input, out=None): - """This function performs the sum operation on the input and returns the + """ + This function performs the sum operation on the input and returns the result as the output. Args: input (Variable|list): The input tensor that has the elements that need to be summed up. + out (Variable|None): Output parameter. The sum result. + Default: None Returns: - Variable: The tensor type variable that has the sum of input - written to it. + Variable: the sum of input. The same as the argument 'out' Examples: - .. code-block::python + .. code-block:: python tmp = fluid.layers.zeros(shape=[10], dtype='int32') i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10) @@ -169,11 +230,15 @@ def sums(input, out=None): helper = LayerHelper('sum', **locals()) if out is None: out = helper.create_tmp_variable(dtype=helper.input_dtype()) - helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out}) + helper.append_op( + type='sum', + inputs={'X': input}, + outputs={'Out': out}, + attrs={'use_mkldnn': False}) return out -def assign(input, output): +def assign(input, output=None): """ **Assign** @@ -181,18 +246,21 @@ def assign(input, output): Args: input(Variable|numpy.ndarray): The source variable - output(Variable): The destination variable + output(Variable|None): The destination variable Returns: Variable: The destination variable that was supplied as the *output*. Examples: .. code-block:: python + out = fluid.layers.create_tensor(dtype='float32') hidden = fluid.layers.fc(input=data, size=10) fluid.layers.assign(hidden, out) """ helper = LayerHelper('assign', **locals()) + if output is None: + output = helper.create_tmp_variable(dtype=input.dtype) if isinstance(input, Variable): helper.append_op( type='assign', inputs={'X': [input]}, outputs={'Out': [output]}) @@ -266,6 +334,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None): return out +@templatedoc() def fill_constant_batch_size_like(input, shape, dtype, @@ -273,30 +342,28 @@ def fill_constant_batch_size_like(input, input_dim_idx=0, output_dim_idx=0): """ - **fill_constant_batch_size_like** - - This function creates a tensor of specified *shape*, *dtype* and batch size, - and initializes this with a constant supplied in *value*. The batch size is - obtained from the `input` tensor. + ${comment} It also sets *stop_gradient* to True. + >>> data = fluid.layers.fill_constant_batch_size_like( + >>> input=like, shape=[1], value=0, dtype='int64') + Args: - input(Variable): Tensor whose dimensions will be used to get batch size - shape(tuple|list|None): Shape of output tensor - dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor - value(float): Constant value to initialize the output tensor - input_dim_idx(int): Index of input's batch size dimension - output_dim_idx(int): Index of output's batch size dimension + input(${input_type}): ${input_comment}. - Returns: - Variable: The tensor variable storing the output + shape(${shape_type}): ${shape_comment}. - Examples: - .. code-block:: python + dtype(${dtype_type}): ${dtype_comment}. + + value(${value_type}): ${value_comment}. - data = fluid.layers.fill_constant_batch_size_like( - input=like, shape=[1], value=0, dtype='int64') + input_dim_idx(${input_dim_idx_type}): ${input_dim_idx_comment}. + + output_dim_idx(${output_dim_idx_type}): ${output_dim_idx_comment}. + + Returns: + ${out_comment}. """ helper = LayerHelper("fill_constant_batch_size_like", **locals()) out = helper.create_tmp_variable(dtype=dtype) @@ -315,6 +382,68 @@ def fill_constant_batch_size_like(input, return out +def argmin(x, axis=0): + """ + **argmin** + + This function computes the indices of the min elements + of the input tensor's element along the provided axis. + + Args: + x(Variable): The input to compute the indices of + the min elements. + axis(int): Axis to compute indices along. + + Returns: + Variable: The tensor variable storing the output + + Examples: + .. code-block:: python + + out = fluid.layers.argmin(x=in, axis=0) + out = fluid.layers.argmin(x=in, axis=-1) + """ + helper = LayerHelper("arg_min", **locals()) + out = helper.create_tmp_variable(VarDesc.VarType.INT64) + helper.append_op( + type='arg_min', + inputs={'X': x}, + outputs={'Out': [out]}, + attrs={'axis': axis}) + return out + + +def argmax(x, axis=0): + """ + **argmax** + + This function computes the indices of the max elements + of the input tensor's element along the provided axis. + + Args: + x(Variable): The input to compute the indices of + the max elements. + axis(int): Axis to compute indices along. + + Returns: + Variable: The tensor variable storing the output + + Examples: + .. code-block:: python + + out = fluid.layers.argmax(x=in, axis=0) + out = fluid.layers.argmax(x=in, axis=-1) + """ + helper = LayerHelper("arg_max", **locals()) + out = helper.create_tmp_variable(VarDesc.VarType.INT64) + helper.append_op( + type='arg_max', + inputs={'X': x}, + outputs={'Out': [out]}, + attrs={'axis': axis}) + return out + + def ones(shape, dtype, force_cpu=False): """ **ones** @@ -349,11 +478,12 @@ def zeros(shape, dtype, force_cpu=False): It also sets *stop_gradient* to True. Args: - shape(tuple|list|None): Shape of output tensor - dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor + shape(tuple|list|None): Shape of output tensor. + dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor. + force_cpu(bool, default False): Whether to make output stay on CPU. Returns: - Variable: The tensor variable storing the output + Variable: The tensor variable storing the output. Examples: .. code-block:: python @@ -363,6 +493,40 @@ def zeros(shape, dtype, force_cpu=False): return fill_constant(value=0.0, **locals()) +def reverse(x, axis): + """ + **reverse** + + This function reverse the input 'x' along given axises. + + Args: + x(Vairbale): the input to be reversed. + axis(int|tuple|list): Axis that along which order of elements + is reversed. If it is a tuple or a list, reversing + will be apply on each axis in the tuple or list. + + Returns: + Variable: The reversed tensor. + + Examples: + .. code-block:: python + + out = fluid.layers.reverse(x=in, axis=0) + # or: + out = fluid.layers.reverse(x=in, axis=[0,1]) + """ + if isinstance(axis, int): + axis = [axis] + helper = LayerHelper("reverse", **locals()) + out = helper.create_tmp_variable(dtype=x.dtype) + helper.append_op( + type='reverse', + inputs={'Input': x}, + outputs={'Out': [out]}, + attrs={'axis': axis}) + return out + + def save(x, file_path, overwrite=True): """ Saves a variable as a file. @@ -370,9 +534,9 @@ def save(x, file_path, overwrite=True): Args: x(variable): The Tensor/LoDTensor to be saved. file_path(str): The file path where the variable will be saved. - overwrite(bool): Whether or not cover the given file when it has already - existed. If it's set 'False' and the file is existed, a runtime - error will be thrown. + overwrite(bool): Whether or not cover the given file when it has already + existed. If it's set 'False' and the file is existed, a runtime + error will be thrown. """ helper = LayerHelper("save", **locals()) helper.append_op( @@ -388,11 +552,27 @@ def save_combine(x, file_path, overwrite=True): Saves a list of variables into a single file. Args: - x(list): A list of Tensor/LoDTensor to be saved together in a single file. + x(list): A list of Tensor/LoDTensor variables to be saved together in + a single file. file_path(str): The file path where variables will be saved. - overwrite(bool): Whether or not cover the given file when it has already - existed. If it's set 'False' and the file is existed, a runtime - error will be thrown. + overwrite(bool): Whether or not cover the given file when it has already + existed. If it's set 'False' and the file is existed, a runtime + error will be thrown. + + Returns: + There is no return value. + + Examples: + + .. code-block:: python + + v1 = fluid.layers.data(name="data", + shape=(4, 6), + dtype="float32") + v2 = fluid.layers.data(name="data", + shape=(6, 8, 4), + dtype="float32") + normed = fluid.layers.save_combine([v1, v2], file_path="output") """ helper = LayerHelper("save_combine", **locals()) helper.append_op( @@ -403,22 +583,6 @@ def save_combine(x, file_path, overwrite=True): "overwrite": overwrite}) -def load(out, file_path): - """ - Loads a variable from a given file. - - Args: - out(variable): The variable to be read from the disk file. - file_path(str): The path of the disk file. - """ - helper = LayerHelper("load", **locals()) - helper.append_op( - type="load", - inputs={}, - output={"Out": out}, - args={"file_path": file_path}) - - def load_combine(out, file_path): """ Loads a list of vairables from a single file. diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py index 9946d0a4ff33b2f5040f6d2e31aa20fcf9c609a7..c417ab393fca88d476d2f1fe83d12f99271d6883 100644 --- a/python/paddle/fluid/lod_tensor.py +++ b/python/paddle/fluid/lod_tensor.py @@ -18,108 +18,42 @@ import numpy as np __all__ = ['create_lod_tensor', 'create_random_int_lodtensor'] -def _validate_lod(lod, tensor_height=-1): - """Check whether the input length-based lod info is valid. - - There are several things to check: - 1. lod should be a list of lists. Empty list is fine. - 2. The length of each sublist (a lod level) should be at least one. - 3. Each element in each lod level should be an integer greater than 0. - 4. The sum of one lod level should be equal to the length of the next lod level. - 5. The sum of the last lod level should be equal to the tensor height. - Bypass this check if user does not provide tensor_height as input. - - Args: - lod: the length-based lod info, e.g., [[2, 3], [2, 1, 2, 3, 4]]. - tensor_height: the outermost dimension of the tensor with which the input - lod is associated with. - - Returns: - A boolean indicating whether the input lod is valid or not. - """ - assert isinstance(lod, list), "lod should be a list" - # Empty lod is fine - if len(lod) == 0: - return True - - lod_sum = [] - for level in lod: - assert isinstance(level, list), "each item in lod should be a list" - # Each level of lod should have at least one length info - if len(level) < 1: - return False - level_sum = 0 - for lod_len in level: - # Each length in a level should be > 0 - if lod_len <= 0: - return False - level_sum += lod_len - lod_sum.append(level_sum) - - for idx, val in enumerate(lod_sum[:-1]): - # Each level's sum should be equal to - # the number of items in the next level - if val != len(lod[idx + 1]): - return False - - if tensor_height == -1: - return True - else: - # Last level's sum should be equal to the tensor height - return lod_sum[-1] == tensor_height - - -def _convert_lod(lod): - """Convert a length-based lod to a offset-based lod. - - If the length-based lod is [[2, 3], [2, 1, 2, 3, 4]], - then the offset-based lod is [[0, 2, 5], [0, 2, 3, 5, 8, 12]]. - - Args: - lod: a length-based lod info. - - Returns: - A list of lists as the offset-based lod converted to from the input lod. - """ - new_lod = [] - for level in lod: - cur_len = 0 - new_level = [cur_len] - for lod_len in level: - cur_len += lod_len - new_level.append(cur_len) - new_lod.append(new_level) - return new_lod - - def create_lod_tensor(data, lod, place): - """Create a lod tensor from a numpy array, a list, or an existing lod tensor. + """ + Create a lod tensor from a numpy array, a list, or an existing lod tensor. Create a lod tensor by doing the following: + 1. Check that the length-based input lod is valid. + 2. Convert the length-based lod to a offset-based LoD. - 3. Copy the data from a numpy array, a list or a existing lod tensor to + + 3. Copy the data from a numpy array, a list or a existing lod tensor to CPU or GPU device (based on input place). + 4. Set the level of detail (LoD) using the offset-based LoD. - Use example: - Suppose we want LoDTensor to hold data for sequences of word, where each word is - represented by an integer. If we want to create a LoDTensor to represent two - sentences, one of 2 words, and one of 3 words. + Examples: + + Suppose we want LoDTensor to hold data for sequences of word, where each + word is represented by an integer. If we want to create a LoDTensor to + represent two sentences, one of 2 words, and one of 3 words. - Then 'data' can be a numpy array of integers with shape (5, 1). - 'lod' will be [[2, 3]], indicating the length(# of words) in each sentence. - This length-based input lod [[2, 3]] will be converted to offset-based lod [[0, 2, 5]] - inside the function call. + Then :code:`data` can be a numpy array of integers with shape (5, 1). + :code:`lod` will be [[2, 3]], indicating the length(# of words) in each + sentence. This length-based input lod [[2, 3]] will be converted to + offset-based lod [[0, 2, 5]] inside the function call. - Please refer to - github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md - for more details regarding LoD. + Please reference :ref:`api_guide_low_level_lod_tensor` for more details + regarding LoD. Args: - data: a numpy array or a LoDTensor or a list holding the data to be copied. - lod: a list of lists indicating the length-based LoD info specified by the user. - place: CPU or GPU place indicating where the data in the new LoDTensor will be stored. + data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a + list holding the data to be copied. + lod(list): a list of lists indicating the length-based LoD info + specified by the user. + place(Place): CPU or GPU place indicating where the data in the new + LoDTensor will be stored. Returns: A fluid LoDTensor object with tensor data and lod info. @@ -139,11 +73,11 @@ def create_lod_tensor(data, lod, place): flattened_data = flattened_data.reshape([len(flattened_data), 1]) return create_lod_tensor(flattened_data, lod, place) elif isinstance(data, np.ndarray): - assert _validate_lod(lod, - data.shape[0]), "the provided lod info is invalid" tensor = core.LoDTensor() tensor.set(data, place) - tensor.set_lod(_convert_lod(lod)) + tensor.set_recursive_sequence_lengths(lod) + assert tensor.has_valid_recursive_sequence_lengths( + ), "the provided lod info is invalid" return tensor else: raise TypeError( @@ -151,39 +85,45 @@ def create_lod_tensor(data, lod, place): def create_random_int_lodtensor(lod, base_shape, place, low, high): - """Create a LoDTensor containing random integers. + """ + Create a LoDTensor containing random integers. - This function is frequently used in the book examples. So we revised it based on - the new create_lod_tensor API and put it here in the lod_tensor module to simplify - the code. + This function is frequently used in the book examples. So we revised it + based on the new create_lod_tensor API and put it here in the lod_tensor + module to simplify the code. The function does the following: - 1. Calculate the overall shape of the LoDTensor based on the length-based 'lod' input - and the shape of the basic element in 'base_shape'. + + 1. Calculate the overall shape of the LoDTensor based on the length-based + :code:`lod` input and the shape of the basic element in + :code:`base_shape`. + 2. Create a numpy array of this shape. + 3. Create the LoDTensor using create_lod_tensor API. - Suppose we want LoDTensor to hold data for sequences of word, where each word is - represented by an integer. If we want to create a LoDTensor to represent two - sentences, one of 2 words, and one of 3 words. Then 'base_shape' is [1], input - length-based 'lod' is [[2, 3]]. Then the overall shape of the LoDTensor would be - [5, 1], holding 5 words for two sentences. + Suppose we want LoDTensor to hold data for sequences of word, where each + word is represented by an integer. If we want to create a LoDTensor to + represent two sentences, one of 2 words, and one of 3 words. Then + 'base_shape' is [1], input length-based 'lod' is [[2, 3]]. Then the overall + shape of the LoDTensor would be [5, 1], holding 5 words for two sentences. Args: - data: a numpy array or a LoDTensor holding the data to be copied. - lod: a list of lists indicating the length-based LoD info specified by the user. - base_shape: the shape of the basic element to be held by the LoDTensor. - place: CPU or GPU place indicating where the data in the new LoDTensor will be stored. - low: the lower bound of the random integers. - high: the upper bound of the random integers. + lod(list): a list of lists indicating the length-based LoD info + specified by the user. + base_shape(list): the shape of the basic element to be held by the + LoDTensor. + place(Place): CPU or GPU place indicating where the data in the new + LoDTensor will be stored. + low(int): the lower bound of the random integers. + high(int): the upper bound of the random integers. Returns: A fluid LoDTensor object with tensor data and lod info. """ assert isinstance(base_shape, list), "base_shape should be a list" - converted_lod = _convert_lod(lod) # append the total number of basic elements to the front of its shape - overall_shape = [converted_lod[-1][-1]] + base_shape + overall_shape = [sum(lod[-1])] + base_shape # the range of integer data elements is [low, high] data = np.random.random_integers(low, high, overall_shape).astype("int64") return create_lod_tensor(data, lod, place) diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index bb9c6fdc60089fc2b43573a6421a6f9781d2d4a8..17bb0826a6ea86c98a069263dfab84b99e1177ad 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -23,6 +23,8 @@ import warnings __all__ = [ 'MetricBase', 'CompositeMetric', + 'Precision', + 'Recall', 'Accuracy', 'ChunkEvaluator', 'EditDistance', @@ -46,33 +48,34 @@ def _is_number_or_matrix_(var): class MetricBase(object): """ - Base Class for all evaluators + Base Class for all Metrics. + MetricBase define a group of interfaces for the + model evaluation methods. Metrics accumulate metric states between + consecutive minibatches, at every minibatch, use update + interface to add current minibatch value to global states. + Use eval to compute accumative metric value from last reset() + or from scratch on. + If you need to custom a new metric, please inherit from MetricBase and + custom implementation. Args: - name(str): The name of evaluator. such as, "accuracy". Used for generate - temporary variable name. - Interface: - Note(*) : the states is the attributes who not has _ prefix. - - get_config(): print current states and configuration - reset(): clear the states. If the Metrics states type is not (int, float, np.ndarray), - Please override this method. - update(): update states at every minibatch - eval(): get metric evaluation in numpy type. + name(str): The name of metric instance. such as, "accuracy". + It needed if you want to distinct different metrics in a model. + """ - def __init__(self, name, **kwargs): + def __init__(self, name): self._name = str(name) if name != None else self.__class__.__name__ - self._kwargs = kwargs if kwargs != None else dict() - self.reset() def __str__(self): return self._name def reset(self): """ - states is the attributes who not has _ prefix. - reset the states of metrics. + reset clear the states of metrics. By default, the states + are the members who do not has _ prefix, reset set them to inital states. + If you violate the implicit name rule, please also custom the reset + interface. """ states = { attr: value @@ -90,61 +93,231 @@ class MetricBase(object): setattr(self, attr, None) def get_config(self): + """ + Get the metric and current states. + The states are the members who do not has "_" prefix. + + Args: + None + + Returns: + dict: a dict of metric and states + """ states = { attr: value for attr, value in self.__dict__.iteritems() if not attr.startswith("_") } - config = copy.deepcopy(self._kwargs) + config = {} config.update({"name": self._name, "states": copy.deepcopy(states)}) return config - def update(self): - raise NotImplementedError() + def update(self, preds, labels): + """ + Updates the metric states at every minibatch. + One user can compute the minibatch metric via pure Python, or + via a c++ operator. + + Args: + preds(numpy.array): the predictions of current minibatch + labels(numpy.array): the labels of current minibatch, if the label is one-hot + or soft-label, should custom the corresponding update rule. + """ + raise NotImplementedError( + "Should not use it directly, please extend it.") def eval(self): - raise NotImplementedError() + """ + Evalute the current metrics based the accumulated states. + + Returns: + float|list(float)|numpy.array: the metrics via Python. + """ + raise NotImplementedError( + "Should not use it directly, please extend it.") class CompositeMetric(MetricBase): """ - Compute multiple metrics in each minibatch. + Composite multiple metrics in one instance. for example, merge F1, accuracy, recall into one Metric. + + Examples: + .. code-block:: python + + labels = fluid.layers.data(name="data", shape=[1], dtype="int32") + data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32") + pred = fluid.layers.fc(input=data, size=1000, act="tanh") + comp = fluid.metrics.CompositeMetric() + acc = fluid.metrics.Precision() + recall = fluid.metrics.Recall() + comp.add_metric(acc) + comp.add_metric(recall) + for pass in range(PASSES): + comp.reset() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) + comp.update(preds=preds, labels=labels) + numpy_acc, numpy_recall = comp.eval() """ - def __init__(self, name=None, **kwargs): - super(CompositeMetric, self).__init__(name, kwargs) + def __init__(self, name=None): + super(CompositeMetric, self).__init__(name) self._metrics = [] def add_metric(self, metric): + """ + add one metric instance to CompositeMetric. + + Args: + metric: a instance of MetricBase. + """ if not isinstance(metric, MetricBase): raise ValueError("SubMetric should be inherit from MetricBase.") self._metrics.append(metric) + def update(self, preds, labels): + """ + Update every metrics in sequence. + + Args: + preds(numpy.array): the predictions of current minibatch + labels(numpy.array): the labels of current minibatch, if the label is one-hot + or soft-label, should custom the corresponding update rule. + """ + for m in self._metrics: + ans.append(m.update(preds, labels)) + def eval(self): + """ + Evaluate every metrics in sequence. + + Returns: + list(float|numpy.array): a list of metrics value in Python. + """ ans = [] for m in self._metrics: ans.append(m.eval()) return ans +class Precision(MetricBase): + """ + Precision (also called positive predictive value) is the fraction of + relevant instances among the retrieved instances. + https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers + + Note Precision is different with Accuracy in binary classifiers. + accuracy = true positive / total instances + precision = true positive / all positive instance + + Examples: + .. code-block:: python + + metric = fluid.metrics.Precision() + for pass in range(PASSES): + metric.reset() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) + metric.update(preds=preds, labels=labels) + numpy_precision = metric.eval() + """ + + def __init__(self, name=None): + super(Precision, self).__init__(name) + self.tp = 0 # true positive + self.fp = 0 # false positive + + def update(self, preds, labels): + if not _is_numpy_(preds): + raise ValueError("The 'preds' must be a numpy ndarray.") + if not _is_numpy_(labels): + raise ValueError("The 'labels' must be a numpy ndarray.") + sample_num = labels[0] + for i in range(sample_num): + pred = preds[i].astype("int32") + label = labels[i] + if label == 1: + if pred == label: + self.tp += 1 + else: + self.fp += 1 + + def eval(self): + ap = self.tp + self.fp + return float(self.tp) / ap if ap != 0 else .0 + + +class Recall(MetricBase): + """ + Recall (also known as sensitivity) is the fraction of + relevant instances that have been retrieved over the + total amount of relevant instances + + https://en.wikipedia.org/wiki/Precision_and_recall + + Examples: + .. code-block:: python + + metric = fluid.metrics.Recall() + for pass in range(PASSES): + metric.reset() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) + metric.update(preds=preds, labels=labels) + numpy_recall = metric.eval() + """ + + def __init__(self, name=None): + super(Recall, self).__init__(name) + self.tp = 0 # true positive + self.fn = 0 # false negtive + + def update(self, preds, labels): + if not _is_numpy_(preds): + raise ValueError("The 'preds' must be a numpy ndarray.") + if not _is_numpy_(labels): + raise ValueError("The 'labels' must be a numpy ndarray.") + sample_num = labels[0] + for i in range(sample_num): + pred = preds[i].astype("int32") + label = labels[i] + if label == 1: + if pred == label: + self.tp += 1 + else: + if pred != label: + self.fn += 1 + + def eval(self): + recall = self.tp + self.fn + return float(self.tp) / recall if recall != 0 else .0 + + class Accuracy(MetricBase): """ Accumulate the accuracy from minibatches and compute the average accuracy for every pass. + https://en.wikipedia.org/wiki/Accuracy_and_precision Args: name: the metrics name - Example: - minibatch_accuracy = fluid.layers.accuracy(pred, label) - accuracy_evaluator = fluid.metrics.Accuracy() - for epoch in PASS_NUM: - accuracy_evaluator.reset() - for data in batches: - loss = exe.run(fetch_list=[cost, minibatch_accuracy]) - accuracy_evaluator.update(value=minibatch_accuracy, weight=batches) - accuracy = accuracy_evaluator.eval() + Examples: + .. code-block:: python + + labels = fluid.layers.data(name="data", shape=[1], dtype="int32") + data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32") + pred = fluid.layers.fc(input=data, size=1000, act="tanh") + minibatch_accuracy = fluid.layers.accuracy(pred, label) + accuracy_evaluator = fluid.metrics.Accuracy() + for pass in range(PASSES): + accuracy_evaluator.reset() + for data in train_reader(): + batch_size = data[0] + loss = exe.run(fetch_list=[cost, minibatch_accuracy]) + accuracy_evaluator.update(value=minibatch_accuracy, weight=batch_size) + numpy_acc = accuracy_evaluator.eval() """ def __init__(self, name=None): @@ -153,6 +326,13 @@ class Accuracy(MetricBase): self.weight = .0 def update(self, value, weight): + """ + Update minibatch states. + + Args: + value(float|numpy.array): accuracy of one minibatch. + weight(int|float): batch size. + """ if not _is_number_or_matrix_(value): raise ValueError( "The 'value' must be a number(int, float) or a numpy ndarray.") @@ -163,9 +343,8 @@ class Accuracy(MetricBase): def eval(self): if self.weight == 0: - raise ValueError( - "There is no data in Accuracy Metrics. Please check layers.accuracy output has added to Accuracy." - ) + raise ValueError("There is no data in Accuracy Metrics. \ + Please check layers.accuracy output has added to Accuracy.") return self.value / self.weight @@ -174,6 +353,25 @@ class ChunkEvaluator(MetricBase): Accumulate counter numbers output by chunk_eval from mini-batches and compute the precision recall and F1-score using the accumulated counter numbers. + For some basics of chunking, please refer to + 'Chunking with Support Vector Machines '. + ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection, + and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. + + Examples: + .. code-block:: python + + labels = fluid.layers.data(name="data", shape=[1], dtype="int32") + data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32") + pred = fluid.layers.fc(input=data, size=1000, act="tanh") + precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval( + input=pred, + label=label) + metric = fluid.metrics.ChunkEvaluator() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) + metric.update(num_infer_chunks, num_label_chunks, num_correct_chunks) + numpy_precision, numpy_recall, numpy_f1 = metric.eval() """ def __init__(self, name=None): @@ -183,9 +381,17 @@ class ChunkEvaluator(MetricBase): self.num_correct_chunks = 0 def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks): + """ + Update the states based on the layers.chunk_eval() ouputs. + Args: + num_infer_chunks(int|numpy.array): The number of chunks in Inference on the given minibatch. + num_label_chunks(int|numpy.array): The number of chunks in Label on the given mini-batch. + num_correct_chunks(int|float|numpy.array): The number of chunks both in Inference and Label on the + given mini-batch. + """ if not _is_number_or_matrix_(num_infer_chunks): raise ValueError( - "The 'num_infer_chunks' must be a number(int, float) or a numpy ndarray." + "The 'num_infer_chunks' must be a number(int) or a numpy ndarray." ) if not _is_number_or_matrix_(num_label_chunks): raise ValueError( @@ -212,21 +418,28 @@ class ChunkEvaluator(MetricBase): class EditDistance(MetricBase): """ + Edit distance is a way of quantifying how dissimilar two strings + (e.g., words) are to one another by counting the minimum number + of operations required to transform one string into the other. + Refer to https://en.wikipedia.org/wiki/Edit_distance + Accumulate edit distance sum and sequence number from mini-batches and compute the average edit_distance and instance error of all batches. Args: name: the metrics name - Example: - edit_distance_metrics = fluid.layers.edit_distance(input, label) - distance_evaluator = fluid.metrics.EditDistance() - for epoch in PASS_NUM: - distance_evaluator.reset() - for data in batches: - loss = exe.run(fetch_list=[cost] + list(edit_distance_metrics)) - distance_evaluator.update(*edit_distance_metrics) - distance, instance_error = distance_evaluator.eval() + Examples: + .. code-block:: python + + distances, seq_num = fluid.layers.edit_distance(input, label) + distance_evaluator = fluid.metrics.EditDistance() + for epoch in PASS_NUM: + distance_evaluator.reset() + for data in batches: + loss = exe.run(fetch_list=[cost] + list(edit_distance_metrics)) + distance_evaluator.update(distances, seq_num) + distance, instance_error = distance_evaluator.eval() In the above example: 'distance' is the average of the edit distance in a pass. @@ -264,16 +477,38 @@ class EditDistance(MetricBase): class DetectionMAP(MetricBase): """ Calculate the detection mean average precision (mAP). - - TODO (Dang Qingqing): update the following doc. - The general steps are as follows: - 1. calculate the true positive and false positive according to the input - of detection and labels. - 2. calculate mAP value, support two versions: '11 point' and 'integral'. - + mAP is the metric to measure the accuracy of object detectors + like Faster R-CNN, SSD, etc. + It is the average of the maximum precisions at different recall values. Please get more information from the following articles: https://sanchom.wordpress.com/tag/average-precision/ + https://arxiv.org/abs/1512.02325 + + The general steps are as follows: + + 1. calculate the true positive and false positive according to the input + of detection and labels. + 2. calculate mAP value, support two versions: '11 point' and 'integral'. + + Examples: + .. code-block:: python + + pred = fluid.layers.fc(input=data, size=1000, act="tanh") + batch_map = layers.detection_map( + input, + label, + class_num, + background_label, + overlap_threshold=overlap_threshold, + evaluate_difficult=evaluate_difficult, + ap_version=ap_version) + metric = fluid.metrics.DetectionMAP() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, batch_map]) + batch_size = data[0] + metric.update(value=batch_map, weight=batch_size) + numpy_map = metric.eval() """ def __init__(self, name=None): @@ -302,17 +537,18 @@ class DetectionMAP(MetricBase): class Auc(MetricBase): """ - Auc Metrics which adapts to binary classification. - Need to note that auc metrics compute the value via Python natively. + Auc metric adapts to the binary classification. + Refer to https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve + Need to note that auc metric compute the value via Python natively. If you concern the speed, please use the fluid.layers.auc instead. The `auc` function creates four local variables, `true_positives`, - `true_negatives`, `false_positives` and `false_negatives` that are used to - compute the AUC. To discretize the AUC curve, a linearly spaced set of - thresholds is used to compute pairs of recall and precision values. The area - under the ROC-curve is therefore computed using the height of the recall - values by the false positive rate, while the area under the PR-curve is the - computed using the height of the precision values by the recall. + `true_negatives`, `false_positives` and `false_negatives` that are used to + compute the AUC. To discretize the AUC curve, a linearly spaced set of + thresholds is used to compute pairs of recall and precision values. The area + under the ROC-curve is therefore computed using the height of the recall + values by the false positive rate, while the area under the PR-curve is the + computed using the height of the precision values by the recall. Args: name: metric name @@ -322,22 +558,32 @@ class Auc(MetricBase): curve. "NOTE: only implement the ROC curve type via Python now." + + Examples: + .. code-block:: python + + pred = fluid.layers.fc(input=data, size=1000, act="tanh") + metric = fluid.metrics.Auc() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) + metric.update(preds, labels) + numpy_auc = metric.eval() """ def __init__(self, name, curve='ROC', num_thresholds=200): - super(MetricBase, self).__init__(name, curve, num_thresholds) + super(Auc, self).__init__(name=name) self._curve = curve self._num_thresholds = num_thresholds self._epsilon = 1e-6 - self.tp_list = np.ndarray((num_thresholds, )) - self.fn_list = np.ndarray((num_thresholds, )) - self.tn_list = np.ndarray((num_thresholds, )) - self.fp_list = np.ndarray((num_thresholds, )) + self.tp_list = np.zeros((num_thresholds, )) + self.fn_list = np.zeros((num_thresholds, )) + self.tn_list = np.zeros((num_thresholds, )) + self.fp_list = np.zeros((num_thresholds, )) - def update(self, labels, predictions, axis=1): + def update(self, preds, labels): if not _is_numpy_(labels): raise ValueError("The 'labels' must be a numpy ndarray.") - if not _is_numpy_(predictions): + if not _is_numpy_(preds): raise ValueError("The 'predictions' must be a numpy ndarray.") kepsilon = 1e-7 # to account for floating point imprecisions @@ -350,12 +596,12 @@ class Auc(MetricBase): tp, fn, tn, fp = 0, 0, 0, 0 for i, lbl in enumerate(labels): if lbl: - if predictions[i, 0] >= thresh: + if preds[i, 1] >= thresh: tp += 1 else: fn += 1 else: - if predictions[i, 0] >= thresh: + if preds[i, 1] >= thresh: fp += 1 else: tn += 1 diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py index bbedf6fde0872fd32d81c103bf5fe61449b7f57b..9b3f2aebee73e56ee820dc8ff4c9cfabd1456aaa 100644 --- a/python/paddle/fluid/nets.py +++ b/python/paddle/fluid/nets.py @@ -26,16 +26,87 @@ def simple_img_conv_pool(input, filter_size, pool_size, pool_stride, - act, - param_attr=None, + pool_padding=0, pool_type='max', + global_pooling=False, + conv_stride=1, + conv_padding=0, + conv_dilation=1, + conv_groups=1, + param_attr=None, + bias_attr=None, + act=None, use_cudnn=True, use_mkldnn=False): + """ + The simple_img_conv_pool is composed with one Convolution2d and one Pool2d. + + Args: + input (Variable): The input image with [N, C, H, W] format. + num_filters(int): The number of filter. It is as same as the output + feature channel. + filter_size (int|list|tuple): The filter size. If filter_size is a list or + tuple, it must contain two integers, (filter_size_H, filter_size_W). Otherwise, + the filter_size_H = filter_size_W = filter_size. + pool_size (int|list|tuple): The pooling size of Pool2d layer. If pool_size + is a list or tuple, it must contain two integers, (pool_size_H, pool_size_W). + Otherwise, the pool_size_H = pool_size_W = pool_size. + pool_stride (int|list|tuple): The pooling stride of Pool2d layer. If pool_stride + is a list or tuple, it must contain two integers, (pooling_stride_H, pooling_stride_W). + Otherwise, the pooling_stride_H = pooling_stride_W = pool_stride. + pool_padding (int|list|tuple): The padding of Pool2d layer. If pool_padding is a list or + tuple, it must contain two integers, (pool_padding_H, pool_padding_W). + Otherwise, the pool_padding_H = pool_padding_W = pool_padding. Default 0. + pool_type (str): Pooling type can be :math:`max` for max-pooling and :math:`avg` for + average-pooling. Default :math:`max`. + global_pooling (bool): Whether to use the global pooling. If global_pooling = true, + pool_size and pool_padding while be ignored. Default False + conv_stride (int|list|tuple): The stride size of the Conv2d Layer. If stride is a + list or tuple, it must contain two integers, (conv_stride_H, conv_stride_W). Otherwise, + the conv_stride_H = conv_stride_W = conv_stride. Default: conv_stride = 1. + conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is + a list or tuple, it must contain two integers, (conv_padding_H, conv_padding_W). + Otherwise, the conv_padding_H = conv_padding_W = conv_padding. Default: conv_padding = 0. + conv_dilation (int|list|tuple): The dilation size of the Conv2d Layer. If dilation is + a list or tuple, it must contain two integers, (conv_dilation_H, conv_dilation_W). + Otherwise, the conv_dilation_H = conv_dilation_W = conv_dilation. Default: conv_dilation = 1. + conv_groups (int): The groups number of the Conv2d Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. Default: groups=1 + param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None + bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None + act (str): Activation type for Conv2d. Default: None + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled + with mkldnn library. Default: False + + Return: + Variable: The result of input after Convolution2d and Pool2d. + + Examples: + .. code-block:: python + + img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') + conv_pool = fluid.nets.simple_img_conv_pool(input=img, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + """ conv_out = layers.conv2d( input=input, num_filters=num_filters, filter_size=filter_size, + stride=conv_stride, + padding=conv_padding, + dilation=conv_dilation, + groups=conv_groups, param_attr=param_attr, + bias_attr=bias_attr, act=act, use_cudnn=use_cudnn, use_mkldnn=use_mkldnn) @@ -45,6 +116,8 @@ def simple_img_conv_pool(input, pool_size=pool_size, pool_type=pool_type, pool_stride=pool_stride, + pool_padding=pool_padding, + global_pooling=global_pooling, use_cudnn=use_cudnn, use_mkldnn=use_mkldnn) return pool_out @@ -60,11 +133,65 @@ def img_conv_group(input, conv_with_batchnorm=False, conv_batchnorm_drop_rate=0.0, pool_stride=1, - pool_type=None, + pool_type="max", use_cudnn=True, use_mkldnn=False): """ - Image Convolution Group, Used for vgg net. + The Image Convolution Group is composed of Convolution2d, BatchNorm, DropOut, + and Pool2d. According to the input arguments, img_conv_group will do serials of + computation for Input using Convolution2d, BatchNorm, DropOut, and pass the last + result to Pool2d. + + Args: + input (Variable): The input image with [N, C, H, W] format. + conv_num_filter(list|tuple): Indicates the numbers of filter of this group. + pool_size (int|list|tuple): The pooling size of Pool2d Layer. If pool_size + is a list or tuple, it must contain two integers, (pool_size_H, pool_size_W). + Otherwise, the pool_size_H = pool_size_W = pool_size. + conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is + a list or tuple, its length must be equal to the length of conv_num_filter. + Otherwise the conv_padding of all Conv2d Layers are the same. Default 1. + conv_filter_size (int|list|tuple): The filter size. If filter_size is a list or + tuple, its length must be equal to the length of conv_num_filter. + Otherwise the conv_filter_size of all Conv2d Layers are the same. Default 3. + conv_act (str): Activation type for Conv2d Layer that is not followed by BatchNorm. + Default: None. + param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None + conv_with_batchnorm (bool|list): Indicates whether to use BatchNorm after Conv2d Layer. + If conv_with_batchnorm is a list, its length must be equal to the length of + conv_num_filter. Otherwise, conv_with_batchnorm indicates whether all the + Conv2d Layer follows a BatchNorm. Default False. + conv_batchnorm_drop_rate (float|list): Indicates the drop_rate of Dropout Layer + after BatchNorm. If conv_batchnorm_drop_rate is a list, its length must be + equal to the length of conv_num_filter. Otherwise, drop_rate of all Dropout + Layers is conv_batchnorm_drop_rate. Default 0.0. + pool_stride (int|list|tuple): The pooling stride of Pool2d layer. If pool_stride + is a list or tuple, it must contain two integers, (pooling_stride_H, + pooling_stride_W). Otherwise, the pooling_stride_H = pooling_stride_W = pool_stride. + Default 1. + pool_type (str): Pooling type can be :math:`max` for max-pooling and :math:`avg` for + average-pooling. Default :math:`max`. + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled + with mkldnn library. Default: False + + Return: + Variable: The final result after serial computation using Convolution2d, + BatchNorm, DropOut, and Pool2d. + + Examples: + .. code-block:: python + + img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') + conv_pool = fluid.nets.img_conv_group(input=img, + num_channels=3, + conv_padding=1, + conv_num_filter=[3, 3], + conv_filter_size=3, + conv_act="relu", + pool_size=2, + pool_stride=2) """ tmp = input assert isinstance(conv_num_filter, list) or \ @@ -74,6 +201,7 @@ def img_conv_group(input, if not hasattr(obj, '__len__'): return [obj] * len(conv_num_filter) else: + assert len(obj) == len(conv_num_filter) return obj conv_padding = __extend_list__(conv_padding) @@ -119,6 +247,39 @@ def sequence_conv_pool(input, param_attr=None, act="sigmoid", pool_type="max"): + """ + The sequence_conv_pool is composed with Sequence Convolution and Pooling. + + Args: + input (Variable): The input of sequence_conv, which supports variable-time + length input sequence. The underlying of input is a matrix with shape + (T, N), where T is the total time steps in this mini-batch and N is + the input_hidden_size + num_filters(int): The number of filter. + filter_size (int): The filter size. + param_attr (ParamAttr): The parameters to the Sequence_conv Layer. Default: None. + act (str): Activation type for Sequence_conv Layer. Default: "sigmoid". + pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for + average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling. + Default :math:`max`. + + Return: + Variable: The final result after Sequence Convolution and Pooling. + + Examples: + .. code-block:: python + + input_dim = len(word_dict) + emb_dim = 128 + hid_dim = 512 + data = fluid.layers.data( ame="words", shape=[1], dtype="int64", lod_level=1) + emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim], is_sparse=True) + seq_conv = fluid.nets.sequence_conv_pool(input=emb, + num_filters=hid_dim, + filter_size=3, + act="tanh", + pool_type="sqrt") + """ conv_out = layers.sequence_conv( input=input, num_filters=num_filters, @@ -132,9 +293,9 @@ def sequence_conv_pool(input, def glu(input, dim=-1): """ - The gated linear unit composed by split, sigmoid activation and elementwise - multiplication. Specifically, Split the input into two equal sized parts - :math:`a` and :math:`b` along the given dimension and then compute as + The Gated Linear Units(GLU) composed by split, sigmoid activation and element-wise + multiplication. Specifically, Split the input into two equal sized parts, + :math:`a` and :math:`b`, along the given dimension and then compute as following: .. math:: @@ -147,16 +308,16 @@ def glu(input, dim=-1): Args: input (Variable): The input variable which is a Tensor or LoDTensor. dim (int): The dimension along which to split. If :math:`dim < 0`, the - dimension to split along is :math:`rank(input) + dim`. + dimension to split along is :math:`rank(input) + dim`. Default -1. Returns: - Variable: The Tensor variable with half the size of input. + Variable: Variable with half the size of input. Examples: .. code-block:: python - # x is a Tensor variable with shape [3, 6, 9] - fluid.nets.glu(input=x, dim=1) # shape of output: [3, 3, 9] + data = fluid.layers.data(name="words", shape=[3, 6, 9], dtype="float32") + output = fluid.nets.glu(input=data, dim=1) # shape of output: [3, 3, 9] """ a, b = layers.split(input, num_or_sections=2, dim=dim) @@ -189,40 +350,48 @@ def scaled_dot_product_attention(queries, `_. Args: - queries (Variable): The input variable which should be a 3-D Tensor. keys (Variable): The input variable which should be a 3-D Tensor. values (Variable): The input variable which should be a 3-D Tensor. num_heads (int): Head number to compute the scaled dot product - attention. Default value is 1. + attention. Default: 1. dropout_rate (float): The dropout rate to drop the attention weight. - Default value is 0. + Default: 0.0. Returns: - - Variable: A 3-D Tensor computed by multi-head scaled dot product \ - attention. + Variable: A 3-D Tensor computed by multi-head scaled dot product\ + attention. Raises: - ValueError: If input queries, keys, values are not 3-D Tensors. - NOTE: + NOTES: 1. When num_heads > 1, three linear projections are learned respectively - to map input queries, keys and values into queries', keys' and values'. - queries', keys' and values' have the same shapes with queries, keys - and values. - - 1. When num_heads == 1, scaled_dot_product_attention has no learnable - parameters. + to map input queries, keys and values into queries', keys' and values'. + queries', keys' and values' have the same shapes with queries, keys + and values. + 2. When num_heads == 1, scaled_dot_product_attention has no learnable + parameters. Examples: .. code-block:: python - # Suppose q, k, v are Tensors with the following shape: - # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10] - - contexts = fluid.nets.scaled_dot_product_attention(q, k, v) + queries = fluid.layers.data(name="queries", + shape=[3, 5, 9], + dtype="float32", + append_batch_size=False) + queries.stop_gradient = False + keys = fluid.layers.data(name="keys", + shape=[3, 6, 9], + dtype="float32", + append_batch_size=False) + keys.stop_gradient = False + values = fluid.layers.data(name="values", + shape=[3, 6, 10], + dtype="float32", + append_batch_size=False) + values.stop_gradient = False + contexts = fluid.nets.scaled_dot_product_attention(queries, keys, values) contexts.shape # [3, 5, 10] """ if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 115362c6bf33018342699a442c688e7356f3c206..607a68e2565a247612f0e7b307088f85be91825c 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -13,7 +13,7 @@ # limitations under the License. import re from collections import defaultdict -from paddle.fluid.framework import Program +from paddle.fluid.framework import Program, Variable import framework import layers from backward import append_backward @@ -26,10 +26,10 @@ from clip import append_gradient_clip_ops, error_clip_callback from contextlib import contextmanager __all__ = [ - 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', + 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer', 'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer', - 'Adadelta', 'ModelAverage', 'Optimizer' + 'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'Optimizer', 'RMSPropOptimizer' ] @@ -41,7 +41,10 @@ class Optimizer(object): but need to use one of it's implementation. """ - def __init__(self, learning_rate, regularization=None): + def __init__(self, + learning_rate, + regularization=None, + LARS_weight_decay=0.0): if not isinstance(learning_rate, float) and \ not isinstance(learning_rate, framework.Variable): raise TypeError("learning rate should be float or Variable") @@ -61,6 +64,7 @@ class Optimizer(object): # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} self._accumulators = defaultdict(lambda: dict()) self.helper = None + self._LARS_weight_decay = LARS_weight_decay def _create_global_learning_rate(self): lr = self.global_learning_rate() @@ -100,10 +104,15 @@ class Optimizer(object): # create learning rate variable for every parameter param = param_and_grad[0] param_lr = param.optimize_attr['learning_rate'] - if param_lr == 1.0: - return self.global_learning_rate() + if type(param_lr) == Variable: + # param learning rate has been updated (LARS) + print("returns updated param lr ", param_lr) + return param_lr else: - return self.global_learning_rate() * param_lr + if param_lr == 1.0: + return self.global_learning_rate() + else: + return self.global_learning_rate() * param_lr def _create_accumulators(self, block, parameters): """Create all accumulators needed by the parameters @@ -183,15 +192,15 @@ class Optimizer(object): """Add optimization operators to update gradients to variables. Args: - loss: the target that this optimization is for. - parameters_and_grads: a list of (variable, gradient) pair to update. + loss(Variable): the target that this optimization is for. + parameters_and_grads(list(tuple(Variable, Variable))): + a list of (variable, gradient) pair to update. Returns: return_op_list: a list of operators that will complete one step of optimization. This will include parameter update ops, global step update ops and any other custom ops required by subclasses to manage their internal state. - :param startup_program: """ # This is a default implementation of create_optimization_pass that # can be shared by most optimizers. This implementation assumes that @@ -210,6 +219,10 @@ class Optimizer(object): self._create_accumulators(loss.block, [p[0] for p in parameters_and_grads]) self._create_global_learning_rate() + if self._LARS_weight_decay > 0.0: + layers.append_LARS(parameters_and_grads, + self.global_learning_rate(), + self._LARS_weight_decay) optimize_ops = [] for param_and_grad in parameters_and_grads: @@ -255,7 +268,22 @@ class Optimizer(object): class SGDOptimizer(Optimizer): - """ Simple SGD optimizer without any state. + """ + Optimizer of the stochastic gradient descent algorithm. + + .. math:: + + param\_out = param - learning\_rate * grad + + Args: + learning_rate (float|Variable): the learning rate used to update parameters. \ + Can be a float value or a Variable with one float value as data element. + + Examples: + .. code-block:: python + + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.2) + sgd_optimizer.minimize(cost) """ def __init__(self, learning_rate, **kwargs): @@ -281,7 +309,37 @@ class SGDOptimizer(Optimizer): class MomentumOptimizer(Optimizer): - """Simple Momentum optimizer with velocity state + """ + + Simple Momentum optimizer with velocity state + + This optimizer has a flag for Nestrov Momentum. + + The update equations are as follows: + + .. math:: + + & velocity = mu * velocity + gradient + + & if (use\_nesterov): + + &\quad param = param - gradient * learning\_rate + mu * velocity * learning\_rate + + & else: + + &\quad param = param - learning\_rate * velocity + + Args: + learning_rate (float|Variable): the learning rate used to update parameters. \ + Can be a float value or a Variable with one float value as data element. + momentum (float): momentum factor + use_nesterov (bool): enables Nesterov momentum + + Examples: + .. code-block:: python + + optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1) + optimizer.minimize(cost) """ _velocity_acc_str = "velocity" @@ -325,7 +383,32 @@ class MomentumOptimizer(Optimizer): class AdagradOptimizer(Optimizer): - """Simple Adagrad optimizer with moment state + """ + **Adaptive Gradient Algorithm (Adagrad)** + + The update is done as follows: + + .. math:: + + moment\_out &= moment + grad * grad + + param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} + + The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) + does not have the epsilon attribute. It is added here in our implementation + as also proposed here: http://cs231n.github.io/neural-networks-3/#ada + for numerical stability to avoid the division by zero error. + + Args: + learning_rate (float|Variable): the learning rate used to update parameters. \ + Can be a float value or a Variable with one float value as data element. + epsilon (float): a small float value for numerical stability. + + Examples: + .. code-block:: python + + optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) + optimizer.minimize(cost) """ _moment_acc_str = "moment" @@ -366,7 +449,40 @@ class AdagradOptimizer(Optimizer): class AdamOptimizer(Optimizer): - """Implements the Adam Optimizer + """ + This implements the Adam optimizer from Section 2 of the Adam + paper : https://arxiv.org/abs/1412.6980. + Adam is a first-order gradient-based optimization method based on + adaptive estimates of lower-order moments. + + Adam updates: + + .. math:: + + t & = t + 1 + + moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad + + moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad + + learning\_rate & = learning\_rate * \\ + \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t} + + param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + + Args: + learning_rate (float|Variable): the learning rate used to update parameters. \ + Can be a float value or a Variable with one float value as data element. + beta1 (float): The exponential decay rate for the 1st moment estimates. + beta2 (float): The exponential decay rate for the 2nd moment estimates. + epsilon (float): a small float value for numerical stability. + + Examples: + .. code-block:: python + + optimizer = fluid.optimizer.Adam(learning_rate=0.2) + optimizer.minimize(cost) + """ _moment1_acc_str = "moment1" _moment2_acc_str = "moment2" @@ -471,7 +587,42 @@ class AdamOptimizer(Optimizer): class AdamaxOptimizer(Optimizer): - """Implements the Adamax Optimizer + """ + We implement the Adamax optimizer from Section 7 of the Adam + paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the + Adam algorithm based on the infinity norm. + + Adamax updates: + + .. math:: + + t & = t + 1 + + moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad + + inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|) + + learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t} + + param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out} + + + The original paper does not have an epsilon attribute. + However, it is added here for numerical stability to prevent the + division by 0 error. + + Args: + learning_rate (float|Variable): the learning rate used to update parameters. \ + Can be a float value or a Variable with one float value as data element. + beta1 (float): The exponential decay rate for the 1st moment estimates. + beta2 (float): The exponential decay rate for the 2nd moment estimates. + epsilon (float): a small float value for numerical stability. + + Examples: + .. code-block:: python + + optimizer = fluid.optimizer.Adamax(learning_rate=0.2) + optimizer.minimize(cost) """ _moment_acc_str = "moment" _inf_norm_acc_str = "inf_norm" @@ -555,7 +706,34 @@ class AdamaxOptimizer(Optimizer): class DecayedAdagradOptimizer(Optimizer): - """Simple Decayed Adagrad optimizer with moment state + """ + **Decayed Adagrad Optimizer** + + The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) + + The update is done as follows: + + .. math:: + + moment\_out & = decay * moment + (1 - decay) * grad * grad + + param\_out & = param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} + + The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) + does not have an epsilon attribute. It is added here for numerical + stability to avoid the division by zero error. + + Args: + learning_rate (float|Variable): the learning rate used to update parameters. \ + Can be a float value or a Variable with one float value as data element. + decay (float): decay rate. + epsilon (float): a small float value for numerical stability. + + Examples: + .. code-block:: python + + optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2) + optimizer.minimize(cost) """ _moment_acc_str = "moment" @@ -601,6 +779,7 @@ class DecayedAdagradOptimizer(Optimizer): class AdadeltaOptimizer(Optimizer): """ **Adadelta Optimizer** + Simple Adadelta optimizer with average squared grad state and average squared update state. The details of adadelta please refer to this @@ -615,7 +794,7 @@ class AdadeltaOptimizer(Optimizer): E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2 Args: - learning_rate(float): global leraning rate + learning_rate(float): global learning rate rho(float): rho in equation epsilon(float): epsilon in equation @@ -690,37 +869,37 @@ class RMSPropOptimizer(Optimizer): .. math:: - r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\ + r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w) The first equation calculates moving average of the squared gradient for - each weight. Then dividing the gradient by :math: `sqrt{v(w,t)}`. + each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`. In some cases, adding a momentum term :math: `\\beta` is beneficial. In our implementation, Nesterov momentum is used: .. math:: - r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\ + r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w) w & = w - v(w, t) - where, :math: `\\rho` is a hyperparameter and typical values are 0.9, 0.95 + where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95 and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a smoothing term to avoid division by zero, usually set somewhere in range from 1e-4 to 1e-8. Args: - learning_rate(float): global leraning rate. + learning_rate(float): global learning rate. rho(float): rho is :math: `\\rho` in equation, set 0.95 by default. epsilon(float): :math: `\\epsilon` in equation is smoothing term to avoid division by zero, set 1e-6 by default. - momentum(float): :math: `\\beta` in equation is the momentum term, + momentum(float): :math:`\\beta` in equation is the momentum term, set 0.0 by default. Raises: @@ -797,6 +976,113 @@ class RMSPropOptimizer(Optimizer): return rmsprop_op +class FtrlOptimizer(Optimizer): + """ + FTRL (Follow The Regularized Leader) Optimizer. + + The paper that proposed Follow The Regularized Leader (FTRL): + (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf) + + .. math:: + + &new\_accum = squared\_accum + grad^2 + + &if (lr\_power == -0.5): + + &\quad linear\_accum += grad - \\frac{\\sqrt{new\_accum} - \\sqrt{squared\_accum}}{learning\_rate * param} + + &else: + + &\quad linear\_accum += grad - \\frac{new\_accum^{-lr\_power} - accum^{-lr\_power}}{learning\_rate * param} + + + &x = l1 * sign(linear\_accum) - linear\_accum + + &if (lr\_power == -0.5): + + &\quad y = \\frac{\\sqrt{new\_accum}}{learning\_rate} + (2 * l2) + + &\quad pre\_shrink = \\frac{x}{y} + + &\quad param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) + + &else: + + &\quad y = \\frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2) + + &\quad pre\_shrink = \\frac{x}{y} + + &\quad param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) + + &squared\_accum += grad^2 + + Args: + learning_rate (float|Variable): global learning rate. + l1 (float): + l2 (float): + lr_power (float): + + Raises: + ValueError: If learning_rate, rho, epsilon, momentum are None. + + Examples: + .. code-block:: python + + optimizer = fluid.optimizer.Ftrl(0.0001) + _, params_grads = optimizer.minimize(cost) + """ + + _squared_acc_str = "squared" + _linear_acc_str = "linear" + + def __init__(self, learning_rate, l1=0.0, l2=0.0, lr_power=-0.5, **kwargs): + super(FtrlOptimizer, self).__init__( + learning_rate=learning_rate, **kwargs) + if learning_rate is None: + raise ValueError("learning_rate is not set.") + + self.type = "ftrl" + self._l1 = l1 + self._l2 = l2 + self._lr_power = lr_power + + def _create_accumulators(self, block, parameters): + if not isinstance(block, framework.Block): + raise TypeError("block is not instance of framework.Block.") + + for p in parameters: + self._add_accumulator(self._squared_acc_str, p) + self._add_accumulator(self._linear_acc_str, p) + + def _append_optimize_op(self, block, param_and_grad): + if not isinstance(block, framework.Block): + raise TypeError("block is not instance of framework.Block.") + + squared_acc = self._get_accumulator(self._squared_acc_str, + param_and_grad[0]) + linear_acc = self._get_accumulator(self._linear_acc_str, + param_and_grad[0]) + ftrl_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "SquaredAccumulator": squared_acc, + "LinearAccumulator": linear_acc, + "LearningRate": self._create_param_lr(param_and_grad), + }, + outputs={ + "ParamOut": param_and_grad[0], + "SquaredAccumOut": squared_acc, + "LinearAccumOut": linear_acc + }, + attrs={"l1": self._l1, + "l2": self._l1, + "lr_power": self._lr_power}) + + return ftrl_op + + # We short the class name, since users will use the optimizer with the package # name. The sample code: # @@ -813,6 +1099,7 @@ Adamax = AdamaxOptimizer DecayedAdagrad = DecayedAdagradOptimizer Adadelta = AdadeltaOptimizer RMSProp = RMSPropOptimizer +Ftrl = FtrlOptimizer class ModelAverage(Optimizer): @@ -831,7 +1118,9 @@ class ModelAverage(Optimizer): max_average_window: The maximum size of average window. Examples: - ... + + .. code-block:: python + optimizer = fluid.optimizer.Momentum() _, params_grads = optimizer.minimize(cost) model_average = fluid.optimizer.ModelAverage(params_grads, 0.15, diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 3117dfe00c7a3df1035c439dc31b81e67781d0cc..6baf648198585022f992709c519038688af293e1 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -18,6 +18,7 @@ import framework import executor import warnings import sys +import os __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy'] @@ -26,6 +27,40 @@ BuildStrategy = core.ParallelExecutor.BuildStrategy class ParallelExecutor(object): + """ + ParallelExecutor can run program in parallel. + + Args: + use_cuda (bool): Whether to use CUDA or not. + loss_name (str): The loss name must set in training. Default None. + main_program (Program): The program that need to run, if not provided, + then default_main_program will be used. Default None. + share_vars_from(ParallelExecutor): If provied, it will share variables + from the specified ParallelExecutor. Default None. + num_trainers(int): If greater than 1, NCCL will be initialized with + multiple rank of nodes, each node should have same number of GPUs. + Distributed training will be enabled then. Default 1. + trainer_id(int: Must use together with num_trainers. trainer_id is the + "rank" of current node starts from 0. Default 0. + + Returns: + ParallelExecutor: The initialized ParallelExecutor object. + + Raises: + TypeError: If share_vars_from is provided, but not ParallelExecutor object. + + Examples: + .. code-block:: python + + train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name) + test_exe = fluid.ParallelExecutor(use_cuda=True, + main_program=test_program, + share_vars_from=train_exe) + + train_loss, = train_exe.run([loss.name], feed=feed_dict) + test_loss, = test_exe.run([loss.name], feed=feed_dict) + """ + def __init__(self, use_cuda, loss_name=None, @@ -36,42 +71,6 @@ class ParallelExecutor(object): num_trainers=1, trainer_id=0, **kwargs): - """ - ParallelExecutor can run program in parallel. - - Args: - use_cuda(bool): Whether to use CUDA or not. - loss_name(str, default None): The loss name must set in training. - main_program(Program, default None): The program that need to run, - if not provided, then default_main_program will be used. - share_vars_from(ParallelExecutor, default None): If provied, - it will share variables from the specified ParallelExecutor. - num_trainers(int, default 1): If greater than 1, NCCL will be - initialized with multpile rank of nodes, each node should have - same number of GPUs. Distributed training will be enabled then. - trainer_id(int, default 0): Must use together with num_trainers. - trainer_id is the "rank" of current node starts from 0. - - Returns: - A ParallelExecutor object. - - Raises: - TypeError: If share_vars_from is provided, but not ParallelExecutor - object. - - Examples: - .. code-block:: python - - train_exe = fluid.ParallelExecutor( - use_cuda=True, loss_name=loss.name) - test_exe = fluid.ParallelExecutor( - use_cuda=True, - main_program=test_program, - share_vars_from=train_exe) - - train_loss, = train_exe.run([loss.name], feed=feed_dict) - test_loss, = test_exe.run([loss.name], feed=feed_dict) - """ if len(kwargs) != 0: err_msg = "" for key in kwargs: @@ -101,7 +100,9 @@ class ParallelExecutor(object): p.set_place(self._act_places[-1]) self._places.append(p) else: - for i in xrange(multiprocessing.cpu_count()): + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + for i in xrange(cpu_num): p = core.Place() self._act_places.append(core.CPUPlace()) p.set_place(self._act_places[-1]) @@ -110,19 +111,17 @@ class ParallelExecutor(object): if exec_strategy is None: exec_strategy = ExecutionStrategy() - if use_cuda: - exec_strategy.use_event = True - else: - exec_strategy.use_event = False + exec_strategy.use_cuda = use_cuda if exec_strategy.num_threads == 0: if use_cuda: # Experiments on se-resnext shows that too many threads hurt # performance. Worth tunning for other models in the future. - exec_strategy.num_threads = len(self._places) * 2 + exec_strategy.num_threads = len(self._places) * 4 else: - exec_strategy.num_threads = min( - len(self._places) * 2, multiprocessing.cpu_count()) + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + exec_strategy.num_threads = cpu_num if build_strategy is None: build_strategy = BuildStrategy() @@ -130,10 +129,16 @@ class ParallelExecutor(object): main = main_program main = main if main else framework.default_main_program() scope = executor.global_scope() + # FIXME(Yancey1989): it's a temporary approach to determinate the distribute + # train program, call self.bcast_param() at the end of each mini-batch. + self.is_dist = True if "recv" in [ + op.type for op in main.global_block().ops + ] else False if share_vars_from and not isinstance(share_vars_from, ParallelExecutor): raise TypeError("share_vars_from must be ParallelExecutor.") + local_scopes = share_vars_from.executor.local_scopes( ) if share_vars_from else [] @@ -155,7 +160,7 @@ class ParallelExecutor(object): build_strategy, num_trainers, trainer_id) self.scope = scope - def run(self, fetch_list, feed=None, feed_dict=None): + def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): """ Run a parallel executor with fetch_list. @@ -165,12 +170,14 @@ class ParallelExecutor(object): element in the list will be copied to each device directly. For example, if the feed is a dict: + >>> exe = ParallelExecutor() >>> # the image will be splitted into devices. If there is two devices >>> # each device will process an image with shape (24, 1, 28, 28) >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))}) For example, if the feed is a list: + >>> exe = ParallelExecutor() >>> # each device will process each element in the list. >>> # the 1st device will process an image with shape (48, 1, 28, 28) @@ -181,18 +188,42 @@ class ParallelExecutor(object): >>> {"image": numpy.random.random(size=(32, 1, 28, 28))}, >>> ]) - Args: fetch_list(list): The fetched variable names feed(list|dict|None): The feed variables. If the feed is a dict, tensors in that dict will be splitted into each devices. If the feed is a list, each element of the list will be copied - to each device. + to each device. Default None. feed_dict: Alias for feed parameter, for backward compatibility. - This parameter is deprecated. + This parameter has been deprecated. Default None. + return_numpy(bool): Whether converts the fetched tensor to numpy. + Default: True. - Returns: fetched result list. + Returns: + List: The fetched result list. + + Raises: + ValueError: If the feed is a list, but its length is not equal the + length of active places, or its element's is not dict. + + NOTES: + 1. If the feed's type is dict, the number of data that feeds to + ParallelExecutor must be bigger than active places. Otherwise, + it will throw exception from C++ side. Special attention should be + paid to check whether the last batch of the dataset is bigger + than active places. + 2. If active places are more than one, the fetch results for each + variable is a list, and each element of this list is the variable of + respective active place. + Examples: + .. code-block:: python + + pe = fluid.ParallelExecutor(use_cuda=use_cuda, + loss_name=avg_cost.name, + main_program=fluid.default_main_program()) + loss = pe.run(feed=feeder.feed(cur_batch), + fetch_list=[avg_cost.name])) """ if feed is None and feed_dict is not None: feed = feed_dict @@ -237,9 +268,20 @@ class ParallelExecutor(object): fetch_var_name = '@FETCHED_VAR_NAME@' self.executor.run(fetch_list, fetch_var_name) arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() + + if self.is_dist: + self.bcast_params() + + if return_numpy: + return executor.as_numpy(arr) + return [arr[i] for i in range(len(arr))] def bcast_params(self): + """ + Broadcast the parameters to other devices. It is used during + distributed training. + """ self.executor.bcast_params(set(self.persistable_vars)) @property diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py index 1c6970441bccdc1c1221503256c30c83502bd123..0a42b9fca8dba7a11b414990be6c04c93158864f 100644 --- a/python/paddle/fluid/param_attr.py +++ b/python/paddle/fluid/param_attr.py @@ -22,6 +22,35 @@ __all__ = [ class ParamAttr(object): + """ + Parameter attributes object. To fine-tuning network training process, user + can set parameter's attributes to control training details. Such as learning rate, + regularization, trainable, do_model_average and the method to initialize param. + + + Args: + name(str): The parameter's name. Default None. + initializer(Initializer): The method to initial this parameter. Default None. + learning_rate(float): The parameter's learning rate. The learning rate when + optimize is :math:`global\_lr * parameter\_lr * scheduler\_factor`. + Default 1.0. + regularizer(WeightDecayRegularizer): Regularization factor. Default None. + trainable(bool): Whether this parameter is trainable. Default True. + gradient_clip(BaseGradientClipAttr): The method to clip this parameter's + gradient. Default None. + do_model_average(bool): Whether this parameter should do model average. + Default False. + + Examples: + .. code-block:: python + + w_param_attrs = fluid.ParamAttr(name="fc_weight", + learning_rate=0.5, + regularizer=fluid.L2Decay(1.0), + trainable=True) + y_predict = fluid.layers.fc(input=x, size=10, param_attr=w_param_attrs) + """ + def __init__(self, name=None, initializer=None, @@ -29,7 +58,7 @@ class ParamAttr(object): regularizer=None, trainable=True, gradient_clip=None, - do_model_average=None): + do_model_average=False): self.name = name self.initializer = initializer self.learning_rate = learning_rate @@ -39,6 +68,16 @@ class ParamAttr(object): self.model_average = do_model_average def set_default_initializer(self, initializer): + """ + Set the default initializer, the initializer should be Constant, + Uniform, Normal, Xavier, MSRA. + + Args: + initializer(Initializer): the initializer to set. + + Returns: + None + """ if initializer is None: if self.initializer is None: raise ValueError("ParamAttr.initializer is not set") @@ -50,13 +89,45 @@ class ParamAttr(object): self.initializer = initializer def set_default_param_initializer(self): + """ + Set the default initializer for the parameter with Xavier. + + Args: + None. + + Returns: + None. + """ self.set_default_initializer(Xavier()) def set_default_bias_initializer(self): + """ + Set the default initializer for the bias with Constant(0.0). + + Args: + None. + + Returns: + None. + """ self.set_default_initializer(Constant(0.0)) @staticmethod def to_attr(arg): + """ + Create ParamAttr[s]. + + Args: + arg: Arguments to initialize ParamAttr[s]. arg's type can be + str, Initializer, float, WeightDecayRegularizer, BaseGradientClipAttr, + bool, ParamAttr, or a list of above type. + + Returns: + ParamAttr[s]: ParamAttr[s] initialized with arg. + + Raises: + arg can not initialize a ParamAttr. + """ if arg is None: return ParamAttr() elif isinstance(arg, list) or isinstance(arg, tuple): @@ -75,6 +146,15 @@ class ParamAttr(object): raise TypeError("{0} cast to ParamAttr".format(type(arg))) def to_kwargs(self, with_initializer=False): + """ + Returns the attributes of this parameter. + + Args: + with_initializer(bool): Whether to add initializer attr. + + Returns: + Parameter attributes(map): The attributes of this parameter. + """ kwargs = { 'name': self.name, 'optimize_attr': { @@ -92,9 +172,27 @@ class ParamAttr(object): class WeightNormParamAttr(ParamAttr): """ - Used for weight normalization. Any field in ParamAttr can also be set here. - Besides, an extra field dim can be set to indicate the dimension except - which to normalize. + Used for weight Norm. Weight Norm is a reparameterization of the weight vectors + in a neural network that decouples the length of those weight vectors from + their direction. Weight Norm has been implemented as discussed in this + paper: `Weight Normalization: A Simple Reparameterization to Accelerate + Training of Deep Neural Networks + `_. + + Args: + dim(list): The parameter's name. Default None. + kwargs: Any field in ParamAttr. Default None. + + Examples: + .. code-block:: python + + data = fluid.layers.data(name="data", shape=[3, 32, 32], dtype="float32") + fc = fluid.layers.fc(input=data, + size=1000, + param_attr=WeightNormParamAttr( + dim=None, + name='weight_norm_param')) + """ # List to record the parameters reparameterized by weight normalization. # If these parameters are treated as Variable rather than Parameter, diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index e2bd1d4c9a1ea5ddc0dfd19c769dcb40bfd6d04c..6a321ae024dcb50452bc4d96d7e7e70f590a42c6 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -42,6 +42,9 @@ def cuda_profiler(output_file, output_mode=None, config=None): counters/options for profiling by `config` argument. The default config is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d', 'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace']. + Then users can use NVIDIA Visual Profiler + (https://developer.nvidia.com/nvidia-visual-profiler) tools to load this + this output file to visualize results. Args: output_file (string) : The output file name, the result will be @@ -50,6 +53,33 @@ def cuda_profiler(output_file, output_mode=None, config=None): Comma separated values format. It should be 'kvp' or 'csv'. config (list of string) : The profiler options and counters can refer to "Compute Command Line Profiler User Guide". + + Raises: + ValueError: If `output_mode` is not in ['kvp', 'csv']. + + Examples: + + .. code-block:: python + + import paddle.fluid as fluid + import paddle.fluid.profiler as profiler + + epoc = 8 + dshape = [4, 3, 28, 28] + data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32') + conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1]) + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + output_file = 'cuda_profiler.txt' + with profiler.cuda_profiler(output_file, 'csv') as nvprof: + for i in range(epoc): + input = np.random.random(dshape).astype('float32') + exe.run(fluid.default_main_program(), feed={'data': input}) + # then use NVIDIA Visual Profiler (nvvp) to load this output file + # to visualize results. """ if output_mode is None: output_mode = 'csv' @@ -69,19 +99,52 @@ def cuda_profiler(output_file, output_mode=None, config=None): def reset_profiler(): - """The profiler clear interface. - reset_profiler will clear the previous time record. + """ + Clear the previous time record. This interface does not work for + `fluid.profiler.cuda_profiler`, it only works for + `fluid.profiler.start_profiler`, `fluid.profiler.stop_profiler`, + and `fluid.profiler.profiler`. + + Examples: + + .. code-block:: python + + import paddle.fluid.profiler as profiler + with profiler.profiler(state, 'total', '/tmp/profile'): + for iter in range(10): + if iter == 2: + profiler.reset_profiler() + # ... """ core.reset_profiler() def start_profiler(state): - """Enable the profiler. + """ + Enable the profiler. Uers can use `fluid.profiler.start_profiler` and + `fluid.profiler.stop_profiler` to insert the code, except the usage of + `fluid.profiler.profiler` interface. Args: state (string) : The profiling state, which should be 'CPU', 'GPU' or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling GPU as well. 'All' also generates timeline. + + Raises: + ValueError: If `state` is not in ['CPU', 'GPU', 'All']. + + Examples: + + .. code-block:: python + + import paddle.fluid.profiler as profiler + + profiler.start_profiler('GPU') + for iter in range(10): + if iter == 2: + profiler.reset_profiler() + # except each iteration + profiler.stop_profiler('total', '/tmp/profile') """ if core.is_profiler_enabled(): return @@ -97,7 +160,10 @@ def start_profiler(state): def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): - """Stop the profiler. + """ + Stop the profiler. Uers can use `fluid.profiler.start_profiler` and + `fluid.profiler.stop_profiler` to insert the code, except the usage of + `fluid.profiler.profiler` interface. Args: sorted_key (string) : If None, the profiling results will be printed @@ -111,6 +177,23 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): The `ave` means sorting by the average execution time. profile_path (string) : If state == 'All', it will write a profile proto output file. + + Raises: + ValueError: If `sorted_key` is not in + ['calls', 'total', 'max', 'min', 'ave']. + + Examples: + + .. code-block:: python + + import paddle.fluid.profiler as profiler + + profiler.start_profiler('GPU') + for iter in range(10): + if iter == 2: + profiler.reset_profiler() + # except each iteration + profiler.stop_profiler('total', '/tmp/profile') """ if not core.is_profiler_enabled(): return @@ -137,7 +220,12 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): Different from cuda_profiler, this profiler can be used to profile both CPU and GPU program. By defalut, it records the CPU and GPU operator kernels, if you want to profile other program, you can refer the profiling tutorial - to add more records. + to add more records in C++ code. + + If the state == 'All', a profile proto file will be written to + `profile_path`. This file records timeline information during the execution. + Then users can visualize this file to see the timeline, please refer + https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md Args: state (string) : The profiling state, which should be 'CPU' or 'GPU', @@ -156,6 +244,25 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): The `ave` means sorting by the average execution time. profile_path (string) : If state == 'All', it will write a profile proto output file. + + Raises: + ValueError: If `state` is not in ['CPU', 'GPU', 'All']. If `sorted_key` is + not in ['calls', 'total', 'max', 'min', 'ave']. + + Examples: + + .. code-block:: python + + import paddle.fluid.profiler as profiler + + with profiler.profiler('All', 'total', '/tmp/profile') as prof: + for pass_id in range(pass_num): + for batch_id, data in enumerate(train_reader()): + exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[], + use_program_cache=True) + # ... """ start_profiler(state) yield diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py index 8d48e9abef0fb9861284c6302b30efb0e3994989..bd57772713057f12b876942de58ee43527e94834 100644 --- a/python/paddle/fluid/recordio_writer.py +++ b/python/paddle/fluid/recordio_writer.py @@ -36,6 +36,45 @@ def convert_reader_to_recordio_file( compressor=core.RecordIOWriter.Compressor.Snappy, max_num_records=1000, feed_order=None): + """ + Convert a Python Reader to a recordio file. + + Please see :ref:`api_guide_python_reader` and :ref:`api_guide_reader_op` for + details. + + Examples: + + >>> import paddle.fluid as fluid + >>> import paddle.dataset.mnist as mnist + >>> import paddle + >>> + >>> tmp_program = fluid.Program() + >>> with fluid.program_guard(tmp_program): + >>> img = fluid.layers.data(name='img', shape=[784]) + >>> label = fluid.layers.data(name='label', shape=[1], dtype='int64') + >>> feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace()) + >>> # mnist.recordio will be generated in current directory + >>> fluid.recordio_writer.convert_reader_to_recordio_file( + >>> filename="mnist.recordio", + >>> reader_creator=paddle.batch(mnist.train(), batch_size=32), + >>> feeder=feeder) + + Args: + filename(str): The recordio filename. + reader_creator(callable): The Python Reader Creator. See + :ref:`api_guide_python_reader`. + feeder(DataFeeder): The DataFeeder instance. Used to convert + :code:`reader_creator` to :code: `lod_tensor` + compressor: Must in fluid.core.RecordIOWriter.Compressor.Snappy or + fluid.core.RecordIOWriter.Compressor.NoCompress. Use :code:`Snappy` + by default. + max_num_records(int): Maximum number of records in one chuck. Each record + is each return value from reader function + feed_order(list): The order of variable names that the reader returns + + Returns: + int: the number of record that saved. + """ if feed_order is None: feed_order = feeder.feed_names counter = 0 @@ -58,6 +97,17 @@ def convert_reader_to_recordio_files( compressor=core.RecordIOWriter.Compressor.Snappy, max_num_records=1000, feed_order=None): + """ + convert a python reader to many recordio files. + + This API is basically same as :code:`convert_reader_to_recordio_file`, + instead of it will create many recordio files. Each file contains at + most :code:`batch_per_file` records. + + Please reference + :ref:`api_fluid_recordio_writer_convert_reader_to_recordio_file` for more + details. + """ if feed_order is None: feed_order = feeder.feed_names f_name, f_ext = os.path.splitext(filename) diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py index c4d6829599616cb3ea7791a189e7070974de6ae3..dac474d5ee76590a75311d6bf2c4cb2fe85b6c40 100644 --- a/python/paddle/fluid/regularizer.py +++ b/python/paddle/fluid/regularizer.py @@ -16,8 +16,8 @@ import framework from . import core __all__ = [ - 'append_regularization_ops', 'WeightDecayRegularizer', 'L1Decay', 'L2Decay', - 'L1DecayRegularizer', 'L2DecayRegularizer' + 'append_regularization_ops', 'L1Decay', 'L2Decay', 'L1DecayRegularizer', + 'L2DecayRegularizer' ] @@ -36,7 +36,8 @@ def append_regularization_ops(parameters_and_grads, regularization=None): set. It will be applied with regularizer. Returns: - list of (parameters, gradients) pair with the regularized gradient + list[(Variable, Variable)]: list of (parameters, gradients) \ + pair with the regularized gradient Raises: Exception: Unknown regularization type @@ -100,6 +101,24 @@ class WeightDecayRegularizer(object): class L2DecayRegularizer(WeightDecayRegularizer): """Implements the L2 Weight Decay Regularization + + Small values of L2 can help prevent over fitting the training data. + + .. math:: + + L2WeightDecay = reg\_coeff * parameter + + Args: + regularization_coeff(float): regularization coeff + + Examples: + .. code-block:: python + + optimizer = fluid.optimizer.Adagrad( + learning_rate=1e-4, + regularization=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=0.1)) + optimizer.minimize(avg_cost) """ def __init__(self, regularization_coeff=0.0): @@ -154,6 +173,27 @@ class L2DecayRegularizer(WeightDecayRegularizer): class L1DecayRegularizer(WeightDecayRegularizer): """Implements the L1 Weight Decay Regularization + + L1 regularization encourages sparsity. + + .. math:: + + L1WeightDecay = reg\_coeff * sign(parameter) + + Args: + regularization_coeff(float): regularization coeff + + Examples: + .. code-block:: python + + program = fluid.framework.Program() + block = program.global_block() + mul_x = block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + regularizer=fluid.regularizer.L1DecayRegularizer(0.5)) """ def __init__(self, regularization_coeff=0.0): diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py index b3117cf2e5e0513089e5e1146d49702fcc8b7ba6..ad28c9eff560507e5b326451159be3949353f58f 100644 --- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py +++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py @@ -38,7 +38,7 @@ def inference_program(): return y_predict -def linear(): +def train_program(): y = fluid.layers.data(name='y', shape=[1], dtype='float32') y_predict = inference_program() @@ -104,7 +104,7 @@ def main(use_cuda): # Directory for saving the trained model params_dirname = "fit_a_line.inference.model" - train(use_cuda, linear, params_dirname) + train(use_cuda, train_program, params_dirname) infer(use_cuda, inference_program, params_dirname) diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py index 2df3da9cca7042222317de626460909f018cb107..8e222d26907e8fe697b596a67e62cc9df84afe0e 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py @@ -96,10 +96,11 @@ def train(use_cuda, train_program, params_dirname): train_reader = paddle.batch( paddle.reader.shuffle( cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10), - batch_size=BATCH_SIZE) + batch_size=BATCH_SIZE, + drop_last=False) test_reader = paddle.batch( - paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE) + paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False) def event_handler(event): if isinstance(event, fluid.EndStepEvent): diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py index 224cca417e717bbcc54b58be6ac0219be207dea3..dbc7bc06c93157f271c79e85b6925468e861e57f 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py @@ -73,10 +73,11 @@ def train(use_cuda, train_program, params_dirname): train_reader = paddle.batch( paddle.reader.shuffle( cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10), - batch_size=BATCH_SIZE) + batch_size=BATCH_SIZE, + drop_last=False) test_reader = paddle.batch( - paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE) + paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False) def event_handler(event): if isinstance(event, fluid.EndStepEvent): diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py index c4b37df3a09f93fe965ae28ce783f06f5018020d..f690a0d2337137fa90951b76fc37c5b5f7c9140b 100644 --- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py +++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py @@ -127,9 +127,19 @@ def decode(context, is_sparse): current_score = pd.fc(input=current_state_with_lod, size=target_dict_dim, act='softmax') - topk_scores, topk_indices = pd.topk(current_score, k=topk_size) + topk_scores, topk_indices = pd.topk(current_score, k=beam_size) + # calculate accumulated scores after topk to reduce computation cost + accu_scores = pd.elementwise_add( + x=pd.log(topk_scores), y=pd.reshape( + pre_score, shape=[-1]), axis=0) selected_ids, selected_scores = pd.beam_search( - pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) + pre_ids, + pre_score, + topk_indices, + accu_scores, + beam_size, + end_id=10, + level=0) pd.increment(x=counter, value=1, in_place=True) @@ -138,10 +148,14 @@ def decode(context, is_sparse): pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter) - pd.less_than(x=counter, y=array_len, cond=cond) + # update the break condition: up to the max length or all candidates of + # source sentences have ended. + length_cond = pd.less_than(x=counter, y=array_len) + finish_cond = pd.logical_not(pd.is_empty(x=selected_ids)) + pd.logical_and(x=length_cond, y=finish_cond, out=cond) translation_ids, translation_scores = pd.beam_search_decode( - ids=ids_array, scores=scores_array) + ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10) # return init_ids, init_scores diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py index 113dda88ca974c9e6241f127091bd96fb2af4a70..8c74be0f08855c20f5aa3ecd75622a51e94a0304 100644 --- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py +++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py @@ -87,7 +87,9 @@ def train(use_cuda, train_program, params_dirname): def event_handler(event): if isinstance(event, fluid.EndEpochEvent): test_reader = paddle.batch( - paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE) + paddle.dataset.imdb.test(word_dict), + batch_size=BATCH_SIZE, + drop_last=False) avg_cost, acc = trainer.test( reader=test_reader, feed_order=['words', 'label']) @@ -113,7 +115,8 @@ def train(use_cuda, train_program, params_dirname): train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.imdb.train(word_dict), buf_size=25000), - batch_size=BATCH_SIZE) + batch_size=BATCH_SIZE, + drop_last=False) trainer.train( num_epochs=1, diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py index c6687e8ad7fcc45c82d6dcb2256e9055a81cc61c..5d9a47c9ba3db07f240b42732536f1ea37627a11 100644 --- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py +++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py @@ -194,16 +194,16 @@ def train(word_dict, if is_local: train_loop(fluid.default_main_program()) else: - port = os.getenv("PADDLE_INIT_PORT", "6174") - pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... + port = os.getenv("PADDLE_PSERVER_PORT", "6174") + pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... - trainers = int(os.getenv("TRAINERS")) + trainers = int(os.getenv("PADDLE_TRAINERS")) current_endpoint = os.getenv("POD_IP") + ":" + port - trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) - training_role = os.getenv("TRAINING_ROLE", "TRAINER") + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER") t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py index b1a6b524d33cae97c8982ffb8f780b1b07761a09..74f96f456a8dc917b715d0f4308bb5ea41947f0b 100644 --- a/python/paddle/fluid/tests/book/test_fit_a_line.py +++ b/python/paddle/fluid/tests/book/test_fit_a_line.py @@ -69,16 +69,16 @@ def train(use_cuda, save_dirname, is_local): if is_local: train_loop(fluid.default_main_program()) else: - port = os.getenv("PADDLE_INIT_PORT", "6174") - pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... + port = os.getenv("PADDLE_PSERVER_PORT", "6174") + pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... - trainers = int(os.getenv("TRAINERS")) + trainers = int(os.getenv("PADDLE_TRAINERS")) current_endpoint = os.getenv("POD_IP") + ":" + port - trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) - training_role = os.getenv("TRAINING_ROLE", "TRAINER") + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER") t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py index 0f3a4c9242a81a3c1fb90268245715a8e59a207a..a2fb186b86c9706ac1aff0de49defbfb06e2eb0f 100644 --- a/python/paddle/fluid/tests/book/test_image_classification.py +++ b/python/paddle/fluid/tests/book/test_image_classification.py @@ -178,16 +178,16 @@ def train(net_type, use_cuda, save_dirname, is_local): if is_local: train_loop(fluid.default_main_program()) else: - port = os.getenv("PADDLE_INIT_PORT", "6174") - pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... + port = os.getenv("PADDLE_PSERVER_PORT", "6174") + pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... - trainers = int(os.getenv("TRAINERS")) + trainers = int(os.getenv("PADDLE_TRAINERS")) current_endpoint = os.getenv("POD_IP") + ":" + port - trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) - training_role = os.getenv("TRAINING_ROLE", "TRAINER") + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER") t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py index bc8a1aafc82d62501cecfa71be0cc3851c75eae2..e214ced0b5593c60ebd4a69edff1e961bcb4a72a 100644 --- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py @@ -76,8 +76,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, emb_layers.append(mark_embedding) hidden_0_layers = [ - fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') - for emb in emb_layers + fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers ] hidden_0 = fluid.layers.sums(input=hidden_0_layers) @@ -94,8 +93,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, for i in range(1, depth): mix_hidden = fluid.layers.sums(input=[ - fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'), - fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh') + fluid.layers.fc(input=input_tmp[0], size=hidden_dim), + fluid.layers.fc(input=input_tmp[1], size=hidden_dim) ]) lstm = fluid.layers.dynamic_lstm( @@ -210,16 +209,16 @@ def train(use_cuda, save_dirname=None, is_local=True): if is_local: train_loop(fluid.default_main_program()) else: - port = os.getenv("PADDLE_INIT_PORT", "6174") - pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... + port = os.getenv("PADDLE_PSERVER_PORT", "6174") + pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... - trainers = int(os.getenv("TRAINERS")) + trainers = int(os.getenv("PADDLE_TRAINERS")) current_endpoint = os.getenv("POD_IP") + ":" + port - trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) - training_role = os.getenv("TRAINING_ROLE", "TRAINER") + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER") t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py index 23e5900f127a7a3253c551f8f7fbceba08382209..5238203317a6657a1357ebc2ae257cce8c1d993f 100644 --- a/python/paddle/fluid/tests/book/test_machine_translation.py +++ b/python/paddle/fluid/tests/book/test_machine_translation.py @@ -126,9 +126,19 @@ def decoder_decode(context, is_sparse): current_score = pd.fc(input=current_state_with_lod, size=target_dict_dim, act='softmax') - topk_scores, topk_indices = pd.topk(current_score, k=50) + topk_scores, topk_indices = pd.topk(current_score, k=beam_size) + # calculate accumulated scores after topk to reduce computation cost + accu_scores = pd.elementwise_add( + x=pd.log(topk_scores), y=pd.reshape( + pre_score, shape=[-1]), axis=0) selected_ids, selected_scores = pd.beam_search( - pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) + pre_ids, + pre_score, + topk_indices, + accu_scores, + beam_size, + end_id=10, + level=0) pd.increment(x=counter, value=1, in_place=True) @@ -137,10 +147,14 @@ def decoder_decode(context, is_sparse): pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter) - pd.less_than(x=counter, y=array_len, cond=cond) + # update the break condition: up to the max length or all candidates of + # source sentences have ended. + length_cond = pd.less_than(x=counter, y=array_len) + finish_cond = pd.logical_not(pd.is_empty(x=selected_ids)) + pd.logical_and(x=length_cond, y=finish_cond, out=cond) translation_ids, translation_scores = pd.beam_search_decode( - ids=ids_array, scores=scores_array) + ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10) # return init_ids, init_scores @@ -200,16 +214,16 @@ def train_main(use_cuda, is_sparse, is_local=True): if is_local: train_loop(framework.default_main_program()) else: - port = os.getenv("PADDLE_INIT_PORT", "6174") - pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... + port = os.getenv("PADDLE_PSERVER_PORT", "6174") + pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... - trainers = int(os.getenv("TRAINERS")) + trainers = int(os.getenv("PADDLE_TRAINERS")) current_endpoint = os.getenv("POD_IP") + ":" + port - trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) - training_role = os.getenv("TRAINING_ROLE", "TRAINER") + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER") t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py index 578b1162fbd7e3a1b1c0cc934406818f2e07e019..5f5c8544bbdb87421f129b201a0ebaf4cb8602a1 100644 --- a/python/paddle/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/fluid/tests/book/test_recognize_digits.py @@ -94,7 +94,7 @@ def train(nn_type, test_program = fluid.default_main_program().clone(for_test=True) - optimizer = fluid.optimizer.Adam(learning_rate=0.001) + optimizer = fluid.optimizer.Adam(learning_rate=0.001, LARS_weight_decay=0.3) optimizer.minimize(avg_loss) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() @@ -151,16 +151,16 @@ def train(nn_type, if is_local: train_loop(fluid.default_main_program()) else: - port = os.getenv("PADDLE_INIT_PORT", "6174") - pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... + port = os.getenv("PADDLE_PSERVER_PORT", "6174") + pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... - trainers = int(os.getenv("TRAINERS")) + trainers = int(os.getenv("PADDLE_TRAINERS")) current_endpoint = os.getenv("POD_IP") + ":" + port - trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) - training_role = os.getenv("TRAINING_ROLE", "TRAINER") + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER") t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py index 65d6552acc9b3d31a97a45290e4613a633fffa3c..937d8dd5b065f0c1fdfc052b0342b572e3fbd7ac 100644 --- a/python/paddle/fluid/tests/book/test_recommender_system.py +++ b/python/paddle/fluid/tests/book/test_recommender_system.py @@ -220,16 +220,16 @@ def train(use_cuda, save_dirname, is_local=True): if is_local: train_loop(fluid.default_main_program()) else: - port = os.getenv("PADDLE_INIT_PORT", "6174") - pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... + port = os.getenv("PADDLE_PSERVER_PORT", "6174") + pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... - trainers = int(os.getenv("TRAINERS")) + trainers = int(os.getenv("PADDLE_TRAINERS")) current_endpoint = os.getenv("POD_IP") + ":" + port - trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) - training_role = os.getenv("TRAINING_ROLE", "TRAINER") + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER") t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py index 3118d88701e5f64ae50f7ee774ea8174aa7758eb..75bed06bd7a9b311ff9466589d6ecab2c37471ce 100644 --- a/python/paddle/fluid/tests/book/test_word2vec.py +++ b/python/paddle/fluid/tests/book/test_word2vec.py @@ -125,16 +125,16 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): if is_local: train_loop(fluid.default_main_program()) else: - port = os.getenv("PADDLE_INIT_PORT", "6174") - pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... + port = os.getenv("PADDLE_PSERVER_PORT", "6174") + pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... - trainers = int(os.getenv("TRAINERS")) + trainers = int(os.getenv("PADDLE_TRAINERS")) current_endpoint = os.getenv("POD_IP") + ":" + port - trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) - training_role = os.getenv("TRAINING_ROLE", "TRAINER") + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER") t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py index 8818cf96fa8f08036f9e23aae786f67b5614b2b9..be347cd5315668dde0454d7959dbf9bcfa465b5f 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py @@ -56,7 +56,7 @@ BATCH_SIZE = 200 # fix the order of training data train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE) + paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE, drop_last=False) # train_reader = paddle.batch( # paddle.reader.shuffle( diff --git a/python/paddle/fluid/tests/test_concurrency.py b/python/paddle/fluid/tests/no_test_concurrency.py similarity index 100% rename from python/paddle/fluid/tests/test_concurrency.py rename to python/paddle/fluid/tests/no_test_concurrency.py diff --git a/python/paddle/fluid/tests/test_data_feeder.py b/python/paddle/fluid/tests/test_data_feeder.py index ce3ba3ebc50d7b015f379b5e80b179463a7b231a..30b7a634a2b978df85d6432854ef12285460be44 100644 --- a/python/paddle/fluid/tests/test_data_feeder.py +++ b/python/paddle/fluid/tests/test_data_feeder.py @@ -22,12 +22,11 @@ class TestDataFeeder(unittest.TestCase): label = fluid.layers.data(name='label', shape=[1], dtype='int64') feeder = fluid.DataFeeder([img, label], fluid.CPUPlace()) result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])]) - print(result) self.assertEqual(result['image'].shape(), [2, 1, 28, 28]) self.assertEqual(result['label'].shape(), [2, 1]) - self.assertEqual(result['image'].lod(), []) - self.assertEqual(result['label'].lod(), []) + self.assertEqual(result['image'].recursive_sequence_lengths(), []) + self.assertEqual(result['label'].recursive_sequence_lengths(), []) def test_lod_level_1_converter(self): # lod_level = 1 @@ -42,12 +41,12 @@ class TestDataFeeder(unittest.TestCase): # label = [1] * len(data) result = feeder.feed( [([1, 2, 3], [1]), ([4, 5], [1]), ([6, 7, 8, 9], [1])]) - print(result) self.assertEqual(result['sentences'].shape(), [9, 1]) self.assertEqual(result['label'].shape(), [3, 1]) - self.assertEqual(result['sentences'].lod(), [[0, 3, 5, 9]]) - self.assertEqual(result['label'].lod(), []) + self.assertEqual(result['sentences'].recursive_sequence_lengths(), + [[3, 2, 4]]) + self.assertEqual(result['label'].recursive_sequence_lengths(), []) def test_lod_level_2_converter(self): # lod_level = 2 @@ -62,12 +61,12 @@ class TestDataFeeder(unittest.TestCase): # label = [1] * len(data) result = feeder.feed( [([[1, 2, 3], [4, 5]], [1]), ([[6, 7, 8, 9]], [1])]) - print(result) self.assertEqual(result['paragraphs'].shape(), [9, 1]) self.assertEqual(result['label'].shape(), [2, 1]) - self.assertEqual(result['paragraphs'].lod(), [[0, 2, 3], [0, 3, 5, 9]]) - self.assertEqual(result['label'].lod(), []) + self.assertEqual(result['paragraphs'].recursive_sequence_lengths(), + [[2, 1], [3, 2, 4]]) + self.assertEqual(result['label'].recursive_sequence_lengths(), []) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py index 013d72f418cf7ac11eb31fd221052039e896e203..b7e7f5801fbbe58626eeec5fc77736d04bb3cefb 100644 --- a/python/paddle/fluid/tests/test_lod_tensor.py +++ b/python/paddle/fluid/tests/test_lod_tensor.py @@ -13,44 +13,41 @@ # limitations under the License. import paddle.fluid as fluid -from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor, _validate_lod, _convert_lod -import numpy +from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor +import numpy as np import unittest class TestLoDTensor(unittest.TestCase): - def test_validate_lod(self): - lod = (1, 2, 1) - self.assertRaises(AssertionError, _validate_lod, lod, -1) - lod = [[1, 2], (2, 3)] - self.assertRaises(AssertionError, _validate_lod, lod, -1) - lod = [1, 2, 3] - self.assertRaises(AssertionError, _validate_lod, lod, -1) - + def test_pybind_lod(self): + tensor = fluid.LoDTensor() lod = [] - self.assertTrue(_validate_lod(lod, -1)) + tensor.set_recursive_sequence_lengths(lod) lod = [[], [1], [3]] - self.assertFalse(_validate_lod(lod, -1)) - lod = [[0], [-1], [3]] - self.assertFalse(_validate_lod(lod, -1)) + self.assertRaises(Exception, tensor.set_recursive_sequence_lengths, lod) + lod = [[0], [2], [3]] + self.assertRaises(Exception, tensor.set_recursive_sequence_lengths, lod) - # Each level's sum should be equal to the number of items in the next level - # Moreover, last level's sum should be equal to the tensor height - lod = [[2, 3], [1, 3, 1, 2, 1]] - self.assertTrue(_validate_lod(lod, tensor_height=8)) - lod = [[1, 3], [2, 1, 3]] - self.assertFalse(_validate_lod(lod, tensor_height=6)) - lod = [[1, 3], [2, 1, 3, 4]] - self.assertFalse(_validate_lod(lod, tensor_height=5)) - - def test_convert_lod(self): lod = [[1, 2, 3]] - converted_lod = [[0, 1, 3, 6]] - self.assertEqual(_convert_lod(lod), converted_lod) + tensor.set_recursive_sequence_lengths(lod) + self.assertEqual(tensor.recursive_sequence_lengths(), lod) + tensor.set(np.random.random([6, 1]), fluid.CPUPlace()) + self.assertTrue(tensor.has_valid_recursive_sequence_lengths()) + tensor.set(np.random.random([9, 1]), fluid.CPUPlace()) + self.assertFalse(tensor.has_valid_recursive_sequence_lengths()) + # Each level's sum should be equal to the number of items in the next level + # Moreover, last level's sum should be equal to the tensor height + lod = [[2, 3], [1, 3, 1, 2, 2]] + tensor.set_recursive_sequence_lengths(lod) + self.assertEqual(tensor.recursive_sequence_lengths(), lod) + tensor.set(np.random.random([8, 1]), fluid.CPUPlace()) + self.assertFalse(tensor.has_valid_recursive_sequence_lengths()) lod = [[2, 3], [1, 3, 1, 2, 1]] - converted_lod = [[0, 2, 5], [0, 1, 4, 5, 7, 8]] - self.assertEqual(_convert_lod(lod), converted_lod) + tensor.set_recursive_sequence_lengths(lod) + self.assertTrue(tensor.has_valid_recursive_sequence_lengths()) + tensor.set(np.random.random([9, 1]), fluid.CPUPlace()) + self.assertFalse(tensor.has_valid_recursive_sequence_lengths()) def test_create_lod_tensor(self): # Create LoDTensor from a list @@ -60,19 +57,19 @@ class TestLoDTensor(unittest.TestCase): self.assertRaises(AssertionError, create_lod_tensor, data, wrong_lod, fluid.CPUPlace()) tensor = create_lod_tensor(data, correct_lod, fluid.CPUPlace()) - self.assertEqual(tensor.lod(), [[0, 3, 5]]) + self.assertEqual(tensor.recursive_sequence_lengths(), correct_lod) # Create LoDTensor from numpy array - data = numpy.random.random([10, 1]) + data = np.random.random([10, 1]) lod = [[2, 1], [3, 3, 4]] tensor = create_lod_tensor(data, lod, fluid.CPUPlace()) - self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]]) + self.assertEqual(tensor.recursive_sequence_lengths(), lod) # Create LoDTensor from another LoDTensor, they are differnt instances new_lod = [[2, 2, 1], [1, 2, 2, 3, 2]] new_tensor = create_lod_tensor(tensor, new_lod, fluid.CPUPlace()) - self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]]) - self.assertEqual(new_tensor.lod(), [[0, 2, 4, 5], [0, 1, 3, 5, 8, 10]]) + self.assertEqual(tensor.recursive_sequence_lengths(), lod) + self.assertEqual(new_tensor.recursive_sequence_lengths(), new_lod) def test_create_random_int_lodtensor(self): # The shape of a word, commonly used in speech and NLP problem, is [1] @@ -83,7 +80,7 @@ class TestLoDTensor(unittest.TestCase): high = dict_size - 1 tensor = create_random_int_lodtensor(lod, shape, fluid.CPUPlace(), low, high) - self.assertEqual(tensor.lod(), [[0, 2, 5, 10]]) + self.assertEqual(tensor.recursive_sequence_lengths(), lod) self.assertEqual(tensor.shape(), [10, 1]) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 32176fc7e5c7f5654c6692d3fb123bb5fe445b7a..5f27864c140573086d07415f83caca708889a068 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -15,7 +15,7 @@ if(NOT WITH_DISTRIBUTE) endif(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 -list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 +list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185 list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152 @@ -48,8 +48,7 @@ foreach(TEST_OP ${TEST_OPS}) endforeach(TEST_OP) py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL) py_test_modules(test_dist_train MODULES test_dist_train SERIAL) -# FIXME(Yancey1989): this test would cost much more time on CUDAPlace -# since load cudnn libraries, so we use a longer timeout to make this -# unit test stability. -set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 60) +py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) +py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) +set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180) diff --git a/python/paddle/fluid/tests/unittests/benchmark.py b/python/paddle/fluid/tests/unittests/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..e891ee932f1440001eb25b222f1f4613e97dfcb1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/benchmark.py @@ -0,0 +1,113 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import unittest +import time +import itertools + +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from op_test import OpTest + + +class BenchmarkSuite(OpTest): + def timeit_function(self, callback, iters, *args, **kwargs): + assert iters != 0, "Iters should >= 1" + start = time.time() + for i in range(iters): + callback(*args, **kwargs) + elapse = time.time() - start + return elapse / iters + + def _assert_cpu_gpu_same(self, cpu_outs, gpu_outs, fetch_list, atol): + for item_cpu_out, item_gpu_out, variable in zip(cpu_outs, gpu_outs, + fetch_list): + # the cpu version is baseline, expect gpu version keep same with cpu version. + expect = item_cpu_out + expect_t = np.array(item_cpu_out) + actual = item_gpu_out + actual_t = np.array(item_gpu_out) + var_name = variable if isinstance(variable, + basestring) else variable.name + self.assertTrue( + np.allclose( + actual_t, expect_t, atol=atol), + "Output (" + var_name + ") has diff" + str(actual_t) + "\n" + + str(expect_t)) + self.assertListEqual(actual.lod(), + expect.lod(), + "Output (" + var_name + ") has different lod") + + def _get_input_names(self): + inputs = [] + for name, value in self.inputs.iteritems(): + if isinstance(value, list): + inputs.extend([sub_name for sub_name, _ in value]) + inputs.append(name) + return inputs + + def _get_output_names(self): + outputs = [] + for var_name, var in self.outputs.iteritems(): + if isinstance(var, list): + for sub_var_name, sub_var in var: + outputs.append(sub_var_name) + else: + outputs.append(var_name) + if len(outputs) == 0: + for out_name, out_dup in Operator.get_op_outputs(self.op_type): + outputs.append(str(out_name)) + return outputs + + def check_output_stability(self, atol=1e-8): + places = self._get_places() + if len(places) < 2: + return + cpu_outs, fetch_list = self._calc_output(places[0]) + gpu_outs, _ = self._calc_output(places[1]) + self._assert_cpu_gpu_same(cpu_outs, gpu_outs, fetch_list, atol) + + def timeit_output_with_place(self, place, iters): + return self.timeit_function(self.calc_output, iters, place) + + def timeit_output(self, iters=100): + places = self._get_places() + elapses = [] + for place in places: + elapses.append(self.timeit_output_with_place(place, iters)) + for place, elapse in zip(places, elapses): + print("One pass of ({2}_op) at {0} cost {1}".format( + str(place), elapse, self.op_type)) + + def timeit_grad_with_place(self, place, iters=100): + inputs_to_check = self._get_input_names() + output_names = self._get_output_names() + return self.timeit_function( + self._get_gradient, + iters, + inputs_to_check, + place, + output_names, + no_grad_set=None) + + def timeit_grad(self, iters=100): + places = self._get_places() + elapses = [] + for place in places: + elapses.append(self.timeit_grad_with_place(place, iters)) + for place, elapse in zip(places, elapses): + print("One pass of ({2}_grad_op) at {0} cost {1}".format( + str(place), elapse, self.op_type)) diff --git a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py new file mode 100644 index 0000000000000000000000000000000000000000..91a5f1bca4441d80489a02eb9283928e38321826 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py @@ -0,0 +1,82 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle.fluid as fluid +from benchmark import BenchmarkSuite +from op_test import OpTest + +# This is a demo op test case for operator benchmarking and high resolution number stability alignment. + + +class TestSumOp(BenchmarkSuite): + def setUp(self): + self.op_type = "sum" + self.customize_testcase() + self.customize_fetch_list() + + def customize_fetch_list(self): + """ + customize fetch list, configure the wanted variables. + >>> self.fetch_list = ["Out"] + """ + self.fetch_list = ["Out"] + # pass + + def customize_testcase(self): + # a test case + x0 = np.random.random((300, 400)).astype('float32') + x1 = np.random.random((300, 400)).astype('float32') + x2 = np.random.random((300, 400)).astype('float32') + + # NOTE: if the output is empty, then it will autofilled by benchmarkSuite. + # only the output dtype is used, the shape, lod and data is computed from input. + self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]} + self.outputs = {"Out": x0 + x1 + x2} + + def test_check_output(self): + """ + compare the output with customized output. In this case, + you should set the correct output by hands. + >>> self.outputs = {"Out": x0 + x1 + x2} + """ + self.check_output(atol=1e-8) + + def test_output_stability(self): + # compare the cpu gpu output in high resolution. + self.check_output_stability() + + def test_timeit_output(self): + """ + perf the op, time cost will be averged in iters. + output example + >>> One pass of (sum_op) at CPUPlace cost 0.000461330413818 + >>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596 + """ + self.timeit_output(iters=100) + + def test_timeit_grad(self): + """ + perf the op gradient, time cost will be averged in iters. + output example + >>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536 + >>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653 + """ + self.timeit_grad(iters=100) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index b611470fa1ff326df960c349b71006f52d586d8e..e056ef9952a519d6c4d580b27f1118a3a91f13af 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -15,13 +15,17 @@ import unittest import numpy as np import random +import time import itertools -import paddle.fluid.core as core import collections + +import paddle.fluid as fluid +import paddle.fluid.core as core from paddle.fluid.backward import append_backward from paddle.fluid.op import Operator from paddle.fluid.executor import Executor -from paddle.fluid.framework import Program, OpProtoHolder +from paddle.fluid.framework import Program, OpProtoHolder, Variable +from testsuite import create_op, set_input, append_input_output, append_loss_ops def randomize_probability(batch_size, class_num, dtype='float32'): @@ -33,73 +37,6 @@ def randomize_probability(batch_size, class_num, dtype='float32'): return prob -def create_op(scope, op_type, inputs, outputs, attrs): - kwargs = dict() - - op_maker = core.op_proto_and_checker_maker - op_role_attr_name = op_maker.kOpRoleAttrName() - - if op_role_attr_name not in attrs: - attrs[op_role_attr_name] = int(op_maker.OpRole.Forward) - - def __create_var__(name, var_name): - scope.var(var_name).get_tensor() - kwargs[name].append(var_name) - - for in_name, in_dup in Operator.get_op_inputs(op_type): - if in_name in inputs: - kwargs[in_name] = [] - if in_dup: - sub_in = inputs[in_name] - for item in sub_in: - sub_in_name, _ = item[0], item[1] - __create_var__(in_name, sub_in_name) - else: - __create_var__(in_name, in_name) - - for out_name, out_dup in Operator.get_op_outputs(op_type): - if out_name in outputs: - kwargs[out_name] = [] - if out_dup: - sub_out = outputs[out_name] - for item in sub_out: - sub_out_name, _ = item[0], item[1] - __create_var__(out_name, sub_out_name) - else: - __create_var__(out_name, out_name) - - for attr_name in Operator.get_op_attr_names(op_type): - if attr_name in attrs: - kwargs[attr_name] = attrs[attr_name] - - return Operator(op_type, **kwargs) - - -def set_input(scope, op, inputs, place): - def __set_input__(var_name, var): - if isinstance(var, tuple) or isinstance(var, np.ndarray): - tensor = scope.find_var(var_name).get_tensor() - if isinstance(var, tuple): - tensor.set_lod(var[1]) - var = var[0] - tensor.set_dims(var.shape) - tensor.set(var, place) - elif isinstance(var, float): - scope.find_var(var_name).set_float(var) - elif isinstance(var, int): - scope.find_var(var_name).set_int(var) - - for in_name, in_dup in Operator.get_op_inputs(op.type()): - if in_name in inputs: - if in_dup: - sub_in = inputs[in_name] - for item in sub_in: - sub_in_name, sub_in_val = item[0], item[1] - __set_input__(sub_in_name, sub_in_val) - else: - __set_input__(in_name, inputs[in_name]) - - def get_numeric_gradient(place, scope, op, @@ -173,54 +110,15 @@ def get_numeric_gradient(place, return gradient_flat.reshape(tensor_to_check.get_dims()) -def append_input_output(block, op_proto, np_list, is_input): - '''Insert VarDesc and generate Python variable instance''' - proto_list = op_proto.inputs if is_input else op_proto.outputs - - def create_var(block, name, np_list, var_proto): - if name not in np_list: - assert var_proto.intermediate, "{} not found".format(name) - shape = None - lod_level = None - else: - np_value = np_list[name] - if isinstance(np_value, tuple): - shape = list(np_value[0].shape) - lod_level = len(np_value[1]) - else: - shape = list(np_value.shape) - lod_level = 0 - return block.create_var( - dtype="float32", shape=shape, lod_level=lod_level, name=name) - - var_dict = {} - for var_proto in proto_list: - var_name = str(var_proto.name) - if is_input: - if (var_name not in np_list) and var_proto.dispensable: - continue - assert (var_name in np_list) or (var_proto.dispensable), \ - "Missing {} as input".format(var_name) - if var_proto.duplicable: - assert isinstance(np_list[var_name], list), \ - "Duplicable {} should be set as list".format(var_name) - var_list = [] - for (name, np_value) in np_list[var_name]: - var_list.append( - create_var(block, name, {name: np_value}, var_proto)) - var_dict[var_name] = var_list - else: - var_dict[var_name] = create_var(block, var_name, np_list, var_proto) - - return var_dict - - class OpTest(unittest.TestCase): @classmethod def setUpClass(cls): '''Fix random seeds to remove randomness from tests''' cls._np_rand_state = np.random.get_state() cls._py_rand_state = random.getstate() + cls.call_once = False + cls.dtype = "float32" + cls.outputs = {} np.random.seed(123) random.seed(124) @@ -231,6 +129,31 @@ class OpTest(unittest.TestCase): np.random.set_state(cls._np_rand_state) random.setstate(cls._py_rand_state) + def try_call_once(self, data_type): + if not self.call_once: + self.call_once = True + self.dtype = data_type + + def infer_dtype_from_inputs_outputs(self, inputs, outputs): + def infer_dtype(numpy_dict): + assert isinstance( + numpy_dict, + dict), "self.inputs, self.outputs must be numpy_dict" + for var_name, var_value in numpy_dict.iteritems(): + if isinstance(var_value, (np.ndarray, np.generic)): + self.try_call_once(var_value.dtype) + elif isinstance(var_value, (list, tuple)): + # the case of self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]} + if len(var_value) > 1 and isinstance(var_value[1], ( + np.ndarray, np.generic)): + instance = var_value[1] + self.try_call_once(instance[1].dtype) + else: + self.try_call_once("float32") + + infer_dtype(inputs) + infer_dtype(outputs) + def feed_var(self, input_vars, place): feed_map = {} for var_name in input_vars: @@ -239,7 +162,7 @@ class OpTest(unittest.TestCase): tensor = core.LoDTensor() if isinstance(np_value, tuple): tensor.set(np_value[0], place) - tensor.set_lod(np_value[1]) + tensor.set_recursive_sequence_lengths(np_value[1]) else: tensor.set(np_value, place) feed_map[name] = tensor @@ -247,25 +170,22 @@ class OpTest(unittest.TestCase): tensor = core.LoDTensor() if isinstance(self.inputs[var_name], tuple): tensor.set(self.inputs[var_name][0], place) - tensor.set_lod(self.inputs[var_name][1]) + tensor.set_recursive_sequence_lengths(self.inputs[var_name][ + 1]) else: tensor.set(self.inputs[var_name], place) feed_map[var_name] = tensor return feed_map - def calc_output(self, place): - outs, _ = self._calc_output(place) - return outs - - def _calc_output(self, place): + def _append_ops(self, block): op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) - - program = Program() - block = program.global_block() - - inputs = append_input_output(block, op_proto, self.inputs, True) - outputs = append_input_output(block, op_proto, self.outputs, False) + "infer datatype from inputs and outputs for this test case" + self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs) + inputs = append_input_output(block, op_proto, self.inputs, True, + self.dtype) + outputs = append_input_output(block, op_proto, self.outputs, False, + self.dtype) op = block.append_op( type=self.op_type, inputs=inputs, @@ -275,22 +195,68 @@ class OpTest(unittest.TestCase): op.desc.infer_var_type(block.desc) op.desc.infer_shape(block.desc) - fetch_list = [] - for var_name, var in outputs.iteritems(): - if var_name in self.outputs: + def _get_io_vars(self, block, numpy_inputs): + inputs = {} + for name, value in numpy_inputs.iteritems(): + if isinstance(value, list): + var_list = [ + block.var(sub_name) for sub_name, sub_value in value + ] + inputs[name] = var_list + else: + inputs[name] = block.var(name) + return inputs + + def _get_inputs(self, block): + return self._get_io_vars(block, self.inputs) + + def _get_outputs(self, block): + return self._get_io_vars(block, self.outputs) + + def calc_output(self, place): + outs, _ = self._calc_output(place) + return outs + + def _calc_output(self, place, parallel=False): + + program = Program() + block = program.global_block() + self._append_ops(block) + + inputs = self._get_inputs(block) + outputs = self._get_outputs(block) + feed_map = self.feed_var(inputs, place) + + if parallel: + use_cuda = False + if isinstance(place, fluid.CUDAPlace(0)): + use_cuda = True + executor = fluid.ParallelExecutor( + use_cuda=use_cuda, loss_name=loss.name, main_program=program) + else: + executor = Executor(place) + + fetch_list = getattr(self, "fetch_list", []) + # if the fetch_list is customized by user, we use it directly. + # if not, fill the fetch_list by the user configured outputs in test. + if len(fetch_list) == 0: + for var_name, var in outputs.iteritems(): if isinstance(var, list): for v in var: fetch_list.append(v) else: fetch_list.append(var) - - feed_map = self.feed_var(inputs, place) - - exe = Executor(place) - outs = exe.run(program, - feed=feed_map, - fetch_list=fetch_list, - return_numpy=False) + # if the fetch_list still empty, fill the fetch_list by the operator output. + if len(fetch_list) == 0: + for out_name, out_dup in Operator.get_op_outputs(self.op_type): + fetch_list.append(str(out_name)) + # fetch_list = map(block.var, fetch_list) + if not isinstance(fetch_list[0], Variable): + fetch_list = map(block.var, fetch_list) + outs = executor.run(program, + feed=feed_map, + fetch_list=fetch_list, + return_numpy=False) return outs, fetch_list def check_output_with_place(self, place, atol): @@ -328,7 +294,8 @@ class OpTest(unittest.TestCase): str(place)) if isinstance(expect, tuple): self.assertListEqual( - actual.lod(), expect[1], "Output (" + sub_out_name + + actual.recursive_sequence_lengths(), expect[1], + "Output (" + sub_out_name + ") has different lod at " + str(place)) else: idx = find_actual(out_name, fetch_list) @@ -342,21 +309,23 @@ class OpTest(unittest.TestCase): "Output (" + out_name + ") has diff at " + str(place) + str(actual_t) + "\n" + str(expect_t)) if isinstance(expect, tuple): - self.assertListEqual(actual.lod(), expect[1], - "Output (" + out_name + + self.assertListEqual(actual.recursive_sequence_lengths(), + expect[1], "Output (" + out_name + ") has different lod at " + str(place)) - def check_output(self, atol=1e-5): - places = [core.CPUPlace()] + def _get_places(self): + places = [fluid.CPUPlace()] if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): places.append(core.CUDAPlace(0)) + return places + + def check_output(self, atol=1e-5): + places = self._get_places() for place in places: self.check_output_with_place(place, atol) def check_output_customized(self, checker): - places = [core.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): - places.append(core.CUDAPlace(0)) + places = self._get_places() for place in places: outs = self.calc_output(place) outs = [np.array(out) for out in outs] @@ -389,9 +358,7 @@ class OpTest(unittest.TestCase): in_place=False, max_relative_error=0.005, user_defined_grads=None): - places = [core.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): - places.append(core.CUDAPlace(0)) + places = self._get_places() for place in places: self.check_grad_with_place(place, inputs_to_check, output_names, no_grad_set, numeric_grad_delta, @@ -438,41 +405,12 @@ class OpTest(unittest.TestCase): max_relative_error, "Gradient Check On %s" % str(place)) - @staticmethod - def _create_var_descs_(block, var_dict): - # FIXME: Try unify with `append_input_output` - for param_name in var_dict: - var = var_dict[param_name] - if not isinstance(var, list) and not isinstance(var, tuple): - var = [(param_name, var, None)] - if not isinstance(var[0], list) and not isinstance(var[0], tuple): - var = [(param_name, var[0], var[1])] - - for i, item in enumerate(var): - if not isinstance(item[0], basestring): - item = [[param_name] + list(item)] - if len(item) == 2: - if isinstance(item[1], tuple): - var[i] = [item[0], item[1][0], item[1][1]] - else: - # only set var name and value, set lod to None - var[i] = list(item) + [None] - var_descs = [(block.create_var( - name=name, shape=each.shape, dtype=each.dtype), each, lod) - for name, each, lod in var] - - yield param_name, var_descs - - @staticmethod - def _merge_list(iterable): - return reduce(lambda a, b: list(a) + list(b), iterable, []) - @staticmethod def _numpy_to_lod_tensor(np_value, lod, place): tensor = core.LoDTensor() tensor.set(np_value, place) if lod is not None: - tensor.set_lod(lod) + tensor.set_recursive_sequence_lengths(lod) return tensor @staticmethod @@ -497,83 +435,31 @@ class OpTest(unittest.TestCase): input.dtype = np.uint16 return input - def _get_gradient(self, input_to_check, place, output_names, no_grad_set): + def _get_gradient(self, + input_to_check, + place, + output_names, + no_grad_set, + parallel=False): prog = Program() block = prog.global_block() - inputs_with_np = { - key: value - for (key, value) in OpTest._create_var_descs_( - block, getattr(self, 'inputs', {})) - } - outputs_with_np = { - key: val - for (key, val) in OpTest._create_var_descs_( - block, getattr(self, 'outputs', {})) - } - inputs = { - k: [item[0] for item in inputs_with_np[k]] - for k in inputs_with_np - } - outputs = { - k: [item[0] for item in outputs_with_np[k]] - for k in outputs_with_np - } - - op = block.append_op( - type=self.op_type, - inputs=inputs, - outputs=outputs, - attrs=getattr(self, 'attrs', {})) - - # infer variable type and infer shape in compile-time - op.desc.infer_var_type(block.desc) - op.desc.infer_shape(block.desc) - - mean_inputs = map(block.var, output_names) - - if len(mean_inputs) == 1: - loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1]) - op = block.append_op( - inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean') - op.desc.infer_var_type(block.desc) - op.desc.infer_shape(block.desc) - else: - avg_sum = [] - for cur_loss in mean_inputs: - cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1]) - op = block.append_op( - inputs={"X": [cur_loss]}, - outputs={"Out": [cur_avg_loss]}, - type="mean") - op.desc.infer_var_type(block.desc) - op.desc.infer_shape(block.desc) - avg_sum.append(cur_avg_loss) - - loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1]) - op_sum = block.append_op( - inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum') - op_sum.desc.infer_var_type(block.desc) - op_sum.desc.infer_shape(block.desc) - - loss = block.create_var(dtype=loss_sum.dtype, shape=[1]) - op_loss = block.append_op( - inputs={"X": loss_sum}, - outputs={"Out": loss}, - type='scale', - attrs={'scale': 1.0 / float(len(avg_sum))}) - op_loss.desc.infer_var_type(block.desc) - op_loss.desc.infer_shape(block.desc) - + self._append_ops(block) + loss = append_loss_ops(block, output_names) param_grad_list = append_backward( loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set) - feed_dict = { - item[0].name: OpTest._numpy_to_lod_tensor(item[1], item[2], place) - for p_name in inputs_with_np for item in inputs_with_np[p_name] - } + inputs = self._get_inputs(block) + feed_dict = self.feed_var(inputs, place) fetch_list = [g for p, g in param_grad_list] - executor = Executor(place) + if parallel: + use_cuda = False + if isinstance(place, fluid.CUDAPlace(0)): + use_cuda = True + executor = fluid.ParallelExecutor( + use_cuda=use_cuda, loss_name=loss.name, main_program=program) + else: + executor = Executor(place) return map(np.array, executor.run(prog, feed_dict, fetch_list, return_numpy=False)) diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index c9c3c648717814c28c39a401487925824e885946..21f2037ad408b0a92718c0ea2bae5e8bf563c665 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import multiprocessing +import os import unittest import paddle.fluid as fluid import time @@ -23,6 +25,7 @@ __all__ = ['TestParallelExecutorBase'] class TestParallelExecutorBase(unittest.TestCase): def check_network_convergence(self, method, + use_cuda=True, memory_opt=True, iter=50, batch_size=None, @@ -53,7 +56,7 @@ class TestParallelExecutorBase(unittest.TestCase): adam.minimize(loss) if memory_opt: fluid.memory_optimize(main) - place = fluid.CUDAPlace(0) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() startup_exe = fluid.Executor(place) startup_exe.run(startup) exec_strategy = fluid.ExecutionStrategy() @@ -64,7 +67,7 @@ class TestParallelExecutorBase(unittest.TestCase): if use_parallel_executor: exe = fluid.ParallelExecutor( - True, + use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) @@ -72,11 +75,12 @@ class TestParallelExecutorBase(unittest.TestCase): exe = fluid.Executor(place=place) if batch_size is not None: - batch_size *= fluid.core.get_cuda_device_count() + batch_size *= fluid.core.get_cuda_device_count( + ) if use_cuda else int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) begin = time.time() first_loss, = run_executor( exe=exe, feed=feed_dict, fetch_list=[loss.name]) - first_loss = np.array(first_loss) for i in xrange(iter): run_executor(exe=exe, feed=feed_dict, fetch_list=[]) @@ -89,8 +93,6 @@ class TestParallelExecutorBase(unittest.TestCase): print "%.4f Instance per second" % ( (batch_size * iter + 2) / (end - begin)) - last_loss = np.array(last_loss) - print first_loss, last_loss # self.assertGreater(first_loss[0], last_loss[0]) return first_loss, last_loss diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py new file mode 100644 index 0000000000000000000000000000000000000000..e04412f809cdd75d07d28a60f0c2f19041a684f6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py @@ -0,0 +1,82 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class BaseTestCase(OpTest): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 0 + + def setUp(self): + self.initTestCase() + self.x = (1000 * np.random.random(self.dims)).astype(self.dtype) + self.inputs = {'X': self.x} + self.attrs = {'axis': self.axis} + if self.op_type == "arg_min": + self.outputs = {'Out': np.argmin(self.x, axis=self.axis)} + else: + self.outputs = {'Out': np.argmax(self.x, axis=self.axis)} + + def test_check_output(self): + self.check_output() + + +class TestCase0(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 0 + + +class TestCase1(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4) + self.dtype = 'float64' + self.axis = 1 + + +class TestCase2(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4) + self.dtype = 'int64' + self.axis = 0 + + +class TestCase3(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, ) + self.dtype = 'int64' + self.axis = 0 + + +class TestCase4(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (1, ) + self.dtype = 'int32' + self.axis = 0 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py index 4216d83653b27ec7f18034e576fbedbecc3f1cfe..01e5749bdb9729c697af1ae87d993a2da66217f8 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py @@ -128,7 +128,7 @@ def create_or_get_tensor(scope, var_name, var, place): tensor = scope.var(var_name).get_tensor() if var is not None: assert isinstance(var, np.ndarray) - tensor.set_lod([[]]) + tensor.set_recursive_sequence_lengths([]) tensor.set_dims(var.shape) tensor.set(var, place) return tensor diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py index 7976dd7c3f14390fb00bc8ab39121b6a686e3039..db5771f7b0ad74c73b81d502209c17dce3ce8457 100644 --- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py +++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py @@ -20,6 +20,8 @@ from paddle.fluid.op import Operator class TestBeamSearchDecodeOp(unittest.TestCase): + """unittest of beam_search_decode_op""" + def setUp(self): self.scope = core.Scope() self.place = core.CPUPlace() @@ -32,32 +34,44 @@ class TestBeamSearchDecodeOp(unittest.TestCase): def test_get_set(self): ids = self.scope.var("ids").get_lod_tensor_array() - self.append_lod_tensor( - ids, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]], - np.array( - [1, 2, 3, 4, 5, 6], dtype="int64")) - self.append_lod_tensor( - ids, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]], - np.array( - [0, 1, 2, 3, 4, 5], dtype="int64")) - self.append_lod_tensor( - ids, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]], - np.array( - [0, 1, 2, 3, 4], dtype="int64")) - scores = self.scope.var("scores").get_lod_tensor_array() - self.append_lod_tensor( - scores, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]], - np.array( - [1, 2, 3, 4, 5, 6], dtype="float64")) - self.append_lod_tensor( - scores, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]], - np.array( - [0, 1, 2, 3, 4, 5], dtype="float64")) - self.append_lod_tensor( - scores, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]], - np.array( - [0, 1, 2, 3, 4], dtype="float64")) + # Construct sample data with 5 steps and 2 source sentences + # beam_size = 2, end_id = 1 + # start with start_id + [ + self.append_lod_tensor( + array, [[0, 1, 2], [0, 1, 2]], np.array( + [0, 0], dtype=dtype)) + for array, dtype in ((ids, "int64"), (scores, "float32")) + ] + [ + self.append_lod_tensor( + array, [[0, 1, 2], [0, 2, 4]], + np.array( + [2, 3, 4, 5], dtype=dtype)) + for array, dtype in ((ids, "int64"), (scores, "float32")) + ] + [ + self.append_lod_tensor( + array, [[0, 2, 4], [0, 2, 2, 4, 4]], + np.array( + [3, 1, 5, 4], dtype=dtype)) + for array, dtype in ((ids, "int64"), (scores, "float32")) + ] + [ + self.append_lod_tensor( + array, [[0, 2, 4], [0, 1, 2, 3, 4]], + np.array( + [1, 1, 3, 5], dtype=dtype)) + for array, dtype in ((ids, "int64"), (scores, "float32")) + ] + [ + self.append_lod_tensor( + array, [[0, 2, 4], [0, 0, 0, 2, 2]], + np.array( + [5, 1], dtype=dtype)) + for array, dtype in ((ids, "int64"), (scores, "float32")) + ] sentence_ids = self.scope.var("sentence_ids").get_tensor() sentence_scores = self.scope.var("sentence_scores").get_tensor() @@ -69,16 +83,18 @@ class TestBeamSearchDecodeOp(unittest.TestCase): Scores="scores", # outputs SentenceIds="sentence_ids", - SentenceScores="sentence_scores") + SentenceScores="sentence_scores", + beam_size=2, + end_id=1, ) beam_search_decode_op.run(self.scope, self.place) - expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]] + expected_lod = [[0, 2, 4], [0, 4, 7, 12, 17]] self.assertEqual(sentence_ids.lod(), expected_lod) self.assertEqual(sentence_scores.lod(), expected_lod) expected_data = np.array( - [2, 1, 0, 3, 1, 0, 3, 2, 1, 5, 4, 3, 2, 4, 4, 3, 6, 5, 4], "int64") + [0, 2, 3, 1, 0, 2, 1, 0, 4, 5, 3, 5, 0, 4, 5, 3, 1], "int64") self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data)) self.assertTrue( np.array_equal(np.array(sentence_scores), expected_data)) diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py index bc708f3aff54f54d290684d68afa503a50a32dac..167451edd8c46c006c8019678a304a38f18cb946 100644 --- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py +++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py @@ -26,9 +26,12 @@ def create_tensor(scope, name, np_data): class BeamSearchOpTester(unittest.TestCase): + """unittest of beam_search_op""" + def setUp(self): self.scope = core.Scope() self._create_ids() + self._create_pre_scores() self._create_scores() self._create_pre_ids() self.scope.var('selected_ids') @@ -37,7 +40,8 @@ class BeamSearchOpTester(unittest.TestCase): def test_run(self): op = Operator( 'beam_search', - pre_ids="pre_ids", + pre_ids='pre_ids', + pre_scores='pre_scores', ids='ids', scores='scores', selected_ids='selected_ids', @@ -47,15 +51,27 @@ class BeamSearchOpTester(unittest.TestCase): end_id=0, ) op.run(self.scope, core.CPUPlace()) selected_ids = self.scope.find_var("selected_ids").get_tensor() - print 'selected_ids', np.array(selected_ids) - print 'lod', selected_ids.lod() + selected_scores = self.scope.find_var("selected_scores").get_tensor() + self.assertTrue( + np.allclose( + np.array(selected_ids), np.array([4, 2, 3, 8])[:, np.newaxis])) + self.assertTrue( + np.allclose( + np.array(selected_scores), + np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis])) + self.assertEqual(selected_ids.lod(), + [[0L, 2L, 4L], [0L, 1L, 2L, 3L, 4L]]) def _create_pre_ids(self): np_data = np.array([[1, 2, 3, 4]], dtype='int64') - tensor = create_tensor(self.scope, "pre_ids", np_data) + tensor = create_tensor(self.scope, 'pre_ids', np_data) + + def _create_pre_scores(self): + np_data = np.array([[0.1, 0.2, 0.3, 0.4]], dtype='float32') + tensor = create_tensor(self.scope, 'pre_scores', np_data) def _create_ids(self): - self.lod = [[0, 1, 4], [0, 1, 2, 3, 4]] + self.lod = [[0, 2, 4], [0, 1, 2, 3, 4]] np_data = np.array( [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int64') tensor = create_tensor(self.scope, "ids", np_data) diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py index 87c11e7880e73b911f21dda77c1cc2b4850b3591..b04f25ef874cc6204211a4f5f5991a0ec8c473dd 100644 --- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py +++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py @@ -15,6 +15,7 @@ import unittest import numpy as np from op_test import OpTest +import paddle.fluid.core as core def bilinear_interp_np(input, out_h, out_w, out_size): @@ -45,9 +46,9 @@ def bilinear_interp_np(input, out_h, out_w, out_size): out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] + w1lambda*input[:, :, h, w+wid]) + \ - h1lambda*(w2lambda*input[:, :, h+hid, w] + - w1lambda*input[:, :, h+hid, w+wid]) - return out.astype("float32") + h1lambda*(w2lambda*input[:, :, h+hid, w] + + w1lambda*input[:, :, h+hid, w+wid]) + return out.astype(input.dtype) class TestBilinearInterpOp(OpTest): @@ -122,5 +123,44 @@ class TestCase6(TestBilinearInterpOp): self.out_size = np.array([65, 129]).astype("int32") +class TestBilinearInterpOpUint8(OpTest): + def setUp(self): + self.out_size = None + self.init_test_case() + self.op_type = "bilinear_interp" + input_np = np.random.randint( + low=0, high=256, size=self.input_shape).astype("uint8") + output_np = bilinear_interp_np(input_np, self.out_h, self.out_w, + self.out_size) + self.inputs = {'X': input_np} + if self.out_size is not None: + self.inputs['OutSize'] = self.out_size + self.attrs = {'out_h': self.out_h, 'out_w': self.out_w} + self.outputs = {'Out': output_np} + + def test_check_output(self): + self.check_output_with_place(place=core.CPUPlace(), atol=1) + + def init_test_case(self): + self.input_shape = [1, 3, 9, 6] + self.out_h = 10 + self.out_w = 9 + + +class TestCase1Uint8(TestBilinearInterpOpUint8): + def init_test_case(self): + self.input_shape = [2, 3, 128, 64] + self.out_h = 120 + self.out_w = 50 + + +class TestCase2Uint8(TestBilinearInterpOpUint8): + def init_test_case(self): + self.input_shape = [4, 1, 7, 8] + self.out_h = 5 + self.out_w = 13 + self.out_size = np.array([6, 15]).astype("int32") + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py index f7461ee6dab699064153332116449c8e20a0bac0..1a245fd756cb2bcaca720f10fa35fd3d2a45cd4d 100644 --- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py +++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py @@ -65,23 +65,25 @@ def batch_bipartite_match(distance, lod, match_type=None, dist_threshold=None): distance (numpy.array) : The distance of two entries with shape [M, N]. lod (list of int): The offsets of each input in this batch. """ - n = len(lod) - 1 + n = len(lod) m = distance.shape[1] match_indices = -1 * np.ones((n, m), dtype=np.int) match_dist = np.zeros((n, m), dtype=np.float32) - for i in range(len(lod) - 1): - bipartite_match(distance[lod[i]:lod[i + 1], :], match_indices[i, :], - match_dist[i, :]) + cur_offset = 0 + for i in range(n): + bipartite_match(distance[cur_offset:(cur_offset + lod[i]), :], + match_indices[i, :], match_dist[i, :]) if match_type == 'per_prediction': - argmax_match(distance[lod[i]:lod[i + 1], :], match_indices[i, :], - match_dist[i, :], dist_threshold) + argmax_match(distance[cur_offset:(cur_offset + lod[i]), :], + match_indices[i, :], match_dist[i, :], dist_threshold) + cur_offset += lod[i] return match_indices, match_dist class TestBipartiteMatchOpWithLoD(OpTest): def setUp(self): self.op_type = 'bipartite_match' - lod = [[0, 5, 11, 23]] + lod = [[5, 6, 12]] dist = np.random.random((23, 217)).astype('float32') match_indices, match_dist = batch_bipartite_match(dist, lod[0]) @@ -98,7 +100,7 @@ class TestBipartiteMatchOpWithLoD(OpTest): class TestBipartiteMatchOpWithoutLoD(OpTest): def setUp(self): self.op_type = 'bipartite_match' - lod = [[0, 8]] + lod = [[8]] dist = np.random.random((8, 17)).astype('float32') match_indices, match_dist = batch_bipartite_match(dist, lod[0]) @@ -115,7 +117,7 @@ class TestBipartiteMatchOpWithoutLoD(OpTest): class TestBipartiteMatchOpWithPerPredictionType(OpTest): def setUp(self): self.op_type = 'bipartite_match' - lod = [[0, 5, 11, 23]] + lod = [[5, 6, 12]] dist = np.random.random((23, 237)).astype('float32') match_indices, match_dist = batch_bipartite_match(dist, lod[0], 'per_prediction', 0.5) diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py index a31b7ea322ff0a351308bea5608b2af9b60ac582..4ce9a4783e2332b6882164a70e1462c6a6d31bef 100644 --- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py +++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py @@ -81,15 +81,19 @@ def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type, n = target_box.shape[0] m = prior_box.shape[0] output_box = np.zeros((n, m, 4), dtype=np.float32) - for i in range(len(lod) - 1): + cur_offset = 0 + for i in range(len(lod)): if (code_type == "EncodeCenterSize"): - box_coder(target_box[lod[i]:lod[i + 1], :], prior_box, - prior_box_var, output_box[lod[i]:lod[i + 1], :, :], + box_coder(target_box[cur_offset:(cur_offset + lod[i]), :], + prior_box, prior_box_var, + output_box[cur_offset:(cur_offset + lod[i]), :, :], code_type, box_normalized) elif (code_type == "DecodeCenterSize"): - box_coder(target_box[lod[i]:lod[i + 1], :, :], prior_box, - prior_box_var, output_box[lod[i]:lod[i + 1], :, :], + box_coder(target_box[cur_offset:(cur_offset + lod[i]), :, :], + prior_box, prior_box_var, + output_box[cur_offset:(cur_offset + lod[i]), :, :], code_type, box_normalized) + cur_offset += lod[i] return output_box @@ -99,7 +103,7 @@ class TestBoxCoderOp(OpTest): def setUp(self): self.op_type = "box_coder" - lod = [[0, 1, 2, 3, 4, 5]] + lod = [[1, 1, 1, 1, 1]] prior_box = np.random.random((10, 4)).astype('float32') prior_box_var = np.random.random((10, 4)).astype('float32') target_box = np.random.random((5, 10, 4)).astype('float32') @@ -120,13 +124,39 @@ class TestBoxCoderOp(OpTest): self.outputs = {'OutputBox': output_box} +class TestBoxCoderOpWithoutBoxVar(OpTest): + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "box_coder" + lod = [[0, 1, 2, 3, 4, 5]] + prior_box = np.random.random((10, 4)).astype('float32') + prior_box_var = np.ones((10, 4)).astype('float32') + target_box = np.random.random((5, 10, 4)).astype('float32') + code_type = "DecodeCenterSize" + box_normalized = False + output_box = batch_box_coder(prior_box, prior_box_var, target_box, + lod[0], code_type, box_normalized) + + self.inputs = { + 'PriorBox': prior_box, + 'TargetBox': target_box, + } + self.attrs = { + 'code_type': 'decode_center_size', + 'box_normalized': False + } + self.outputs = {'OutputBox': output_box} + + class TestBoxCoderOpWithLoD(OpTest): def test_check_output(self): self.check_output() def setUp(self): self.op_type = "box_coder" - lod = [[0, 4, 12, 20]] + lod = [[4, 8, 8]] prior_box = np.random.random((10, 4)).astype('float32') prior_box_var = np.random.random((10, 4)).astype('float32') target_box = np.random.random((20, 4)).astype('float32') diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint.py b/python/paddle/fluid/tests/unittests/test_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..e22400a045ced16c46b0bf005155f621f249d263 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_checkpoint.py @@ -0,0 +1,75 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import unittest +import os +import tempfile + + +class TestCheckpoint(unittest.TestCase): + def setUp(self): + self.dirname = tempfile.mktemp() + self.max_num_checkpoints = 3 + self.epoch_interval = 1 + self.step_interval = 1 + self.trainer_id = 0 + self.chief = self.trainer_id == 0 + self.place = fluid.CPUPlace() + self.epoch_id = 100 + self.step_id = 20 + + def test_checkpoint(self): + self.save_checkpoint() + serial = fluid.io.get_latest_checkpoint_serial(self.dirname) + self.assertTrue(serial >= 0) + trainer_args = ["epoch_id", "step_id"] + epoch_id, step_id = fluid.io.load_trainer_args( + self.dirname, serial, self.trainer_id, trainer_args) + self.assertEqual(self.step_id, int(step_id)) + self.assertEqual(self.epoch_id, int(epoch_id)) + + program = fluid.Program() + with fluid.program_guard(program): + exe = fluid.Executor(self.place) + fluid.io.load_checkpoint(exe, self.dirname, serial, program) + + fluid.io.clean_checkpoint(self.dirname, delete_dir=True) + self.assertFalse(os.path.isdir(self.dirname)) + + def save_checkpoint(self): + config = fluid.CheckpointConfig(self.dirname, self.max_num_checkpoints, + self.epoch_interval, self.step_interval) + + trainer_args = {} + trainer_args["epoch_id"] = self.epoch_id + trainer_args["step_id"] = self.step_id + + program = fluid.Program() + with fluid.program_guard(program): + program.global_block().create_var( + name="scale_0", + psersistable=True, + dtype="float32", + shape=[32, 32]) + + exe = fluid.Executor(self.place) + for i in xrange(10): + fluid.io.save_checkpoint(exe, config.checkpoint_dir, + self.trainer_id, trainer_args, program, + config.max_num_checkpoints) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py index 050df2801c98e8f4167cdd1b4dde858c9f9f07dd..23932194f0ca97954ec9ade3fdcaebd7a32749a0 100644 --- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py +++ b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py @@ -144,10 +144,10 @@ class TestChunkEvalOp(OpTest): starts = sorted(starts) self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = self.gen_chunks( infer, label, starts) - self.inputs = { - 'Inference': (infer, [starts]), - 'Label': (label, [starts]) - } + lod = [] + for i in range(len(starts) - 1): + lod.append(starts[i + 1] - starts[i]) + self.inputs = {'Inference': (infer, [lod]), 'Label': (label, [lod])} precision = float( self.num_correct_chunks ) / self.num_infer_chunks if self.num_infer_chunks else 0 diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py index 1e00d67d5480bfa77a60e1aed52cafac6e8242ca..e9f3c45dc40b3333fe7304f8e4313d156bd5374c 100644 --- a/python/paddle/fluid/tests/unittests/test_concat_op.py +++ b/python/paddle/fluid/tests/unittests/test_concat_op.py @@ -43,7 +43,7 @@ class TestConcatOp(OpTest): self.axis = 1 -class TestConcatOp2(OpTest): +class TestConcatOp2(TestConcatOp): def init_test_data(self): self.x0 = np.random.random((2, 3, 4, 5)).astype('float32') self.x1 = np.random.random((2, 3, 4, 5)).astype('float32') @@ -51,5 +51,16 @@ class TestConcatOp2(OpTest): self.axis = 1 +class TestConcatOp3(TestConcatOp): + def init_test_data(self): + self.x0 = np.random.random((1, 256, 170, 256)).astype('float32') + self.x1 = np.random.random((1, 128, 170, 256)).astype('float32') + self.x2 = np.random.random((1, 128, 170, 256)).astype('float32') + self.axis = 1 + + def test_check_grad(self): + pass + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py index f397f542bb07519886d75618e2a915c2dbf61fce..122b076c2d3e3a69f52a2c335e2bc89707b4fa9b 100644 --- a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py +++ b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py @@ -22,9 +22,9 @@ from op_test import OpTest class CRFDecoding(object): def __init__(self, emission_weights, transition_weights, seq_start_positions): - assert (emission_weights.shape[0] == seq_start_positions[-1]) + assert (emission_weights.shape[0] == sum(seq_start_positions)) self.tag_num = emission_weights.shape[1] - self.seq_num = len(seq_start_positions) - 1 + self.seq_num = len(seq_start_positions) self.seq_start_positions = seq_start_positions self.x = emission_weights @@ -34,9 +34,9 @@ class CRFDecoding(object): self.w = transition_weights[2:, :] self.track = np.zeros( - (seq_start_positions[-1], self.tag_num), dtype="int64") + (sum(seq_start_positions), self.tag_num), dtype="int64") self.decoded_path = np.zeros( - (seq_start_positions[-1], 1), dtype="int64") + (sum(seq_start_positions), 1), dtype="int64") def _decode_one_sequence(self, decoded_path, x): seq_len, tag_num = x.shape @@ -71,9 +71,11 @@ class CRFDecoding(object): decoded_path[i - 1] = max_idx = track[i, max_idx] def decode(self): + cur_pos = 0 for i in range(self.seq_num): - start = self.seq_start_positions[i] - end = self.seq_start_positions[i + 1] + start = cur_pos + cur_pos += self.seq_start_positions[i] + end = cur_pos self._decode_one_sequence(self.decoded_path[start:end, :], self.x[start:end, :]) return self.decoded_path @@ -90,11 +92,13 @@ class TestCRFDecodingOp1(OpTest): TAG_NUM = 17 MAX_SEQ_LEN = 10 - lod = [[0]] + lod = [[]] + total_len = 0 for i in range(SEQ_NUM): - lod[-1].append(lod[-1][-1] + random.randint(1, MAX_SEQ_LEN)) + lod[-1].append(random.randint(1, MAX_SEQ_LEN)) + total_len += lod[-1][-1] emission = np.random.uniform(-1, 1, - [lod[-1][-1], TAG_NUM]).astype("float64") + [total_len, TAG_NUM]).astype("float64") transition = np.random.uniform(-0.5, 0.5, [TAG_NUM + 2, TAG_NUM]).astype("float64") @@ -126,7 +130,8 @@ class TestCRFDecodingOp2(OpTest): self.op_type = "crf_decoding" TAG_NUM = 5 - lod = [[0, 1, 3, 6, 10]] + lod = [[1, 2, 3, 4]] + total_len = sum(lod[-1]) transition = np.repeat( np.arange( TAG_NUM, dtype="float64").reshape(1, TAG_NUM), @@ -135,13 +140,13 @@ class TestCRFDecodingOp2(OpTest): emission = np.repeat( np.arange( TAG_NUM, dtype="float64").reshape(1, TAG_NUM), - lod[-1][-1], + total_len, axis=0) labels = np.random.randint( - low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64") + low=0, high=TAG_NUM, size=(total_len, 1), dtype="int64") predicted_labels = np.ones( - (lod[-1][-1], 1), dtype="int64") * (TAG_NUM - 1) + (total_len, 1), dtype="int64") * (TAG_NUM - 1) expected_output = (labels == predicted_labels).astype("int64") self.inputs = { diff --git a/python/paddle/fluid/tests/unittests/test_crop_op.py b/python/paddle/fluid/tests/unittests/test_crop_op.py index 20cc3a643f1adfc04faad15e1b7baad3e22d9d29..4016089c01644f0389855ab114360f90c50a1bbe 100644 --- a/python/paddle/fluid/tests/unittests/test_crop_op.py +++ b/python/paddle/fluid/tests/unittests/test_crop_op.py @@ -42,9 +42,9 @@ class TestCropOp(OpTest): def setUp(self): self.op_type = "crop" self.crop_by_input = False + self.offset_by_input = False self.attrs = {} self.initTestCase() - self.attrs['offsets'] = self.offsets if self.crop_by_input: self.inputs = { 'X': np.random.random(self.x_shape).astype("float32"), @@ -55,6 +55,10 @@ class TestCropOp(OpTest): self.inputs = { 'X': np.random.random(self.x_shape).astype("float32"), } + if self.offset_by_input: + self.inputs['Offsets'] = np.array(self.offsets).astype('int32') + else: + self.attrs['offsets'] = self.offsets self.outputs = { 'Out': crop(self.inputs['X'], self.offsets, self.crop_shape) } @@ -101,5 +105,22 @@ class TestCase4(TestCropOp): self.crop_by_input = True +class TestCase5(TestCropOp): + def initTestCase(self): + self.x_shape = (3, 4, 5) + self.crop_shape = [2, 2, 3] + self.offsets = [1, 0, 2] + self.offset_by_input = True + + +class TestCase6(TestCropOp): + def initTestCase(self): + self.x_shape = (10, 9, 14) + self.crop_shape = [3, 3, 5] + self.offsets = [3, 5, 4] + self.crop_by_input = True + self.offset_by_input = True + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_ctc_align.py b/python/paddle/fluid/tests/unittests/test_ctc_align.py index f166031a1cbbaa5e312f5c7919b39648d0dad013..131b4076f45ae25b45bb3f64da07a5c3aacc43d5 100644 --- a/python/paddle/fluid/tests/unittests/test_ctc_align.py +++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py @@ -22,14 +22,16 @@ from test_softmax_op import stable_softmax def CTCAlign(input, lod, blank, merge_repeated): lod0 = lod[0] result = [] - for i in range(len(lod0) - 1): + cur_offset = 0 + for i in range(len(lod0)): prev_token = -1 - for j in range(lod0[i], lod0[i + 1]): + for j in range(cur_offset, cur_offset + lod0[i]): token = input[j][0] if (token != blank) and not (merge_repeated and token == prev_token): result.append(token) prev_token = token + cur_offset += lod0[i] result = np.array(result).reshape([len(result), 1]).astype("int32") if len(result) == 0: result = np.array([-1]) @@ -39,7 +41,7 @@ def CTCAlign(input, lod, blank, merge_repeated): class TestCTCAlignOp(OpTest): def config(self): self.op_type = "ctc_align" - self.input_lod = [[0, 11, 18]] + self.input_lod = [[11, 7]] self.blank = 0 self.merge_repeated = False self.input = np.array( @@ -66,7 +68,7 @@ class TestCTCAlignOp(OpTest): class TestCTCAlignOpCase1(TestCTCAlignOp): def config(self): self.op_type = "ctc_align" - self.input_lod = [[0, 11, 19]] + self.input_lod = [[11, 8]] self.blank = 0 self.merge_repeated = True self.input = np.array( @@ -77,7 +79,7 @@ class TestCTCAlignOpCase1(TestCTCAlignOp): class TestCTCAlignOpCase2(TestCTCAlignOp): def config(self): self.op_type = "ctc_align" - self.input_lod = [[0, 4]] + self.input_lod = [[4]] self.blank = 0 self.merge_repeated = True self.input = np.array([0, 0, 0, 0]).reshape([4, 1]).astype("int32") diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py index f545ad155ccd28c2d34e424d307eed49b37f20fb..05d3367ad8ec2bc3df794015a7c25e943a26c68c 100644 --- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py +++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py @@ -74,13 +74,13 @@ class TestDetectionMAPOp(OpTest): self.evaluate_difficult = True self.ap_type = "integral" - self.label_lod = [[0, 2, 4]] + self.label_lod = [[2, 2]] # label difficult xmin ymin xmax ymax self.label = [[1, 0, 0.1, 0.1, 0.3, 0.3], [1, 1, 0.6, 0.6, 0.8, 0.8], [2, 0, 0.3, 0.3, 0.6, 0.5], [1, 0, 0.7, 0.1, 0.9, 0.3]] # label score xmin ymin xmax ymax difficult - self.detect_lod = [[0, 3, 7]] + self.detect_lod = [[3, 4]] self.detect = [ [1, 0.3, 0.1, 0.0, 0.4, 0.3], [1, 0.7, 0.0, 0.1, 0.2, 0.3], [1, 0.9, 0.7, 0.6, 0.8, 0.8], [2, 0.8, 0.2, 0.1, 0.4, 0.4], @@ -89,7 +89,7 @@ class TestDetectionMAPOp(OpTest): ] # label score true_pos false_pos - self.tf_pos_lod = [[0, 3, 7]] + self.tf_pos_lod = [[3, 4]] self.tf_pos = [[1, 0.9, 1, 0], [1, 0.7, 1, 0], [1, 0.3, 0, 1], [1, 0.2, 1, 0], [2, 0.8, 0, 1], [2, 0.1, 1, 0], [3, 0.2, 0, 1]] @@ -112,15 +112,19 @@ class TestDetectionMAPOp(OpTest): for i, count in enumerate(class_pos_count): class_pos_count_dict[i] = count - for i in range(len(true_pos_lod[0]) - 1): - start = true_pos_lod[0][i] - end = true_pos_lod[0][i + 1] + cur_pos = 0 + for i in range(len(true_pos_lod[0])): + start = cur_pos + cur_pos += true_pos_lod[0][i] + end = cur_pos for j in range(start, end): true_pos_dict[i].append(true_pos[j]) - for i in range(len(false_pos_lod[0]) - 1): - start = false_pos_lod[0][i] - end = false_pos_lod[0][i + 1] + cur_pos = 0 + for i in range(len(false_pos_lod[0])): + start = cur_pos + cur_pos += false_pos_lod[0][i] + end = cur_pos for j in range(start, end): false_pos_dict[i].append(false_pos[j]) @@ -130,19 +134,19 @@ class TestDetectionMAPOp(OpTest): label_number = self.class_num out_class_pos_count = [] - out_true_pos_lod = [0] + out_true_pos_lod = [] out_true_pos = [] - out_false_pos_lod = [0] + out_false_pos_lod = [] out_false_pos = [] for i in range(label_number): out_class_pos_count.append([label_count[i]]) true_pos_list = true_pos[i] out_true_pos += true_pos_list - out_true_pos_lod.append(len(out_true_pos)) + out_true_pos_lod.append(len(true_pos_list)) false_pos_list = false_pos[i] out_false_pos += false_pos_list - out_false_pos_lod.append(len(out_false_pos)) + out_false_pos_lod.append(len(false_pos_list)) return out_class_pos_count, out_true_pos, [ out_true_pos_lod @@ -241,7 +245,7 @@ class TestDetectionMAPOpSkipDiff(TestDetectionMAPOp): self.evaluate_difficult = False - self.tf_pos_lod = [[0, 2, 6]] + self.tf_pos_lod = [[2, 4]] # label score true_pos false_pos self.tf_pos = [[1, 0.7, 1, 0], [1, 0.3, 0, 1], [1, 0.2, 1, 0], [2, 0.8, 0, 1], [2, 0.1, 1, 0], [3, 0.2, 0, 1]] @@ -267,9 +271,9 @@ class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp): def init_test_case(self): super(TestDetectionMAPOpMultiBatch, self).init_test_case() self.class_pos_count = [0, 2, 1] - self.true_pos_lod = [[0, 0, 3, 5]] + self.true_pos_lod = [[0, 3, 2]] self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]] - self.false_pos_lod = [[0, 0, 3, 5]] + self.false_pos_lod = [[0, 3, 2]] self.false_pos = [[0.7, 0.], [0.3, 1.], [0.2, 0.], [0.8, 1.], [0.1, 0.]] diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py index 2314bb2ed8a4eeb34752fd5d040f8a8476798aa6..562e66b0625083fe840d64967249f0215cfda1f9 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_train.py +++ b/python/paddle/fluid/tests/unittests/test_dist_train.py @@ -16,6 +16,7 @@ import os import time import unittest from multiprocessing import Process +import signal import numpy @@ -24,9 +25,6 @@ import paddle.fluid.layers as layers class TestSendOp(unittest.TestCase): - @unittest.skip( - "This test is buggy. We cannot use time.sleep to sync processes, the connection may fail in unittest." - ) def test_send(self): # Run init_serv in a thread place = fluid.CPUPlace() @@ -35,7 +33,9 @@ class TestSendOp(unittest.TestCase): p.daemon = True p.start() - time.sleep(10) + self.ps_timeout = 5 + self._wait_ps_ready(p.pid) + with open("/tmp/paddle.%d.port" % p.pid, "r") as fn: selected_port = int(fn.readlines()[0]) self.init_client(place, selected_port) @@ -44,9 +44,23 @@ class TestSendOp(unittest.TestCase): self.assertTrue(numpy.allclose(self.local_out, self.dist_out)) # FIXME(typhoonzero): find a way to gracefully shutdown the server. - os.system("kill -9 %d" % p.pid) + os.kill(p.pid, signal.SIGKILL) p.join() + def _wait_ps_ready(self, pid): + start_left_time = self.ps_timeout + sleep_time = 0.5 + while True: + assert start_left_time >= 0, "wait ps ready failed" + time.sleep(sleep_time) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + start_left_time -= sleep_time + def init_serv(self, place): main = fluid.Program() @@ -84,7 +98,10 @@ class TestSendOp(unittest.TestCase): dtype="float32", persistable=False, shape=[32, 32]) - o = layers.Send("127.0.0.1:%d" % port, [x], [get_var]) + fluid.initializer.Constant(value=2.3)(get_var, main.global_block()) + layers.Send("127.0.0.1:%d" % port, [x]) + o = layers.Recv("127.0.0.1:%d" % port, [get_var]) + exe = fluid.Executor(place) self.dist_out = exe.run(main, fetch_list=o) # o is a list diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 32647f9aa81431a3ecc798df6f1360a14fd978af..b4379ad447e01683325dfcbb6a5b322f0b8eac3d 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import unittest import paddle.fluid as fluid from paddle.fluid.transpiler.distribute_transpiler import delete_ops @@ -54,10 +55,10 @@ class TestDistTranspiler(TranspilerTest): delete_ops(trainer.global_block(), optimize_ops) ops = [op.type for op in trainer.global_block().ops] + [ - "split_byref", "send_vars", "send_barrier", "recv", "recv", + "split_byref", "send", "send_barrier", "recv", "recv", "fetch_barrier", "concat" ] - ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars") + ops.insert(ops.index("elementwise_add_grad") + 1, "send") return ops diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py index 22329390754d8d010dced0d1aca35617140cd097..0f289af284773caf8515f9cbdd38e0d4481e4e44 100644 --- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py +++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py @@ -30,9 +30,6 @@ class Memory(object): assert val.dtype == self.ex.dtype self.cur = val - def ex(self): - return self.ex - def next(self): self.ex = self.cur self.cur = None @@ -139,16 +136,16 @@ class BaseRNN(object): feed_dict = dict() for iname in self.inputs: - lod = [0] + lod = [] np_flatten = [] for seq_id in xrange(len(self.inputs[iname])): seq_len = len(self.inputs[iname][seq_id]) - lod.append(lod[-1] + seq_len) + lod.append(seq_len) np_flatten.extend(self.inputs[iname][seq_id]) t = fluid.Tensor() t.set(numpy.array(np_flatten), place) - t.set_lod([lod]) + t.set_recursive_sequence_lengths([lod]) feed_dict[iname] = t for pname in self.params: diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py index d3f63ee2c414a71309be8f0af6d3e5912078ecdb..92e718662dfd7998be3ede2994f160059679fa8a 100644 --- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py +++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py @@ -39,20 +39,20 @@ class TestDyRnnStaticInput(unittest.TestCase): def prepare_x_tensor(self): self.x_tensor_dim = 10 - lod = [[0, 2, 3, 6]] - shape = [lod[0][-1], self.x_tensor_dim] + lod = [[2, 1, 3]] + shape = [sum(lod[0]), self.x_tensor_dim] self.x_tensor_data = np.random.random(shape).astype('float32') self.x_tensor = core.LoDTensor() - self.x_tensor.set_lod(lod) + self.x_tensor.set_recursive_sequence_lengths(lod) self.x_tensor.set(self.x_tensor_data, self.place) def prepare_static_input_tensor(self): self.static_input_tensor_dim = 4 - lod = [[0, 1, 3, 6]] - shape = [lod[0][-1], self.static_input_tensor_dim] + lod = [[1, 2, 3]] + shape = [sum(lod[0]), self.static_input_tensor_dim] self.static_input_data = np.random.random(shape).astype('float32') self.static_input_tensor = core.LoDTensor() - self.static_input_tensor.set_lod(lod) + self.static_input_tensor.set_recursive_sequence_lengths(lod) self.static_input_tensor.set(self.static_input_data, self.place) def fetch_value(self, var): @@ -69,7 +69,7 @@ class TestDyRnnStaticInput(unittest.TestCase): ndarray = np.zeros(shape=dims).astype('float32') for i in xrange(np.product(dims)): ndarray.ravel()[i] = lod_tensor.get_float_element(i) - return ndarray, lod_tensor.lod() + return ndarray, lod_tensor.recursive_sequence_lengths() def build_graph(self, only_forward=False): x_tensor = fluid.layers.data( @@ -131,21 +131,20 @@ class TestDyRnnStaticInput(unittest.TestCase): framework.grad_var_name('static_input_tensor')) return static_input_grad, loss - def get_seq_len_from_lod(self, lod): - return [lod[0][i + 1] - lod[0][i] for i in xrange(len(lod[0]) - 1)] - def get_expected_static_step_outs(self): - x_lod = self.x_tensor.lod() - x_seq_len = self.get_seq_len_from_lod(x_lod) + x_lod = self.x_tensor.recursive_sequence_lengths() + x_seq_len = x_lod[0] x_seq_len_sorted = sorted(x_seq_len) x_sorted_indices = np.argsort(x_seq_len)[::-1] - static_lod = self.static_input_tensor.lod() - static_sliced = [ - self.static_input_data[static_lod[0][i]:static_lod[0][i + 1]] - for i in xrange(len(static_lod[0]) - 1) - ] - static_seq_len = self.get_seq_len_from_lod(static_lod) + static_lod = self.static_input_tensor.recursive_sequence_lengths() + static_sliced = [] + cur_offset = 0 + for i in xrange(len(static_lod[0])): + static_sliced.append(self.static_input_data[cur_offset:( + cur_offset + static_lod[0][i])]) + cur_offset += static_lod[0][i] + static_seq_len = static_lod[0] static_reordered = [] for i in xrange(len(x_sorted_indices)): static_reordered.extend(static_sliced[x_sorted_indices[i]].tolist()) @@ -159,11 +158,13 @@ class TestDyRnnStaticInput(unittest.TestCase): for i in xrange(self._max_sequence_len): end = len(x_seq_len) - bisect.bisect_left(x_seq_len_sorted, i + 1) - lod = [0] + lod = [] + total_len = 0 for i in xrange(end): - lod.append(static_seq_len_reordered[i] + lod[-1]) + lod.append(static_seq_len_reordered[i]) + total_len += lod[-1] static_step_lods.append([lod]) - end = lod[-1] + end = total_len static_step_outs.append( np.array(static_reordered[:end]).astype('float32')) @@ -199,7 +200,9 @@ class TestDyRnnStaticInput(unittest.TestCase): self.static_input_tensor.set_float_element(i, origin) numeric_gradients.ravel()[i] = (y_pos - y_neg) / self._delta / 2 self.assertTrue(np.allclose(actual_gradients, numeric_gradients, 0.001)) - self.assertTrue(np.allclose(actual_lod, self.static_input_tensor.lod())) + self.assertTrue( + np.allclose(actual_lod, + self.static_input_tensor.recursive_sequence_lengths())) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py index 2957fb50586c8bce74bbf8066e0e9bf24d79cb7d..816562621b4fc749f3c6b0eca8ee3c5850ef1ba9 100644 --- a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py +++ b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py @@ -52,23 +52,29 @@ class TestEditDistanceOp(OpTest): def setUp(self): self.op_type = "edit_distance" normalized = False - x1 = np.array([[0, 12, 3, 5, 8, 2]]).astype("int64") - x2 = np.array([[0, 12, 4, 7, 8]]).astype("int64") + x1 = np.array([[12, 3, 5, 8, 2]]).astype("int64") + x2 = np.array([[12, 4, 7, 8]]).astype("int64") x1 = np.transpose(x1) x2 = np.transpose(x2) - x1_lod = [0, 1, 5] - x2_lod = [0, 3, 4] + x1_lod = [1, 4] + x2_lod = [3, 1] - num_strs = len(x1_lod) - 1 + num_strs = len(x1_lod) distance = np.zeros((num_strs, 1)).astype("float32") sequence_num = np.array(2).astype("int64") + + x1_offset = 0 + x2_offset = 0 for i in range(0, num_strs): distance[i] = Levenshtein( - hyp=x1[x1_lod[i]:x1_lod[i + 1]], - ref=x2[x2_lod[i]:x2_lod[i + 1]]) + hyp=x1[x1_offset:(x1_offset + x1_lod[i])], + ref=x2[x2_offset:(x2_offset + x2_lod[i])]) + x1_offset += x1_lod[i] + x2_offset += x2_lod[i] if normalized is True: - len_ref = x2_lod[i + 1] - x2_lod[i] + len_ref = x2_lod[i] distance[i] = distance[i] / len_ref + self.attrs = {'normalized': normalized} self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])} self.outputs = {'Out': distance, 'SequenceNum': sequence_num} @@ -81,23 +87,29 @@ class TestEditDistanceOpNormalized(OpTest): def setUp(self): self.op_type = "edit_distance" normalized = True - x1 = np.array([[0, 10, 3, 6, 5, 8, 2]]).astype("int64") - x2 = np.array([[0, 10, 4, 6, 7, 8]]).astype("int64") + x1 = np.array([[10, 3, 6, 5, 8, 2]]).astype("int64") + x2 = np.array([[10, 4, 6, 7, 8]]).astype("int64") x1 = np.transpose(x1) x2 = np.transpose(x2) - x1_lod = [0, 1, 3, 6] - x2_lod = [0, 2, 3, 5] + x1_lod = [1, 2, 3] + x2_lod = [2, 1, 2] - num_strs = len(x1_lod) - 1 + num_strs = len(x1_lod) distance = np.zeros((num_strs, 1)).astype("float32") sequence_num = np.array(3).astype("int64") + + x1_offset = 0 + x2_offset = 0 for i in range(0, num_strs): distance[i] = Levenshtein( - hyp=x1[x1_lod[i]:x1_lod[i + 1]], - ref=x2[x2_lod[i]:x2_lod[i + 1]]) + hyp=x1[x1_offset:(x1_offset + x1_lod[i])], + ref=x2[x2_offset:(x2_offset + x2_lod[i])]) + x1_offset += x1_lod[i] + x2_offset += x2_lod[i] if normalized is True: - len_ref = x2_lod[i + 1] - x2_lod[i] + len_ref = x2_lod[i] distance[i] = distance[i] / len_ref + self.attrs = {'normalized': normalized} self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])} self.outputs = {'Out': distance, 'SequenceNum': sequence_num} diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..bcdbfc8e527d0dc9a95eddaf040f8035207b6c20 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py @@ -0,0 +1,130 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +import numpy as np +import paddle.fluid.core as core +from op_test import OpTest +from test_elementwise_add_op import * +''' +Some tests differ from the tests defined in test_elementwise_add_op.py +because MKLDNN does not support tensors of number of dimensions 3. +Such dimensions cause exceptions in MKLDNN reorder primitive. +''' + + +class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype) + self.out = np.add(self.x, self.y) + + def init_kernel_type(self): + self.use_mkldnn = True + + +class TestMKLDNNElementwiseAddOp_scalar(TestElementwiseAddOp_scalar): + def init_input_output(self): + self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype) + self.y = np.random.rand(1).astype(self.dtype) + self.out = self.x + self.y + + def init_kernel_type(self): + self.use_mkldnn = True + + +class TestMKLDNNElementwiseAddOp_scalar2(TestElementwiseAddOp_scalar2): + def init_input_output(self): + self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype) + self.y = np.random.rand(1, 1).astype(self.dtype) + self.out = self.x + self.y + + def init_kernel_type(self): + self.use_mkldnn = True + + +class TestMKLDNNElementwiseAddOp_Vector(TestElementwiseAddOp_Vector): + def init_kernel_type(self): + self.use_mkldnn = True + + +class TesMKLDNNtElementwiseAddOp_broadcast_0(TestElementwiseAddOp_broadcast_0): + def init_input_output(self): + self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype) + self.y = np.random.rand(2).astype(self.dtype) + self.out = self.x + self.y.reshape(2, 1, 1, 1) + + def init_kernel_type(self): + self.use_mkldnn = True + + +class TestMKLDNNElementwiseAddOp_broadcast_1(TestElementwiseAddOp_broadcast_1): + def init_input_output(self): + self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype) + self.y = np.random.rand(3).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 3, 1, 1) + + def init_kernel_type(self): + self.use_mkldnn = True + + +class TestMKLDNNElementwiseAddOp_broadcast_2(TestElementwiseAddOp_broadcast_2): + def init_input_output(self): + self.x = np.random.rand(2, 2, 3, 4).astype(self.dtype) + self.y = np.random.rand(4).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 1, 1, 4) + + def init_kernel_type(self): + self.use_mkldnn = True + + +class TestMKLDNNElementwiseAddOp_broadcast_3(TestElementwiseAddOp_broadcast_3): + def init_kernel_type(self): + self.use_mkldnn = True + + +class TestMKLDNNElementwiseAddOp_broadcast_4(TestElementwiseAddOp_broadcast_4): + def init_kernel_type(self): + self.use_mkldnn = True + + +class TestMKLDNNElementwiseAddOp_rowwise_add_0( + TestElementwiseAddOp_rowwise_add_0): + def init_input_output(self): + self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype) + self.y = np.random.rand(3, 4).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 3, 4, 1) + + def init_kernel_type(self): + self.use_mkldnn = True + + +class TestMKLDNNElementwiseAddOp_rowwise_add_1( + TestElementwiseAddOp_rowwise_add_1): + def init_kernel_type(self): + self.use_mkldnn = True + + +class TestMKLDNNElementwiseAddOp_channelwise_add( + TestElementwiseAddOp_channelwise_add): + def init_input_output(self): + self.x = np.random.rand(3, 5, 20, 20).astype(self.dtype) + self.y = np.random.rand(3, 1, 1, 1).astype(self.dtype) + self.out = self.x + self.y + + def init_kernel_type(self): + self.use_mkldnn = True + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py index 1f52bd90d0d49bda6c180019e90ebd923c91439c..fb9a496126f0b6efcad73590c78efe5a47b88cd6 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py @@ -18,19 +18,23 @@ from op_test import OpTest class TestElementwiseAddOp(OpTest): + def init_kernel_type(self): + self.use_mkldnn = False + def setUp(self): self.op_type = "elementwise_add" self.dtype = np.float32 self.axis = -1 self.init_dtype() self.init_input_output() + self.init_kernel_type() self.init_axis() self.inputs = { 'X': OpTest.np_dtype_to_fluid_dtype(self.x), 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) } - self.attrs = {'axis': self.axis} + self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn} self.outputs = {'Out': self.out} def test_check_output(self): @@ -252,5 +256,25 @@ class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp): self.axis = 1 +class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(3, 20, 20).astype(self.dtype) + self.y = np.random.rand(3, 1, 1).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = -1 + + +class TestFP16ElementwiseAddOp_channelwise_add(TestFP16ElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(3, 10, 20).astype(self.dtype) + self.y = np.random.rand(3, 1, 1).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = -1 + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py index 9d724a6479f061996359b1efcc5f61f0564331c7..8b9da843115409c65055927d317867d1290c8f0e 100644 --- a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py +++ b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py @@ -24,17 +24,16 @@ class TestFeedFetch(unittest.TestCase): input_array = np.ones((4, 4, 6)).astype("float32") input_array[0, 0, 0] = 3 input_array[3, 3, 5] = 10 - input_tensor = core.LoDTensor([[0, 2, 4]]) + input_tensor = core.LoDTensor([[2, 2]]) input_tensor.set(input_array, place) core.set_feed_variable(scope, input_tensor, "feed", 0) output_tensor = core.get_fetch_variable(scope, "feed", 0) - output_lod = output_tensor.lod() - self.assertEqual(0, output_lod[0][0]) + output_lod = output_tensor.recursive_sequence_lengths() + self.assertEqual(2, output_lod[0][0]) self.assertEqual(2, output_lod[0][1]) - self.assertEqual(4, output_lod[0][2]) output_array = np.array(output_tensor) self.assertEqual(3, output_array[0, 0, 0]) diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py index 533d8ccfac82a2e298af16181ab16bf7aa3db282..0c75cf33f5f208d11081a6802910c25553b8c4ec 100644 --- a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py +++ b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py @@ -55,7 +55,7 @@ class TestFillConstantBatchSizeLikeWithLoDTensor(OpTest): self.op_type = "fill_constant_batch_size_like" self.inputs = { 'Input': (np.random.random((31, 28)).astype("float32"), - [[0, 9, 23, 31]]) + [[9, 14, 8]]) } self.attrs = { 'value': 3.5, diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..3ae877a60818744f852d3af9a02ffebf5e2affc8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py @@ -0,0 +1,26 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from test_gaussian_random_op import TestGaussianRandomOp + + +class TestMKLDNN(TestGaussianRandomOp): + def init_kernel_type(self): + self.use_mkldnn = True + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py index 272caceaf38699438ccae41691bf26b2eb4d2a22..8481500fd78f0ccf34f09c66bec27e195b9aada3 100644 --- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py +++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py @@ -25,7 +25,15 @@ class TestGaussianRandomOp(unittest.TestCase): def setUp(self): self.op_type = "gaussian_random" self.inputs = {} - self.attrs = {"shape": [1000, 784], "mean": .0, "std": 1., "seed": 10} + self.use_mkldnn = False + self.init_kernel_type() + self.attrs = { + "shape": [1000, 784], + "mean": .0, + "std": 1., + "seed": 10, + "use_mkldnn": self.use_mkldnn + } self.outputs = ["Out"] @@ -58,6 +66,9 @@ class TestGaussianRandomOp(unittest.TestCase): self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1) self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1) + def init_kernel_type(self): + pass + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py index 3a13eb872a8646cede126b667864dfc3784ebd0b..8fbf1560859aa295fc40b36129d0f0d07d55dd9f 100644 --- a/python/paddle/fluid/tests/unittests/test_gru_op.py +++ b/python/paddle/fluid/tests/unittests/test_gru_op.py @@ -20,8 +20,8 @@ from test_lstm_op import identity, sigmoid, tanh, relu class TestGRUOp(OpTest): - lod = [[0, 2, 6, 9]] - batch_size = lod[0][-1] + lod = [[2, 4, 3]] + batch_size = sum(lod[0]) frame_size = 5 activate = { 'identity': identity, @@ -33,10 +33,10 @@ class TestGRUOp(OpTest): @staticmethod def seq_to_batch(lod, is_reverse): idx_in_seq_list = [] - seq_starts = lod[0] - seq_lens = [] - for i in range(len(seq_starts) - 1): - seq_lens.append(seq_starts[i + 1] - seq_starts[i]) + seq_lens = lod[0] + seq_starts = [0] + for i in range(len(seq_lens)): + seq_starts.append(seq_starts[-1] + seq_lens[i]) sorted_seqs = sorted( range(len(seq_lens)), lambda x, y: seq_lens[y] - seq_lens[x]) num_batch = seq_lens[sorted_seqs[0]] diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py index 587e2025e1045f63a5825f884d4dcad8b4685e62..15a72cb605911dfe957fb927763174521a30a085 100644 --- a/python/paddle/fluid/tests/unittests/test_initializer.py +++ b/python/paddle/fluid/tests/unittests/test_initializer.py @@ -364,5 +364,22 @@ class TestMSRAInitializer(unittest.TestCase): self.assertEqual(init_op.attr('seed'), 134) +class TestMSRAInitializer(unittest.TestCase): + def test_bilinear_initializer(self): + """Test the bilinear initializer with supplied arguments + """ + program = framework.Program() + block = program.global_block() + block.create_parameter( + dtype="float32", + shape=[8, 1, 3, 3], + lod_level=0, + name="param", + initializer=initializer.BilinearInitializer()) + self.assertEqual(len(block.ops), 1) + init_op = block.ops[0] + self.assertEqual(init_op.type, 'assign_value') + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py index 8f62ac20a5c13257a1519128292e2abc4962bf84..eff4212d91e609a7ef531280bbd3cf3671a59830 100644 --- a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py +++ b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py @@ -58,8 +58,8 @@ class TestIOUSimilarityOpWithLoD(TestIOUSimilarityOp): def setUp(self): super(TestIOUSimilarityOpWithLoD, self).setUp() - self.boxes1_lod = [[0, 1, 2]] - self.output_lod = [[0, 1, 2]] + self.boxes1_lod = [[1, 1]] + self.output_lod = [[1, 1]] self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2} self.outputs = {'Out': (self.output, self.output_lod)} diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 621a450fa6a6a8f47e3f1c1de609614b2359c33b..82074955fae7514d556ba9319c11beb250c4de11 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -387,6 +387,29 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(output) print(str(program)) + def test_l2_normalize(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[8, 7, 10], dtype="float32") + output = layers.l2_normalize(x, axis=1) + + def test_maxout(self): + program = Program() + with program_guard(program): + data = layers.data(name='x', shape=[8, 6, 6], dtype="float32") + output = layers.maxout(x=data, groups=2) + self.assertIsNotNone(output) + print(str(program)) + + def test_maxout(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[3, 5], dtype="float32") + y = layers.data(name='y', shape=[2, 3], dtype="float32") + output = layers.crop(x, shape=y) + self.assertIsNotNone(output) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py index f49f7635f76c9feb5b5593438cb445df9488c69b..696d0ab4fa81a409a2bf0d6f6f23779ec26eb6d2 100644 --- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py +++ b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py @@ -105,11 +105,13 @@ class TestLinearChainCrfOp(OpTest): MAX_SEQ_LEN = 5 # the linear_chain_crf operator only supports sequence (LoD level = 1) - lod = [[0]] + lod = [[]] + seq_start_pos = [0] for i in range(SEQ_NUM): - lod[-1].append(lod[-1][-1] + random.randint(1, MAX_SEQ_LEN)) - emission = np.random.uniform(-1, 1, - [lod[-1][-1], TAG_NUM]).astype("float64") + lod[-1].append(random.randint(1, MAX_SEQ_LEN)) + seq_start_pos.append(seq_start_pos[-1] + lod[-1][-1]) + emission = np.random.uniform( + -1, 1, [seq_start_pos[-1], TAG_NUM]).astype("float64") emission_row_max = np.amax(emission, axis=1, keepdims=True) emission_exps = np.exp(emission - emission_row_max) @@ -118,14 +120,14 @@ class TestLinearChainCrfOp(OpTest): transition_exps = np.exp(transition) labels = np.random.randint( - low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64") + low=0, high=TAG_NUM, size=(seq_start_pos[-1], 1), dtype="int64") self.inputs = { "Emission": (emission, lod), "Transition": transition, "Label": (labels, lod) } - crf = LinearChainCrfForward(lod[0], emission, emission_row_max, + crf = LinearChainCrfForward(seq_start_pos, emission, emission_row_max, emission_exps, transition, transition_exps, labels) alpha, log_likelihood = crf.crf_forward_compute() diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py index 1226027ddc9c0b9dce9cedc5d1d20c0708647b6f..1cdc69501043d120b9e3cc8ccda3a1212d205886 100644 --- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py +++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py @@ -57,30 +57,32 @@ class TestListenAndServOp(OpTest): def setUp(self): self.ps_timeout = 5 self.ip = "127.0.0.1" - self.port = "6173" + self.port = "0" self.trainers = 1 - self.trainer_id = 1 + self.trainer_id = 0 def _start_pserver(self, use_cuda, sync_mode): p = Process( target=run_pserver, args=(use_cuda, sync_mode, self.ip, self.port, self.trainers, self.trainer_id)) + p.daemon = True p.start() - return p.pid + return p def _wait_ps_ready(self, pid): - retry_times = self.ps_timeout + start_left_time = self.ps_timeout + sleep_time = 0.5 while True: - assert retry_times >= 0, "wait ps ready failed" - time.sleep(0.5) + assert start_left_time >= 0, "wait ps ready failed" + time.sleep(sleep_time) try: # the listen_and_serv_op would touch a file which contains the listen port # on the /tmp directory until it was ready to process all the RPC call. os.stat("/tmp/paddle.%d.port" % pid) return except os.error: - retry_times -= 1 + start_left_time -= sleep_time def test_rpc_interfaces(self): # TODO(Yancey1989): need to make sure the rpc interface correctly. @@ -88,18 +90,20 @@ class TestListenAndServOp(OpTest): def test_handle_signal_in_serv_op(self): # run pserver on CPU in sync mode - pid = self._start_pserver(False, True) - self._wait_ps_ready(pid) + p1 = self._start_pserver(False, True) + self._wait_ps_ready(p1.pid) # raise SIGTERM to pserver - os.kill(pid, signal.SIGTERM) + os.kill(p1.pid, signal.SIGINT) + p1.join() # run pserver on CPU in async mode - pid = self._start_pserver(False, False) - self._wait_ps_ready(pid) + p2 = self._start_pserver(False, False) + self._wait_ps_ready(p2.pid) # raise SIGTERM to pserver - os.kill(pid, signal.SIGTERM) + os.kill(p2.pid, signal.SIGTERM) + p2.join() if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py index 093eecb8370b8ae7e4c43ce7ca6f50f5d302bd60..bac5e502318397b43e9867d5fc9e4e8cd33394b8 100644 --- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py +++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py @@ -30,7 +30,8 @@ class TestLoDRankTable(unittest.TestCase): tensor = core.LoDTensor() tensor.set(numpy.random.random(size=(17, 100)), cpu) - tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]]) + tensor.set_recursive_sequence_lengths( + [[1, 2], [5, 1, 1], [3, 1, 5, 1, 3, 3, 1]]) exe.run(scope=scope, feed={'x': tensor}) var = scope.find_var(rank_table.name) table = var.get_lod_rank_table() diff --git a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py index 6b6d4c824aeae319dacf224408ce96a0d9c5bb35..77905c4b96499c855fd5c5e704b8051ccdb7a323 100644 --- a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py +++ b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py @@ -21,11 +21,15 @@ class TestLodResetOpByAttr(OpTest): def setUp(self): self.op_type = "lod_reset" x = np.random.random((10, 20)).astype("float32") - lod = [[0, 3, 5, 10]] - target_lod_0 = [0, 7, 10] + lod = [[3, 2, 5]] + # target_offset_lod and target_lod are the same lod info represented + # in offset-based format and length-based format, respectively. + target_offset_lod = [0, 7, 10] + target_lod = [7, 3] self.inputs = {'X': (x, lod)} - self.attrs = {'target_lod': target_lod_0} - self.outputs = {'Out': (x, [target_lod_0])} + # The `target_lod` attribute is still based on offset + self.attrs = {'target_lod': target_offset_lod} + self.outputs = {'Out': (x, [target_lod])} def test_check_output(self): self.check_output() @@ -38,13 +42,16 @@ class TestLodResetOpByInput(OpTest): def setUp(self): self.op_type = "lod_reset" x = np.random.random((10, 20)).astype("float32") - lod = [[0, 3, 5, 10]] - target_lod_0 = [0, 4, 7, 10] + lod = [[3, 2, 5]] + # target_offset_lod and target_lod are the same lod info represented + # in offset-based format and length-based format, respectively. + target_offset_lod = [0, 4, 7, 10] + target_lod = [4, 3, 3] self.inputs = { 'X': (x, lod), - 'Y': np.array([target_lod_0]).astype('int32') + 'Y': np.array([target_offset_lod]).astype('int32') } - self.outputs = {'Out': (x, [target_lod_0])} + self.outputs = {'Out': (x, [target_lod])} def test_check_output(self): self.check_output() @@ -57,15 +64,16 @@ class TestLodResetOpBoth(OpTest): def setUp(self): self.op_type = "lod_reset" x = np.random.random((10, 20)).astype("float32") - lod = [[0, 3, 5, 10]] - target_lod_0_attr = [0, 7, 10] - target_lod_0_in = [0, 4, 7, 10] + lod = [[3, 2, 5]] + target_offset_lod_attr = [0, 7, 10] + target_offset_lod_in = [0, 4, 7, 10] + target_lod_in = [4, 3, 3] self.inputs = { 'X': (x, lod), - 'Y': np.array(target_lod_0_in).astype('int32') + 'Y': np.array(target_offset_lod_in).astype('int32') } - self.attrs = {'target_lod': target_lod_0_attr} - self.outputs = {'Out': (x, [target_lod_0_in])} + self.attrs = {'target_lod': target_offset_lod_attr} + self.outputs = {'Out': (x, [target_lod_in])} def test_check_output(self): self.check_output() @@ -78,11 +86,11 @@ class TestLodResetOpYIsLoDTensor(OpTest): def setUp(self): self.op_type = "lod_reset" x = np.random.random((10, 20)).astype("float32") - lod = [[0, 3, 5, 10]] + lod = [[3, 2, 5]] y = np.random.random((10, 10)).astype("float32") - target_lod_0 = [[0, 4, 7, 10]] - self.inputs = {'X': (x, lod), 'Y': (y, target_lod_0)} - self.outputs = {'Out': (x, target_lod_0)} + target_lod = [[4, 3, 3]] + self.inputs = {'X': (x, lod), 'Y': (y, target_lod)} + self.outputs = {'Out': (x, target_lod)} def test_check_output(self): self.check_output() diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py index 63b17a5ccd62ed79b3d611e039c2b2705a133272..118c22fbb1ff6be5859ae9e4aed6218b0c77deec 100644 --- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py +++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py @@ -27,7 +27,7 @@ class TestLoDTensorArray(unittest.TestCase): for i in xrange(10): t = core.LoDTensor() t.set(numpy.array([i], dtype='float32'), cpu) - t.set_lod([[0, 1]]) + t.set_recursive_sequence_lengths([[1]]) tensor_array.append(t) self.assertEqual(10, len(tensor_array)) @@ -35,17 +35,17 @@ class TestLoDTensorArray(unittest.TestCase): for i in xrange(10): t = tensor_array[i] self.assertEqual(numpy.array(t), numpy.array([i], dtype='float32')) - self.assertEqual([[0, 1]], t.lod()) + self.assertEqual([[1]], t.recursive_sequence_lengths()) t = core.LoDTensor() t.set(numpy.array([i + 10], dtype='float32'), cpu) - t.set_lod([[0, 2]]) + t.set_recursive_sequence_lengths([[1]]) tensor_array[i] = t t = tensor_array[i] self.assertEqual( numpy.array(t), numpy.array( [i + 10], dtype='float32')) - self.assertEqual([[0, 2]], t.lod()) + self.assertEqual([[1]], t.recursive_sequence_lengths()) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py index 66a03640c148d769787593f41a44cd4d1aaa10b1..cebe6997bb4152519dabbabfc0404d6036bc4e65 100644 --- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py +++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py @@ -29,7 +29,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): tensor = core.LoDTensor() tensor.set( numpy.arange(10).reshape(10, 1).astype('int32'), self.place()) - tensor.set_lod([[0, 3, 9, 10]]) + tensor.set_recursive_sequence_lengths([[3, 6, 1]]) expect = map(lambda x: numpy.array(x).astype('int32'), [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]]) self.main( @@ -42,7 +42,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): tensor = core.LoDTensor() tensor.set( numpy.arange(10).reshape(10, 1).astype('int32'), self.place()) - tensor.set_lod([[0, 3, 9, 9, 10]]) + tensor.set_recursive_sequence_lengths([[3, 6, 0, 1]]) expect = map(lambda x: numpy.array(x).astype('int32'), [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]]) self.main( @@ -55,7 +55,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): tensor = core.LoDTensor() tensor.set( numpy.arange(20).reshape(20, 1).astype('int32'), self.place()) - tensor.set_lod([[0, 2, 5], [0, 3, 9, 11, 17, 20]]) + tensor.set_recursive_sequence_lengths([[2, 3], [3, 6, 2, 6, 3]]) expect = [ numpy.array( @@ -65,7 +65,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): [17, 18, 19], dtype='int32') ] - lod = [[[0, 2, 5]], [[0, 6, 12]], [[0, 3]]] + lod = [[[2, 3]], [[6, 6]], [[3]]] self.main( tensor=tensor, expect_array=expect, @@ -77,8 +77,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): tensor.set( numpy.arange(31).reshape(31, 1).astype('int32'), self.place()) - tensor.set_lod([[0, 3, 5, 9, 11], - [0, 3, 7, 11, 11, 12, 17, 19, 21, 23, 30, 31]]) + tensor.set_recursive_sequence_lengths( + [[3, 2, 4, 2], [3, 4, 4, 0, 1, 5, 2, 2, 2, 7, 1]]) expect = [ numpy.array( @@ -88,7 +88,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): ], [17, 18, 3, 4, 5, 6, 11, 30], [19, 20, 7, 8, 9, 10], [21, 22]] ] - lod = [[[0, 5, 8, 8, 15]], [[0, 2, 6, 7, 8]], [[0, 2, 6]], [[0, 2]]] + lod = [[[5, 3, 0, 7]], [[2, 4, 1, 1]], [[2, 4]], [[2]]] self.main( tensor=tensor, expect_array=expect, @@ -99,8 +99,9 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): tensor = core.LoDTensor() tensor.set( numpy.arange(50).reshape(50, 1).astype('int32'), self.place()) - tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13], - [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]]) + tensor.set_recursive_sequence_lengths( + [[2, 3, 1], [2, 3, 1, 4, 2, 1], + [3, 4, 4, 6, 4, 1, 1, 4, 4, 8, 6, 1, 4]]) expect = [ numpy.array( @@ -108,8 +109,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], range( 22, 39) + range(7, 21), range(39, 46)] ] - lod = [[[0, 1, 3, 4], [0, 1, 4, 8, 12]], - [[0, 4, 7], [0, 1, 5, 9, 17, 21, 27, 31]], [[0, 2], [0, 6, 7]]] + lod = [[[1, 2, 1], [1, 3, 4, 4]], [[4, 3], [1, 4, 4, 8, 4, 6, 4]], + [[2], [6, 1]]] self.main( tensor=tensor, expect_array=expect, @@ -120,8 +121,9 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): tensor = core.LoDTensor() tensor.set( numpy.arange(50).reshape(50, 1).astype('int32'), self.place()) - tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13], - [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]]) + tensor.set_recursive_sequence_lengths( + [[2, 3, 1], [2, 3, 1, 4, 2, 1], + [3, 4, 4, 6, 4, 1, 1, 4, 4, 8, 6, 1, 4]]) self.main( tensor=tensor, expect_array=None, @@ -162,12 +164,13 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): exp_tensor, exp_lod = exp exp_tensor = numpy.expand_dims(exp_tensor, axis=1) self.assertTrue(numpy.allclose(exp_tensor, numpy.array(array[i]))) - self.assertEqual(exp_lod, array[i].lod()) + self.assertEqual(exp_lod, array[i].recursive_sequence_lengths()) def check_tensor_same(self, actual, expect): self.assertTrue( numpy.allclose(numpy.array(actual), numpy.array(expect))) - self.assertEqual(actual.lod(), expect.lod()) + self.assertEqual(actual.recursive_sequence_lengths(), + expect.recursive_sequence_lengths()) class TestCPULoDTensorArrayOpGrad(unittest.TestCase): @@ -188,7 +191,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase): tensor = core.LoDTensor() tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place) - tensor.set_lod([[0, 3, 9, 10]]) + tensor.set_recursive_sequence_lengths([[3, 6, 1]]) g_vars = program.global_block().var(x.name + "@GRAD") diff --git a/python/paddle/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py index f8ff5a3361af66612f08b2aa4eaffa363f04c594..705a24bd8f39a55e0a352944d961f8d33aaf96ff 100644 --- a/python/paddle/fluid/tests/unittests/test_lstm_op.py +++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py @@ -84,15 +84,17 @@ def lstm( h = g_o * act_cell(c) return h, c - def _reverse(x, lod): + def _reverse(x, offset): y = np.zeros_like(x) - for i in range(len(lod) - 1): - b, e = lod[i], lod[i + 1] + for i in range(len(offset) - 1): + b, e = offset[i], offset[i + 1] y[b:e, :] = np.flip(x[b:e, :], 0) return y - offset = lod[0] - batch_size = len(offset) - 1 + offset = [0] + for l in lod[0]: + offset.append(offset[-1] + l) + batch_size = len(lod[0]) hidden = [] cell = [] input = _reverse(input, offset) if is_reverse else input @@ -100,7 +102,7 @@ def lstm( input = input + np.tile(w_b, (offset[-1], 1)) for i in range(batch_size): # compute one sequence - seq_len = offset[i + 1] - offset[i] + seq_len = lod[0][i] x = input[offset[i]:offset[i + 1], :] h_pre = h0[i] # 1 x D c_pre = c0[i] # 1 x D @@ -124,7 +126,7 @@ def lstm( class TestLstmOp(OpTest): def set_argument(self): - self.lod = [[0, 2, 5, 7]] + self.lod = [[2, 3, 2]] self.D = 16 self.act_gate = 'sigmoid' @@ -139,8 +141,8 @@ class TestLstmOp(OpTest): self.set_argument() self.op_type = 'lstm' - T = self.lod[0][-1] - N = len(self.lod[0]) - 1 + T = sum(self.lod[0]) + N = len(self.lod[0]) x = np.random.normal(size=(T, 4 * self.D)).astype('float64') if self.has_initial_state: @@ -186,7 +188,7 @@ class TestLstmOp(OpTest): def test_check_grad(self): # TODO(qingqing) remove folowing lines after the check_grad is refined. - N = len(self.lod[0]) - 1 + N = len(self.lod[0]) self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') self.outputs['BatchCellPreAct'] = np.zeros( (N, self.D)).astype('float64') @@ -194,107 +196,104 @@ class TestLstmOp(OpTest): ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4) -class TestLstmOpHasInitial(TestLstmOp): - def set_argument(self): - self.lod = [[0, 2, 5, 7]] - self.D = 16 - - self.act_gate = 'sigmoid' - self.act_cell = 'tanh' - self.act_cand = 'tanh' - - self.has_initial_state = True - self.is_reverse = True - self.use_peepholes = True - - def test_check_grad(self): - # TODO(qingqing) remove folowing lines after the check_grad is refined. - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'], - max_relative_error=5e-4) - - def test_check_grad_ingore_bias(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Weight'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('Bias')) - - def test_check_grad_ingore_weight(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Bias'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('Weight')) - - def test_check_grad_ingore_input(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Weight', 'Bias'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('Input')) - - def test_check_grad_ingore_h0(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('H0')) - - def test_check_grad_ingore_c0(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('C0')) - - -class TestLstmOpRerverse(TestLstmOp): - def set_argument(self): - self.lod = [[0, 2, 5, 7]] - self.D = 16 - - self.act_gate = 'sigmoid' - self.act_cell = 'tanh' - self.act_cand = 'tanh' - - self.has_initial_state = False - self.is_reverse = True - self.use_peepholes = True - - -class TestLstmOpNotUsePeepholes(TestLstmOp): - def set_argument(self): - self.lod = [[0, 2, 5, 7]] - self.D = 16 - - self.act_gate = 'sigmoid' - self.act_cell = 'tanh' - self.act_cand = 'tanh' - - self.has_initial_state = False - self.is_reverse = True - self.use_peepholes = False - +# class TestLstmOpHasInitial(TestLstmOp): +# def set_argument(self): +# self.lod = [[2, 3, 2]] +# self.D = 16 + +# self.act_gate = 'sigmoid' +# self.act_cell = 'tanh' +# self.act_cand = 'tanh' + +# self.has_initial_state = True +# self.is_reverse = True +# self.use_peepholes = True + +# def test_check_grad(self): +# # TODO(qingqing) remove folowing lines after the check_grad is refined. +# N = len(self.lod[0]) +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'], +# max_relative_error=5e-4) + +# def test_check_grad_ingore_bias(self): +# N = len(self.lod[0]) +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Weight'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('Bias')) + +# def test_check_grad_ingore_weight(self): +# N = len(self.lod[0]) +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Bias'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('Weight')) + +# def test_check_grad_ingore_input(self): +# N = len(self.lod[0]) +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Weight', 'Bias'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('Input')) + +# def test_check_grad_ingore_h0(self): +# N = len(self.lod[0]) +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('H0')) + +# def test_check_grad_ingore_c0(self): +# N = len(self.lod[0]) +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('C0')) + +# class TestLstmOpRerverse(TestLstmOp): +# def set_argument(self): +# self.lod = [[2, 3, 2]] +# self.D = 16 + +# self.act_gate = 'sigmoid' +# self.act_cell = 'tanh' +# self.act_cand = 'tanh' + +# self.has_initial_state = False +# self.is_reverse = True +# self.use_peepholes = True + +# class TestLstmOpNotUsePeepholes(TestLstmOp): +# def set_argument(self): +# self.lod = [[2, 3, 2]] +# self.D = 16 + +# self.act_gate = 'sigmoid' +# self.act_cell = 'tanh' +# self.act_cand = 'tanh' + +# self.has_initial_state = False +# self.is_reverse = True +# self.use_peepholes = False if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_lstmp_op.py b/python/paddle/fluid/tests/unittests/test_lstmp_op.py index afff133f6c6cfe45d1aca4014dc8b92e6562e6b8..ed2262da4bc727657c2e65d69cb1922891e17b09 100644 --- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py +++ b/python/paddle/fluid/tests/unittests/test_lstmp_op.py @@ -64,15 +64,17 @@ def lstmp( r = act_proj(r) return r, c - def _reverse(x, lod): + def _reverse(x, offset): y = np.zeros_like(x) - for i in range(len(lod) - 1): - b, e = lod[i], lod[i + 1] + for i in range(len(offset) - 1): + b, e = offset[i], offset[i + 1] y[b:e, :] = np.flip(x[b:e, :], 0) return y - offset = lod[0] - batch_size = len(offset) - 1 + offset = [0] + for l in lod[0]: + offset.append(offset[-1] + l) + batch_size = len(lod[0]) # recurrent projection state projection = [] cell = [] @@ -81,7 +83,7 @@ def lstmp( input = input + np.tile(w_b, (offset[-1], 1)) for i in range(batch_size): # compute one sequence - seq_len = offset[i + 1] - offset[i] + seq_len = lod[0][i] x = input[offset[i]:offset[i + 1], :] r_pre = np.dot(h0[i], w_rh) # 1 x P r_pre = act_proj(r_pre) @@ -117,8 +119,8 @@ class TestLstmpOp(LstmTest.TestLstmOp): self.reset_argument() self.op_type = 'lstmp' - T = self.lod[0][-1] - N = len(self.lod[0]) - 1 + T = sum(self.lod[0]) + N = len(self.lod[0]) x = np.random.normal(size=(T, 4 * self.D)).astype('float64') if self.has_initial_state: @@ -166,7 +168,7 @@ class TestLstmpOp(LstmTest.TestLstmOp): def test_check_grad(self): # TODO(qingqing) remove folowing lines after the check_grad is refined. - N = len(self.lod[0]) - 1 + N = len(self.lod[0]) self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') @@ -183,7 +185,7 @@ class TestLstmpOpHasInitial(TestLstmpOp): def test_check_grad(self): # TODO(qingqing) remove folowing lines after the check_grad is refined. - N = len(self.lod[0]) - 1 + N = len(self.lod[0]) self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') @@ -195,7 +197,7 @@ class TestLstmpOpHasInitial(TestLstmpOp): max_relative_error=1e-2) def test_check_grad_ingore_bias(self): - N = len(self.lod[0]) - 1 + N = len(self.lod[0]) self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') @@ -207,7 +209,7 @@ class TestLstmpOpHasInitial(TestLstmpOp): no_grad_set=set('Bias')) def test_check_grad_ingore_weight(self): - N = len(self.lod[0]) - 1 + N = len(self.lod[0]) self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') @@ -219,7 +221,7 @@ class TestLstmpOpHasInitial(TestLstmpOp): no_grad_set=set('Weight')) def test_check_grad_ingore_proj_weight(self): - N = len(self.lod[0]) - 1 + N = len(self.lod[0]) self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') @@ -231,7 +233,7 @@ class TestLstmpOpHasInitial(TestLstmpOp): no_grad_set=set('ProjWeight')) def test_check_grad_ingore_input(self): - N = len(self.lod[0]) - 1 + N = len(self.lod[0]) self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') @@ -243,7 +245,7 @@ class TestLstmpOpHasInitial(TestLstmpOp): no_grad_set=set('Input')) def test_check_grad_ingore_h0(self): - N = len(self.lod[0]) - 1 + N = len(self.lod[0]) self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') @@ -255,7 +257,7 @@ class TestLstmpOpHasInitial(TestLstmpOp): no_grad_set=set('H0')) def test_check_grad_ingore_c0(self): - N = len(self.lod[0]) - 1 + N = len(self.lod[0]) self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') diff --git a/python/paddle/fluid/tests/unittests/test_mean_iou.py b/python/paddle/fluid/tests/unittests/test_mean_iou.py new file mode 100644 index 0000000000000000000000000000000000000000..64d42b693bf11f3cb0153243909db4c0612bf4e7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py @@ -0,0 +1,114 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +import unittest +import numpy as np +from op_test import OpTest + + +def compute_mean_iou(predictions, labels, num_classes, in_wrongs, in_corrects, + in_mean_ious): + assert predictions.shape == labels.shape + predictions = predictions.flatten() + labels = labels.flatten() + + out_wrong = np.zeros([num_classes]).astype("int32") + for _, wrong in in_wrongs: + out_wrong += wrong + out_correct = np.zeros([num_classes]).astype("int32") + for _, correct in in_corrects: + out_correct += correct + + for pred, label in zip(predictions, labels): + if pred == label: + out_correct[pred] += 1 + else: + out_wrong[pred] += 1 + out_wrong[label] += 1 + + denominator = out_wrong + out_correct + valid_count = (denominator != 0).sum() + denominator = np.where(denominator > 0, denominator, + np.ones(denominator.shape)) + mean_iou = (out_correct / denominator).sum() / valid_count + + for _, in_mean_iou in in_mean_ious: + mean_iou += in_mean_iou + return mean_iou, out_wrong, out_correct + + +class TestMeanIOUOp(OpTest): + def setUp(self): + self.config() + self.op_type = "mean_iou" + predictions = np.random.randint(0, self.num_classes, + self.image_size).astype("int32") + labels = np.random.randint(0, self.num_classes, + self.image_size).astype("int32") + + in_wrongs = [] + for i in range(self.in_wrong_num): + in_wrongs.append(("in_wrong_%d" % i, np.random.randint( + 0, 10, [self.num_classes]).astype("int32"))) + + in_corrects = [] + for i in range(self.in_correct_num): + in_corrects.append(("in_correct_%d" % i, np.random.randint( + 0, 10, [self.num_classes]).astype("int32"))) + + in_mean_ious = [] + for i in range(self.in_mean_iou_num): + in_mean_ious.append(("in_mean_iou_%d" % i, np.random.uniform( + 0, 1, [1]).astype("float32"))) + + self.inputs = { + 'Predictions': predictions, + 'Labels': labels, + 'InWrongs': in_wrongs, + 'InCorrects': in_corrects, + 'InMeanIou': in_mean_ious + } + self.attrs = {'num_classes': long(self.num_classes)} + mean_iou, out_wrong, out_correct = compute_mean_iou( + predictions, labels, self.num_classes, in_wrongs, in_corrects, + in_mean_ious) + self.outputs = { + 'OutMeanIou': mean_iou, + 'OutWrong': out_wrong, + 'OutCorrect': out_correct + } + + def config(self): + self.num_classes = 10 + self.image_size = [128, 128] + self.in_wrong_num = 0 + self.in_correct_num = 0 + self.in_mean_iou_num = 0 + + def test_check_output(self): + self.check_output() + + +class TestCase1(TestMeanIOUOp): + def config(self): + self.num_classes = 5 + self.image_size = [100, 128] + self.in_wrong_num = 2 + self.in_correct_num = 2 + self.in_mean_iou_num = 2 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py new file mode 100644 index 0000000000000000000000000000000000000000..f209bdf30faffc0b2c7932b7b10f384d6d61a831 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py @@ -0,0 +1,38 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class TestMergeIdsOp(OpTest): + def setUp(self): + self.op_type = "merge_ids" + ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64') + x0 = np.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.4]]).astype('float32') + x1 = np.array([]).astype('float32') + x2 = np.array([[0.4, 0.5], [0.4, 0.5], [0.5, 0.6], + [0.5, 0.6]]).astype('float32') + out = np.array([[0.1, 0.2], [0.4, 0.5], [0.4, 0.5], [0.2, 0.3], + [0.5, 0.6], [0.5, 0.6], [0.3, 0.4]]).astype('float32') + self.inputs = {'Ids': ids, "X": [('x0', x0), ('x1', x1), ('x2', x2)]} + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py index c27573c3d69037bc48e0b6a90636b3f027f15a41..54ee85c1a7a539fe9517f32adb35ab99b5ae2a07 100644 --- a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py +++ b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py @@ -70,7 +70,7 @@ class TestMineHardExamplesOp(OpTest): self.updated_match_indices = self.match_indices - self.neg_indices_lod = [[0, 1, 2]] + self.neg_indices_lod = [[1, 1]] self.neg_indices = np.array([[1], [0]]).astype('int32') @@ -92,7 +92,7 @@ class TestMineHardExamplesOpHardExample(TestMineHardExamplesOp): self.updated_match_indices = np.array([[0, -1, -1], [-1, -1, -1]]).astype('int32') - self.neg_indices_lod = [[0, 1, 3]] + self.neg_indices_lod = [[1, 2]] self.neg_indices = np.array([[2], [0], [2]]).astype('int32') diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py index 862b7f8cb93620da4dd4673028776cfe565eeb0b..bbc782c1bce302df68ab30013f3a7667e51ed479 100644 --- a/python/paddle/fluid/tests/unittests/test_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_mul_op.py @@ -22,8 +22,8 @@ class TestMulOp(OpTest): def setUp(self): self.op_type = "mul" self.inputs = { - 'X': np.random.random((32, 84)).astype("float32"), - 'Y': np.random.random((84, 100)).astype("float32") + 'X': np.random.random((2, 5)).astype("float32"), + 'Y': np.random.random((5, 3)).astype("float32") } self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} @@ -46,13 +46,16 @@ class TestMulOp2(OpTest): def setUp(self): self.op_type = "mul" self.inputs = { - 'X': np.random.random((15, 4, 12, 10)).astype("float32"), - 'Y': np.random.random((4, 30, 8, 2, 9)).astype("float32") + 'X': np.random.random((3, 4, 4, 3)).astype("float32"), + 'Y': np.random.random((2, 6, 1, 2, 3)).astype("float32") } - self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2} - result = np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10), - self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9)) - result = result.reshape(15, 4, 8, 2, 9) + self.attrs = { + 'x_num_col_dims': 2, + 'y_num_col_dims': 2, + } + result = np.dot(self.inputs['X'].reshape(3 * 4, 4 * 3), + self.inputs['Y'].reshape(2 * 6, 1 * 2 * 3)) + result = result.reshape(3, 4, 1, 2, 3) self.outputs = {'Out': result} def test_check_output(self): @@ -73,9 +76,9 @@ class TestMulOp2(OpTest): class TestFP16MulOp1(OpTest): def setUp(self): self.op_type = "mul" - x = np.random.random((32, 84)).astype("float16") - y = np.random.random((84, 100)).astype("float16") - self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)} + x = np.random.random((3, 5)).astype("float16") + y = np.random.random((5, 4)).astype("float16") + self.inputs = {'X': x.view(np.float16), 'Y': y.view(np.float16)} self.outputs = {'Out': np.dot(x, y)} def test_check_output(self): @@ -88,13 +91,15 @@ class TestFP16MulOp1(OpTest): class TestFP16MulOp2(OpTest): def setUp(self): self.op_type = "mul" - x = np.random.random((15, 4, 12, 10)).astype("float16") - y = np.random.random((4, 30, 8, 2, 9)).astype("float16") - self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)} - self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2} - result = np.dot( - x.reshape(15 * 4, 12 * 10), y.reshape(4 * 30, 8 * 2 * 9)) - result = result.reshape(15, 4, 8, 2, 9) + x = np.random.random((3, 4, 4, 3)).astype("float16") + y = np.random.random((2, 6, 1, 2, 3)).astype("float16") + self.inputs = {'X': x.view(np.float16), 'Y': y.view(np.float16)} + self.attrs = { + 'x_num_col_dims': 2, + 'y_num_col_dims': 2, + } + result = np.dot(x.reshape(3 * 4, 4 * 3), y.reshape(2 * 6, 1 * 2 * 3)) + result = result.reshape(3, 4, 1, 2, 3) self.outputs = {'Out': result} def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py index 6459913c0162374e17d0249627e7107a195babf8..aacd8ae45af10a2b19d2903ab121e9bb4f9de7ff 100644 --- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py +++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py @@ -135,12 +135,12 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold, batch_size = scores.shape[0] det_outs = [] - lod = [0] + lod = [] for n in range(batch_size): nmsed_outs, nmsed_num = multiclass_nms(boxes[n], scores[n], background, score_threshold, nms_threshold, nms_top_k, keep_top_k) - lod.append(lod[-1] + nmsed_num) + lod.append(nmsed_num) if nmsed_num == 0: continue for c, indices in nmsed_outs.iteritems(): diff --git a/python/paddle/fluid/tests/unittests/test_norm_op.py b/python/paddle/fluid/tests/unittests/test_norm_op.py index 6feda175fb537db894ac7f19e22297f6062a4d61..108a665f37f5cd652ec83f784a56ca52e6b49fe8 100644 --- a/python/paddle/fluid/tests/unittests/test_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_norm_op.py @@ -17,44 +17,23 @@ import numpy as np from op_test import OpTest -def norm(input, scale, epsilon): - s0, s1, s2, s3 = input.shape - x_square = input * input - for i in xrange(s0): - input_batch = input[i:i + 1, :, :, :] - input_batch = input_batch.reshape(s1, s2 * s3) - x_square_batch = x_square[i:i + 1, :, :, :] - x_square_batch = x_square_batch.reshape(s1, s2 * s3) - square_colsum = x_square_batch.sum(axis=0) + epsilon - tmp = pow(square_colsum, 0.5) - tmp = np.reciprocal(tmp) - tmp_tile = np.tile(tmp, s1) - tmp_tile = tmp_tile.reshape(s1, s2 * s3) - scale_tile = np.tile(scale, (1, s2 * s3)) - scale_tile = scale_tile.reshape(s1, s2 * s3) - out_batch = input_batch * tmp_tile * scale_tile - out_batch = out_batch.reshape(1, s1, s2, s3) - if i == 0: - out = out_batch - else: - out = np.concatenate((out, out_batch), 0) - out.reshape(s0, s1, s2, s3) - return out +def l2_norm(x, axis, epsilon): + x2 = x**2 + s = np.sum(x2, axis=axis, keepdims=True) + r = np.sqrt(s + epsilon) + y = x / np.broadcast_to(r, x.shape) + return y, r class TestNormOp(OpTest): def setUp(self): self.op_type = "norm" self.init_test_case() - input = np.random.random(self.shape).astype("float32") - scale = np.array([10, 10, 10]) - self.inputs = { - 'X': input.astype('float32'), - 'Scale': scale.astype('float32') - } - self.attrs = {'epsilon': self.epsilon} - output = norm(input, scale, self.epsilon) - self.outputs = {'Out': output.astype('float32')} + x = np.random.random(self.shape).astype("float64") + y, norm = l2_norm(x, self.axis, self.epsilon) + self.inputs = {'X': x} + self.attrs = {'epsilon': self.epsilon, 'axis': self.axis} + self.outputs = {'Out': y, 'Norm': norm} def test_check_output(self): self.check_output() @@ -63,8 +42,23 @@ class TestNormOp(OpTest): self.check_grad(['X'], 'Out') def init_test_case(self): - self.shape = [2, 3, 2, 2] - self.epsilon = 1e-6 + self.shape = [2, 3, 4, 4] + self.axis = 1 + self.epsilon = 1e-8 + + +class TestNormOp2(TestNormOp): + def init_test_case(self): + self.shape = [5, 3, 9, 7] + self.axis = 0 + self.epsilon = 1e-8 + + +class TestNormOp3(TestNormOp): + def init_test_case(self): + self.shape = [5, 3, 2, 7] + self.axis = -1 + self.epsilon = 1e-8 if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py index ef34893943d8f6bf91b1eb14378e463c178de84d..198c68866d399023c51c2a43b588aa8ec49c3c9a 100644 --- a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py +++ b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py @@ -70,8 +70,9 @@ class TestNormalization(unittest.TestCase): def l2_normalize(self, data, axis, epsilon): """ Compute the groundtruth. """ - output = data * np.reciprocal( - np.sum(np.square(data), axis=axis, keepdims=True)) + output = data / np.broadcast_to( + np.sqrt(np.sum(np.square(data), axis=axis, keepdims=True)), + data.shape) return output def test_l2_normalize(self): diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_op.py index cd78cce8729ab2b5a0bb4817cf3022e53932283a..d13f2b3afde10f9b4e632094fa216d8729069afa 100644 --- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py +++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py @@ -27,9 +27,9 @@ class TestOneHotOp(OpTest): self.op_type = 'one_hot' depth = 10 dimension = 12 - x_lod = [[0, 4, 5, 8, 11]] - x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])] - x = np.array(x).astype('int').reshape([x_lod[0][-1], 1]) + x_lod = [[4, 1, 3, 3]] + x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))] + x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1]) out = np.zeros(shape=(np.product(x.shape[:-1]), depth)).astype('float32') @@ -50,9 +50,9 @@ class TestOneHotOp_default_dtype(OpTest): self.op_type = 'one_hot' depth = 10 dimension = 12 - x_lod = [[0, 4, 5, 8, 11]] - x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])] - x = np.array(x).astype('int').reshape([x_lod[0][-1], 1]) + x_lod = [[4, 1, 3, 3]] + x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))] + x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1]) out = np.zeros(shape=(np.product(x.shape[:-1]), depth)).astype('float32') @@ -75,11 +75,11 @@ class TestOneHotOp_exception(OpTest): self.place = core.CPUPlace() self.dimension = 12 self.x = core.LoDTensor() - x_lod = [[0, 4, 5, 8, 11]] - data = [np.random.randint(11, 20) for i in xrange(x_lod[0][-1])] - data = np.array(data).astype('int').reshape([x_lod[0][-1], 1]) + x_lod = [[4, 1, 3, 3]] + data = [np.random.randint(11, 20) for i in xrange(sum(x_lod[0]))] + data = np.array(data).astype('int').reshape([sum(x_lod[0]), 1]) self.x.set(data, self.place) - self.x.set_lod(x_lod) + self.x.set_recursive_sequence_lengths(x_lod) def test_check_output(self): program = Program() diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py index e775db1d10f4561b6fb90631757a25c9f74cb777..7286c7c450108c4b5ad7136041bc4e989894a2ba 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer.py @@ -434,5 +434,71 @@ class TestDecayedAdagradOptimizer(unittest.TestCase): self.assertAlmostEqual(init_ops[1].attr('value'), 0.0) +class TestFtrlOptimizer(unittest.TestCase): + class MockFtrl(optimizer.FtrlOptimizer): + def get_accumulators(self): + return self._accumulators + + def get_squared_str(self): + return self._squared_acc_str + + def get_linear_str(self): + return self._linear_acc_str + + def test_ftrl_optimizer(self): + init_program = framework.Program() + program = framework.Program() + block = program.global_block() + mul_x = block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr={'learning_rate': 1.1}) + mul_y = block.create_var( + dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") + mul_out = block.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") + block.append_op( + type="mul", + inputs={"X": mul_x, + "Y": mul_y}, + outputs={"Out": mul_out}, + attrs={"x_num_col_dims": 1}) + mean_out = block.create_var( + dtype="float32", shape=[1], lod_level=0, name="mean.out") + block.append_op( + type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) + learning_rate = 0.01 + ftrl_optimizer = self.MockFtrl( + learning_rate=learning_rate, l1=0.0, l2=0.0, lr_power=-0.5) + params_grads = append_backward(mean_out) + self.assertEqual(len(params_grads), 1) + self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0) + opts = ftrl_optimizer.create_optimization_pass(params_grads, mul_out, + init_program) + self.assertEqual(len(opts), 3) + self.assertEqual([op.type for op in opts], + ["fill_constant", "elementwise_mul", "ftrl"]) + + # Check accumulators + accumulators = ftrl_optimizer.get_accumulators() + self.assertEqual(len(accumulators), 2) + self.assertTrue(ftrl_optimizer.get_squared_str() in accumulators) + self.assertTrue(ftrl_optimizer.get_linear_str() in accumulators) + squared_acc = accumulators[ftrl_optimizer.get_squared_str()] + linear_acc = accumulators[ftrl_optimizer.get_linear_str()] + self.assertEqual(len(squared_acc), 1) + self.assertEqual(len(linear_acc), 1) + self.assertTrue(mul_x.name in squared_acc) + self.assertTrue(mul_x.name in linear_acc) + + # Check init_program + init_ops = init_program.global_block().ops + self.assertEqual(len(init_ops), 3) + self.assertEqual(init_ops[0].type, "fill_constant") + self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py index 66e138b03f3b170aca4fb2207438eb9af1783c33..63fb58c6927fa387b3b19147b9dc9d24bb8e5132 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -17,6 +17,7 @@ import paddle.fluid as fluid import unittest import paddle import numpy as np +import os word_dict, verb_dict, label_dict = conll05.get_dict() word_dict_len = len(word_dict) @@ -101,7 +102,11 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, class TestCRFModel(unittest.TestCase): - def check_network_convergence(self, is_sparse, build_strategy=None): + def check_network_convergence(self, + is_sparse, + build_strategy=None, + use_cuda=True): + os.environ['CPU_NUM'] = str(4) main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): @@ -145,12 +150,12 @@ class TestCRFModel(unittest.TestCase): paddle.dataset.conll05.test(), buf_size=8192), batch_size=16) - place = fluid.CUDAPlace(0) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) pe = fluid.ParallelExecutor( - use_cuda=True, + use_cuda=use_cuda, loss_name=avg_cost.name, build_strategy=build_strategy) @@ -164,33 +169,44 @@ class TestCRFModel(unittest.TestCase): data = train_data() for i in xrange(10): cur_batch = next(data) - print map(np.array, - pe.run(feed=feeder.feed(cur_batch), - fetch_list=[avg_cost.name]))[0] + print pe.run(feed=feeder.feed(cur_batch), + fetch_list=[avg_cost.name])[0] + @unittest.skip(reason="CI hangs") def test_update_sparse_parameter_all_reduce(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy) + is_sparse=True, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( + is_sparse=True, build_strategy=build_strategy, use_cuda=False) + @unittest.skip(reason="CI hangs") def test_update_dense_parameter_all_reduce(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy) + is_sparse=False, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( + is_sparse=False, build_strategy=build_strategy, use_cuda=False) + @unittest.skip(reason="CI hangs") def test_update_sparse_parameter_reduce(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy) + is_sparse=True, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( + is_sparse=True, build_strategy=build_strategy, use_cuda=False) + @unittest.skip(reason="CI hangs") def test_update_dense_parameter_reduce(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy) + is_sparse=False, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( + is_sparse=False, build_strategy=build_strategy, use_cuda=False) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py index 24f8d28c0304a77a99213374b25d0db728eca265..1f5d2f16773efb7537de85abec88344f8e0daa9f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py @@ -18,6 +18,7 @@ import paddle.fluid as fluid import unittest import numpy as np import paddle +import os def Lenet(data, class_dim): @@ -35,7 +36,7 @@ def Lenet(data, class_dim): class TestFetchOp(unittest.TestCase): - def parallel_exe(self, train_inputs, seed): + def parallel_exe(self, train_inputs, seed, use_cuda): main = fluid.Program() startup = fluid.Program() startup.random_seed = seed @@ -59,13 +60,13 @@ class TestFetchOp(unittest.TestCase): # conv2d_1.b_0@GRAD. Those variables should not be pruned. # fluid.memory_optimize(main) - place = fluid.CUDAPlace(0) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) pe = fluid.ParallelExecutor( - use_cuda=True, loss_name=loss.name, main_program=main) + use_cuda=use_cuda, loss_name=loss.name, main_program=main) fetch_list = [] all_vars = main.global_block().vars @@ -74,7 +75,9 @@ class TestFetchOp(unittest.TestCase): fetch_list.append(k) for data in train_inputs: - ret = pe.run(fetch_list, feed=feeder.feed(data)) + ret = pe.run(fetch_list, + feed=feeder.feed(data), + return_numpy=True) for i in range(len(fetch_list)): assert not math.isnan(np.sum(ret[i])) and \ not math.isinf(np.sum(ret[i])) @@ -88,14 +91,16 @@ class TestFetchOp(unittest.TestCase): for i in range(iters): train_inputs.append(tst_reader_iter.next()) - self.parallel_exe(train_inputs, seed=1) + os.environ['CPU_NUM'] = str(4) + self.parallel_exe(train_inputs, seed=1, use_cuda=True) + self.parallel_exe(train_inputs, seed=1, use_cuda=False) class TestFeedParallel(unittest.TestCase): - def test_main(self): + def parallel_exe(self, use_cuda, seed): main = fluid.Program() startup = fluid.Program() - startup.random_seed = 1 + startup.random_seed = seed with fluid.scope_guard(fluid.core.Scope()): with fluid.program_guard(main, startup): data = fluid.layers.data( @@ -111,22 +116,30 @@ class TestFeedParallel(unittest.TestCase): regularization=fluid.regularizer.L2Decay(1e-4)) opt.minimize(loss) - place = fluid.CUDAPlace(0) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) reader = feeder.decorate_reader( paddle.batch( flowers.train(), batch_size=16), multi_devices=True) + exe = fluid.Executor(place) exe.run(startup) + pe = fluid.ParallelExecutor( - use_cuda=True, loss_name=loss.name, main_program=main) + use_cuda=use_cuda, loss_name=loss.name, main_program=main) for batch_id, data in enumerate(reader()): - loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0]) + loss_np = pe.run(feed=data, fetch_list=[loss.name])[0] print batch_id, loss_np if batch_id == 2: break + def test_feed_op(self): + os.environ['CPU_NUM'] = str(4) + self.parallel_exe(use_cuda=True, seed=1) + self.parallel_exe(use_cuda=False, seed=1) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 015703c3e25f4e11e64ab6a7de99da12bee608f6..a801d99aa1ced35eb7f081fde63ad541f0eb2589 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -18,6 +18,7 @@ import numpy as np import paddle import paddle.dataset.mnist as mnist import unittest +import os MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio" @@ -85,6 +86,7 @@ def fc_with_batchnorm(use_feed): class TestMNIST(TestParallelExecutorBase): @classmethod def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) # Convert mnist to recordio file with fluid.program_guard(fluid.Program(), fluid.Program()): reader = paddle.batch(mnist.train(), batch_size=4) @@ -99,9 +101,12 @@ class TestMNIST(TestParallelExecutorBase): fluid.recordio_writer.convert_reader_to_recordio_file( MNIST_RECORDIO_FILE, reader, feeder) - def check_simple_fc_convergence(self, balance_parameter_opt_between_cards): - self.check_network_convergence(simple_fc_net) - self.check_network_convergence(simple_fc_net, allow_op_delay=True) + def check_simple_fc_convergence(self, + balance_parameter_opt_between_cards, + use_cuda=True): + self.check_network_convergence(simple_fc_net, use_cuda=use_cuda) + self.check_network_convergence( + simple_fc_net, use_cuda=use_cuda, allow_op_delay=True) img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') @@ -109,17 +114,21 @@ class TestMNIST(TestParallelExecutorBase): simple_fc_net, feed_dict={"image": img, "label": label}, + use_cuda=use_cuda, balance_parameter_opt_between_cards=balance_parameter_opt_between_cards ) def test_simple_fc(self): - self.check_simple_fc_convergence(False) + self.check_simple_fc_convergence(False, use_cuda=True) + self.check_simple_fc_convergence(False, use_cuda=False) def test_simple_fc_with_new_strategy(self): - self.check_simple_fc_convergence(True) + self.check_simple_fc_convergence(True, use_cuda=True) + self.check_simple_fc_convergence(True, use_cuda=False) def check_simple_fc_parallel_accuracy(self, - balance_parameter_opt_between_cards): + balance_parameter_opt_between_cards, + use_cuda=True): img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') single_first_loss, single_last_loss = self.check_network_convergence( @@ -127,12 +136,14 @@ class TestMNIST(TestParallelExecutorBase): seed=1000, feed_dict={"image": img, "label": label}, + use_cuda=use_cuda, use_parallel_executor=False) parallel_first_loss, parallel_last_loss = self.check_network_convergence( method=simple_fc_net, seed=1000, feed_dict={"image": img, "label": label}, + use_cuda=use_cuda, use_parallel_executor=True, balance_parameter_opt_between_cards=balance_parameter_opt_between_cards ) @@ -143,28 +154,33 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(False) + self.check_simple_fc_parallel_accuracy(False, use_cuda=True) + self.check_simple_fc_parallel_accuracy(False, use_cuda=False) def test_simple_fc_parallel_accuracy_with_new_strategy(self): - self.check_simple_fc_parallel_accuracy(True) + self.check_simple_fc_parallel_accuracy(True, use_cuda=True) + self.check_simple_fc_parallel_accuracy(True, use_cuda=False) - def check_batchnorm_fc_convergence(self, - balance_parameter_opt_between_cards): - self.check_network_convergence(fc_with_batchnorm) + def check_batchnorm_fc_convergence( + self, balance_parameter_opt_between_cards, use_cuda): + self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda) img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') self.check_network_convergence( fc_with_batchnorm, feed_dict={"image": img, "label": label}, + use_cuda=use_cuda, balance_parameter_opt_between_cards=balance_parameter_opt_between_cards ) def test_batchnorm_fc(self): - self.check_batchnorm_fc_convergence(False) + self.check_batchnorm_fc_convergence(False, use_cuda=True) + self.check_batchnorm_fc_convergence(False, use_cuda=False) def test_batchnorm_fc_with_new_strategy(self): - self.check_batchnorm_fc_convergence(True) + self.check_batchnorm_fc_convergence(True, use_cuda=True) + self.check_batchnorm_fc_convergence(True, use_cuda=False) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index a3fa140cbb7994a36d2cbee26d598165f1f771d2..066299e6c6f7f6c159cb0886e86d3404b027b698 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -15,6 +15,7 @@ import paddle.fluid as fluid from parallel_executor_test_base import TestParallelExecutorBase import unittest +import os def squeeze_excitation(input, num_channels, reduction_ratio): @@ -130,22 +131,30 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False): class TestResnet(TestParallelExecutorBase): - def check_resnet_convergence(self, balance_parameter_opt_between_cards): + def check_resnet_convergence(self, + balance_parameter_opt_between_cards, + use_cuda=True, + iter=20): + os.environ['CPU_NUM'] = str(4) + import functools batch_size = 2 self.check_network_convergence( functools.partial( SE_ResNeXt50Small, batch_size=batch_size), - iter=20, + iter=iter, batch_size=batch_size, + use_cuda=use_cuda, balance_parameter_opt_between_cards=balance_parameter_opt_between_cards ) def test_resnet(self): - self.check_resnet_convergence(False) + self.check_resnet_convergence(False, use_cuda=True) + self.check_resnet_convergence(False, use_cuda=False, iter=5) def test_resnet_with_new_strategy(self): - self.check_resnet_convergence(True) + self.check_resnet_convergence(True, use_cuda=True) + self.check_resnet_convergence(True, use_cuda=False, iter=5) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py index 93a5f767867d68110cf7b8f441cc740ecd843cf9..252793944462244539084a288e5259f216359650 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py @@ -15,6 +15,7 @@ import paddle.fluid as fluid import numpy as np import unittest +import os def simple_fc_net(): @@ -35,7 +36,8 @@ def simple_fc_net(): class ParallelExecutorTestingDuringTraining(unittest.TestCase): - def check_network_convergence(self, build_strategy=None): + def check_network_convergence(self, use_cuda, build_strategy=None): + os.environ['CPU_NUM'] = str(4) main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): @@ -49,29 +51,28 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): image = np.random.normal(size=(batch_size, 784)).astype('float32') label = np.random.randint(0, 10, (batch_size, 1), dtype="int64") - place = fluid.CUDAPlace(0) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) feed_dict = {'image': image, 'label': label} train_exe = fluid.ParallelExecutor( - use_cuda=True, + use_cuda=use_cuda, loss_name=loss.name, main_program=main, build_strategy=build_strategy) test_exe = fluid.ParallelExecutor( - use_cuda=True, + use_cuda=use_cuda, main_program=test_program, share_vars_from=train_exe, build_strategy=build_strategy) for i in xrange(5): test_loss, = test_exe.run([loss.name], feed=feed_dict) - test_loss = np.array(test_loss) train_loss, = train_exe.run([loss.name], feed=feed_dict) - train_loss = np.array(train_loss) + self.assertTrue( np.allclose( train_loss, test_loss, atol=1e-8), @@ -81,12 +82,18 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): def test_parallel_testing(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce - self.check_network_convergence(build_strategy) + self.check_network_convergence( + use_cuda=True, build_strategy=build_strategy) + self.check_network_convergence( + use_cuda=False, build_strategy=build_strategy) def test_parallel_testing_with_new_strategy(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce - self.check_network_convergence(build_strategy) + self.check_network_convergence( + use_cuda=True, build_strategy=build_strategy) + self.check_network_convergence( + use_cuda=False, build_strategy=build_strategy) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index c81df66d987f3d3856af0e19fc935df7de2edacc..b6215fddb11bb6b3a76b5a6395e7254d21971c13 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -19,6 +19,7 @@ from parallel_executor_test_base import TestParallelExecutorBase import unittest import paddle import paddle.dataset.wmt16 as wmt16 +import os WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio" @@ -149,6 +150,7 @@ def transformer(use_feed): class TestTransformer(TestParallelExecutorBase): @classmethod def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) reader = paddle.batch( wmt16.train(ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size), @@ -167,7 +169,8 @@ class TestTransformer(TestParallelExecutorBase): @unittest.skip("transformer is buggy in multi gpu") def test_main(self): - self.check_network_convergence(transformer) + self.check_network_convergence(transformer, use_cuda=True) + self.check_network_convergence(transformer, use_cuda=False) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_print_op.py b/python/paddle/fluid/tests/unittests/test_print_op.py index c75080fbb96d472810e5d6a1d02a77c456006f66..e01af42a58b86042fd0282928d1a78d9c3239fe3 100644 --- a/python/paddle/fluid/tests/unittests/test_print_op.py +++ b/python/paddle/fluid/tests/unittests/test_print_op.py @@ -28,7 +28,7 @@ class TestPrintOpCPU(unittest.TestCase): self.x_tensor = core.LoDTensor() tensor_np = np.random.random(size=(2, 3)).astype('float32') self.x_tensor.set(tensor_np, self.place) - self.x_tensor.set_lod([[0, 1, 1]]) + self.x_tensor.set_recursive_sequence_lengths([[1, 1]]) def build_network(self, only_forward, **kargs): x = layers.data('x', shape=[3], dtype='float32', lod_level=1) @@ -62,7 +62,7 @@ class TestPrintOpGPU(TestPrintOpCPU): self.x_tensor = core.LoDTensor() tensor_np = np.random.random(size=(2, 3)).astype('float32') self.x_tensor.set(tensor_np, self.place) - self.x_tensor.set_lod([[0, 1, 1]]) + self.x_tensor.set_recursive_sequence_lengths([[1, 1]]) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py index 76d0d2f2fe80e409dc1b7fa858d43fbc6ad960ef..a70321bd800bf25eeb9e5d197ea7e08626b9aede 100644 --- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py @@ -70,11 +70,10 @@ class TestReorderLoDTensor(unittest.TestCase): lod_level_i = numpy.random.randint( low=1, high=5, - size=self.num_seq if i == 0 else lod_level_i[-1]) - lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist() + size=self.num_seq if i == 0 else sum(lod_level_i)).tolist() data_lod.append(lod_level_i) data_value = numpy.random.random( - size=[data_lod[-1][-1] if data_lod else self.num_seq + size=[sum(data_lod[-1]) if data_lod else self.num_seq ] + data_shape).astype('float32') self.data[data_name] = (data_value, data_lod) @@ -84,29 +83,36 @@ class TestReorderLoDTensor(unittest.TestCase): tensor = fluid.Tensor() tensor.set(self.data[desc[0]][0], place) if self.data[desc[0]][1]: - tensor.set_lod(self.data[desc[0]][1]) + tensor.set_recursive_sequence_lengths(self.data[desc[0]][1]) self.inputs[desc[0]] = tensor def reorder(self): - level = 0 + def convert_to_offset(lod): + offset_lod = [[0] for i in lod] + for i, level in enumerate(lod): + for seq_len in level: + offset_lod[i].append(offset_lod[i][-1] + seq_len) + return offset_lod + level = 0 # compute the rank_table according to ref_lod ref_lod = self.data[self.data_desc[1][0]][1][level] rank_table = [] # list of (index, length) - for i in range(len(ref_lod) - 1): - rank_table.append((i, ref_lod[i + 1] - ref_lod[i])) + for i in range(len(ref_lod)): + rank_table.append((i, ref_lod[i])) rank_table = sorted(rank_table, lambda x, y: y[1] - x[1]) # compute the input sequence info according to input_lod input_value, input_lod = self.data[self.data_desc[0][0]] + offset_lod = convert_to_offset(input_lod) input_table = [] # list of (offset, length, sub_lod) - if input_lod: - for i in range(len(input_lod[level]) - 1): + if offset_lod: + for i in range(len(offset_lod[level]) - 1): start_idx = i end_idx = i + 1 sub_lod = [] - for lod_level_i in input_lod[level:]: + for lod_level_i in offset_lod[level:]: sub_lod_i = [] for idx in range(start_idx, end_idx): sub_lod_i.append(lod_level_i[idx + 1] - lod_level_i[ @@ -132,10 +138,9 @@ class TestReorderLoDTensor(unittest.TestCase): input_seq_sub_lod = input_table[index][2] if len(output_lod) == 0: - output_lod = [[0] for i in input_seq_sub_lod] - for i, sub_lod_i in enumerate(input_seq_sub_lod): - for idx_sub in sub_lod_i: - output_lod[i].append(output_lod[i][-1] + idx_sub) + output_lod = [[] for i in input_seq_sub_lod] + for i, level in enumerate(input_seq_sub_lod): + output_lod[i].extend(level) return output_value, output_lod def test_reorder_lod_tensor(self): @@ -148,7 +153,8 @@ class TestReorderLoDTensor(unittest.TestCase): self.assertTrue( numpy.allclose( numpy.array(actual_output), expect_output, atol=0.001)) - self.assertEqual(expect_output_lod, actual_output.lod()) + self.assertEqual(expect_output_lod, + actual_output.recursive_sequence_lengths()) # check gradient expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0]) expect_grad_lod = self.data[self.data_desc[0][0]][1] @@ -156,7 +162,8 @@ class TestReorderLoDTensor(unittest.TestCase): self.assertTrue( numpy.allclose( numpy.array(actual_grad), expect_grad, atol=0.001)) - self.assertEqual(expect_grad_lod, actual_grad.lod()) + self.assertEqual(expect_grad_lod, + actual_grad.recursive_sequence_lengths()) def test_reorder_tensor(self): self.data_desc[0][-1] = 0 # input is tensor @@ -168,7 +175,8 @@ class TestReorderLoDTensor(unittest.TestCase): self.assertTrue( numpy.allclose( numpy.array(actual_output), expect_output, atol=0.001)) - self.assertEqual(expect_output_lod, actual_output.lod()) + self.assertEqual(expect_output_lod, + actual_output.recursive_sequence_lengths()) # check gradient expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0]) expect_grad_lod = self.data[self.data_desc[0][0]][1] @@ -176,14 +184,14 @@ class TestReorderLoDTensor(unittest.TestCase): self.assertTrue( numpy.allclose( numpy.array(actual_grad), expect_grad, atol=0.001)) - self.assertEqual(expect_grad_lod, actual_grad.lod()) + self.assertEqual(expect_grad_lod, + actual_grad.recursive_sequence_lengths()) # compare outputs between LodTensors with explicit and implicit lod # use the same data but set the input lod explicitly - input_lod = [[ - i for i in range(len(self.data[self.data_desc[0][0]][0]) + 1) - ]] - self.inputs[self.data_desc[0][0]].set_lod(input_lod) + input_lod = [[1] * len(self.data[self.data_desc[0][0]][0])] + self.inputs[self.data_desc[0][0]].set_recursive_sequence_lengths( + input_lod) # preserve the output of LodTensor with implicit lod to compare expect_output = [ numpy.array(actual_output) for actual_output in self.actual_outputs diff --git a/python/paddle/fluid/tests/unittests/test_reverse_op.py b/python/paddle/fluid/tests/unittests/test_reverse_op.py new file mode 100644 index 0000000000000000000000000000000000000000..f845575a02869f08299d76b5600074598ca27f6c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py @@ -0,0 +1,67 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class TestReverseOp(OpTest): + def initTestCase(self): + self.x = np.random.random((3, 4)).astype('float32') + self.axis = [0] + + def setUp(self): + self.initTestCase() + self.op_type = "reverse" + self.inputs = {"X": self.x} + self.attrs = {'axis': self.axis} + out = self.x + for a in self.axis: + out = np.flip(out, axis=a) + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestCase0(TestReverseOp): + def initTestCase(self): + self.x = np.random.random((3, 4)).astype('float32') + self.axis = [1] + + +class TestCase1(TestReverseOp): + def initTestCase(self): + self.x = np.random.random((3, 4)).astype('float32') + self.axis = [0, 1] + + +class TestCase2(TestReverseOp): + def initTestCase(self): + self.x = np.random.random((3, 4, 5)).astype('float32') + self.axis = [0, 2] + + +class TestCase3(TestReverseOp): + def initTestCase(self): + self.x = np.random.random((3, 4, 5)).astype('float32') + self.axis = [1, 2] + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py index 3d754aff3a73e7168e2123483b26e5e3a3585a4e..df5684ab173a4889dd7b693f9246bafd12e0345f 100644 --- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py +++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py @@ -107,7 +107,7 @@ class TestROIPoolOp(OpTest): rois = [] self.rois_lod = [[]] for bno in range(self.batch_size): - self.rois_lod[0].append(len(rois)) + self.rois_lod[0].append(bno + 1) for i in range(bno + 1): x1 = np.random.random_integers( 0, self.width / self.spatial_scale - self.pooled_width) @@ -121,7 +121,6 @@ class TestROIPoolOp(OpTest): roi = [bno, x1, y1, x2, y2] rois.append(roi) - self.rois_lod[0].append(len(rois)) self.rois_num = len(rois) self.rois = np.array(rois).astype("int64") diff --git a/python/paddle/fluid/tests/unittests/test_row_conv_op.py b/python/paddle/fluid/tests/unittests/test_row_conv_op.py index 30f1efbcbcb11332c85c9d5489f22c17b06c2b36..07dcd108689ae6069e30fe22029258d192215549 100644 --- a/python/paddle/fluid/tests/unittests/test_row_conv_op.py +++ b/python/paddle/fluid/tests/unittests/test_row_conv_op.py @@ -19,8 +19,10 @@ from op_test import OpTest def row_conv_forward(x, lod, wt): out = np.zeros_like(x) - seq_info = lod[0] - num_sequences = len(seq_info) - 1 + num_sequences = len(lod[0]) + seq_info = [0] + for seq_len in lod[0]: + seq_info.append(seq_info[-1] + seq_len) context_length = wt.shape[0] for i in range(num_sequences): # loop over number of sequences @@ -32,7 +34,6 @@ def row_conv_forward(x, lod, wt): cur_timesteps = end - start for j in range(cur_timesteps): # loop over different timesteps for k in range(context_length): - if j + k >= cur_timesteps: continue curoutput[j, :] += curinput[j + k, :] * wt[k, :] @@ -44,8 +45,8 @@ class TestRowConvOp1(OpTest): def setUp(self): self.op_type = "row_conv" - lod = [[0, 2, 5, 7]] - T = lod[0][-1] + lod = [[2, 3, 2]] + T = sum(lod[0]) D = 16 context_length = 2 @@ -75,8 +76,8 @@ class TestRowConvOp2(OpTest): def setUp(self): self.op_type = "row_conv" - lod = [[0, 20, 50, 100]] - T = lod[0][-1] + lod = [[20, 30, 50]] + T = sum(lod[0]) D = 35 context_length = 35 diff --git a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py index 10592d127fafdf202c65fcfa91b5c464cc60e96c..11ffa761a690eb1f9f6dc50c45128a99301741db 100644 --- a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py +++ b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py @@ -18,14 +18,19 @@ import sys from op_test import OpTest -def to_abs_lod(lod): - if len(lod) == 0 or len(lod) == 1: - return lod +def to_abs_offset_lod(lod): + offset_lod = [[0] for i in lod] + for i, level in enumerate(lod): + for seq_len in level: + offset_lod[i].append(offset_lod[i][-1] + seq_len) + + if len(offset_lod) == 0 or len(offset_lod) == 1: + return offset_lod import copy - new_lod = copy.deepcopy(lod) - for idx, val in enumerate(lod[0]): - new_lod[0][idx] = lod[1][val] - return new_lod + new_offset_lod = copy.deepcopy(offset_lod) + for idx, val in enumerate(offset_lod[0]): + new_offset_lod[0][idx] = offset_lod[1][val] + return new_offset_lod def seq_concat(inputs, level): @@ -35,11 +40,11 @@ def seq_concat(inputs, level): x1 = inputs['X'][1][1][0] level_idx = len(lod0) - level - 1 outs = [] - for i in range(len(lod0[level_idx]) - 1): - sub_x0 = x0[to_abs_lod(lod0)[level_idx][i]:to_abs_lod(lod0)[level_idx][ - i + 1], :] - sub_x1 = x1[to_abs_lod(lod1)[level_idx][i]:to_abs_lod(lod1)[level_idx][ - i + 1], :] + for i in range(len(lod0[level_idx])): + sub_x0 = x0[to_abs_offset_lod(lod0)[level_idx][i]:to_abs_offset_lod( + lod0)[level_idx][i + 1], :] + sub_x1 = x1[to_abs_offset_lod(lod1)[level_idx][i]:to_abs_offset_lod( + lod1)[level_idx][i + 1], :] outs.append(np.concatenate((sub_x0, sub_x1), axis=0)) return np.concatenate(outs, axis=0) @@ -48,9 +53,9 @@ class TestSeqConcatOp(OpTest): def set_data(self): # two level, batch size is 3 x0 = np.random.random((4, 6, 3)).astype('float32') - lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]] + lod0 = [[2, 2], [1, 1, 1, 1]] x1 = np.random.random((4, 8, 3)).astype('float32') - lod1 = [[0, 2, 4], [0, 1, 2, 3, 4]] + lod1 = [[2, 2], [1, 1, 1, 1]] axis = 1 level = 1 self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]} @@ -72,14 +77,14 @@ class TestSeqConcatOpLevelZeroNestedSequence(TestSeqConcatOp): def set_data(self): # two level, batch size is 3 x0 = np.random.random((4, 6, 3)).astype('float32') - lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]] + lod0 = [[2, 2], [1, 1, 1, 1]] x1 = np.random.random((7, 6, 3)).astype('float32') - lod1 = [[0, 2, 4], [0, 1, 3, 5, 7]] + lod1 = [[2, 2], [1, 2, 2, 2]] axis = 0 level = 0 self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]} self.attrs = {'axis': axis, 'level': level} - out_lod = [[0, 2, 4], [0, 2, 5, 8, 11]] + out_lod = [[2, 2], [2, 3, 3, 3]] self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)} @@ -87,14 +92,14 @@ class TestSeqConcatOplevelOneNestedSequence(TestSeqConcatOp): def set_data(self): # two level, batch size is 3 x0 = np.random.random((4, 6, 3)).astype('float32') - lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]] + lod0 = [[2, 2], [1, 1, 1, 1]] x1 = np.random.random((7, 6, 3)).astype('float32') - lod1 = [[0, 3, 4], [0, 1, 3, 5, 7]] + lod1 = [[3, 1], [1, 2, 2, 2]] axis = 0 level = 1 self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]} self.attrs = {'axis': axis, 'level': level} - out_lod = [[0, 5, 8], [0, 1, 2, 3, 5, 7, 8, 9, 11]] + out_lod = [[5, 3], [1, 1, 1, 2, 2, 1, 1, 2]] self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)} @@ -102,14 +107,14 @@ class TestSeqConcatOpLevelZeroSequence(TestSeqConcatOp): def set_data(self): # two level, batch size is 3 x0 = np.random.random((4, 3, 4)).astype('float32') - lod0 = [[0, 1, 2, 3, 4]] + lod0 = [[1, 1, 1, 1]] x1 = np.random.random((7, 3, 4)).astype('float32') - lod1 = [[0, 1, 3, 5, 7]] + lod1 = [[1, 2, 2, 2]] axis = 0 level = 0 self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]} self.attrs = {'axis': axis, 'level': level} - out_lod = [[0, 2, 5, 8, 11]] + out_lod = [[2, 3, 3, 3]] self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)} diff --git a/python/paddle/fluid/tests/unittests/test_seq_conv.py b/python/paddle/fluid/tests/unittests/test_seq_conv.py index 51dbf1f61834ff0093d76ed546be27a585697d40..9701d9adef1fd272f2520f66607acded6a8c25c6 100644 --- a/python/paddle/fluid/tests/unittests/test_seq_conv.py +++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py @@ -75,35 +75,38 @@ class TestSeqProject(OpTest): pading_data = self.pad_data out = np.zeros((self.input_size[0], self.context_length * self.input_size[1])).astype('float32') - lod = lod[0] + offset = [0] + for seq_len in lod[0]: + offset.append(offset[-1] + seq_len) begin_pad = np.max([0, -self.context_start]) - for i in range(len(lod) - 1): + for i in range(len(offset) - 1): for j in range(self.context_length): - in_begin = lod[i] + self.context_start + j - in_end = lod[i + 1] + self.context_start + j - out_begin = lod[i] - out_end = lod[i + 1] - if in_begin < lod[i]: - pad_size = np.min([lod[i] - in_begin, lod[i + 1] - lod[i]]) + in_begin = offset[i] + self.context_start + j + in_end = offset[i + 1] + self.context_start + j + out_begin = offset[i] + out_end = offset[i + 1] + if in_begin < offset[i]: + pad_size = np.min( + [offset[i] - in_begin, offset[i + 1] - offset[i]]) if self.padding_trainable: sub_w = pading_data[j:j + pad_size, :] - out[lod[i]:lod[i] + pad_size, j * self.input_size[1]:( - j + 1) * self.input_size[1]] = sub_w - out_begin = lod[i] + pad_size - in_begin = lod[i] + out[offset[i]:offset[i] + pad_size, j * self.input_size[ + 1]:(j + 1) * self.input_size[1]] = sub_w + out_begin = offset[i] + pad_size + in_begin = offset[i] - if in_end > lod[i + 1]: + if in_end > offset[i + 1]: pad_size = np.min( - [in_end - lod[i + 1], lod[i + 1] - lod[i]]) + [in_end - offset[i + 1], offset[i + 1] - offset[i]]) if self.padding_trainable: sub_w = pading_data[begin_pad + self.context_start + j - pad_size:begin_pad + self.context_start + j, :] - out[lod[i + 1] - pad_size:lod[i + 1], j * self. + out[offset[i + 1] - pad_size:offset[i + 1], j * self. input_size[1]:(j + 1) * self.input_size[1]] = sub_w - in_end = lod[i + 1] - out_end = lod[i + 1] - pad_size + in_end = offset[i + 1] + out_end = offset[i + 1] - pad_size if in_end <= in_begin: continue @@ -175,7 +178,11 @@ class TestSeqProject(OpTest): self.context_stride = 1 self.input_size = [self.input_row, 23] - self.lod = [[0, 4, 5, 8, self.input_row]] + offset_lod = [[0, 4, 5, 8, self.input_row]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) self.output_represention = 8 # output feature size @@ -188,7 +195,11 @@ class TestSeqProjectCase1(TestSeqProject): self.context_stride = 1 self.input_size = [self.input_row, 23] - self.lod = [[0, 4, 5, 8, self.input_row]] + offset_lod = [[0, 4, 5, 8, self.input_row]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) self.output_represention = 8 # output feature size @@ -203,8 +214,12 @@ class TestSeqProjectCase2(TestSeqProject): self.input_size = [self.input_row, 23] idx = range(self.input_size[0]) del idx[0] - self.lod = [[0] + np.sort(random.sample(idx, 8)).tolist() + - [self.input_size[0]]] + offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() + + [self.input_size[0]]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) self.output_represention = 8 # output feature size diff --git a/python/paddle/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py index 2e48ef0e880839f6d5b4e515a174f427a35e7e6f..0b3659d7a67956f7546d368346bd102eeedf1d97 100644 --- a/python/paddle/fluid/tests/unittests/test_seq_pool.py +++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py @@ -18,26 +18,34 @@ from op_test import OpTest class TestSeqAvgPool(OpTest): + def convert_to_offset(self, lod): + offset = [[0] for i in lod] + for i, level in enumerate(lod): + for seq_len in level: + offset[i].append(offset[i][-1] + seq_len) + return offset + def set_data(self): self.op_type = 'sequence_pool' # one level, batch size is 4 x = np.random.uniform(0.1, 1, [11, 23]).astype('float32') - lod = [[0, 4, 5, 8, 11]] + lod = [[4, 1, 3, 3]] self.inputs = {'X': (x, lod)} + offset = self.convert_to_offset(lod) out = np.zeros((4, 23)).astype('float32') self.outputs = {'Out': out} - return x, lod, out + return x, offset, out - def compute(self, x, lod, out): + def compute(self, x, offset, out): self.attrs = {'pooltype': "AVERAGE"} - for i in range(4): - sub_x = x[lod[0][i]:lod[0][i + 1], :] + for i in range(len(offset[0]) - 1): + sub_x = x[offset[0][i]:offset[0][i + 1], :] out[i] = sub_x.mean(axis=0) def setUp(self): - x, lod, out = self.set_data() - self.compute(x, lod, out) + x, offset, out = self.set_data() + self.compute(x, offset, out) def test_check_output(self): self.check_output() @@ -50,10 +58,10 @@ class TestSeqAvgPool(OpTest): class TestSeqSumPool(TestSeqAvgPool): - def compute(self, x, lod, out): + def compute(self, x, offset, out): self.attrs = {'pooltype': "SUM"} - for i in range(4): - sub_x = x[lod[0][i]:lod[0][i + 1], :] + for i in range(len(offset[0]) - 1): + sub_x = x[offset[0][i]:offset[0][i + 1], :] out[i] = sub_x.sum(axis=0) @@ -61,46 +69,47 @@ class TestSeqMaxPool(TestSeqAvgPool): def set_data(self): self.op_type = 'sequence_pool' x = np.random.uniform(0.1, 1, [13, 23]).astype('float32') - lod = [[0, 4, 5, 8, 13]] - for i in range(4): - l = lod[0][i + 1] - lod[0][i] - x[lod[0][i] + np.random.randint(l), :] += 2.0 + lod = [[4, 1, 3, 5]] + offset = self.convert_to_offset(lod) + for i in range(len(offset[0]) - 1): + l = offset[0][i + 1] - offset[0][i] + x[offset[0][i] + np.random.randint(l), :] += 2.0 self.inputs = {'X': (x, lod)} out = np.zeros((4, 23)).astype('float32') self.outputs = {'Out': out} - return x, lod, out + return x, offset, out - def compute(self, x, lod, out): + def compute(self, x, offset, out): self.attrs = {'pooltype': "MAX"} - for i in range(4): - sub_x = x[lod[0][i]:lod[0][i + 1], :] + for i in range(len(offset[0]) - 1): + sub_x = x[offset[0][i]:offset[0][i + 1], :] out[i] = np.amax(sub_x, axis=0) class TestSeqSqrtPool(TestSeqAvgPool): - def compute(self, x, lod, out): + def compute(self, x, offset, out): self.attrs = {'pooltype': "SQRT"} - for i in range(4): - sub_x = x[lod[0][i]:lod[0][i + 1], :] - len = lod[0][i + 1] - lod[0][i] - out[i] = sub_x.sum(axis=0) / np.sqrt(len) + for i in range(len(offset[0]) - 1): + sub_x = x[offset[0][i]:offset[0][i + 1], :] + seq_len = offset[0][i + 1] - offset[0][i] + out[i] = sub_x.sum(axis=0) / np.sqrt(seq_len) class TestSeqLastPool(TestSeqAvgPool): - def compute(self, x, lod, out): + def compute(self, x, offset, out): self.attrs = {'pooltype': "LAST"} - for i in range(4): - sub_x = x[lod[0][i]:lod[0][i + 1], :] + for i in range(len(offset[0]) - 1): + sub_x = x[offset[0][i]:offset[0][i + 1], :] out[i] = sub_x[-1, :] class TestSeqFirstPool(TestSeqAvgPool): - def compute(self, x, lod, out): + def compute(self, x, offset, out): self.attrs = {'pooltype': "FIRST"} - for i in range(4): - sub_x = x[lod[0][i]:lod[0][i + 1], :] + for i in range(len(offset[0]) - 1): + sub_x = x[offset[0][i]:offset[0][i + 1], :] out[i] = sub_x[0, :] @@ -109,35 +118,39 @@ class TestSeqAvgPool2D(TestSeqAvgPool): self.op_type = 'sequence_pool' # one level, batch size is 4 x = np.random.uniform(0.1, 1, [13, 3, 17]).astype('float32') - lod = [[0, 4, 5, 8, 13]] + lod = [[4, 1, 3, 5]] self.inputs = {'X': (x, lod)} + offset = self.convert_to_offset(lod) out = np.zeros((4, 3, 17)).astype('float32') self.outputs = {'Out': out} - return x, lod, out + return x, offset, out - def compute(self, x, lod, out): + def compute(self, x, offset, out): self.attrs = {'pooltype': "AVERAGE"} - for i in range(4): - sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17)) + for i in range(len(offset[0]) - 1): + sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :], + (-1, 3 * 17)) out[i] = np.reshape(sub_x.mean(axis=0), (3, 17)) class TestSeqSumPool2D(TestSeqAvgPool2D): - def compute(self, x, lod, out): + def compute(self, x, offset, out): self.attrs = {'pooltype': "SUM"} - for i in range(4): - sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17)) + for i in range(len(offset[0]) - 1): + sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :], + (-1, 3 * 17)) out[i] = np.reshape(sub_x.sum(axis=0), (3, 17)) class TestSeqSqrtPool2D(TestSeqAvgPool2D): - def compute(self, x, lod, out): + def compute(self, x, offset, out): self.attrs = {'pooltype': "SQRT"} - for i in range(4): - sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17)) - len = lod[0][i + 1] - lod[0][i] - out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(len), (3, 17)) + for i in range(len(offset[0]) - 1): + sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :], + (-1, 3 * 17)) + seq_len = offset[0][i + 1] - offset[0][i] + out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(seq_len), (3, 17)) def test_check_grad(self): # Remove MaxIndex after check_grad is refined. @@ -150,36 +163,40 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D): def set_data(self): self.op_type = 'sequence_pool' x = np.random.uniform(0.1, 1, [13, 3, 11]).astype('float32') - lod = [[0, 4, 5, 8, 13]] + lod = [[4, 1, 3, 5]] self.inputs = {'X': (x, lod)} - for i in range(4): - l = lod[0][i + 1] - lod[0][i] - x[lod[0][i] + np.random.randint(l), :] += 1.0 + offset = self.convert_to_offset(lod) + for i in range(len(offset[0]) - 1): + l = offset[0][i + 1] - offset[0][i] + x[offset[0][i] + np.random.randint(l), :] += 1.0 out = np.zeros((4, 3, 11)).astype('float32') self.outputs = {'Out': out} - return x, lod, out + return x, offset, out - def compute(self, x, lod, out): + def compute(self, x, offset, out): self.attrs = {'pooltype': "MAX"} - for i in range(4): - sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 11)) + for i in range(len(offset[0]) - 1): + sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :], + (-1, 3 * 11)) out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11)) class TestSeqLastPool2D(TestSeqAvgPool2D): - def compute(self, x, lod, out): + def compute(self, x, offset, out): self.attrs = {'pooltype': "LAST"} - for i in range(4): - sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17)) + for i in range(len(offset[0]) - 1): + sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :], + (-1, 3 * 17)) out[i] = np.reshape(sub_x[-1, :], (3, 17)) class TestSeqFirstPool2D(TestSeqAvgPool2D): - def compute(self, x, lod, out): + def compute(self, x, offset, out): self.attrs = {'pooltype': "FIRST"} - for i in range(4): - sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17)) + for i in range(len(offset[0]) - 1): + sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :], + (-1, 3 * 17)) out[i] = np.reshape(sub_x[0, :], (3, 17)) diff --git a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py index ebab77e8041d5ff1bd845fb121e5901116fd0254..8f0765277ae85af2b17ad96d4fd0c1148c393ff0 100644 --- a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py +++ b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py @@ -18,15 +18,17 @@ from op_test import OpTest def sequence_erase(in_seq, lod0, tokens): - new_lod0 = [0] + new_lod0 = [] out_seq = [] - for i in range(0, len(lod0) - 1): + offset = 0 + for i in range(0, len(lod0)): num_out = 0 - for dat in in_seq[lod0[i]:lod0[i + 1]]: + for dat in in_seq[offset:(offset + lod0[i])]: if dat not in tokens: out_seq.append(dat) num_out += 1 - new_lod0.append(new_lod0[-1] + num_out) + offset += lod0[i] + new_lod0.append(num_out) return np.array(out_seq).astype("int32"), new_lod0 @@ -34,7 +36,7 @@ class TestSequenceEraseOpInt32(OpTest): def setUp(self): self.op_type = "sequence_erase" in_seq = np.random.randint(0, 10, (30, 1)).astype("int32") - lod = [[0, 9, 13, 24, 30]] + lod = [[9, 4, 11, 6]] tokens = [2, 3, 5] out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens) self.attrs = {'tokens': tokens} @@ -49,7 +51,7 @@ class TestSequenceEraseOpInt64(OpTest): def setUp(self): self.op_type = "sequence_erase" in_seq = np.random.randint(0, 10, (30, 1)).astype("int64") - lod = [[0, 9, 13, 24, 30]] + lod = [[9, 4, 11, 6]] tokens = [2, 3, 5] out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens) self.attrs = {'tokens': tokens} @@ -64,7 +66,7 @@ class TestSequenceEraseOpEmpty(OpTest): def setUp(self): self.op_type = "sequence_erase" in_seq = np.random.randint(0, 10, (30, 1)).astype("int32") - lod = [[0, 9, 13, 24, 30]] + lod = [[9, 4, 11, 6]] tokens = [] out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens) self.attrs = {'tokens': tokens} diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py index 4c8ec1426c6e103498af544ea5928ec630707d46..0bbd31814efdff6050733f6876ef64e3fcaaaf76 100644 --- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py +++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py @@ -21,7 +21,7 @@ class TestSequenceExpand(OpTest): def set_data(self): x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32') y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32') - y_lod = [[0, 1, 4, 8]] + y_lod = [[1, 3, 4]] self.inputs = {'X': x_data, 'Y': (y_data, y_lod)} def compute(self): @@ -37,23 +37,27 @@ class TestSequenceExpand(OpTest): out = np.zeros(shape=((0, ) + x_data.shape[1:]), dtype=x_data.dtype) if x_lod is None: - x_idx = [i for i in xrange(x_data.shape[0] + 1)] + # x_idx = [i for i in xrange(x_data.shape[0] + 1)] + x_idx = [1] * x_data.shape[0] else: x_idx = x_lod[0] - out_lod = [[0]] + out_lod = [[]] + + offset = 0 + for i in xrange(len(y_lod[ref_level])): + repeat_num = y_lod[ref_level][i] + x_len = x_idx[i] - for i in xrange(1, len(y_lod[ref_level])): - repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1] - x_len = x_idx[i] - x_idx[i - 1] if repeat_num > 0: - x_sub = x_data[x_idx[i - 1]:x_idx[i], :] + x_sub = x_data[offset:(offset + x_len), :] stacked_x_sub = x_sub for r in range(repeat_num - 1): stacked_x_sub = np.vstack((stacked_x_sub, x_sub)) out = np.vstack((out, stacked_x_sub)) if x_lod is not None: for j in xrange(repeat_num): - out_lod[0].append(out_lod[0][-1] + x_len) + out_lod[0].append(x_len) + offset += x_len if x_lod is None: self.outputs = {'Out': out} @@ -75,9 +79,9 @@ class TestSequenceExpand(OpTest): class TestSequenceExpandCase1(TestSequenceExpand): def set_data(self): x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32') - x_lod = [[0, 2, 5]] + x_lod = [[2, 3]] y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32') - y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]] + y_lod = [[2, 3], [2, 2, 3, 3, 3]] self.inputs = {'X': x_data, 'Y': (y_data, y_lod)} self.attrs = {'ref_level': 0} @@ -85,9 +89,9 @@ class TestSequenceExpandCase1(TestSequenceExpand): class TestSequenceExpandCase2(TestSequenceExpand): def set_data(self): x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32') - x_lod = [[0, 1]] + x_lod = [[1]] y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32') - y_lod = [[0, 2], [0, 2]] + y_lod = [[2], [1, 1]] self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} self.attrs = {'ref_level': 0} @@ -95,9 +99,9 @@ class TestSequenceExpandCase2(TestSequenceExpand): class TestSequenceExpandCase3(TestSequenceExpand): def set_data(self): x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32') - x_lod = [[0, 1, 2, 3, 4]] - y_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32') - y_lod = [[0, 2, 4, 4, 6]] + x_lod = [[1, 1, 1, 1]] + y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32') + y_lod = [[2, 2, 2, 2]] self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} @@ -105,9 +109,9 @@ class TestSequenceExpandCase4(TestSequenceExpand): def set_data(self): data = np.random.uniform(0.1, 1, [5 * 2, 1]) x_data = np.array(data).reshape([5, 2]).astype('float32') - x_lod = [[0, 2, 5]] - y_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32') - y_lod = [[0, 1, 3], [0, 1, 3]] + x_lod = [[2, 3]] + y_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32') + y_lod = [[2], [2, 3]] self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} diff --git a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py index efeab560392d8c03b1bb5db83f59c12d4fef64b0..68f2e5eba35ed318281d14e397dc6d363bcb4079 100644 --- a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py +++ b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py @@ -22,7 +22,7 @@ class TestSequenceReshape(OpTest): def setUp(self): self.op_type = 'sequence_reshape' dimension = 12 - x_lod = [[0, 4, 5, 8, 11]] + x_lod = [[4, 1, 3, 3]] x = np.random.uniform(0.1, 1, [11, 24]).astype('float32') self.inputs = {'X': (x, x_lod)} @@ -34,13 +34,13 @@ class TestSequenceReshape(OpTest): def compute_output(self, x, x_lod, dimension): x_width = x.shape[1] - out_lod = [[0]] - for i in xrange(len(x_lod[0]) - 1): - seq_len = x_lod[0][i + 1] - x_lod[0][i] + out_lod = [[]] + for i in xrange(len(x_lod[0])): + seq_len = x_lod[0][i] offset = (seq_len * x_width) / dimension assert int(offset) * dimension == seq_len * x_width - out_lod[0].append(out_lod[0][-1] + int(offset)) - out = np.zeros(shape=(out_lod[0][-1], dimension)).astype('float32') + out_lod[0].append(int(offset)) + out = np.zeros(shape=(sum(out_lod[0]), dimension)).astype('float32') out.ravel()[:] = x.ravel()[:] return out, out_lod @@ -55,7 +55,7 @@ class TestSequenceReshape_reduce(TestSequenceReshape): def setUp(self): self.op_type = 'sequence_reshape' dimension = 24 - x_lod = [[0, 4, 6, 8, 12]] + x_lod = [[4, 2, 2, 4]] x = np.random.uniform(0.1, 1, [12, 12]).astype('float32') self.inputs = {'X': (x, x_lod)} @@ -70,7 +70,7 @@ class TestSequenceReshape_same(TestSequenceReshape): def setUp(self): self.op_type = 'sequence_reshape' dimension = 12 - x_lod = [[0, 4, 6, 8, 12]] + x_lod = [[4, 2, 2, 4]] x = np.random.uniform(0.1, 1, [12, 12]).astype('float32') self.inputs = {'X': (x, x_lod)} diff --git a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py index 660b4a171d09ddfc0e78b650a467db6b576c7ee3..313e485d1e3080f2c59c68256cbc5c81aa6558cd 100644 --- a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py +++ b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py @@ -29,20 +29,20 @@ class TestSequenceSliceOp(OpTest): self.inputs = {'X': (x, lod), 'Offset': offset, 'Length': length} outs = [] #np.zeros((100, 3, 2)).astype('float32') - out_lod = [[0]] - out_lod_offset = 0 + out_lod = [[]] + lod_offset = 0 for i in range(len(offset)): - sub_x = x[lod[0][i] + offset[i, 0]:lod[0][i] + offset[i, 0] + + sub_x = x[lod_offset + offset[i, 0]:lod_offset + offset[i, 0] + length[i, 0], :] - out_lod_offset = out_lod_offset + len(sub_x) outs.append(sub_x) - out_lod[0].append(out_lod_offset) + out_lod[0].append(len(sub_x)) + lod_offset += lod[0][i] outs = np.concatenate(outs, axis=0) self.outputs = {'Out': (outs, out_lod)} def init_test_case(self): self.x_dim = (100, 3, 2) - self.x_lod = [[0, 20, 40, 60, 80, 100]] + self.x_lod = [[20, 20, 20, 20, 20]] self.offset = [[1], [2], [3], [4], [5]] self.length = [[10], [8], [6], [4], [2]] diff --git a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py index d6dc99bb3106feee33daa52bffb386f07cc16de5..e91a69a0f8039651225039beb2a42e8dffeb62d3 100644 --- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py +++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py @@ -26,15 +26,16 @@ class TestSequenceSoftmaxOp(OpTest): self.init_op_type() x = np.random.uniform(0.1, 1, (11, 1)).astype("float32") - lod = [[0, 4, 5, 8, 11]] + lod = [[4, 1, 3, 3]] out = np.zeros((11, 1)).astype("float32") - for i in range(4): - sub_x = x[lod[0][i]:lod[0][i + 1], :] - sub_x = sub_x.reshape(1, lod[0][i + 1] - lod[0][i]) + offset = 0 + for i in range(len(lod[0])): + sub_x = x[offset:offset + lod[0][i], :] + sub_x = sub_x.reshape(1, lod[0][i]) sub_out = stable_softmax(sub_x) - out[lod[0][i]:lod[0][i + 1], :] = sub_out.reshape( - lod[0][i + 1] - lod[0][i], 1) + out[offset:offset + lod[0][i], :] = sub_out.reshape(lod[0][i], 1) + offset += lod[0][i] self.inputs = {"X": (x, lod)} self.outputs = {"Out": out} diff --git a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py index 1d93230e7b74c5b6c00bbe125e3ae2d3a649b4b9..b779f0fb014bbba62927754ea6f36828a32e6c0a 100644 --- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py +++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py @@ -54,12 +54,12 @@ class TestShrinkRNNMemoryReferLoD(TestShrinkRNNMemoryBase): def test_refer_lod(self): cpu = core.CPUPlace() x_tensor = core.LoDTensor() - x_tensor.set_lod([[0, 2, 5, 6]]) + x_tensor.set_recursive_sequence_lengths([[2, 3, 1]]) tensor_np = np.random.random(size=(6, 100)).astype('float32') x_tensor.set(tensor_np, cpu) rank_table_tensor = core.LoDTensor() - rank_table_tensor.set_lod([[0, 1, 3, 6]]) + rank_table_tensor.set_recursive_sequence_lengths([[1, 2, 3]]) rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'), cpu) @@ -83,7 +83,7 @@ class TestShrinkRNNMemoryNoLoD(TestShrinkRNNMemoryBase): x_tensor.set(tensor_np, cpu) rank_table_tensor = core.LoDTensor() - rank_table_tensor.set_lod([[0, 1, 3, 6]]) + rank_table_tensor.set_recursive_sequence_lengths([[1, 2, 3]]) rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'), cpu) diff --git a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py index 5ae2844e295194f95701e1cdccd43bf919bf964f..f4aa7426bc315be501348a64e2f15caed6dc8810 100644 --- a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py @@ -59,9 +59,9 @@ class TestSimpleDistTranspiler(TranspilerTest): delete_ops(trainer.global_block(), optimize_ops) ops = [op.type for op in trainer.global_block().ops] + [ - "send_vars", "send_barrier", "recv", "recv", "fetch_barrier" + "send", "send_barrier", "recv", "recv", "fetch_barrier" ] - ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars") + ops.insert(ops.index("elementwise_add_grad") + 1, "send") return ops def _transpiler_instance(self): diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py new file mode 100644 index 0000000000000000000000000000000000000000..1a48bce3bb7c74551a365fd471f6869b128babac --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_slice_op.py @@ -0,0 +1,62 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class TestSliceOp(OpTest): + def setUp(self): + self.op_type = "slice" + self.config() + self.inputs = {'Input': self.input} + self.outputs = {'Out': self.out} + self.attrs = { + 'axes': self.axes, + 'starts': self.starts, + 'ends': self.ends + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [1, 0, 2] + self.ends = [3, 3, 4] + self.axes = [0, 1, 2] + self.out = self.input[1:3, 0:3, 2:4, :] + + def test_check_output(self): + self.check_output() + + +class TestCase1(TestSliceOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 2] + self.out = self.input[-3:3, 0:100, 2:-1, :] + + +class TestCase2(TestSliceOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 3] + self.out = self.input[-3:3, 0:100, :, 2:-1] + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py index 02cc7da84918041c33bf5c8def46025bc87a2b9e..0916ed7c9f1e2d6d90c6908983fdc8b177aecbb9 100644 --- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py @@ -56,7 +56,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): def test_split_and_merge_lod_tensor_level_0(self): tensor = core.LoDTensor() tensor.set(np.arange(10).reshape(10, 1).astype('int32'), self.place()) - tensor.set_lod([[0, 3, 9, 10]]) + tensor.set_recursive_sequence_lengths([[3, 6, 1]]) mask_np = np.array([0, 1, 0]).astype('bool') mask_np = np.expand_dims(mask_np, axis=1) @@ -68,15 +68,15 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): expect_true_tensor = np.expand_dims(expect_true_tensor, axis=1) expect_true = core.LoDTensor() expect_true.set(expect_true_tensor, self.place()) - expect_true.set_lod([[0, 6]]) + expect_true.set_recursive_sequence_lengths([[6]]) expect_false_tensor = np.array([0, 1, 2, 9]).astype('int32') expect_false_tensor = np.expand_dims(expect_false_tensor, axis=1) - expect_false_lod = [[0, 3, 4]] + expect_false_lod = [[3, 1]] expect_false = core.LoDTensor() expect_false.set(expect_false_tensor, self.place()) - expect_false.set_lod(expect_false_lod) + expect_false.set_recursive_sequence_lengths(expect_false_lod) self.main( tensor=tensor, @@ -126,7 +126,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): def check_tensor_same(self, actual, expect): self.assertTrue(np.allclose(np.array(actual), np.array(expect))) - self.assertEqual(actual.lod(), expect.lod()) + self.assertEqual(actual.recursive_sequence_lengths(), + expect.recursive_sequence_lengths()) class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase): @@ -151,7 +152,7 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase): tensor = core.LoDTensor() tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place) - tensor.set_lod([[0, 3, 9, 10]]) + tensor.set_recursive_sequence_lengths([[3, 6, 1]]) mask_np = np.array([0, 1, 0]).astype('bool') mask_np = np.expand_dims(mask_np, axis=1) diff --git a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..7956897d68a3fb49d62ba696d0b6400b4f909989 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py @@ -0,0 +1,26 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from test_sum_op import TestSumOp + + +class TestMKLDNN(TestSumOp): + def init_kernel_type(self): + self.use_mkldnn = True + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py index 2faf5b10647a1fa1d44e4847f017db177ee8808a..1d90414e137a70e6265042e24e106fe565802778 100644 --- a/python/paddle/fluid/tests/unittests/test_sum_op.py +++ b/python/paddle/fluid/tests/unittests/test_sum_op.py @@ -20,12 +20,15 @@ from op_test import OpTest class TestSumOp(OpTest): def setUp(self): self.op_type = "sum" + self.use_mkldnn = False + self.init_kernel_type() x0 = np.random.random((3, 4)).astype('float32') x1 = np.random.random((3, 4)).astype('float32') x2 = np.random.random((3, 4)).astype('float32') self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]} y = x0 + x1 + x2 self.outputs = {'Out': y} + self.attrs = {'use_mkldnn': self.use_mkldnn} def test_check_output(self): self.check_output() @@ -33,6 +36,9 @@ class TestSumOp(OpTest): def test_check_grad(self): self.check_grad(['x0'], 'Out') + def init_kernel_type(self): + pass + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py index ccb41e56c5555b8c79674449c9139ada0bc47aac..bd208897520122b6a5dcf71da325b1b9dba632f6 100644 --- a/python/paddle/fluid/tests/unittests/test_target_assign_op.py +++ b/python/paddle/fluid/tests/unittests/test_target_assign_op.py @@ -22,22 +22,23 @@ def gen_match_and_neg_indices(num_prior, gt_lod, neg_lod): if len(gt_lod) != len(neg_lod): raise AssertionError("The input arguments are illegal.") - batch_size = len(gt_lod) - 1 + batch_size = len(gt_lod) match_indices = -1 * np.ones((batch_size, num_prior)).astype('int32') - neg_indices = np.zeros((neg_lod[-1], 1)).astype('int32') + neg_indices = np.zeros((sum(neg_lod), 1)).astype('int32') + offset = 0 for n in range(batch_size): - gt_num = gt_lod[n + 1] - gt_lod[n] + gt_num = gt_lod[n] ids = random.sample([i for i in range(num_prior)], gt_num) match_indices[n, ids] = [i for i in range(gt_num)] ret_ids = set([i for i in range(num_prior)]) - set(ids) - s = neg_lod[n] - e = neg_lod[n + 1] - l = e - s + l = neg_lod[n] neg_ids = random.sample(ret_ids, l) - neg_indices[s:e, :] = np.array(neg_ids).astype('int32').reshape(l, 1) + neg_indices[offset:offset + neg_lod[n], :] = np.array(neg_ids).astype( + 'int32').reshape(l, 1) + offset += neg_lod[n] return match_indices, neg_indices @@ -56,24 +57,28 @@ def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod, # init weight for target label trg_label_wt = np.zeros((batch_size, num_prior, 1)).astype('float32') + gt_offset = 0 + neg_offset = 0 for i in range(batch_size): cur_indices = match_indices[i] col_ids = np.where(cur_indices > -1) col_val = cur_indices[col_ids] - gt_start = gt_lod[i] # target bbox - for v, c in zip(col_val + gt_start, col_ids[0].tolist()): + for v, c in zip(col_val + gt_offset, col_ids[0].tolist()): trg_box[i][c][:] = encoded_box[v][c][:] # weight for target bbox trg_box_wt[i][col_ids] = 1.0 - trg_label[i][col_ids] = gt_label[col_val + gt_start] + trg_label[i][col_ids] = gt_label[col_val + gt_offset] trg_label_wt[i][col_ids] = 1.0 # set target label weight to 1.0 for the negative samples if neg_indices is not None: - neg_ids = neg_indices[neg_lod[i]:neg_lod[i + 1]] + neg_ids = neg_indices[neg_offset:neg_offset + neg_lod[i]] trg_label_wt[i][neg_ids] = 1.0 + # update offset + gt_offset += gt_lod[i] + neg_offset += neg_lod[i] return trg_box, trg_box_wt, trg_label, trg_label_wt @@ -83,11 +88,11 @@ class TestTargetAssginFloatType(OpTest): self.op_type = "target_assign" num_prior = 120 num_class = 21 - gt_lod = [0, 5, 11, 23] - neg_lod = [0, 4, 7, 13] + gt_lod = [5, 6, 12] + neg_lod = [4, 3, 6] mismatch_value = 0 - batch_size = len(gt_lod) - 1 - num_gt = gt_lod[-1] + batch_size = len(gt_lod) + num_gt = sum(gt_lod) encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32') gt_label = np.random.randint( @@ -121,11 +126,11 @@ class TestTargetAssginIntType(OpTest): self.op_type = "target_assign" num_prior = 120 num_class = 21 - gt_lod = [0, 5, 11, 23] - neg_lod = [0, 4, 7, 13] + gt_lod = [5, 6, 12] + neg_lod = [4, 3, 6] mismatch_value = 0 - batch_size = len(gt_lod) - 1 - num_gt = gt_lod[-1] + batch_size = len(gt_lod) + num_gt = sum(gt_lod) encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32') gt_label = np.random.randint( diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py index 379081c3287ce81dbf2bd7307cb5eac2620b13db..f17edd3025b17549892bbd47935a1d2452cefac3 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_tensor.py @@ -69,15 +69,14 @@ class TestTensor(unittest.TestCase): array[0, 0, 0] = 3 array[3, 3, 5] = 10 lod_tensor.set(array, place) - lod_tensor.set_lod([[0, 2, 4]]) + lod_tensor.set_recursive_sequence_lengths([[2, 2]]) lod_v = numpy.array(lod_tensor) self.assertTrue(numpy.alltrue(array == lod_v)) - lod = lod_tensor.lod() - self.assertEqual(0, lod[0][0]) + lod = lod_tensor.recursive_sequence_lengths() + self.assertEqual(2, lod[0][0]) self.assertEqual(2, lod[0][1]) - self.assertEqual(4, lod[0][2]) def test_float_lod_tensor(self): place = core.CPUPlace() @@ -97,21 +96,21 @@ class TestTensor(unittest.TestCase): lod_v = numpy.array(lod_tensor) self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0]) self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1]) - self.assertEqual(len(lod_tensor.lod()), 0) + self.assertEqual(len(lod_tensor.recursive_sequence_lengths()), 0) - lod_py = [[0, 2, 5], [0, 2, 4, 5]] - lod_tensor.set_lod(lod_py) - lod = lod_tensor.lod() + lod_py = [[2, 1], [1, 2, 2]] + lod_tensor.set_recursive_sequence_lengths(lod_py) + lod = lod_tensor.recursive_sequence_lengths() self.assertListEqual(lod_py, lod) def test_lod_tensor_init(self): scope = core.Scope() place = core.CPUPlace() - lod_py = [[0, 2, 5], [0, 2, 4, 5]] + lod_py = [[2, 1], [1, 2, 2]] lod_tensor = core.LoDTensor() lod_tensor.set_dims([5, 2, 3, 4]) - lod_tensor.set_lod(lod_py) + lod_tensor.set_recursive_sequence_lengths(lod_py) lod_tensor.alloc_float(place) tensor_array = numpy.array(lod_tensor) tensor_array[0, 0, 0, 0] = 1.0 @@ -121,17 +120,17 @@ class TestTensor(unittest.TestCase): lod_v = numpy.array(lod_tensor) self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0]) self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1]) - self.assertListEqual(lod_py, lod_tensor.lod()) + self.assertListEqual(lod_py, lod_tensor.recursive_sequence_lengths()) def test_lod_tensor_gpu_init(self): if not core.is_compiled_with_cuda(): return place = core.CUDAPlace(0) - lod_py = [[0, 2, 5], [0, 2, 4, 5]] + lod_py = [[2, 1], [1, 2, 2]] lod_tensor = core.LoDTensor() lod_tensor.set_dims([5, 2, 3, 4]) - lod_tensor.set_lod(lod_py) + lod_tensor.set_recursive_sequence_lengths(lod_py) lod_tensor.alloc_float(place) tensor_array = numpy.array(lod_tensor) tensor_array[0, 0, 0, 0] = 1.0 @@ -141,7 +140,7 @@ class TestTensor(unittest.TestCase): lod_v = numpy.array(lod_tensor) self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0]) self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1]) - self.assertListEqual(lod_py, lod_tensor.lod()) + self.assertListEqual(lod_py, lod_tensor.recursive_sequence_lengths()) def test_empty_tensor(self): place = core.CPUPlace() diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py index ac638f7836f8205f80e31cfd5eb8892b2c7aee08..9f1aaee472f918da7deb8816a0a4654dafe74a30 100644 --- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py +++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py @@ -34,8 +34,8 @@ class CTCForward(object): self.level = 0 self.num_classes = softmax.shape[1] - self.batch_size = len(softmax_lod[self.level]) - 1 - assert self.batch_size == len(labels_lod[self.level]) - 1 + self.batch_size = len(softmax_lod[self.level]) + assert self.batch_size == len(labels_lod[self.level]) self.loss = np.zeros([self.batch_size, 1], dtype="float32") self.gradient = np.zeros(self.softmax.shape, dtype="float32") @@ -156,16 +156,20 @@ class CTCForward(object): return -log_prob def forward(self): + softmax_offset = 0 + labels_offset = 0 for i in range(self.batch_size): - softmax_start_i = self.softmax_lod[self.level][i] - softmax_end_i = self.softmax_lod[self.level][i + 1] - labels_start_i = self.labels_lod[self.level][i] - labels_end_i = self.labels_lod[self.level][i + 1] + softmax_start_i = softmax_offset + softmax_end_i = softmax_offset + self.softmax_lod[self.level][i] + labels_start_i = labels_offset + labels_end_i = labels_offset + self.labels_lod[self.level][i] softmax_a_sequence = self.softmax[softmax_start_i:softmax_end_i, :] labels_a_sequence = self.labels[labels_start_i:labels_end_i, :] self.loss[i] = self.forward_a_sequence(softmax_a_sequence, labels_a_sequence) + softmax_offset += self.softmax_lod[self.level][i] + labels_offset += self.labels_lod[self.level][i] return self.loss @@ -173,8 +177,8 @@ class TestWarpCTCOp(OpTest): def config(self): self.batch_size = 4 self.num_classes = 8 - self.logits_lod = [[0, 4, 5, 8, 11]] - self.labels_lod = [[0, 3, 4, 8, 12]] + self.logits_lod = [[4, 1, 3, 3]] + self.labels_lod = [[3, 1, 4, 4]] self.blank = self.num_classes - 1 self.norm_by_times = False @@ -184,11 +188,13 @@ class TestWarpCTCOp(OpTest): logits = np.random.uniform( 0.1, 1.0, - [self.logits_lod[0][-1], self.num_classes]).astype("float32") + [sum(self.logits_lod[0]), self.num_classes]).astype("float32") softmax = np.apply_along_axis(stable_softmax, 1, logits) # labels should not be blank labels = np.random.randint( - 0, self.num_classes - 1, [self.labels_lod[0][-1], 1], dtype="int32") + 0, + self.num_classes - 1, [sum(self.labels_lod[0]), 1], + dtype="int32") ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod, self.blank, self.norm_by_times) @@ -196,9 +202,8 @@ class TestWarpCTCOp(OpTest): max_sequence_length = 0 for i in range(self.batch_size): - max_sequence_length = max( - max_sequence_length, - self.logits_lod[0][i + 1] - self.logits_lod[0][i]) + max_sequence_length = max(max_sequence_length, + self.logits_lod[0][i]) self.gradient = np.zeros( [max_sequence_length, self.batch_size, self.num_classes], dtype="float32") @@ -222,8 +227,8 @@ class TestWarpCTCOpCase1(TestWarpCTCOp): def config(self): self.batch_size = 4 self.num_classes = CUDA_BLOCK_SIZE + 2 - self.logits_lod = [[0, 4, 5, 8, 11]] - self.labels_lod = [[0, 3, 4, 8, 12]] + self.logits_lod = [[4, 1, 3, 3]] + self.labels_lod = [[3, 1, 4, 4]] self.blank = 0 self.norm_by_times = False diff --git a/python/paddle/fluid/tests/unittests/test_weight_normalization.py b/python/paddle/fluid/tests/unittests/test_weight_normalization.py index 2adf917bc5d3bb35842a817c57a983627b759f22..436f9b9f86fb86270e47c8e30c5c0701787ca0f1 100644 --- a/python/paddle/fluid/tests/unittests/test_weight_normalization.py +++ b/python/paddle/fluid/tests/unittests/test_weight_normalization.py @@ -76,11 +76,11 @@ class TestWeightNormalization(unittest.TestCase): lod_level_i = numpy.random.randint( low=1, high=5, - size=self.batch_size if i == 0 else lod_level_i[-1]) - lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist() + size=self.batch_size + if i == 0 else sum(lod_level_i)).tolist() data_lod.append(lod_level_i) data_value = numpy.random.random( - size=[data_lod[-1][-1] if data_lod else self.batch_size + size=[sum(data_lod[-1]) if data_lod else self.batch_size ] + data_shape).astype('float32') self.data[data_name] = (data_value, data_lod) @@ -90,7 +90,7 @@ class TestWeightNormalization(unittest.TestCase): tensor = fluid.Tensor() tensor.set(self.data[desc[0]][0], place) if self.data[desc[0]][1]: - tensor.set_lod(self.data[desc[0]][1]) + tensor.set_recursive_sequence_lengths(self.data[desc[0]][1]) self.inputs[desc[0]] = tensor def weight_normalize(self): diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py new file mode 100644 index 0000000000000000000000000000000000000000..a995ee10f29a714b674fae4b31070e6ba2ca9953 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/testsuite.py @@ -0,0 +1,182 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +import paddle.fluid.core as core +from paddle.fluid.op import Operator + + +def as_lodtensor(np_array, lod, place): + tensor = core.LoDTensor() + tensor.set(np_value, place) + if lod is not None: + tensor.set_recursive_sequence_lengths(lod) + return tensor + + +def create_op(scope, op_type, inputs, outputs, attrs): + kwargs = dict() + + op_maker = core.op_proto_and_checker_maker + op_role_attr_name = op_maker.kOpRoleAttrName() + + if op_role_attr_name not in attrs: + attrs[op_role_attr_name] = int(op_maker.OpRole.Forward) + + def __create_var__(name, var_name): + scope.var(var_name).get_tensor() + kwargs[name].append(var_name) + + for in_name, in_dup in Operator.get_op_inputs(op_type): + if in_name in inputs: + kwargs[in_name] = [] + if in_dup: + sub_in = inputs[in_name] + for item in sub_in: + sub_in_name, _ = item[0], item[1] + __create_var__(in_name, sub_in_name) + else: + __create_var__(in_name, in_name) + + for out_name, out_dup in Operator.get_op_outputs(op_type): + if out_name in outputs: + kwargs[out_name] = [] + if out_dup: + sub_out = outputs[out_name] + for item in sub_out: + sub_out_name, _ = item[0], item[1] + __create_var__(out_name, sub_out_name) + else: + __create_var__(out_name, out_name) + + for attr_name in Operator.get_op_attr_names(op_type): + if attr_name in attrs: + kwargs[attr_name] = attrs[attr_name] + + return Operator(op_type, **kwargs) + + +def set_input(scope, op, inputs, place): + def __set_input__(var_name, var): + if isinstance(var, tuple) or isinstance(var, np.ndarray): + tensor = scope.find_var(var_name).get_tensor() + if isinstance(var, tuple): + tensor.set_recursive_sequence_lengths(var[1]) + var = var[0] + tensor.set_dims(var.shape) + tensor.set(var, place) + elif isinstance(var, float): + scope.find_var(var_name).set_float(var) + elif isinstance(var, int): + scope.find_var(var_name).set_int(var) + + for in_name, in_dup in Operator.get_op_inputs(op.type()): + if in_name in inputs: + if in_dup: + sub_in = inputs[in_name] + for item in sub_in: + sub_in_name, sub_in_val = item[0], item[1] + __set_input__(sub_in_name, sub_in_val) + else: + __set_input__(in_name, inputs[in_name]) + + +def append_input_output(block, op_proto, np_list, is_input, dtype): + '''Insert VarDesc and generate Python variable instance''' + proto_list = op_proto.inputs if is_input else op_proto.outputs + + def create_var(block, name, np_list, var_proto): + dtype = None + shape = None + lod_level = None + if name not in np_list: + assert var_proto.intermediate, "{} not found".format(name) + else: + np_value = np_list[name] + if isinstance(np_value, tuple): + dtype = np_value[0].dtype + # output shape, lod should be infered from input. + if is_input: + shape = list(np_value[0].shape) + lod_level = len(np_value[1]) + else: + dtype = np_value.dtype + if is_input: + shape = list(np_value.shape) + lod_level = 0 + return block.create_var( + dtype=dtype, shape=shape, lod_level=lod_level, name=name) + + var_dict = {} + for var_proto in proto_list: + var_name = str(var_proto.name) + if is_input: + if (var_name not in np_list) and var_proto.dispensable: + continue + assert (var_name in np_list) or (var_proto.dispensable), \ + "Missing {} as input".format(var_name) + if var_proto.duplicable: + assert isinstance(np_list[var_name], list), \ + "Duplicable {} should be set as list".format(var_name) + var_list = [] + for (name, np_value) in np_list[var_name]: + var_list.append( + create_var(block, name, {name: np_value}, var_proto)) + var_dict[var_name] = var_list + else: + var_dict[var_name] = create_var(block, var_name, np_list, var_proto) + + return var_dict + + +def append_loss_ops(block, output_names): + mean_inputs = map(block.var, output_names) + # for item in mean_inputs: + # print(item) + # print("Item", item.dtype) + + if len(mean_inputs) == 1: + loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1]) + op = block.append_op( + inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean') + op.desc.infer_var_type(block.desc) + op.desc.infer_shape(block.desc) + else: + avg_sum = [] + for cur_loss in mean_inputs: + cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1]) + op = block.append_op( + inputs={"X": [cur_loss]}, + outputs={"Out": [cur_avg_loss]}, + type="mean") + op.desc.infer_var_type(block.desc) + op.desc.infer_shape(block.desc) + avg_sum.append(cur_avg_loss) + + loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1]) + op_sum = block.append_op( + inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum') + op_sum.desc.infer_var_type(block.desc) + op_sum.desc.infer_shape(block.desc) + + loss = block.create_var(dtype=loss_sum.dtype, shape=[1]) + op_loss = block.append_op( + inputs={"X": loss_sum}, + outputs={"Out": loss}, + type='scale', + attrs={'scale': 1.0 / float(len(avg_sum))}) + op_loss.desc.infer_var_type(block.desc) + op_loss.desc.infer_shape(block.desc) + return loss diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index cdacb419863518cc0606903ed8eb79f0d2bc9e40..f191ef7df5caa04537e69ad9a0e018d161cd59ad 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -27,38 +27,114 @@ import parallel_executor from transpiler import distribute_transpiler __all__ = [ - 'Trainer', - 'BeginEpochEvent', - 'EndEpochEvent', - 'BeginStepEvent', - 'EndStepEvent', + 'Trainer', 'BeginEpochEvent', 'EndEpochEvent', 'BeginStepEvent', + 'EndStepEvent', 'CheckpointConfig' ] class BeginEpochEvent(object): + """ + The begin of a training epoch. + + Args: + epoch_id(int): The current epoch ID. + """ + def __init__(self, epoch_id): self.epoch = epoch_id class EndEpochEvent(object): + """ + The end of a training epoch. + + Args: + epoch_id(int): The current epoch ID. + """ + def __init__(self, epoch_id): self.epoch = epoch_id class BeginStepEvent(object): + """ + The begin of a training epoch. + + Args: + epoch_id(int): The current epoch ID. + step_id(int): The current step ID. + """ + def __init__(self, epoch_id, step_id): self.epoch = epoch_id self.step = step_id self.fetch_metrics = True + """ + If fetch_metrics is true, the metrics will be fetched at the + EndStepEvent. Default is True. + """ class EndStepEvent(object): + """ + The end of a training step. + + Args: + epoch_id(int): The current epoch ID. + step_id(int): The current step ID. + metrics(list): A list of fetched tensor. The order of this list is same + as the :code:`train_func` returns. + """ + def __init__(self, epoch_id, step_id, metrics): self.epoch = epoch_id self.step = step_id self.metrics = metrics +class CheckpointConfig(object): + """ + Parameter object for :code:`fluid.io.save_checkpoint` and + :code:`fluid.Trainer`. Used to configuration how to save checkpoint. + + Args: + checkpoint_dir(str): Directory path to save check point. Default is the + current directory. + + max_num_checkpoints(int): The max number of local check points. + epoch_interval(int): Every number of epoch to save check point. + step_interval(int): Every number of step to save check point. + + Examples: + >>> config = fluid.CheckpointConfig("./checkpoints") + >>> trainer = fluid.Trainer(train_func=train_program, + >>> place=place, + >>> optimizer_func=optimizer_func, + >>> checkpoint_config=config) + >>> trainer.train(...) + """ + + def __init__(self, + checkpoint_dir=None, + max_num_checkpoints=3, + epoch_interval=1, + step_interval=10): + + assert epoch_interval >= 1 + assert step_interval >= 1 + + self.checkpoint_dir = checkpoint_dir \ + if checkpoint_dir is not None else os.getcwd() + self.max_num_checkpoints = max_num_checkpoints + self.epoch_interval = epoch_interval + self.step_interval = step_interval + self.epoch_id = 0 + self.step_id = 0 + self.load_serial = None + self.pserver_id = None + self.lookup_table_name = None + + def check_and_get_place(place): """ Check the type of place or get the default place @@ -87,11 +163,62 @@ def check_and_get_place(place): class Trainer(object): """ + A trainer wraps MultiGPU/MultiNode training loops and can be used to train a + simple neural network easily. + + This API takes a :code:`train_func`. A :code:`train_func` is a function that + return loss as it first return value. The reset value can be fetched by + EndStepEvent.metrics + + This API also takes a :code:`optimizer_func` that will return an optimizer + instance. + + For example, to train a MLP for MNIST dataset, the sample program is + + >>> import paddle.fluid as fluid + >>> + >>> def mlp(image, layer_sizes=[200, 100], activation="relu", num_classes=10): + >>> hidden = image + >>> for layer_size in layer_sizes: + >>> hidden = fluid.layers.fc(input=hidden, size=layer_size, act=activation) + >>> return fluid.layers.fc(input=hidden, size=num_classes, act="softmax") + >>> + >>> def train_mnist_mlp(): + >>> img = fluid.layers.data(name='image', shape=[784]) + >>> label = fluid.layers.data(name='label', shape=[1], dtype='int64') + >>> prediction = mlp(img) + >>> return fluid.layers.mean(fluid.layers.cross_entropy(prediction, label)) + >>> + >>> def optimizer(): + >>> return fluid.optimizer.Adam() + >>> + >>> trainer = Trainer(train_func=train_mnist_mlp, + >>> optimizer_func=optimizer, + >>> place=fluid.CUDAPlace(0), + >>> parallel=True) + >>> + >>> def train_callback(event): + >>> if isinstance(event, fluid.EndStepEvent): + >>> print "Epoch ID", event.epoch, "Step ID",\ + >>> event.step, "AvgLoss", event.metrics[0] + >>> elif isinstance(event, fluid.EndEpochEvent): + >>> trainer.save_params("./model_{0}".format(event.epoch)) + >>> + >>> trainer.train(num_epochs=100, event_handler=train_callback) + + For more example, please see :ref:`api_guide_high_level_api`. + Args: - train_func(callable): A function which will return loss. The loss must be a scalar. + train_func(callable): A function which will return loss. The loss must be + a scalar tensor. optimizer_func(callable): A function that returns an Optimizer object. - place: The device place of this trainer. + place(CUDAPlace|CPUPlace): The device place of this trainer. If + :code:`parallel=True,` all CUDA Places will be used if :code:`place` + is a :code:`CUDAPlace`. + parallel(bool): True if use multiple devices. + checkpoint_config(CheckpointConfig): Configuration about how to save + checkpoints. """ def __init__(self, @@ -99,15 +226,27 @@ class Trainer(object): optimizer_func, param_path=None, place=None, - parallel=False): + parallel=False, + checkpoint_config=None): self.__stop = False self.parallel = parallel + + # config for checkpoint + # only chief worker will save variables + self.trainer_id = 0 + self.checkpoint_cfg = checkpoint_config + if self.checkpoint_cfg: + assert isinstance(self.checkpoint_cfg, CheckpointConfig) + serial = io.get_latest_checkpoint_serial( + self.checkpoint_cfg.checkpoint_dir) + self.checkpoint_cfg.load_serial = serial if serial >= 0 else None + + self.scope = core.Scope() + # 1. we need to generate a framework.Program by calling # program_func. Reference: fluid.program_guard in # test_word2vec.py - self.scope = core.Scope() - self.startup_program = framework.Program() self.train_program = framework.Program() @@ -115,9 +254,9 @@ class Trainer(object): program_func_outs = train_func() self.train_func_outputs = program_func_outs if isinstance( program_func_outs, list) else [program_func_outs] - self.test_program = self.train_program.clone() + self.test_program = self.train_program.clone(for_test=True) - # The fisrt element of program_func_outs is loss. + # The first element of program_func_outs is loss. loss = self.train_func_outputs[0] optimizer = optimizer_func() @@ -137,9 +276,32 @@ class Trainer(object): exe = executor.Executor(place) exe.run(self.startup_program) - if param_path: + if self.checkpoint_cfg and self.checkpoint_cfg.load_serial: + with self._prog_and_scope_guard(): + exe = executor.Executor(place) + io.load_checkpoint(exe, self.checkpoint_cfg.checkpoint_dir, + self.checkpoint_cfg.load_serial, + self.startup_program) + + if not self.checkpoint_cfg.pserver_id: + epoch_id, step_id = io.load_trainer_args( + self.checkpoint_cfg.checkpoint_dir, + self.checkpoint_cfg.load_serial, self.trainer_id, + self._get_checkpoint_load_args()) + self.checkpoint_cfg.epoch_id = int(epoch_id) + self.checkpoint_cfg.step_id = int(step_id) + else: + if self.checkpoint_cfg.lookup_table_name: + io.load_lookup_table_vars( + exe, self.checkpoint_cfg.checkpoint_dir, + self.startup_program, + self.checkpoint_cfg.pserver_id, + self.checkpoint_cfg.lookup_table_name) + + if param_path and os.path.isdir(param_path): # load params from param_path into scope - io.load_persistables(exe, dirname=param_path) + io.load_persist_vars_without_grad( + exe, dirname=param_path, program=self.startup_program) def _transpile_nccl2_dist(self): # PADDLE_TRAINER_IPS @@ -194,14 +356,21 @@ class Trainer(object): current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port # the unique trainer id, starting from 0, needed by trainer # only - trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + # the role, should be either PSERVER or TRAINER training_role = os.getenv("PADDLE_TRAINING_ROLE") with self._prog_and_scope_guard(): t = distribute_transpiler.DistributeTranspiler() t.transpile( - trainer_id, pservers=pserver_endpoints, trainers=trainers) + self.trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": + if self.checkpoint_cfg: + pserver_id = eplist.index(current_endpoint) + self.checkpoint_cfg.pserver_id = pserver_id + if t.has_distributed_lookup_table: + self.checkpoint_cfg.lookup_table_name = t.table_name + self.train_program = t.get_pserver_program(current_endpoint) self.startup_program = t.get_startup_program(current_endpoint, self.train_program) @@ -220,17 +389,18 @@ class Trainer(object): def train(self, num_epochs, event_handler, reader=None, feed_order=None): """ - Train the model. + Start the train loop to train the model. Args: - num_epochs: The number of epoch. An epoch will process all data in reader - event_handler: The event handler. A function with type (ev:Event)->void - reader: - feed_order: Feeding order of reader. None will following the defining + num_epochs(int): The number of epoch. An epoch will process all data in reader + event_handler(callable): The event handler. A function with type (ev:Event)->void + reader(callable): A reader creator object. See also + :ref:`api_guide_python_reader` . + feed_order(list): Feeding order of reader. None will following the defining order in program Returns: - + None """ training_role = os.getenv("PADDLE_TRAINING_ROLE", "") if training_role == "PSERVER": @@ -250,16 +420,24 @@ class Trainer(object): Test the model on given test data Args: - reader: The reader that yields test data. - feed_order: Feeding order of reader. None will following the defining - order in program + reader(callable): The reader that yields test data. + feed_order(list): Feeding order of reader. None will following the + defining order in program """ return self._test_by_executor(reader, feed_order, self.train_func_outputs) def save_params(self, param_path): - # reference: save_persistables in io.py + """ + Save all parameters into :code:`param_path`. + + Args: + param_path(str): The path to save parameters. + + Returns: + None + """ with self._prog_and_scope_guard(): exe = executor.Executor(self.place) io.save_persistables(exe, dirname=param_path) @@ -294,11 +472,26 @@ class Trainer(object): self._train_by_any_executor(event_handler, exe, num_epochs, reader) def _train_by_any_executor(self, event_handler, exe, num_epochs, reader): - for epoch_id in range(num_epochs): + if self.checkpoint_cfg: + epochs = [ + epoch_id for epoch_id in range(num_epochs) + if epoch_id >= self.checkpoint_cfg.epoch_id + ] + else: + epochs = [epoch_id for epoch_id in range(num_epochs)] + + for epoch_id in epochs: event_handler(BeginEpochEvent(epoch_id)) for step_id, data in enumerate(reader()): if self.__stop: + if self.checkpoint_cfg: + self._clean_checkpoint() return + + if self.checkpoint_cfg and self.checkpoint_cfg.load_serial \ + and self.checkpoint_cfg.step_id >= step_id and self.checkpoint_cfg.epoch_id == epoch_id: + continue + begin_event = BeginStepEvent(epoch_id, step_id) event_handler(begin_event) if begin_event.fetch_metrics: @@ -309,8 +502,13 @@ class Trainer(object): ]) else: metrics = exe.run(feed=data, fetch_list=[]) + + if self.checkpoint_cfg: + self._save_checkpoint(epoch_id, step_id) event_handler(EndStepEvent(epoch_id, step_id, metrics)) event_handler(EndEpochEvent(epoch_id)) + if self.checkpoint_cfg: + self._clean_checkpoint() def _test_by_executor(self, reader, feed_order, fetch_list): with executor.scope_guard(self.scope): @@ -349,6 +547,39 @@ class Trainer(object): loss_name=self.train_func_outputs[0].name) return self._get_parallel_executor() + def _clean_checkpoint(self): + assert self.checkpoint_cfg + io.clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir) + + def _get_checkpoint_load_args(self): + """ + epoch_id and step_id are runtime arguments, they are not variables, will load them independently. + """ + return ["epoch_id", "step_id"] + + def _get_checkpoint_save_args(self, epoch_id, step_id): + """ + epoch_id and step_id are runtime arguments, they are not variables, will save them independently. + """ + trainer_args = {} + trainer_args["epoch_id"] = epoch_id + trainer_args["step_id"] = step_id + return trainer_args + + def _save_checkpoint(self, epoch_id, step_id): + assert self.checkpoint_cfg + + if epoch_id % self.checkpoint_cfg.epoch_interval == 0 \ + and step_id % self.checkpoint_cfg.step_interval == 0: + exe = executor.Executor(self.place) + io.save_checkpoint( + executor=exe, + checkpoint_dir=self.checkpoint_cfg.checkpoint_dir, + trainer_id=self.trainer_id, + trainer_args=self._get_checkpoint_save_args(epoch_id, step_id), + main_program=self.train_program, + max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints) + def build_feed_var_list(program, feed_order): if not isinstance(program, framework.Program): diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 27992df462ffd00ddf445538cc508b4232712481..4a3bd3bef2c3b763eee411034a908edd55c4df03 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -12,21 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Transpile the program to distributed data-parallelism programs. -The main_program will be transformed to use a remote parameter server -to do parameter optimization. And the optimization graph will be put -into a parameter server program. - -Use different methods to split trainable variables to different -parameter servers. - Steps to transpile trainer: 1. split variable to multiple blocks, aligned by product(dim[1:]) (width). 2. rename splited grad variables to add trainer_id suffix ".trainer_%d". 3. modify trainer program add split_op to each grad variable. -4. append send_op to send splited variables to server and fetch - params(splited blocks or origin param) from server. -5. append concat_op to merge splited blocks to update local weights. +4. append send_op to send splited variables to server and +5. add recv_op to fetch params(splited blocks or origin param) from server. +6. append concat_op to merge splited blocks to update local weights. Steps to transpile pserver: 1. create new program for parameter server. @@ -44,7 +36,7 @@ import numpy as np from ps_dispatcher import RoundRobin, HashName, PSDispatcher from .. import core, framework from ..framework import Program, default_main_program, \ - default_startup_program, \ + default_startup_program, Block, \ Variable, Parameter, grad_var_name from details import * @@ -117,128 +109,41 @@ def slice_variable(var_list, slice_count, min_block_size=8192): return blocks -class DistributeTranspiler: - def _has_distributed_lookup_table(self): - # process lookup_table_op - # 1. check all lookup_table_op is distributed - # 2. check all lookup_table_op share the same table. - distributed_lookup_table_ops = [] - # support only one distributed_lookup_table now - self.table_name = None - for op in self.origin_program.global_block().ops: - if op.type == LOOKUP_TABLE_TYPE: - if op.attrs['is_distributed'] is True: - if self.table_name is None: - self.table_name = op.input("W")[0] - if self.table_name != op.input("W")[0]: - raise RuntimeError("all distributed lookup_table_ops" - " should have only one table") - distributed_lookup_table_ops.append(op) - else: - if self.table_name is not None: - assert op.input("W")[0] != self.table_name - - return len(distributed_lookup_table_ops) > 0 - - def _update_dist_lookup_table_vars(self, param_list, grad_list, - params_grads): - # TODO(wuyi): put find a way to put dist lookup table stuff all together. - # update self.table_param_grad and self.trainer_side_table_grad_list - program = self.origin_program - if self.has_distributed_lookup_table: - param_list = [ - param for param in param_list if param.name != self.table_name - ] - grad_list = [ - grad for grad in grad_list - if grad.name != grad_var_name(self.table_name) - ] - self.table_param_grad = [ - param_grad for param_grad in params_grads - if param_grad[0].name == self.table_name - ][0] - table_grad_var = self.table_param_grad[1] - if self.sync_mode: - self.trainer_side_table_grad_list = [ - program.global_block().create_var( - name="%s.trainer_%d.pserver_%d" % - (table_grad_var.name, self.trainer_id, index), - type=table_grad_var.type, - shape=table_grad_var.shape, - dtype=table_grad_var.dtype) - for index in range(len(self.pserver_endpoints)) - ] - else: - self.trainer_side_table_grad_list = [ - program.global_block().create_var( - name="%s.pserver_%d" % (table_grad_var.name, index), - type=table_grad_var.type, - shape=table_grad_var.shape, - dtype=table_grad_var.dtype) - for index in range(len(self.pserver_endpoints)) - ] - - def _init_splited_vars(self, slice_var_up): - # update these mappings for further transpile: - # 1. param_var_mapping: param var name -> [splited params vars] - # 2. grad_var_mapping: grad var name -> [splited grads vars] - # 3. grad_param_mapping: grad.blockx -> param.blockx - # 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []} - - param_list = [] - grad_list = [] - param_grad_set = set() - for p, g in self.params_grads: - # skip parameter marked not trainable - if type(p) == Parameter and p.trainable == False: - continue - if p.name not in param_grad_set: - param_list.append(p) - param_grad_set.add(p.name) - if g.name not in param_grad_set: - grad_list.append(g) - param_grad_set.add(g.name) - - self._update_dist_lookup_table_vars(param_list, grad_list, - self.params_grads) - - if slice_var_up: - # when we slice var up into blocks, we will slice the var according to - # pserver services' count. A pserver may have two or more listening ports. - grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints)) - param_blocks = slice_variable(param_list, - len(self.pserver_endpoints)) - else: - # when we do NOT slice var up into blocks, we will always slice params - # grads into one block. - grad_blocks = slice_variable(grad_list, 1) - param_blocks = slice_variable(param_list, 1) - assert (len(grad_blocks) == len(param_blocks)) - - # origin_varname -> [splited_var] - self.param_var_mapping = self._create_vars_from_blocklist( - self.origin_program, param_blocks) - self.grad_var_mapping = self._create_vars_from_blocklist( - self.origin_program, - grad_blocks, - add_trainer_suffix=self.trainer_num > 1) - self.grad_param_mapping = dict() - for g, p in zip(grad_blocks, param_blocks): - g_name, g_bid, _ = g.split(":") - p_name, p_bid, _ = p.split(":") - self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \ - self.param_var_mapping[p_name][int(p_bid)] - - # create mapping of endpoint -> split var to create pserver side program - self.param_grad_ep_mapping = dict() - [ - self.param_grad_ep_mapping.update({ - ep: { - "params": [], - "grads": [] - } - }) for ep in self.pserver_endpoints - ] +class DistributeTranspiler(object): + """ + **DistributeTranspiler** + + Convert the fluid program to distributed data-parallelism programs. + + The main_program will be transformed to use a remote parameter server + to do parameter optimization. And the optimization graph will be put + into a parameter server program. + + Examples: + .. code-block:: python + + # Define your model before these codes. + port = os.getenv("PADDLE_PSERVER_PORT", "6174") + pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "") + eplist = [] + for ip in pserver_ips.split(","): + eplist.append(':'.join([ip, port])) + pserver_endpoints = ",".join(eplist) + trainers = int(os.getenv("PADDLE_TRAINERS")) + current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + role = os.getenv("PADDLE_TRAINING_ROLE") + + t = distribute_transpiler.DistributeTranspiler() + t.transpile( + trainer_id, pservers=pserver_endpoints, trainers=trainers) + if role == "PSERVER": + pserver_program = t.get_pserver_program(current_endpoint) + pserver_startup_program = t.get_startup_program(current_endpoint, + pserver_program) + elif role == "TRAINER": + trainer_program = t.get_trainer_program() + """ def transpile(self, trainer_id, @@ -249,20 +154,20 @@ class DistributeTranspiler: split_method=RoundRobin, sync_mode=True): """ - :param trainer_id: one unique id for each trainer in a job. - :type trainer_id: int - :param program: program to transpile, default is default_main_program - :type program: Program - :param pservers: parameter server endpoints like "m1:6174,m2:6174" - :type pservers: string - :param trainers: total number of workers/trainers in the job - :type trainers: int - :param split_method: A function to determin how to split variables - to different servers equally. - :type split_method: function - :type sync_mode: boolean default True - :param sync_mode: if sync_mode is set True, it means that dist transpiler - will transpile the program into sync_mode pserver and trainer program. + Run the transpiler. + + Args: + trainer_id (int): id for current trainer worker, if you have + n workers, the id may range from 0 ~ n-1 + program (Program|None): program to transpile, + default is fluid.default_main_program(). + pservers (str): comma separated ip:port string for the pserver + list. + trainers (int): number of trainers in the distributed job. + slice_var_up (bool): Do Tensor slice for pservers, default is True. + split_method (PSDispatcher): RoundRobin or HashName can be used + try to choose the best method to balance loads for pservers. + sync_mode (bool): Do sync training or not, default is True. """ assert (split_method.__bases__[0] == PSDispatcher) if program is None: @@ -316,7 +221,7 @@ class DistributeTranspiler: program.global_block().insert_op( index=index + 1, - type="send_vars", + type="send", inputs={"X": splited_vars}, outputs={}, attrs={ @@ -389,20 +294,32 @@ class DistributeTranspiler: self._split_table_grad_and_add_send_vars(program, pserver_endpoints) def get_trainer_program(self): + """ + Get transpiled trainer side program. + + Returns: + Program: trainer side program. + """ # remove optimize ops and add a send op to main_program delete_ops(self.origin_program.global_block(), self.optimize_ops) - # FIXME(typhoonzero): serialize once will fix error occurs when clone. self.origin_program.__str__() return self.origin_program def get_pserver_program(self, endpoint): """ - Get pserver side program using the endpoint. - TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers. - NOTE: assume blocks of the same variable is not distributed - on the same pserver, only change param/grad varnames for - trainers to fetch. + Get parameter server side program. + + Args: + endpoint (str): current parameter server endpoint. + + Returns: + Program: the program for current parameter server to run. """ + # TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers. + # NOTE: assume blocks of the same variable is not distributed + # on the same pserver, only change param/grad varnames for + # trainers to fetch. + # step1 pserver_program = Program() # step2: Create vars to receive vars at parameter servers. @@ -465,12 +382,13 @@ class DistributeTranspiler: if self._is_adam_connected_op(op): global_ops.append(op) - def __append_optimize_op__(op, block, grad_to_block_id, merged_var): + def __append_optimize_op__(op, block, grad_to_block_id, merged_var, + lr_ops): if self._is_optimizer_op(op): self._append_pserver_ops(block, op, endpoint, grad_to_block_id, self.origin_program, merged_var) - else: - self._append_pserver_non_opt_ops(block, op, endpoint) + elif op not in lr_ops: + self._append_pserver_non_opt_ops(block, op) def __op_have_grad_input__(op): for varname in op.input_arg_names: @@ -478,19 +396,50 @@ class DistributeTranspiler: return varname return "" + def __clone_lr_op_sub_block__(op, program, lr_block): + if not op.has_attr('sub_block'): + return + + origin_block_desc = op.attr('sub_block') + origin_block = self.origin_program.block(origin_block_desc.id) + assert isinstance(origin_block, Block) + # we put the new sub block to new block to follow the block + # hierarchy of the original blocks + new_sub_block = program.create_block(lr_block.idx) + + # clone vars + for var in origin_block.vars: + new_sub_block.clone_variable(var) + + # clone ops + for origin_op in origin_block.ops: + cloned_op = self._clone_lr_op(program, new_sub_block, origin_op) + # clone sub_block of op + __clone_lr_op_sub_block__(cloned_op, program, new_sub_block) + + # reset the block of op + op.set_attr('sub_block', new_sub_block) + # append lr decay ops to the child block if exists lr_ops = self._get_lr_ops() + # record optimize blocks and we can run them on pserver parallel + optimize_blocks = [] if len(lr_ops) > 0: lr_decay_block = pserver_program.create_block( pserver_program.num_blocks - 1) + optimize_blocks.append(lr_decay_block) for _, op in enumerate(lr_ops): - self._append_pserver_non_opt_ops(lr_decay_block, op, endpoint) + cloned_op = self._append_pserver_non_opt_ops(lr_decay_block, op) + # append sub blocks to pserver_program in lr_decay_op + __clone_lr_op_sub_block__(cloned_op, pserver_program, + lr_decay_block) # append op to the current block grad_to_block_id = [] pre_block_idx = pserver_program.num_blocks - 1 for idx, opt_op in enumerate(opt_op_on_pserver): per_opt_block = pserver_program.create_block(pre_block_idx) + optimize_blocks.append(per_opt_block) # append grad merging ops before clip and weight decay for _, op in enumerate(self.optimize_ops): # find the origin @GRAD var before clipping @@ -503,46 +452,53 @@ class DistributeTranspiler: # optimizer is connected to itself if ufind.is_connected(op, opt_op) and op not in global_ops: __append_optimize_op__(op, per_opt_block, grad_to_block_id, - merged_var) + merged_var, lr_ops) # append global ops if global_ops: opt_state_block = pserver_program.create_block( pserver_program.num_blocks - 1) + optimize_blocks.append(opt_state_block) for glb_op in global_ops: __append_optimize_op__(glb_op, opt_state_block, - grad_to_block_id, None) + grad_to_block_id, None, lr_ops) # process distributed lookup_table - prefetch_block = None + prefetch_var_name_to_block_id = [] if self.has_distributed_lookup_table: pserver_index = self.pserver_endpoints.index(endpoint) table_opt_block = self._create_table_optimize_block( pserver_index, pserver_program, pre_block_idx, grad_to_block_id) - prefetch_block = self._create_prefetch_block( + prefetch_var_name_to_block_id = self._create_prefetch_block( pserver_index, pserver_program, table_opt_block) + checkpoint_block_id = self._create_checkpoint_save_block( + pserver_program, table_opt_block.idx) # NOTE: if has_distributed_lookup_table is False, then prefetch_block will # not be executed, so it's safe to use optimize_block to hold the place if self.has_distributed_lookup_table: - assert prefetch_block is not None + assert len(prefetch_var_name_to_block_id) > 0 else: - assert prefetch_block is None - prefetch_block = pserver_program.global_block() + assert len(prefetch_var_name_to_block_id) == 0 + + attrs = { + "optimize_blocks": optimize_blocks, + "endpoint": endpoint, + "Fanin": self.trainer_num, + "sync_mode": self.sync_mode, + "grad_to_block_id": grad_to_block_id, + } + if len(prefetch_var_name_to_block_id) > 0: + attrs['prefetch_var_name_to_block_id'] \ + = prefetch_var_name_to_block_id + attrs['checkpint_block_id'] = checkpoint_block_id # step5 append the listen_and_serv op pserver_program.global_block().append_op( type="listen_and_serv", inputs={'X': recv_inputs}, outputs={}, - attrs={ - "OptimizeBlock": pserver_program.block(1), - "endpoint": endpoint, - "Fanin": self.trainer_num, - "PrefetchBlock": prefetch_block, - "sync_mode": self.sync_mode, - "grad_to_block_id": grad_to_block_id - }) + attrs=attrs) pserver_program.sync_with_cpp() return pserver_program @@ -552,6 +508,14 @@ class DistributeTranspiler: Get startup program for current parameter server. Modify operator input variables if there are variables that were split to several blocks. + + Args: + endpoint (str): current pserver endpoint. + pserver_program (Program): call get_pserver_program first and + pass the result here. + + Returns: + Program: parameter server side startup program. """ s_prog = Program() orig_s_prog = default_startup_program() @@ -603,12 +567,142 @@ class DistributeTranspiler: # ====================== private transpiler functions ===================== + def _has_distributed_lookup_table(self): + # process lookup_table_op + # 1. check all lookup_table_op is distributed + # 2. check all lookup_table_op share the same table. + distributed_lookup_table_ops = [] + # support only one distributed_lookup_table now + self.table_name = None + for op in self.origin_program.global_block().ops: + if op.type == LOOKUP_TABLE_TYPE: + if op.attrs['is_distributed'] is True: + if self.table_name is None: + self.table_name = op.input("W")[0] + if self.table_name != op.input("W")[0]: + raise RuntimeError("all distributed lookup_table_ops" + " should have only one table") + distributed_lookup_table_ops.append(op) + else: + if self.table_name is not None: + assert op.input("W")[0] != self.table_name + + return len(distributed_lookup_table_ops) > 0 + + def _update_dist_lookup_table_vars(self, param_list, grad_list, + params_grads): + # TODO(wuyi): put find a way to put dist lookup table stuff all together. + # update self.table_param_grad and self.trainer_side_table_grad_list + program = self.origin_program + if self.has_distributed_lookup_table: + param_list = [ + param for param in param_list if param.name != self.table_name + ] + grad_list = [ + grad for grad in grad_list + if grad.name != grad_var_name(self.table_name) + ] + self.table_param_grad = [ + param_grad for param_grad in params_grads + if param_grad[0].name == self.table_name + ][0] + table_grad_var = self.table_param_grad[1] + if self.sync_mode: + self.trainer_side_table_grad_list = [ + program.global_block().create_var( + name="%s.trainer_%d.pserver_%d" % + (table_grad_var.name, self.trainer_id, index), + type=table_grad_var.type, + shape=table_grad_var.shape, + dtype=table_grad_var.dtype) + for index in range(len(self.pserver_endpoints)) + ] + else: + self.trainer_side_table_grad_list = [ + program.global_block().create_var( + name="%s.pserver_%d" % (table_grad_var.name, index), + type=table_grad_var.type, + shape=table_grad_var.shape, + dtype=table_grad_var.dtype) + for index in range(len(self.pserver_endpoints)) + ] + return param_list, grad_list + + def _init_splited_vars(self, slice_var_up): + # update these mappings for further transpile: + # 1. param_var_mapping: param var name -> [splited params vars] + # 2. grad_var_mapping: grad var name -> [splited grads vars] + # 3. grad_param_mapping: grad.blockx -> param.blockx + # 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []} + + param_list = [] + grad_list = [] + param_grad_set = set() + for p, g in self.params_grads: + # skip parameter marked not trainable + if type(p) == Parameter and p.trainable == False: + continue + if p.name not in param_grad_set: + param_list.append(p) + param_grad_set.add(p.name) + if g.name not in param_grad_set: + grad_list.append(g) + param_grad_set.add(g.name) + + param_list, grad_list = self._update_dist_lookup_table_vars( + param_list, grad_list, self.params_grads) + + if slice_var_up: + # when we slice var up into blocks, we will slice the var according to + # pserver services' count. A pserver may have two or more listening ports. + grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints)) + param_blocks = slice_variable(param_list, + len(self.pserver_endpoints)) + else: + # when we do NOT slice var up into blocks, we will always slice params + # grads into one block. + grad_blocks = slice_variable(grad_list, 1) + param_blocks = slice_variable(param_list, 1) + assert (len(grad_blocks) == len(param_blocks)) + + # origin_varname -> [splited_var] + self.param_var_mapping = self._create_vars_from_blocklist( + self.origin_program, param_blocks) + self.grad_var_mapping = self._create_vars_from_blocklist( + self.origin_program, + grad_blocks, + add_trainer_suffix=self.trainer_num > 1) + self.grad_param_mapping = dict() + for g, p in zip(grad_blocks, param_blocks): + g_name, g_bid, _ = g.split(":") + p_name, p_bid, _ = p.split(":") + self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \ + self.param_var_mapping[p_name][int(p_bid)] + + # create mapping of endpoint -> split var to create pserver side program + self.param_grad_ep_mapping = dict() + [ + self.param_grad_ep_mapping.update({ + ep: { + "params": [], + "grads": [] + } + }) for ep in self.pserver_endpoints + ] + # transpiler function for dis lookup_table def _replace_lookup_table_op_with_prefetch(self, program, pserver_endpoints): # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op - self.prefetch_input_vars = None - self.prefetch_output_vars = None + # self.all_prefetch_input_vars = + # [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1] + # [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]] + self.all_prefetch_input_vars = [] + + # self.all_prefetch_input_vars = + # [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1] + # [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]] + self.all_prefetch_output_vars = [] continue_search_lookup_table_op = True while continue_search_lookup_table_op: @@ -618,26 +712,27 @@ class DistributeTranspiler: if op.type == LOOKUP_TABLE_TYPE: continue_search_lookup_table_op = True - op_index = list(all_ops).index(op) + lookup_table_op_index = list(all_ops).index(op) ids_name = op.input("Ids") out_name = op.output("Out") - if self.prefetch_input_vars is None: - ids_var = program.global_block().vars[ids_name[0]] - self.prefetch_input_vars = self.create_splited_vars( - source_var=ids_var, - block=program.global_block(), - tag="_prefetch_in_") - if self.prefetch_output_vars is None: - out_var = program.global_block().vars[out_name[0]] - self.prefetch_output_vars = self.create_splited_vars( - source_var=out_var, - block=program.global_block(), - tag="_prefetch_out_") + ids_var = program.global_block().vars[ids_name[0]] + prefetch_input_vars = self.create_splited_vars( + source_var=ids_var, + block=program.global_block(), + tag="_prefetch_in_") + self.all_prefetch_input_vars.append(prefetch_input_vars) + + out_var = program.global_block().vars[out_name[0]] + prefetch_output_vars = self.create_splited_vars( + source_var=out_var, + block=program.global_block(), + tag="_prefetch_out_") + self.all_prefetch_output_vars.append(prefetch_output_vars) # insert split_ids_op program.global_block().insert_op( - index=op_index, + index=lookup_table_op_index, type="split_ids", inputs={ 'Ids': [ @@ -645,14 +740,14 @@ class DistributeTranspiler: for varname in ids_name ] }, - outputs={"Out": self.prefetch_input_vars}) + outputs={"Out": prefetch_input_vars}) # insert prefetch_op program.global_block().insert_op( - index=op_index + 1, + index=lookup_table_op_index + 1, type="prefetch", - inputs={'X': self.prefetch_input_vars}, - outputs={"Out": self.prefetch_output_vars}, + inputs={'X': prefetch_input_vars}, + outputs={"Out": prefetch_output_vars}, attrs={ "epmap": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE @@ -660,16 +755,21 @@ class DistributeTranspiler: # insert concat_op program.global_block().insert_op( - index=op_index + 2, - type="concat", - inputs={'X': self.prefetch_output_vars}, + index=lookup_table_op_index + 2, + type="merge_ids", + inputs={ + 'Ids': [ + program.global_block().vars[varname] + for varname in ids_name + ], + 'X': prefetch_output_vars + }, outputs={ "Out": [ program.global_block().vars[varname] for varname in out_name ] - }, - attrs={"axis": 0}) + }) # delete lookup_table_op delete_ops(program.global_block(), [op]) @@ -677,7 +777,7 @@ class DistributeTranspiler: break def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints): - # 2. add split_ids_op and send_vars_op to send gradient to pservers + # 2. add split_ids_op and send_op to send gradient to pservers # there should only be one table_name all_ops = program.global_block().ops table_grad_name = grad_var_name(self.table_name) @@ -694,11 +794,11 @@ class DistributeTranspiler: outputs={"Out": self.trainer_side_table_grad_list}) program.global_block().insert_op( index=op_index + 2, - type="send_vars", + type="send", inputs={'X': self.trainer_side_table_grad_list}, outputs={}, attrs={ - "sync_send": True, + "sync_mode": True, "epmap": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE }) @@ -708,30 +808,34 @@ class DistributeTranspiler: optimize_block): # STEP: create prefetch block table_var = pserver_program.global_block().vars[self.table_name] - prefetch_block = pserver_program.create_block(optimize_block.idx) - trainer_ids = self.prefetch_input_vars[pserver_index] - pserver_ids = pserver_program.global_block().create_var( - name=trainer_ids.name, - type=trainer_ids.type, - shape=trainer_ids.shape, - dtype=trainer_ids.dtype) - trainer_out = self.prefetch_output_vars[pserver_index] - pserver_out = pserver_program.global_block().create_var( - name=trainer_out.name, - type=trainer_out.type, - shape=trainer_out.shape, - dtype=trainer_out.dtype) - prefetch_block.append_op( - type="lookup_sparse_table", - inputs={'Ids': pserver_ids, - "W": table_var}, - outputs={"Out": pserver_out}, - attrs={ - "is_sparse": True, # has no effect on lookup_table op - "is_distributed": True, - "padding_idx": -1 - }) - return prefetch_block + prefetch_var_name_to_block_id = [] + for index in range(len(self.all_prefetch_input_vars)): + prefetch_block = pserver_program.create_block(optimize_block.idx) + trainer_ids = self.all_prefetch_input_vars[index][pserver_index] + pserver_ids = pserver_program.global_block().create_var( + name=trainer_ids.name, + type=trainer_ids.type, + shape=trainer_ids.shape, + dtype=trainer_ids.dtype) + trainer_out = self.all_prefetch_output_vars[index][pserver_index] + pserver_out = pserver_program.global_block().create_var( + name=trainer_out.name, + type=trainer_out.type, + shape=trainer_out.shape, + dtype=trainer_out.dtype) + prefetch_block.append_op( + type="lookup_sparse_table", + inputs={'Ids': pserver_ids, + "W": table_var}, + outputs={"Out": pserver_out}, + attrs={ + "is_sparse": True, # has no effect on lookup_table op + "is_distributed": True, + "padding_idx": -1 + }) + prefetch_var_name_to_block_id.append(trainer_ids.name + ":" + str( + prefetch_block.idx)) + return prefetch_var_name_to_block_id def _create_table_optimize_block(self, pserver_index, pserver_program, pre_block_idx, grad_to_block_id): @@ -777,7 +881,8 @@ class DistributeTranspiler: table_opt_block.append_op( type="sum", inputs={"X": pserver_side_table_grad_list}, - outputs={"Out": [grad_var]}) + outputs={"Out": [grad_var]}, + attrs={"use_mkldnn": False}) else: # in async_mode, for table gradient, it also need to be splited to each parameter server origin_grad_name = grad_var.name @@ -808,6 +913,27 @@ class DistributeTranspiler: return table_opt_block + def _create_checkpoint_save_block(self, pserver_program, pre_block_idx): + """ + create a new block to handle save checkpoint. + """ + import os + + pserver_program.global_block().create_var( + name="kLookupTablePath", + persistable=True, + type=core.VarDesc.VarType.RAW) + + checkpoint_save_block = pserver_program.create_block(pre_block_idx) + # this 'file_path' do not be used in save lookup table variable + checkpoint_save_block.append_op( + type='save', + inputs={'X': [self.table_name]}, + outputs={}, + attrs={'file_path': "none"}) + + return checkpoint_save_block.idx + def _create_vars_from_blocklist(self, program, block_list, @@ -1009,7 +1135,8 @@ class DistributeTranspiler: optimize_block.append_op( type="sum", inputs={"X": vars2merge}, - outputs={"Out": merged_var}) + outputs={"Out": merged_var}, + attrs={"use_mkldnn": False}) # TODO(panyx0718): What if it's SELECTED_ROWS. if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS: optimize_block.append_op( @@ -1095,7 +1222,29 @@ class DistributeTranspiler: break return grad_block - def _append_pserver_non_opt_ops(self, optimize_block, opt_op, endpoint): + def _clone_lr_op(self, program, block, op): + inputs = self._get_input_map_from_op( + self.origin_program.global_block().vars, op) + for key, varlist in inputs.iteritems(): + if not isinstance(varlist, list): + varlist = [varlist] + for var in varlist: + if var not in program.global_block().vars: + block.clone_variable(var) + + outputs = self._get_output_map_from_op( + self.origin_program.global_block().vars, op) + for key, varlist in outputs.iteritems(): + if not isinstance(varlist, list): + varlist = [varlist] + for var in varlist: + if var not in program.global_block().vars: + block.clone_variable(var) + + return block.append_op( + type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs) + + def _append_pserver_non_opt_ops(self, optimize_block, opt_op): program = optimize_block.program # Append the ops for parameters that do not need to be optimized/updated inputs = self._get_input_map_from_op( @@ -1130,7 +1279,7 @@ class DistributeTranspiler: elif not program.global_block().vars.has_key(var.name): program.global_block().clone_variable(var) - optimize_block.append_op( + return optimize_block.append_op( type=opt_op.type, inputs=inputs, outputs=outputs, @@ -1174,16 +1323,6 @@ class DistributeTranspiler: ufind.union(op1, op2) return ufind - def _is_opt_role_op(self, op): - # NOTE: depend on oprole to find out whether this op is for - # optimize - op_maker = core.op_proto_and_checker_maker - optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize - if op_maker.kOpRoleAttrName() in op.attrs and \ - int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role): - return True - return False - def _is_optimizer_op(self, op): if "Param" in op.input_names and \ "LearningRate" in op.input_names: @@ -1274,7 +1413,10 @@ class DistributeTranspiler: params_grads = [] origin_var_dict = self.origin_program.global_block().vars for op in block.ops: - if self._is_opt_role_op(op): + # NOTE(Yancey1989): we can not use op role to distinguish an optimizer op + # or not, because all ops in optimizer sub-graph would + # sign the optimizer op role + if self._is_optimizer_op(op): opt_ops.append(op) # HACK(wuyi): if we find grad vars from input of optimize # ops, we may get the output of clip op. Use syntax "@GRAD" diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index 202aa76084432b4b2378470919b2e924301f2130..0629f2916b339a6cd19ccadf435a67a17d6da4cc 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -19,16 +19,30 @@ from ..executor import global_scope class InferenceTranspiler: + ''' + Convert the fluid program to optimized inference program. + + There are several optimizations, only fuse batch normalization is supported now. + + Examples: + + .. code-block:: python + + # As InferenceTranspiler will modify the original program, + # please clone before use it. + inference_transpiler_program = program.clone() + t = fluid.InferenceTranspiler() + t.transpile(inference_transpiler_program, place) + ''' + def transpile(self, program, place, scope=None): ''' - Transpile the program. Support only fuse batch normalization now. - - :param program: program to transpile - :type program: Program - :param place: inference place - :type place: Place - :param scope: inference scope - :type scope: Scope or None + Run the transpiler. + + Args: + program (Program): program to transpile + place (Place): inference place + scope (Scope|None): inference Scope ''' if not isinstance(program, Program): raise TypeError("program should be as Program type") @@ -49,36 +63,43 @@ class InferenceTranspiler: can be integrated with them. Doing so will give us a forward acceleration, especially in environments like mobile or embedded. - For input X: - - Conv process: X = input * W + bias - - Batch norm process: X' = (X - mean) / std - - Scale Process: Y = a * X' + b + For input :math:`X`: + + - Conv process: :math:`X = input * W + bias` + - Batch norm process: :math:`X' = (X - mean) / std` + - Scale Process: :math:`Y = a * X' + b` After fuse into one operation: - Y = (input * W + bias - mean) / std * a + b - = input * a * W / std + ((bias - mean) / std * a + b) + .. math:: + + Y &= (input * W + bias - mean) / std * a + b \\\\ + &= input * a * W / std + ((bias - mean) / std * a + b) The operator transformation is: + - before: + - conv->batch_norm->any_other_op (bias == 0) - conv->elementwise_add->batch_norm->any_other_op (bias != 0) + - after: + - conv->elementwise_add->any_other_op The transpile stages are: + 1. insert elementwise_add op when bias == 0. 2. fuse the batch_norm's parameters to conv and elementwise_add operators. 3. remove batch_norm ops which are not used in any other ops. 4. adjust the input of any_other_op to be the output of elementwise_add operator. 5. remove unused variables. - :param program: program to transpile - :type program: Program - :param place: inference place - :type place: Place - :param scope: inference scope - :type scope: Scope + Args: + program (Program): program to transpile + place (Place): inference place + scope (Scope): inference Scope + ''' self.scope = scope self.place = place diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index 9ff0ae6fca27d4681891b2033e2f8f95bd825942..999ef43ca0feacbddff5f9db59589ce7097fe77e 100644 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -157,9 +157,11 @@ class ControlFlowGraph(object): if op.type() == "fill_constant" and op.attr("force_cpu") == True: self._skip_opt.update(op.output_arg_names()) - def release_memory(self): + def release_memory(self, skip_opt_set=None): self._dataflow_analyze() self._update_skip_opt_set() + if skip_opt_set: + self._skip_opt.update(skip_opt_set) fwd_id = 0 bwd_id = 0 for i in range(self.op_size): @@ -183,7 +185,7 @@ class ControlFlowGraph(object): else: bwd_id += 1 - def memory_optimize(self, level=0): + def memory_optimize(self, skip_opt_set=None, level=0): def compare_shape(x_shape, cache_shape, opt_level): if opt_level == 0: return x_shape == cache_shape @@ -200,6 +202,9 @@ class ControlFlowGraph(object): self._dataflow_analyze() self._update_skip_opt_set() + # update skip set to meet users' demand + if skip_opt_set: + self._skip_opt.update(skip_opt_set) self.pool = [] for i in range(self.op_size): op = self._ops[i] @@ -358,7 +363,7 @@ def _get_cfgs(input_program): return cfgs -def memory_optimize(input_program, print_log=False, level=0): +def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0): """Optimize memory by reusing var memory. Note: it doesn't not support subblock nested in subblock. @@ -374,10 +379,20 @@ def memory_optimize(input_program, print_log=False, level=0): PRINT_LOG = print_log cfgs = _get_cfgs(input_program) for cfg in cfgs: - cfg.memory_optimize(level) + cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level) -def release_memory(input_program): +def release_memory(input_program, skip_opt_set=None): + """ + Modify the input program and insert :code:`delete_op` to early drop not used + variables. The modification will be performed inplace. + + Notes: This is an experimental API and could be removed in next few + releases. Users should not use this API. + + Args: + input_program(Program): The program will be inserted :code:`delete_op`. + """ cfgs = _get_cfgs(input_program) for cfg in cfgs: - cfg.release_memory() + cfg.release_memory(skip_opt_set=skip_opt_set) diff --git a/python/paddle/fluid/transpiler/ps_dispatcher.py b/python/paddle/fluid/transpiler/ps_dispatcher.py index d6a68677527deb09ace0e3a23cbc093d6d7b4349..dcffadd531719431f27feb464ed58a65c04770ee 100644 --- a/python/paddle/fluid/transpiler/ps_dispatcher.py +++ b/python/paddle/fluid/transpiler/ps_dispatcher.py @@ -33,15 +33,21 @@ class PSDispatcher(object): def dispatch(self, varlist): """ - :param varlist: a list of Variables - :return: a map of pserver endpoint -> varname + Args: + varlist(list): a list of Variables + Returns: + a map of pserver endpoint -> varname """ AssertionError("Interface has not been implemented.") class HashName(PSDispatcher): """ - Hash variable names to several endpoints + Hash variable names to several endpoints using python + "hash()" function. + + Args: + pserver_endpoints (list): list of endpoint(ip:port). """ def __init__(self, pserver_endpoints): @@ -61,7 +67,11 @@ class HashName(PSDispatcher): class RoundRobin(PSDispatcher): """ - Distribute variables to serveral endpoints. + Distribute variables to serveral endpoints using + RondRobin method. + + Args: + pserver_endpoints (list): list of endpoint(ip:port). """ def __init__(self, pserver_endpoints): diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py index 33c53113ae7e8ed9aeada31f2aed6990b6fea110..776619cd36722e338a9fdd5e13bceeaf3724de2c 100644 --- a/python/paddle/fluid/unique_name.py +++ b/python/paddle/fluid/unique_name.py @@ -16,7 +16,7 @@ import collections import contextlib import sys -__all__ = ['generate', 'switch', 'guard', 'UniqueNameGenerator'] +__all__ = ['generate', 'switch', 'guard'] class UniqueNameGenerator(object): diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py index 44a6e344630bb35d28ee29078bf8727053a24bef..1f83cabb8481451736944823be45185deea4f43b 100644 --- a/python/paddle/reader/decorator.py +++ b/python/paddle/reader/decorator.py @@ -336,7 +336,7 @@ def _buf2lines(buf, line_break="\n"): class PipeReader: """ - PipeReader read data by stream from a command, take it's + PipeReader read data by stream from a command, take it's stdout into a pipe buffer and redirect it to the parser to parse, then yield data as your desired format. @@ -352,7 +352,7 @@ class PipeReader: An example: .. code-block:: python - + def example_reader(): for f in myfiles: pr = PipeReader("cat %s"%f) diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py index e6f87ce61b1d16d4f98f111626776aa52c2ec35b..4e3beaf639bad9fed2862a5477095b66ef4b9aee 100644 --- a/python/paddle/trainer_config_helpers/attrs.py +++ b/python/paddle/trainer_config_helpers/attrs.py @@ -240,14 +240,15 @@ class ExtraLayerAttribute(object): :type error_clipping_threshold: float :param drop_rate: Dropout rate. Dropout will create a mask on layer output. The dropout rate is the zero rate of this mask. The - details of what dropout is please refer to `here - `_. + details of what dropout is please refer to `JMLRdropout + `_. :type drop_rate: float :param device: device ID of layer. device=-1, use CPU. device>=0, use GPU. - The details allocation in parallel_nn please refer to `here - `_. + The details allocation in parallel_nn please refer to `use_case + `_. :type device: int """ diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index ebc31b23e0f5504b4bebccabe996b054c7fbce3b..e6a03759ef431086390e217eabcdff47e610346c 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -2556,7 +2556,7 @@ def img_conv_layer(input, the output will be obtained by concatenating the two results. The details of grouped convolution, please refer to: - `ImageNet Classification with Deep Convolutional Neural Networks + `ImageNet Classification With Deep Convolutional Neural Networks `_ The example usage is: @@ -5678,8 +5678,8 @@ def warp_ctc_layer(input, `_ library, which is used in `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin `_, to compute Connectionist Temporal - Classification (CTC) loss. Besides, another `warp-ctc - `_ repository, which is forked from + Classification (CTC) loss. Besides, another `warp-ctc repository + `_ , which is forked from the official one, is maintained to enable more compiling options. During the building process, PaddlePaddle will clone the source codes, build and install it to :code:`third_party/install/warpctc` directory. diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py index 0a2a1ced11ee5cb2fb407b229ce810d553c2fa46..662655c836dbc54bd6187dcd3dac7354d6c8ecd1 100644 --- a/python/paddle/v2/dataset/cifar.py +++ b/python/paddle/v2/dataset/cifar.py @@ -43,7 +43,7 @@ CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz' CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85' -def reader_creator(filename, sub_name): +def reader_creator(filename, sub_name, cycle=False): def read_batch(batch): data = batch['data'] labels = batch.get('labels', batch.get('fine_labels', None)) @@ -56,10 +56,13 @@ def reader_creator(filename, sub_name): names = (each_item.name for each_item in f if sub_name in each_item.name) - for name in names: - batch = cPickle.load(f.extractfile(name)) - for item in read_batch(batch): - yield item + while True: + for name in names: + batch = cPickle.load(f.extractfile(name)) + for item in read_batch(batch): + yield item + if not cycle: + break return reader @@ -94,34 +97,40 @@ def test100(): 'test') -def train10(): +def train10(cycle=False): """ CIFAR-10 training set creator. It returns a reader creator, each sample in the reader is image pixels in [0, 1] and label in [0, 9]. + :param cycle: whether to cycle through the dataset + :type cycle: bool :return: Training reader creator :rtype: callable """ return reader_creator( paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), - 'data_batch') + 'data_batch', + cycle=cycle) -def test10(): +def test10(cycle=False): """ CIFAR-10 test set creator. It returns a reader creator, each sample in the reader is image pixels in [0, 1] and label in [0, 9]. + :param cycle: whether to cycle through the dataset + :type cycle: bool :return: Test reader creator. :rtype: callable """ return reader_creator( paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), - 'test_batch') + 'test_batch', + cycle=cycle) def fetch(): diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py index 7bdddeaabec733ef26b3f766c6437f5c53d65044..db12076d54064781bd1060947497622b14783768 100644 --- a/python/paddle/v2/dataset/flowers.py +++ b/python/paddle/v2/dataset/flowers.py @@ -76,7 +76,8 @@ def reader_creator(data_file, dataset_name, mapper, buffered_size=1024, - use_xmap=True): + use_xmap=True, + cycle=False): ''' 1. read images from tar file and merge images into batch files in 102flowers.tgz_batch/ @@ -96,6 +97,8 @@ def reader_creator(data_file, :type mapper: callable :param buffered_size: the size of buffer used to process images :type buffered_size: int + :param cycle: whether to cycle through the dataset + :type cycle: bool :return: data reader :rtype: callable ''' @@ -108,23 +111,27 @@ def reader_creator(data_file, file_list = batch_images_from_tar(data_file, dataset_name, img2label) def reader(): - for file in open(file_list): - file = file.strip() - batch = None - with open(file, 'r') as f: - batch = cPickle.load(f) - data = batch['data'] - labels = batch['label'] - for sample, label in itertools.izip(data, batch['label']): - yield sample, int(label) - 1 + while True: + for file in open(file_list): + file = file.strip() + batch = None + with open(file, 'r') as f: + batch = cPickle.load(f) + data = batch['data'] + labels = batch['label'] + for sample, label in itertools.izip(data, batch['label']): + yield sample, int(label) - 1 + if not cycle: + break if use_xmap: - return xmap_readers(mapper, reader, cpu_count(), buffered_size) + cpu_num = int(os.environ.get('CPU_NUM', cpu_count())) + return xmap_readers(mapper, reader, cpu_num, buffered_size) else: return map_readers(mapper, reader) -def train(mapper=train_mapper, buffered_size=1024, use_xmap=True): +def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False): ''' Create flowers training set reader. It returns a reader, each sample in the reader is @@ -137,17 +144,23 @@ def train(mapper=train_mapper, buffered_size=1024, use_xmap=True): :type mapper: callable :param buffered_size: the size of buffer used to process images :type buffered_size: int + :param cycle: whether to cycle through the dataset + :type cycle: bool :return: train data reader :rtype: callable ''' return reader_creator( download(DATA_URL, 'flowers', DATA_MD5), download(LABEL_URL, 'flowers', LABEL_MD5), - download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper, - buffered_size, use_xmap) + download(SETID_URL, 'flowers', SETID_MD5), + TRAIN_FLAG, + mapper, + buffered_size, + use_xmap, + cycle=cycle) -def test(mapper=test_mapper, buffered_size=1024, use_xmap=True): +def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False): ''' Create flowers test set reader. It returns a reader, each sample in the reader is @@ -160,14 +173,20 @@ def test(mapper=test_mapper, buffered_size=1024, use_xmap=True): :type mapper: callable :param buffered_size: the size of buffer used to process images :type buffered_size: int + :param cycle: whether to cycle through the dataset + :type cycle: bool :return: test data reader :rtype: callable ''' return reader_creator( download(DATA_URL, 'flowers', DATA_MD5), download(LABEL_URL, 'flowers', LABEL_MD5), - download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper, - buffered_size, use_xmap) + download(SETID_URL, 'flowers', SETID_MD5), + TEST_FLAG, + mapper, + buffered_size, + use_xmap, + cycle=cycle) def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True): diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py index d48c54fcbb66487617b1946bc69724870c8f879c..3c6a53db3c2287e8ef5931a06ca5dad455665ee0 100644 --- a/python/paddle/v2/minibatch.py +++ b/python/paddle/v2/minibatch.py @@ -15,7 +15,7 @@ __all__ = ['batch'] -def batch(reader, batch_size, drop_last=False): +def batch(reader, batch_size, drop_last=True): """ Create a batched reader. diff --git a/tools/check_ctest_hung.py b/tools/check_ctest_hung.py new file mode 100644 index 0000000000000000000000000000000000000000..7de76c381b29a1ff8dcf2167f0e861dc261aa47b --- /dev/null +++ b/tools/check_ctest_hung.py @@ -0,0 +1,53 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import re + + +def escape(input): + o = input.replace("\n", "") + o = o.replace("\r", "") + return o + + +def main(): + usage = """Usage: +1. Download the Paddle_PR_CI_*.log from TeamCity +2. run: python check_ctest_hung.py Paddle_PR_CI_*.log +3. If there is hung ctest, the result likes: +Diff: set(['test_parallel_executor_crf']) + """ + if len(sys.argv) < 2: + print(usage) + exit(0) + + logfile = sys.argv[1] + started = set() + passed = set() + with open(logfile, "r") as fn: + for l in fn.readlines(): + if l.find("Test ") != -1 and \ + l.find("Passed") != -1: + m = re.search("Test\s+#[0-9]*\:\s([a-z0-9_]+)", escape(l)) + passed.add(m.group(1)) + if l.find("Start ") != -1: + start_parts = escape(l).split(" ") + m = re.search("Start\s+[0-9]+\:\s([a-z0-9_]+)", escape(l)) + started.add(m.group(1)) + print "Diff: ", started - passed + + +if __name__ == "__main__": + main() diff --git a/.clang_format.hook b/tools/codestyle/clang_format.hook similarity index 100% rename from .clang_format.hook rename to tools/codestyle/clang_format.hook diff --git a/.copyright.hook b/tools/codestyle/copyright.hook similarity index 100% rename from .copyright.hook rename to tools/codestyle/copyright.hook diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook index b194af76dc529fd52b0aedfab9c41d625fe64c0d..a9775e10ef51fae493523149ee3dbbf227a1aaa9 100755 --- a/tools/codestyle/cpplint_pre_commit.hook +++ b/tools/codestyle/cpplint_pre_commit.hook @@ -7,7 +7,7 @@ for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do if [[ $file =~ ^(paddle/api/.*|paddle/capi/.*|paddle/contrib/.*|paddle/cuda/.*|paddle/function/.*|paddle/gserver/.*|paddle/math/.*|paddle/optimizer/.*|paddle/parameter/.*|paddle/pserver/.*|paddle/trainer/.*|paddle/utils/.*) ]]; then continue; else - cpplint $file; + cpplint --filter=-readability/fn_size $file; TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?); fi done diff --git a/tools/codestyle/docstring_checker.py b/tools/codestyle/docstring_checker.py index 48100e5bf989520043b5ca372b02883faea8a9fd..8d4b24a0cf6b743b72dca58fd885f927560964bf 100644 --- a/tools/codestyle/docstring_checker.py +++ b/tools/codestyle/docstring_checker.py @@ -126,9 +126,10 @@ class DocstringChecker(BaseChecker): 'W9002': ('Doc string does not end with "." period', symbol + "-end-with", 'Used when a doc string does not end with a period'), - 'W9003': ('All args with their types must be mentioned in doc string', - symbol + "-with-all-args", - 'Used when not all arguments are in the doc string '), + 'W9003': + ('All args with their types must be mentioned in doc string %s', + symbol + "-with-all-args", + 'Used when not all arguments are in the doc string '), 'W9005': ('Missing docstring or docstring is too short', symbol + "-missing", 'Add docstring longer >=10'), 'W9006': ('Docstring indent error, use 4 space for indent', @@ -178,6 +179,8 @@ class DocstringChecker(BaseChecker): self.indent_style(node) def missing_doc_string(self, node): + if node.name.startswith("__") or node.name.startswith("_"): + return True if node.tolineno - node.fromlineno <= 10: return True @@ -199,12 +202,16 @@ class DocstringChecker(BaseChecker): doc = node.doc lines = doc.splitlines() + line_num = 0 for l in lines: + if line_num == 0: + continue cur_indent = len(l) - len(l.lstrip()) if cur_indent % indent != 0: self.add_message('W9006', node=node, line=node.fromlineno) return False + line_num += 1 return True @@ -284,6 +291,8 @@ class DocstringChecker(BaseChecker): True if successful otherwise False. """ + if node.name.startswith("__") or node.name.startswith("_"): + return True find = False for t in node.body: if not isinstance(t, astroid.Return): @@ -309,6 +318,8 @@ class DocstringChecker(BaseChecker): Returns: True if successful otherwise False. """ + if node.name.startswith("__") or node.name.startswith("_"): + return True args = [] for arg in node.args.get_children(): if (not isinstance(arg, astroid.AssignName)) \ @@ -320,15 +331,19 @@ class DocstringChecker(BaseChecker): return True parsed_args = doc.args + args_not_documented = set(args) - set(parsed_args) if len(args) > 0 and len(parsed_args) <= 0: - print "debug:parsed args: ", parsed_args - self.add_message('W9003', node=node, line=node.fromlineno) + self.add_message( + 'W9003', + node=node, + line=node.fromlineno, + args=list(args_not_documented)) return False for t in args: if t not in parsed_args: - print t, " with (type) not in ", parsed_args - self.add_message('W9003', node=node, line=node.fromlineno) + self.add_message( + 'W9003', node=node, line=node.fromlineno, args=[t, ]) return False return True diff --git a/tools/codestyle/pylint_pre_commit.hook b/tools/codestyle/pylint_pre_commit.hook index e7c92ba671e0eb778b2ab5447bea7c4b14fe761b..150a3f5666bd39d30b7e6518e58a14fb5fe2f14b 100755 --- a/tools/codestyle/pylint_pre_commit.hook +++ b/tools/codestyle/pylint_pre_commit.hook @@ -7,13 +7,13 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" export PYTHONPATH=$DIR:$PYTHONPATH # The trick to remove deleted files: https://stackoverflow.com/a/2413151 -for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do +for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do pylint --disable=all --load-plugins=docstring_checker \ --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file; TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?); done -#exit $TOTAL_ERRORS +exit $TOTAL_ERRORS #For now, just warning: -exit 0 +#exit 0 diff --git a/tools/print_signatures.py b/tools/print_signatures.py new file mode 100644 index 0000000000000000000000000000000000000000..5e7ffd44c7b0ba2270069bc4467dc377a58b2417 --- /dev/null +++ b/tools/print_signatures.py @@ -0,0 +1,67 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Print all signature of a python module in alphabet order. + +Usage: + ./print_signature "paddle.fluid" > signature.txt +""" +import importlib +import inspect +import collections +import sys +import pydoc + +member_dict = collections.OrderedDict() + + +def visit_member(parent_name, member): + cur_name = ".".join([parent_name, member.__name__]) + if inspect.isclass(member): + for name, value in inspect.getmembers(member): + if hasattr(value, '__name__') and (not name.startswith("_") or + name == "__init__"): + visit_member(cur_name, value) + elif callable(member): + try: + member_dict[cur_name] = inspect.getargspec(member) + except TypeError: # special for PyBind method + member_dict[cur_name] = " ".join([ + line.strip() for line in pydoc.render_doc(member).split('\n') + if "->" in line + ]) + + else: + raise RuntimeError("Unsupported generate signature of member, type {0}". + format(str(type(member)))) + + +def visit_all_module(mod): + for member_name in ( + name + for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod)) + if not name.startswith("_")): + instance = getattr(mod, member_name, None) + if instance is None: + continue + if inspect.ismodule(instance): + visit_all_module(instance) + else: + visit_member(mod.__name__, instance) + + +visit_all_module(importlib.import_module(sys.argv[1])) + +for name in member_dict: + print name, member_dict[name]