diff --git a/CMakeLists.txt b/CMakeLists.txt index cfaab206e1f321a55119d4a8d65c4a99d3819fff..4117f077219d3b8fc097631073eafa748ff918bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,12 +55,14 @@ option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) -option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF) +option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF) +option(WITH_ANAKIN "Compile with Anakin library" OFF) +option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) @@ -147,7 +149,16 @@ include(external/any) # download libn::any include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 include(external/cares) -include(external/grpc) + +if(WITH_DISTRIBUTE) + if(WITH_GRPC) + include(external/grpc) + else() + include(external/leveldb) + include(external/brpc) + endif() +endif() + include(external/snappy) # download snappy include(external/snappystream) include(external/threadpool) @@ -183,7 +194,10 @@ set(EXTERNAL_LIBS if(WITH_GPU) include(cuda) include(tensorrt) -endif(WITH_GPU) + include(external/anakin) +else() + set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE) +endif() if(WITH_AMD_GPU) find_package(HIP) diff --git a/Dockerfile b/Dockerfile index 4d6165b79a1d94b8f27d7f3ee1b6e2cee5992d31..752fea5951bdc8c2cf79a17c960217c88ae62571 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,7 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/ RUN apt-get update && \ apt-get install -y --allow-downgrades \ - git python-pip python-dev openssh-server bison \ + git python-pip python-dev python-opencv openssh-server bison \ libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ curl sed grep graphviz libjpeg-dev zlib1g-dev \ @@ -76,8 +76,7 @@ RUN easy_install -U pip && \ pip install sphinx-rtd-theme==0.1.9 recommonmark RUN pip install pre-commit 'ipython==5.3.0' && \ - pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip install opencv-python + pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' #For docstring checker RUN pip install pylint pytest astroid isort diff --git a/benchmark/.gitignore b/benchmark/.gitignore index 7b66e8a5b5020fd847982db401665d24ba3a069c..fb4114356d4f37efc8ad672316fd4f99443d9fcd 100644 --- a/benchmark/.gitignore +++ b/benchmark/.gitignore @@ -7,3 +7,6 @@ paddle/rnn/imdb.pkl caffe/image/logs tensorflow/image/logs tensorflow/rnn/logs +fluid/models/*.pyc +fluid/logs +fluid/nohup.out diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile index 46140a9d1be01a50cd74dab2799e3731e8d3debd..b9eaca5ee6b487bb37bb954b3c606c3096d37aeb 100644 --- a/benchmark/fluid/Dockerfile +++ b/benchmark/fluid/Dockerfile @@ -1,8 +1,8 @@ FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 -RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop +RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so RUN pip install -U pip -RUN pip install -U kubernetes opencv-python paddlepaddle +RUN pip install -U kubernetes paddlepaddle # IMPORTANT: # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime. @@ -19,4 +19,4 @@ ADD *.whl / RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s ENV LD_LIBRARY_PATH=/usr/local/lib -ADD fluid_benchmark.py dataset.py models/ /workspace/ +ADD fluid_benchmark.py recordio_converter.py models/ /workspace/ diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md index 1b0c7dce8bd6faab0c4c59caa1cbe337483cbd16..28cade4634bb62723bf5120169e202657f548234 100644 --- a/benchmark/fluid/README.md +++ b/benchmark/fluid/README.md @@ -24,14 +24,18 @@ Currently supported `--model` argument include: * Run the following command to start a benchmark job locally: ```bash - python fluid_benchmark.py --model mnist --device GPU + python fluid_benchmark.py --model mnist --device GPU ``` You can choose to use GPU/CPU training. With GPU training, you can specify `--gpus ` to run multi GPU training. + You can set async mode parameter server. With async mode, you can specify + `--async_mode` to train model asynchronous. * Run distributed training with parameter servers: + * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example. * start parameter servers: ```bash PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver + sleep 15 ``` * start trainers: ```bash @@ -42,6 +46,16 @@ Currently supported `--model` argument include: PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2 ``` +## Prepare the RecordIO file to Achieve Better Performance + +Run the following command will generate RecordIO files like "mnist.recordio" under the path +and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size +at any time using `fluid.batch`. + +```bash +python -c 'from recordio_converter import *; prepare_mnist("data", 1)' +``` + ## Run Distributed Benchmark on Kubernetes Cluster You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py new file mode 100644 index 0000000000000000000000000000000000000000..68a3d42d7a8a8082730f4cae3b5d4ea33819ca2f --- /dev/null +++ b/benchmark/fluid/args.py @@ -0,0 +1,126 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +__all__ = ['parse_args', ] + +BENCHMARK_MODELS = [ + "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" +] + + +def parse_args(): + parser = argparse.ArgumentParser('Fluid model benchmarks.') + parser.add_argument( + '--model', + type=str, + choices=BENCHMARK_MODELS, + default='resnet', + help='The model to run benchmark with.') + parser.add_argument( + '--batch_size', type=int, default=32, help='The minibatch size.') + # args related to learning rate + parser.add_argument( + '--learning_rate', type=float, default=0.001, help='The learning rate.') + # TODO(wuyi): add "--use_fake_data" option back. + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=100, help='The number of passes.') + parser.add_argument( + '--data_format', + type=str, + default='NCHW', + choices=['NCHW', 'NHWC'], + help='The data data_format, now only support NCHW.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--gpus', + type=int, + default=1, + help='If gpus > 1, will use ParallelExecutor to run, else use Executor.') + # this option is available only for vgg and resnet. + parser.add_argument( + '--cpus', + type=int, + default=1, + help='If cpus > 1, will use ParallelDo to run, else use Executor.') + parser.add_argument( + '--data_set', + type=str, + default='flowers', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + parser.add_argument( + '--use_cprof', action='store_true', help='If set, use cProfile.') + parser.add_argument( + '--use_nvprof', + action='store_true', + help='If set, use nvprof for CUDA.') + parser.add_argument( + '--no_test', + action='store_true', + help='If set, do not test the testset during training.') + parser.add_argument( + '--memory_optimize', + action='store_true', + help='If set, optimize runtime memory before start.') + parser.add_argument( + '--use_fake_data', + action='store_true', + help='If set ommit the actual read data operators.') + parser.add_argument( + '--profile', action='store_true', help='If set, profile a few steps.') + parser.add_argument( + '--update_method', + type=str, + default='local', + choices=['local', 'pserver', 'nccl2'], + help='Choose parameter update method, can be local, pserver, nccl2.') + parser.add_argument( + '--no_split_var', + action='store_true', + default=False, + help='Whether split variables into blocks when update_method is pserver') + parser.add_argument( + '--async_mode', + action='store_true', + default=False, + help='Whether start pserver in async mode to support ASGD') + parser.add_argument( + '--use_reader_op', + action='store_true', + help='Whether to use reader op, and must specify the data path if set this to true.' + ) + parser.add_argument( + '--data_path', + type=str, + default="", + help='Directory that contains all the training recordio files.') + args = parser.parse_args() + return args diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index c1d458970a58bfac2a3369e8964eb100568b28f2..aa70783ecd68be543b2d5aabee96a5b09bd72e6a 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -24,90 +24,7 @@ import paddle.fluid.core as core import paddle.fluid.profiler as profiler import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler -BENCHMARK_MODELS = [ - "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" -] - - -def parse_args(): - parser = argparse.ArgumentParser('Fluid model benchmarks.') - parser.add_argument( - '--model', - type=str, - choices=BENCHMARK_MODELS, - default='resnet', - help='The model to run benchmark with.') - parser.add_argument( - '--batch_size', type=int, default=32, help='The minibatch size.') - parser.add_argument( - '--learning_rate', - type=float, - default=0.001, - help='The minibatch size.') - # TODO(wuyi): add "--use_fake_data" option back. - parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test' - ) - parser.add_argument( - '--iterations', type=int, default=80, help='The number of minibatches.') - parser.add_argument( - '--pass_num', type=int, default=100, help='The number of passes.') - parser.add_argument( - '--data_format', - type=str, - default='NCHW', - choices=['NCHW', 'NHWC'], - help='The data data_format, now only support NCHW.') - parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help='The device type.') - parser.add_argument( - '--gpus', - type=int, - default=1, - help='If gpus > 1, will use ParallelExecutor to run, else use Executor.') - parser.add_argument( - '--data_set', - type=str, - default='flowers', - choices=['cifar10', 'flowers'], - help='Optional dataset for benchmark.') - parser.add_argument( - '--infer_only', action='store_true', help='If set, run forward only.') - parser.add_argument( - '--use_cprof', action='store_true', help='If set, use cProfile.') - parser.add_argument( - '--use_nvprof', - action='store_true', - help='If set, use nvprof for CUDA.') - parser.add_argument( - '--no_test', - action='store_false', - help='If set, test the testset during training.') - parser.add_argument( - '--memory_optimize', - action='store_true', - help='If set, optimize runtime memory before start.') - parser.add_argument( - '--use_fake_data', - action='store_true', - help='If set ommit the actual read data operators.') - parser.add_argument( - '--profile', action='store_true', help='If set, profile a few steps.') - parser.add_argument( - '--update_method', - type=str, - default='local', - choices=['local', 'pserver', 'nccl2'], - help='Choose parameter update method, can be local, pserver, nccl2.') - args = parser.parse_args() - return args +from args import * def append_nccl2_prepare(trainer_id): @@ -142,7 +59,7 @@ def append_nccl2_prepare(trainer_id): "nccl-based dist train.") -def dist_transpile(trainer_id): +def dist_transpile(trainer_id, args): if trainer_id < 0: return None, None @@ -164,7 +81,12 @@ def dist_transpile(trainer_id): training_role = os.getenv("PADDLE_TRAINING_ROLE") t = distribute_transpiler.DistributeTranspiler() - t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) + t.transpile( + trainer_id, + pservers=pserver_endpoints, + trainers=trainers, + sync_mode=not args.async_mode, + slice_var_up=not args.no_split_var) if training_role == "PSERVER": pserver_program = t.get_pserver_program(current_endpoint) pserver_startup_program = t.get_startup_program(current_endpoint, @@ -208,36 +130,57 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) exe = fluid.Executor(place) exe.run(startup_prog) - feed_var_list = [ - var for var in train_prog.global_block().vars.itervalues() - if var.is_data - ] - feeder = fluid.DataFeeder(feed_var_list, place) + + if not args.use_reader_op: + feed_var_list = [ + var for var in train_prog.global_block().vars.itervalues() + if var.is_data + ] + feeder = fluid.DataFeeder(feed_var_list, place) iters, num_samples, start_time = 0, 0, time.time() for pass_id in range(args.pass_num): train_losses = [] - for batch_id, data in enumerate(train_reader()): + if not args.use_reader_op: + reader_generator = train_reader() + batch_id = 0 + data = None + while True: + if not args.use_reader_op: + data = next(reader_generator, None) + if data == None: + break + if iters == args.iterations: + break if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 - if iters == args.iterations: - break - loss = exe.run(train_prog, - feed=feeder.feed(data), - fetch_list=[avg_loss]) + + if args.use_reader_op: + try: + loss = exe.run(train_prog, fetch_list=[avg_loss]) + except fluid.core.EnforceNotMet as ex: + break + else: + loss = exe.run(train_prog, + feed=feeder.feed(data), + fetch_list=[avg_loss]) iters += 1 - num_samples += len(data) + batch_id += 1 + # FIXME(wuyi): For use_reader_op, if the current + # pass is not the last, the last batch of this pass + # is also equal to args.batch_size. + if args.use_reader_op: + num_samples += args.batch_size * args.gpus + else: + num_samples += len(data) train_losses.append(loss) print("Pass: %d, Iter: %d, Loss: %f\n" % (pass_id, iters, np.mean(train_losses))) - train_elapsed = time.time() - start_time - examples_per_sec = num_samples / train_elapsed - print('\nTotal examples: %d, total time: %.5f, %.5f examples/sec\n' % - (num_samples, train_elapsed, examples_per_sec)) - print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))) + print_train_time(start_time, time.time(), num_samples) + print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))), # evaluation - if not args.no_test and batch_acc != None: + if not args.no_test and batch_acc and not args.use_reader_op: pass_test_acc = test(exe, infer_prog, test_reader, feeder, batch_acc) print(", Test Accuracy: %f" % pass_test_acc) @@ -251,10 +194,14 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, args, train_prog, startup_prog, nccl_id_var, num_trainers, trainer_id): - feed_var_list = [ - var for var in train_prog.global_block().vars.itervalues() - if var.is_data - ] + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + if not args.use_reader_op: + feed_var_list = [ + var for var in train_prog.global_block().vars.itervalues() + if var.is_data + ] + feeder = fluid.DataFeeder(feed_var_list, place) + # generate fake: if args.use_fake_data: for var in feed_var_list: @@ -271,7 +218,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, "value": 1.0, "dtype": var.dtype}) - place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) if nccl_id_var and trainer_id == 0: #FIXME(wuyi): wait other trainer to start listening time.sleep(30) @@ -288,12 +234,21 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, num_trainers=num_trainers, trainer_id=trainer_id) - feeder = fluid.DataFeeder(feed_var_list, place) for pass_id in range(args.pass_num): num_samples = 0 iters = 0 start_time = time.time() - for batch_id, data in enumerate(train_reader()): + if not args.use_reader_op: + reader_generator = train_reader() + batch_id = 0 + data = None + while True: + if not args.use_reader_op: + data = next(reader_generator, None) + if data == None: + break + if iters == args.iterations: + break if args.profile and pass_id == 0 and batch_id == 5: profiler.start_profiler("All") elif args.profile and pass_id == 0 and batch_id == 10: @@ -302,39 +257,50 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 - if iters == args.iterations: - break - if args.use_fake_data: - loss, = exe.run([avg_loss.name]) + if args.use_fake_data or args.use_reader_op: + try: + loss, = exe.run([avg_loss.name]) + except fluid.core.EnforceNotMet as ex: + break else: loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) if args.update_method == "pserver": exe.bcast_params() - num_samples += len(data) + if args.use_reader_op: + num_samples += args.batch_size * args.gpus + else: + num_samples += len(data) iters += 1 if batch_id % 1 == 0: print("Pass %d, batch %d, loss %s" % (pass_id, batch_id, np.array(loss))) - train_elapsed = time.time() - start_time - examples_per_sec = num_samples / train_elapsed - print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % - (num_samples, train_elapsed, examples_per_sec)) - if not args.no_test and batch_acc != None: + batch_id += 1 + + print_train_time(start_time, time.time(), num_samples) + if not args.no_test and batch_acc and not args.use_reader_op: + # we have not implement record io for test + # skip test when use args.use_reader_op test_acc = test(startup_exe, infer_prog, test_reader, feeder, batch_acc) print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc)) - exit(0) def print_arguments(args): vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and vars(args)['device'] == 'GPU') - print('----------- resnet Configuration Arguments -----------') + print('----------- Configuration Arguments -----------') for arg, value in sorted(vars(args).iteritems()): print('%s: %s' % (arg, value)) print('------------------------------------------------') +def print_train_time(start_time, end_time, num_samples): + train_elapsed = end_time - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + + def main(): args = parse_args() print_arguments(args) @@ -342,7 +308,7 @@ def main(): # the unique trainer id, starting from 0, needed by trainer # only nccl_id_var, num_trainers, trainer_id = ( - None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1"))) + None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0"))) if args.use_cprof: pr = cProfile.Profile() @@ -356,7 +322,7 @@ def main(): fluid.memory_optimize(fluid.default_main_program()) if args.update_method == "pserver": - train_prog, startup_prog = dist_transpile(trainer_id) + train_prog, startup_prog = dist_transpile(trainer_id, args) if not train_prog: raise Exception( "Must configure correct environments to run dist train.") diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py index 635b3373dd27b21f83afae10b1d24833b81d57eb..69541adf6b7e53fcc1ac9d3c82b5a60ca0a72879 100644 --- a/benchmark/fluid/models/machine_translation.py +++ b/benchmark/fluid/models/machine_translation.py @@ -197,6 +197,8 @@ def lodtensor_to_ndarray(lod_tensor): def get_model(args): + if args.use_reader_op: + raise Exception("machine_translation do not support reader op for now.") embedding_dim = 512 encoder_size = 512 decoder_size = 512 @@ -221,7 +223,7 @@ def get_model(args): train_batch_generator = paddle.batch( paddle.reader.shuffle( paddle.dataset.wmt14.train(dict_size), buf_size=1000), - batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus) test_batch_generator = paddle.batch( paddle.reader.shuffle( diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py index d264bfc12bdb159c06dae81db4949b9ee17268e2..8e740dc6896b7eeeb82170aa13d32987c4df5c48 100644 --- a/benchmark/fluid/models/mnist.py +++ b/benchmark/fluid/models/mnist.py @@ -20,6 +20,7 @@ import numpy as np import argparse import time import cProfile +import os import paddle import paddle.fluid as fluid @@ -65,19 +66,49 @@ def cnn_model(data): def get_model(args): - # Input data - images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - - # Train program - predict = cnn_model(images) - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - - # Evaluator - batch_size_tensor = fluid.layers.create_tensor(dtype='int64') - batch_acc = fluid.layers.accuracy( - input=predict, label=label, total=batch_size_tensor) + if args.use_reader_op: + filelist = [ + os.path.join(args.data_path, f) for f in os.listdir(args.data_path) + ] + data_file = fluid.layers.open_files( + filenames=filelist, + shapes=[[-1, 1, 28, 28], (-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int64"], + thread_num=args.gpus, + pass_num=args.pass_num) + data_file = fluid.layers.double_buffer( + fluid.layers.batch( + data_file, batch_size=args.batch_size)) + images, label = fluid.layers.read_file(data_file) + else: + images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + if args.device == 'CPU' and args.cpus > 1: + places = fluid.layers.get_places(args.cpus) + pd = fluid.layers.ParallelDo(places) + with pd.do(): + predict = cnn_model(pd.read_input(images)) + label = pd.read_input(label) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + batch_acc = fluid.layers.accuracy(input=predict, label=label) + + pd.write_output(avg_cost) + pd.write_output(batch_acc) + + avg_cost, batch_acc = pd() + avg_cost = fluid.layers.mean(avg_cost) + batch_acc = fluid.layers.mean(batch_acc) + else: + # Train program + predict = cnn_model(images) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_acc = fluid.layers.accuracy(input=predict, label=label) # inference program inference_program = fluid.default_main_program().clone() @@ -88,7 +119,7 @@ def get_model(args): # Reader train_reader = paddle.batch( - paddle.dataset.mnist.train(), batch_size=args.batch_size) + paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus) test_reader = paddle.batch( paddle.dataset.mnist.test(), batch_size=args.batch_size) return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index 9dec8911ed64e09285fb461c4a12adb601535316..9ed1093c54a501cc93dbbf9c3651fe70914ce26b 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -19,6 +19,7 @@ from __future__ import print_function import functools import numpy as np import time +import os import cProfile, pstats, StringIO @@ -26,6 +27,7 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.profiler as profiler +from recordio_converter import imagenet_train, imagenet_test def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): @@ -122,40 +124,85 @@ def get_model(args): else: dshape = [32, 32, 3] model = resnet_cifar10 - else: + train_reader = paddle.dataset.cifar.train10() + test_reader = paddle.dataset.cifar.test10() + elif args.data_set == "flowers": class_dim = 102 if args.data_format == 'NCHW': dshape = [3, 224, 224] else: dshape = [224, 224, 3] model = resnet_imagenet - - input = fluid.layers.data(name='data', shape=dshape, dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - predict = model(input, class_dim) - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - - batch_size_tensor = fluid.layers.create_tensor(dtype='int64') - batch_acc = fluid.layers.accuracy( - input=predict, label=label, total=batch_size_tensor) + train_reader = paddle.dataset.flowers.train() + test_reader = paddle.dataset.flowers.test() + elif args.data_set == "imagenet": + class_dim = 1000 + if args.data_format == 'NCHW': + dshape = [3, 224, 224] + else: + dshape = [224, 224, 3] + model = resnet_imagenet + if not args.data_path: + raise Exception( + "Must specify --data_path when training with imagenet") + train_reader = imagenet_train(args.data_path) + test_reader = imagenet_test(args.data_path) + + if args.use_reader_op: + filelist = [ + os.path.join(args.data_path, f) for f in os.listdir(args.data_path) + ] + data_file = fluid.layers.open_files( + filenames=filelist, + shapes=[[-1] + dshape, (-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int64"], + thread_num=args.gpus, + pass_num=args.pass_num) + data_file = fluid.layers.double_buffer( + fluid.layers.batch( + data_file, batch_size=args.batch_size)) + input, label = fluid.layers.read_file(data_file) + else: + input = fluid.layers.data(name='data', shape=dshape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + if args.device == 'CPU' and args.cpus > 1: + places = fluid.layers.get_places(args.cpus) + pd = fluid.layers.ParallelDo(places) + with pd.do(): + predict = model(pd.read_input(input), class_dim) + label = pd.read_input(label) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + batch_acc = fluid.layers.accuracy(input=predict, label=label) + + pd.write_output(avg_cost) + pd.write_output(batch_acc) + + avg_cost, batch_acc = pd() + avg_cost = fluid.layers.mean(avg_cost) + batch_acc = fluid.layers.mean(batch_acc) + else: + predict = model(input, class_dim) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + batch_acc = fluid.layers.accuracy(input=predict, label=label) inference_program = fluid.default_main_program().clone() with fluid.program_guard(inference_program): inference_program = fluid.io.get_inference_program( - target_vars=[batch_acc, batch_size_tensor]) + target_vars=[batch_acc]) optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) - train_reader = paddle.batch( + batched_train_reader = paddle.batch( paddle.reader.shuffle( - paddle.dataset.cifar.train10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), - buf_size=5120), - batch_size=args.batch_size) - test_reader = paddle.batch( - paddle.dataset.cifar.test10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), - batch_size=args.batch_size) - - return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc + train_reader, buf_size=5120), + batch_size=args.batch_size * args.gpus, + drop_last=True) + batched_test_reader = paddle.batch( + train_reader, batch_size=args.batch_size, drop_last=True) + + return avg_cost, inference_program, optimizer, batched_train_reader,\ + batched_test_reader, batch_acc diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py index 81a28b5f3aed0c325398b909d700c23df545824a..211869af4e8d7180cb485811d3363c50d32f0f74 100644 --- a/benchmark/fluid/models/stacked_dynamic_lstm.py +++ b/benchmark/fluid/models/stacked_dynamic_lstm.py @@ -44,6 +44,9 @@ def crop_sentence(reader, crop_size): def get_model(args): + if args.use_reader_op: + raise Exception( + "stacked_dynamic_lstm do not support reader op for now.") lstm_size = 512 emb_dim = 512 crop_size = 1500 @@ -115,7 +118,7 @@ def get_model(args): train_reader = batch( paddle.reader.shuffle( crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000), - batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus) test_reader = batch( paddle.reader.shuffle( crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000), diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py index 53856c5f7acd3a4e1476ec57154a880bb6f984c9..932601302d2f5d56b53e3462af886429034d8989 100644 --- a/benchmark/fluid/models/vgg.py +++ b/benchmark/fluid/models/vgg.py @@ -22,6 +22,7 @@ import paddle.fluid as fluid import paddle.fluid.core as core import argparse import functools +import os def vgg16_bn_drop(input): @@ -65,9 +66,25 @@ def get_model(args): else: data_shape = [224, 224, 3] - # Input data - images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') + if args.use_reader_op: + filelist = [ + os.path.join(args.data_path, f) for f in os.listdir(args.data_path) + ] + data_file = fluid.layers.open_files( + filenames=filelist, + shapes=[[-1] + data_shape, (-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int64"], + thread_num=args.gpus, + pass_num=args.pass_num) + data_file = fluid.layers.double_buffer( + fluid.layers.batch( + data_file, batch_size=args.batch_size)) + images, label = fluid.layers.read_file(data_file) + else: + images = fluid.layers.data( + name='data', shape=data_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') # Train program net = vgg16_bn_drop(images) @@ -95,7 +112,7 @@ def get_model(args): paddle.dataset.cifar.train10() if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), buf_size=5120), - batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus) test_reader = paddle.batch( paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), diff --git a/benchmark/fluid/recordio_converter.py b/benchmark/fluid/recordio_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..f2dc39109bf1beaf147b046560c92fbd2416d8e6 --- /dev/null +++ b/benchmark/fluid/recordio_converter.py @@ -0,0 +1,164 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.dataset import mnist, cifar, flowers, image + + +def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data, + shape_label): + num_batches = 0 + with fluid.program_guard(fluid.Program(), fluid.Program()): + reader = paddle.batch(py_reader(), batch_size=batch_size) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=shape_data), + fluid.layers.data( + name='label', shape=shape_label, dtype='int64'), + ], + place=fluid.CPUPlace()) + num_batches = fluid.recordio_writer.convert_reader_to_recordio_file( + outfilepath, reader, feeder) + return num_batches + + +def prepare_mnist(outpath, batch_size): + outfilepath = os.path.join(outpath, "mnist.recordio") + convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1]) + + +def prepare_cifar10(outpath, batch_size): + outfilepath = os.path.join(outpath, "cifar.recordio") + convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1]) + + +def prepare_flowers(outpath, batch_size): + outfilepath = os.path.join(outpath, "flowers.recordio") + convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224], + [1]) + + +def default_mapper(sample): + img, label = sample + img = image.simple_transform( + img, 256, 224, True, mean=[103.94, 116.78, 123.68]) + return img.flatten().astype('float32'), label + + +def imagenet_train(data_dir): + contents = os.listdir(data_dir) + if set(contents) != set( + ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]): + raise Exception("Imagenet data contents error!") + img2label = dict() + imgfilelist = [] + with open(os.path.join(data_dir, "train.txt")) as fn: + while 1: + l = fn.readline() + if not l: + break + img, lbl = l[:-1].split(" ") + img2label[img] = int(lbl) + imgfilelist.append(img) + # shuffle all, this is slow + random.shuffle(imgfilelist) + + def train_reader(): + for idx, imgfile in enumerate(imgfilelist): + data = image.load_image( + os.path.join(data_dir, "train", imgfile.lower())) + label = [img2label[imgfile], ] + yield [data, label] + + return paddle.reader.map_readers(default_mapper, train_reader) + + +def imagenet_test(data_dir): + contents = os.listdir(data_dir) + if set(contents) != set( + ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]): + raise Exception("Imagenet data contents error!") + img2label = dict() + imgfilelist = [] + with open(os.path.join(data_dir, "val.txt")) as fn: + while 1: + l = fn.readline() + if not l: + break + img, lbl = l[:-1].split(" ") + img2label[img] = int(lbl) + imgfilelist.append(img) + + def test_reader(): + for idx, imgfile in enumerate(imgfilelist): + base_path = os.path.join(data_dir, "val", imgfile.split(".")[0]) + image_path = ".".join([base_path, "jpeg"]) + data = image.load_image(image_path) + label = [img2label[imgfile], ] + yield [data, label] + + return paddle.reader.map_readers(default_mapper, test_reader) + + +# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged +def convert_reader_to_recordio_files( + filename, + batch_per_file, + reader_creator, + feeder, + compressor=core.RecordIOWriter.Compressor.Snappy, + max_num_records=1000, + feed_order=None): + if feed_order is None: + feed_order = feeder.feed_names + f_name, f_ext = os.path.splitext(filename) + assert (f_ext == ".recordio") + + lines = [] + f_idx = 0 + counter = 0 + for idx, batch in enumerate(reader_creator()): + lines.append(batch) + if idx >= batch_per_file and idx % batch_per_file == 0: + filename = "%s-%05d%s" % (f_name, f_idx, f_ext) + with fluid.recordio_writer.create_recordio_writer( + filename, compressor, max_num_records) as writer: + for l in lines: + res = feeder.feed(l) + for each in feed_order: + writer.append_tensor(res[each]) + writer.complete_append_tensor() + counter += 1 + lines = [] + f_idx += 1 + print("written file: ", filename) + return counter + + +def prepare_imagenet(inpath, outpath, batch_size): + r = paddle.batch(imagenet_train(inpath), batch_size=batch_size) + feeder = fluid.DataFeeder( + feed_list=[ + fluid.layers.data( + name="image", shape=[3, 224, 224]), fluid.layers.data( + name="label", shape=[1], dtype='int64') + ], + place=fluid.CPUPlace()) + outpath = os.path.join(outpath, "imagenet.recordio") + convert_reader_to_recordio_files(outpath, 10000, r, feeder) diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh index afaab5f4de43fa7e94feeed4a1de991351c04b76..5d9b2db87135e53470b106dcd11a6bcfdc5dbda9 100644 --- a/benchmark/fluid/run.sh +++ b/benchmark/fluid/run.sh @@ -2,6 +2,7 @@ # This script benchmarking the PaddlePaddle Fluid on # single thread single GPU. +mkdir -p logs #export FLAGS_fraction_of_gpu_memory_to_use=0.0 export CUDNN_PATH=/paddle/cudnn_v5 @@ -35,6 +36,7 @@ nohup stdbuf -oL nvidia-smi \ --format=csv \ --filename=mem.log \ -l 1 & + # mnist # mnist gpu mnist 128 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ @@ -43,7 +45,7 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ --batch_size=128 \ --skip_batch_num=5 \ --iterations=500 \ - 2>&1 | tee -a mnist_gpu_128.log + 2>&1 | tee -a logs/mnist_gpu_128.log # vgg16 # gpu cifar10 128 @@ -53,7 +55,7 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ --batch_size=128 \ --skip_batch_num=5 \ --iterations=30 \ - 2>&1 | tee -a vgg16_gpu_128.log + 2>&1 | tee -a logs/vgg16_gpu_128.log # flowers gpu 128 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ @@ -63,28 +65,28 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ --data_set=flowers \ --skip_batch_num=5 \ --iterations=30 \ - 2>&1 | tee -a vgg16_gpu_flowers_32.log + 2>&1 | tee -a logs/vgg16_gpu_flowers_32.log # resnet50 # resnet50 gpu cifar10 128 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ - --model=resnet50 \ + --model=resnet \ --device=GPU \ --batch_size=128 \ --data_set=cifar10 \ --skip_batch_num=5 \ --iterations=30 \ - 2>&1 | tee -a resnet50_gpu_128.log + 2>&1 | tee -a logs/resnet50_gpu_128.log # resnet50 gpu flowers 64 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ - --model=resnet50 \ + --model=resnet \ --device=GPU \ --batch_size=64 \ --data_set=flowers \ --skip_batch_num=5 \ --iterations=30 \ - 2>&1 | tee -a resnet50_gpu_flowers_64.log + 2>&1 | tee -a logs/resnet50_gpu_flowers_64.log # lstm # lstm gpu imdb 32 # tensorflow only support batch=32 @@ -94,7 +96,7 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ --batch_size=32 \ --skip_batch_num=5 \ --iterations=30 \ - 2>&1 | tee -a lstm_gpu_32.log + 2>&1 | tee -a logs/lstm_gpu_32.log # seq2seq # seq2seq gpu wmb 128 @@ -104,4 +106,4 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ --batch_size=128 \ --skip_batch_num=5 \ --iterations=30 \ - 2>&1 | tee -a lstm_gpu_128.log + 2>&1 | tee -a logs/lstm_gpu_128.log diff --git a/benchmark/fluid/run_fluid_benchmark.sh b/benchmark/fluid/run_fluid_benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..4309a3126c1d72fe1eb2d5ec423075aea4d3ec88 --- /dev/null +++ b/benchmark/fluid/run_fluid_benchmark.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device CPU --update_method pserver --iterations=10000 & + +sleep 15 + +CUDA_VISIBLE_DEVICES=0,1 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 & + +CUDA_VISIBLE_DEVICES=2,3 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=1 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 & diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 682614742cf1bd3130c638020a2545e16226d4d6..6a8b15a6b60a2e5635dc78fc877f0c8da9a2a998 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -92,6 +92,9 @@ if(WITH_GPU) if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile") endif() + if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4) + message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile") + endif() include_directories(${TENSORRT_INCLUDE_DIR}) endif() elseif(WITH_AMD_GPU) @@ -115,6 +118,10 @@ endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}") +if(WITH_DISTRIBUTE) + add_definitions(-DPADDLE_WITH_DISTRIBUTE) +endif() + if(WITH_GOLANG) # we need to symlink Paddle directory into GOPATH. If we # don't do it and we have code that depends on Paddle, go @@ -163,3 +170,7 @@ if(WITH_GOLANG) endif() endif(WITH_GOLANG) + +if(WITH_GRPC) + add_definitions(-DPADDLE_WITH_GRPC) +endif(WITH_GRPC) diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake new file mode 100644 index 0000000000000000000000000000000000000000..f1cd9c99ebfe5dc5ee0d46d61f1e08256c27d9cd --- /dev/null +++ b/cmake/external/anakin.cmake @@ -0,0 +1,42 @@ +if (NOT WITH_ANAKIN) + return() +endif() + +set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH + "Anakin install path." FORCE) +set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files") +set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library") + +set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp) + +set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz") + +# A helper function used in Anakin, currently, to use it, one need to recursively include +# nearly all the header files. +function(fetch_include_recursively root_dir) + if (IS_DIRECTORY ${root_dir}) + include_directories(${root_dir}) + endif() + + file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*) + foreach(sub ${ALL_SUB}) + if (IS_DIRECTORY ${root_dir}/${sub}) + fetch_include_recursively(${root_dir}/${sub}) + endif() + endforeach() +endfunction() + +# download library +message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}") +execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") +execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*") +execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}") +execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") +execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz") + +if (WITH_ANAKIN) + message(STATUS "Anakin for inference is enabled") + message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}") + fetch_include_recursively(${ANAKIN_INCLUDE}) + link_directories(${ANAKIN_LIBRARY}) +endif() diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake new file mode 100644 index 0000000000000000000000000000000000000000..8e2c913b2caae0c4eeb844d2b51a8975e81c1592 --- /dev/null +++ b/cmake/external/brpc.cmake @@ -0,0 +1,58 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc) +SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc) +SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE) +SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc library." FORCE) + +INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR}) + +# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args +set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf") + +# If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF +ExternalProject_Add( + extern_brpc + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/brpc/brpc" + GIT_TAG "6d153dd7ff00f960ae6895c9c5fff0ce9f07aff2" + PREFIX ${BRPC_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_PREFIX_PATH=${prefix_path} + -DBRPC_WITH_GLOG=ON + ${EXTERNAL_OPTIONAL_ARGS} + LIST_SEPARATOR | + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} +) +ADD_DEPENDENCIES(extern_brpc protobuf leveldb gflags glog gtest snappy) +ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) +ADD_DEPENDENCIES(brpc extern_brpc) + + +LIST(APPEND external_project_dependencies brpc) diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake index 9459f1ddfe85f5607880d3fdd968b494d6af592a..ffdf91a354bd92bdaf3f88344f0a9256638b568c 100644 --- a/cmake/external/grpc.cmake +++ b/cmake/external/grpc.cmake @@ -33,10 +33,19 @@ ELSE() SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin) ENDIF() +# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them ExternalProject_Add( extern_grpc DEPENDS protobuf zlib - URL "http://paddlepaddledeps.bj.bcebos.com/grpc.tar.xz" + # NOTE(wuyi): + # this package is generated by following steps: + # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git + # 2. submodule update --init + # 3. keep only zlib, cares, protobuf, boringssl under "third_party", + # checkout and clean other dirs under third_party + # 4. remove .git, and package the directory. + URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz" + URL_MD5 "c9c58ee7d0e8929a63155af6a2ecdbd0" PREFIX ${GRPC_SOURCES_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" @@ -49,7 +58,6 @@ ExternalProject_Add( INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install ) -# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them. ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a") diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake new file mode 100644 index 0000000000000000000000000000000000000000..fb5091731da02b497a14f119e944905eee4979d5 --- /dev/null +++ b/cmake/external/leveldb.cmake @@ -0,0 +1,44 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(LEVELDB_SOURCES_DIR ${THIRD_PARTY_PATH}/leveldb) +SET(LEVELDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/leveldb) +SET(LEVELDB_INCLUDE_DIR "${LEVELDB_INSTALL_DIR}/include" CACHE PATH "leveldb include directory." FORCE) +SET(LEVELDB_LIBRARIES "${LEVELDB_INSTALL_DIR}/lib/libleveldb.a" CACHE FILEPATH "leveldb library." FORCE) +INCLUDE_DIRECTORIES(${LEVELDB_INCLUDE_DIR}) + +ExternalProject_Add( + extern_leveldb + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${LEVELDB_SOURCES_DIR} + URL "https://github.com/google/leveldb/archive/v1.18.tar.gz" + URL_MD5 "73770de34a2a5ab34498d2e05b2b7fa0" + CONFIGURE_COMMAND "" + BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a + INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ + && cp ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/libleveldb.a ${LEVELDB_LIBRARIES} + && cp -r ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/include ${LEVELDB_INSTALL_DIR}/ + BUILD_IN_SOURCE 1 +) + +ADD_DEPENDENCIES(extern_leveldb snappy) + +ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES}) +ADD_DEPENDENCIES(leveldb extern_leveldb) + +LIST(APPEND external_project_dependencies leveldb) + diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 8af2765f58717408e3a1ef6b500bb01511bfd8d3..4a49a92f2b131bbb38fcf93070ea811e0b1a14e8 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -29,6 +29,8 @@ IF(NOT ${CBLAS_FOUND}) "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE FILEPATH "openblas library." FORCE) + ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS) + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") SET(OPENBLAS_COMMIT "v0.2.20") diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 9ddd05b3d9404df29ca1bf634105314b7e6a5b70..0e2df86c19086357ab520edfcd8421e35768c928 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -610,3 +610,21 @@ function(grpc_library TARGET_NAME) COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}") endfunction() + + +function(brpc_library TARGET_NAME) + set(oneValueArgs PROTO) + set(multiValueArgs SRCS DEPS) + set(options "") + cmake_parse_arguments(brpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + message(STATUS "generating brpc ${brpc_library_PROTO}") + + get_filename_component(ABS_PROTO ${brpc_library_PROTO} ABSOLUTE) + get_filename_component(PROTO_WE ${brpc_library_PROTO} NAME_WE) + get_filename_component(PROTO_PATH ${ABS_PROTO} PATH) + + protobuf_generate_cpp(brpc_proto_srcs brpc_proto_hdrs "${ABS_PROTO}") + cc_library("${TARGET_NAME}_proto" SRCS "${brpc_proto_srcs}") + cc_library("${TARGET_NAME}" SRCS "${brpc_library_SRCS}" DEPS "${TARGET_NAME}_proto" "${brpc_library_DEPS}") +endfunction() diff --git a/doc/fluid/api/detection.rst b/doc/fluid/api/detection.rst new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh index 0f0539355559446fd91f659d61b636db214b5a40..27f2419c06b3ba2d29c471c4928d098ccee9ea02 100755 --- a/doc/fluid/api/gen_doc.sh +++ b/doc/fluid/api/gen_doc.sh @@ -1,5 +1,5 @@ #!/bin/bash -python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst +python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler > layers.rst for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer do diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst index 5329adaa18ba3309a1aeda7e24c9d0d3b26ea377..e5ced9c04c3f702733635ad0397c8c52ec4b3970 100644 --- a/doc/fluid/api/layers.rst +++ b/doc/fluid/api/layers.rst @@ -594,7 +594,6 @@ roi_pool .. autofunction:: paddle.fluid.layers.roi_pool :noindex: - ops === @@ -991,27 +990,93 @@ zeros .. autofunction:: paddle.fluid.layers.zeros :noindex: -topk ----- +detection +========= -.. autofunction:: paddle.fluid.layers.topk +multi_box_head +-------------- + +.. autofunction:: paddle.fluid.layers.multi_box_head :noindex: -dice_loss ----- +bipartite_match +--------------- + +.. autofunction:: paddle.fluid.layers.bipartite_match + :noindex: + +target_assign +------------- + +.. autofunction:: paddle.fluid.layers.target_assign + :noindex: + +detection_output +---------------- -.. autofunction:: paddle.fluid.layers.dice_loss +.. autofunction:: paddle.fluid.layers.detection_output :noindex: -resize_bilinear -____ +ssd_loss +-------- -.. autofunction:: paddle.fluid.layers.resize_bilinear +.. autofunction:: paddle.fluid.layers.ssd_loss :noindex: -gather -____ +detection_map +------------- + +.. autofunction:: paddle.fluid.layers.detection_map + :noindex: + +iou_similarity +-------------- + +.. autofunction:: paddle.fluid.layers.iou_similarity + :noindex: + +box_coder +--------- + +.. autofunction:: paddle.fluid.layers.box_coder + :noindex: + +learning_rate_scheduler +======================= + +exponential_decay +----------------- + +.. autofunction:: paddle.fluid.layers.exponential_decay + :noindex: + +natural_exp_decay +----------------- + +.. autofunction:: paddle.fluid.layers.natural_exp_decay + :noindex: + +inverse_time_decay +------------------ + +.. autofunction:: paddle.fluid.layers.inverse_time_decay + :noindex: + +polynomial_decay +---------------- + +.. autofunction:: paddle.fluid.layers.polynomial_decay + :noindex: + +piecewise_decay +--------------- + +.. autofunction:: paddle.fluid.layers.piecewise_decay + :noindex: + +noam_decay +---------- -.. autofunction:: paddle.fluid.layers.gather +.. autofunction:: paddle.fluid.layers.noam_decay :noindex: diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst index df2bd2eace52e78805433bea320f5de95d45bfc7..79a0995fce303518d989693976c4e92e05795ca2 100644 --- a/doc/fluid/api/optimizer.rst +++ b/doc/fluid/api/optimizer.rst @@ -47,28 +47,6 @@ DecayedAdagrad :members: :noindex: -Adadelta ------------------ - -.. autoclass:: paddle.fluid.optimizer.Adadelta - :members: - :noindex: - -RMSProp ------------------ - -.. autoclass:: paddle.fluid.optimizer.RMSProp - :members: - :noindex: - -ModelAverage ------------------ - -.. autoclass:: paddle.fluid.optimizer.ModelAverage - :members: - :noindex: - - SGDOptimizer ------------ @@ -111,25 +89,24 @@ DecayedAdagradOptimizer :members: :noindex: +Adadelta +-------- -AdadeltaOptimizer ------------------ - -.. autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer +.. autoclass:: paddle.fluid.optimizer.Adadelta :members: :noindex: +ModelAverage +------------ -RMSPropOptimizer ------------------ - -.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer +.. autoclass:: paddle.fluid.optimizer.ModelAverage :members: :noindex: - + Optimizer --------- .. autoclass:: paddle.fluid.optimizer.Optimizer :members: :noindex: + diff --git a/doc/fluid/dev/api_doc_std_cn.md b/doc/fluid/dev/api_doc_std_cn.md index b50f18f21df0787b9761bf0935ed7f4384ff0f98..7d39b8de1e6dc502ffea5f7882bd6a42b1ed6549 100644 --- a/doc/fluid/dev/api_doc_std_cn.md +++ b/doc/fluid/dev/api_doc_std_cn.md @@ -1,8 +1,9 @@ # API注释撰写标准 -- [API注释模块](#API注释模块) -- [格式及示例](#格式及示例) -- [完整示例](#完整示例) +- [API注释撰写标准](#api) + - [API注释模块](#api) + - [格式及示例](#) + - [完整示例](#) ## API注释模块 @@ -217,4 +218,4 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接 ## 完整示例 -fc 的完整注释见[示例](src/fc.py)。 +fc 的完整注释见[示例](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。 diff --git a/doc/fluid/dev/api_doc_std_en.md b/doc/fluid/dev/api_doc_std_en.md index e57072d52fd162e92a3482aef33f99ab9394c532..f175b219750d1c765a6a111c2ec3aa732fa46175 100644 --- a/doc/fluid/dev/api_doc_std_en.md +++ b/doc/fluid/dev/api_doc_std_en.md @@ -1,8 +1,9 @@ # API Doc Standard -- [API Doc Structure](#API Doc Structure) -- [Format and Examples](#Format and Examples) -- [Complete Example](#Complete Example) +- [API Doc Standard](#api-doc-standard) + - [API Doc Structure](#api-doc-structure) + - [Format and Examples](#format-and-examples) + - [Complete Example](#complete-example) ## API Doc Structure @@ -223,4 +224,4 @@ Format and examples of each part of API documantation are as follows: (take fc f ## Complete Example -Complete Example of fc please see [here](src/fc.py)。 +Complete Example of fc please see [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。 diff --git a/doc/fluid/howto/cluster/fluid_recordio.md b/doc/fluid/howto/cluster/fluid_recordio.md new file mode 100644 index 0000000000000000000000000000000000000000..55ce63ec193948424cd0b87f13d56b9cf6154dfc --- /dev/null +++ b/doc/fluid/howto/cluster/fluid_recordio.md @@ -0,0 +1,127 @@ +# How to use RecordIO in Fluid + +If you want to use RecordIO as your training data format, you need to convert to your training data +to RecordIO files and reading them in the process of training, PaddlePaddle Fluid provides some +interface to deal with the RecordIO files. + +## Generate RecordIO File + +Before start training with RecordIO files, you need to convert your training data +to RecordIO format by `fluid.recordio_writer.convert_reader_to_recordio_file`, the sample codes +as follows: + +```python + reader = paddle.batch(mnist.train(), batch_size=1) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=[784]), + fluid.layers.data( + name='label', shape=[1], dtype='int64'), + ], + place=fluid.CPUPlace()) + fluid.recordio_writer.convert_reader_to_recordio_file('./mnist.recordio', reader, feeder) +``` + +The above code snippet would generate a RecordIO `./mnist.recordio` on your host. + +**NOTE**: we recommend users to set `batch_size=1` when generating the recordio files so that users can +adjust it flexibly while reading it. + +## Use the RecordIO file in a Local Training Job + +PaddlePaddle Fluid provides an interface `fluid.layers.io.open_recordio_file` to load your RecordIO file +and then you can use them as a Layer in your network configuration, the sample codes as follows: + +```python + data_file = fluid.layers.io.open_recordio_file( + filename="./mnist.recordio", + shapes=[(-1, 784),(-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int32"]) + data_file = fluid.layers.io.batch(data_file, batch_size=4) + + img, label = fluid.layers.io.read_file(data_file) + hidden = fluid.layers.fc(input=img, size=100, act='tanh') + prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + avg_loss = fluid.layers.mean(loss) + + fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss) + + place = fluid.CPUPlace() + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + avg_loss_np = [] + + # train a pass + batch_id = 0 + while True: + tmp, = exe.run(fetch_list=[avg_loss]) + + avg_loss_np.append(tmp) + print(batch_id) + batch_id += 1 +``` + +## Use the RecordIO files in Distributed Training + +1. generate multiple RecordIO files + +For a distributed training job, you may have multiple trainer nodes, +and one or more RecordIO files for one trainer node, you can use the interface +`fluid.recordio_writer.convert_reader_to_recordio_files` to convert your training data +into multiple RecordIO files, the sample codes as follows: + +```python + reader = paddle.batch(mnist.train(), batch_size=1) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=[784]), + fluid.layers.data( + name='label', shape=[1], dtype='int64'), + ], + place=fluid.CPUPlace()) + fluid.recordio_writer.convert_reader_to_recordio_files( + filename_suffix='./mnist.recordio', batch_per_file=100, reader, feeder) +``` + +The above codes would generate multiple RecordIO files on your host like: + +```bash +. + \_mnist-00000.recordio + |-mnist-00001.recordio + |-mnist-00002.recordio + |-mnist-00003.recordio + |-mnist-00004.recordio +``` + +2. open multiple RecordIO files by `fluid.layers.io.open_files` + +For a distributed training job, the distributed operator system will schedule trainer process on multiple nodes, +each trainer process reads parts of the whole training data, we usually take the following approach to make the training +data allocated by each trainer process as uniform as possiable: + +```python +def gen_train_list(file_pattern, trainers, trainer_id): + file_list = glob.glob(file_pattern) + ret_list = [] + for idx, f in enumerate(file_list): + if (idx + trainers) % trainers == trainer_id: + ret_list.append(f) + return ret_list + +trainers = int(os.getenv("TRAINERS")) +trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) +data_file = fluid.layers.io.open_files( + filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0), + thread_num=1, + shapes=[(-1, 784),(-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int32"]) +img, label = fluid.layers.io.read_file(data_files) +... +``` diff --git a/doc/fluid/howto/optimization/benchmark/README.md b/doc/fluid/howto/optimization/benchmark/README.md deleted file mode 120000 index db30af7f53231c687f9ad61ad961a685733cbad0..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/optimization/benchmark/README.md +++ /dev/null @@ -1 +0,0 @@ -../../../../../benchmark/cluster/README.md \ No newline at end of file diff --git a/doc/fluid/howto/optimization/benchmark/vgg16/README.md b/doc/fluid/howto/optimization/benchmark/vgg16/README.md deleted file mode 120000 index ca963ef5f06aa0c2fe507ba7548dca8017358120..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/optimization/benchmark/vgg16/README.md +++ /dev/null @@ -1 +0,0 @@ -../../../../../../benchmark/cluster/vgg16/README.md \ No newline at end of file diff --git a/doc/fluid/howto/optimization/host_memory_profiling_cn.md b/doc/fluid/howto/optimization/host_memory_profiling_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..9b55a66ded8b48f7105c05f1462839a72ab5f904 --- /dev/null +++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md @@ -0,0 +1,89 @@ +## 堆内存分析和优化 + +计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放,随着程序的运行占用的内存越来越大,一方面会影响程序的稳定性,可能让运行速度越来越慢,或者造成oom,甚至会影响运行程序的机器的稳定性,造成宕机。 + + +目前有很多内存泄漏分析工具,比较经典的有[valgrind](http://valgrind.org/docs/manual/quick-start.html#quick-start.intro), [gperftools](https://gperftools.github.io/gperftools/)。 + +因为Fluid是用Python驱动C++ core来运行,valgrind直接分析非常困难,需要自己编译debug版本的、带valgrind支持的专用Python版本,而且输出的信息中大部分是Python自己的符号和调用信息,分析起来很困难,另外使用valgrind会让程序运行速度变得非常慢,所以不建议使用。 + +本教程主要介绍[gperftools](https://gperftools.github.io/gperftools/)的使用。 + +gperftool主要支持以下四个功能: + +- thread-caching malloc +- heap-checking using tcmalloc +- heap-profiling using tcmalloc +- CPU profiler + +Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/cpu_profiling_cn.md)。 + +对于堆内存的分析,主要用到thread-caching malloc和heap-profiling using tcmalloc。 + +## 使用流程 +#### 环境 +本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev,基于Ubuntu 16.04.4 LTS环境。 + +#### 使用流程 + +- 安装google-perftools + +``` +apt-get install libunwind-dev +apt-get install google-perftools +``` + +- 安装pprof + +``` +go get -u github.com/google/pprof +``` + +- 设置运行环境 + +``` +export PPROF_PATH=/root/gopath/bin/pprof +export PPROF_BINARY_PATH=/root/gopath/bin/pprof +export LD_PRELOAD=/usr/lib/libtcmalloc.so.4 +``` + +- 使用heap profile来运行python程序。本质上是周期性的对堆的分配情况做一次快照。 + +``` +# HEAPPROFILE 设置生成的堆分析文件的目录和文件前缀 +# HEAP_PROFILE_ALLOCATION_INTERVAL 设置每分配多少存储dump一次dump,默认1GB +env HEAPPROFILE="./perf_log/test.log" HEAP_PROFILE_ALLOCATION_INTERVAL=209715200 python trainer.py +``` + +随着程序的运行,会在perf_log这个文件夹下生成很多文件,如下: + +``` +-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0001.heap +-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0002.heap +-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0003.heap +-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0004.heap +-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0005.heap +-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0006.heap +``` + +- 使用pprof对heap文件进行分析。分析有两种模式: + - 完整模式。会对当前heap做一个分析,显示目前分配内存一些调用路径。 + + ``` + pprof --pdf python test.log.0012.heap + ``` + 上述命令会生成一个profile00x.pdf的文件,可以直接打开,例如:[memory_cpu_allocator](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_cpu_allocator.pdf)。从下图可以看出,在CPU版本fluid的运行过程中,分配存储最多的模块式CPUAllocator. 而别的模块相对而言分配内存较少,所以被忽略了,这对于分配内存泄漏是很不方便的,因为泄漏是一个缓慢的过程,在这种图中是无法看到的。 + + ![result](https://user-images.githubusercontent.com/3048612/40964027-a54033e4-68dc-11e8-836a-144910c4bb8c.png) + + - Diff模式。可以对两个时刻的heap做diff,把一些内存分配没有发生变化的模块去掉,而把增量部分显示出来。 + ``` + pprof --pdf --base test.log.0010.heap python test.log.1045.heap + ``` + 生成的结果为:[`memory_leak_protobuf`](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_leak_protobuf.pdf) + + 从图中可以看出:ProgramDesc这个结构,在两个版本之间增长了200MB+,所以这里有很大的内存泄漏的可能性,最终结果也确实证明是这里造成了泄漏。 + + ![result](https://user-images.githubusercontent.com/3048612/40964057-b434d5e4-68dc-11e8-894b-8ab62bcf26c2.png) + ![result](https://user-images.githubusercontent.com/3048612/40964063-b7dbee44-68dc-11e8-9719-da279f86477f.png) + diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md index cdd6917239371a660d0df05bb623f0b94f8f11a3..0607748b751e9f2d606236d9e98868335379b05c 100644 --- a/doc/mobile/cross_compiling_for_android_cn.md +++ b/doc/mobile/cross_compiling_for_android_cn.md @@ -63,16 +63,16 @@ Android的Docker开发镜像向用户提供两个可配置的参数: - 编译`armeabi-v7a`,`Android API 21`的PaddlePaddle库 ```bash -$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev +$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android ``` - 编译`arm64-v8a`,`Android API 21`的PaddlePaddle库 ```bash -$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev +$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android ``` -执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`,`ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。 +执行上述`docker run`命令时,容器执行[paddle/scripts/paddle_build.sh build_android](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`,`ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。 ## 基于Linux交叉编译环境的编译方式 本文档将以Linux x86-64平台为例,介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。 diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md index 6af16fc114a2310e364023ec43cc3c64149af8f7..572063e8012efee2d2e142eb57e459e0e8c6382c 100644 --- a/doc/mobile/cross_compiling_for_android_en.md +++ b/doc/mobile/cross_compiling_for_android_en.md @@ -36,7 +36,7 @@ $ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below: ```bash -$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android +$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android ./paddle/scripts/paddle_build.sh build_android ``` The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`: @@ -70,7 +70,7 @@ The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`: The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API. -The default entry-point of the Docker image, [`paddle/scripts/docker/build_android.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`. For information about other configuration arguments, please continue reading. +The build command, [`paddle/scripts/paddle_build.sh build_android`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`. For information about other configuration arguments, please continue reading. The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`. diff --git a/doc/survey/dynamic_graph.md b/doc/survey/dynamic_graph.md new file mode 100644 index 0000000000000000000000000000000000000000..6b80b014b1b1dc50f425e1296f70984c9e9b1cbd --- /dev/null +++ b/doc/survey/dynamic_graph.md @@ -0,0 +1,378 @@ +# Automatic Differentiation with the Tape + +## Automatic Differentiation + +A key challenge in the field of deep learning is to automatically derive the backward pass from the forward pass described algorithmically by researchers. Such a derivation, or a transformation of the forward pass program, has been long studied before the recent prosperity of deep learning in the field known as [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf). + +## The Tape + +Given the forward pass program (usually in Python in practices), there are two strategies to derive the backward pass: + +1. from the forward pass program itself, or +1. from the execution trace of the forward pass program, which is often known as the *tape*. + +This article surveys systems that follow the latter strategy. + +## Dynamic Network + +When we train a deep learning model, the tape changes every iteration as the input data change, so we have to re-derive the backward pass every iteration. This is known as *dynamic network*. + +Deep learning systems that utilize the idea of dynamic network gained their popularities in recent years. This article surveys two representative systems: [PyTorch](https://pytorch.org/) and [DyNet](https://dynet.readthedocs.io/en/latest/). + +## An Overview + +Both frameworks record a ‘tape’ of the computation and interpreting (or run-time compiling) a transformation of the tape played back in reverse. This tape is a different kind of entity than the original program.[[link]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf) + +Consider the following code feedforward model. + +```python +x = Variable(randn(20, 1))) +label = Variable(randint(1)) +W_1, W_2 = Variable(randn(20, 20)), Variable(randn(10, 20)) +h = matmul(W_1, x) +pred = matmul(W_2, x) +loss = softmax(pred, label) +loss.backward() +``` + +### 1) Dynet uses List to encode the Tape + +During the forward execution, a list of operators, in this case `matmul`, `matmul` and `softmax`, are recorded in the tape, along with the necessary information needed to do the backward such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward()`. + +
+ +digraph g { + graph [ + rankdir = "LR" + ]; + node [ + fontsize = "16" + shape = "ellipse" + ]; + edge []; + "node0" [ + label = " type: matmul | input: W_1, x | output: h" + shape = "record" + ]; + "node1" [ + label = " type: matmul | input: W_2, h | output: pred" + shape = "record" + ]; + "node2" [ + label = " type: softmax | input: pred, label | output: loss" + shape = "record" + ]; + "node0":f0 -> "node1":f0 []; + "node1":f0 -> "node2":f0 []; +} +
+ +![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22ellipse%22%20];%20edge%20[];%20%22node0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_1,%20x%20|%20%3Cf2%3E%20output:%20h%22%20shape%20=%20%22record%22%20];%20%22node1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_2,%20h%20|%20%3Cf2%3E%20output:%20pred%22%20shape%20=%20%22record%22%20];%20%22node2%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20%3Cf1%3E%20input:%20pred,%20label%20|%20%3Cf2%3E%20output:%20loss%22%20shape%20=%20%22record%22%20];%20%22node0%22:f0%20-%3E%20%22node1%22:f0%20[%20id%20=%200%20];%20%22node1%22:f0%20-%3E%20%22node2%22:f0%20[%20id%20=%201%20];%20}) + +### 2) Pytorch uses Node Graph to encode the Tape + +The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order. + +
+ +digraph g { + graph [ + rankdir = "LR" + ]; + + subgraph function { + node [ + fontsize = "16" + style = filled + shape = "record" + ]; + "matmul0" [ label = " type: matmul | prev_func: None" ]; + "matmul1" [ label = " type: matmul | prev_func: matmul" ]; + "softmax" [ label = " type: softmax | prev_func: matmul" ]; + } + + subgraph variable { + node [ + fontsize = "16" + shape = "Mrecord" + style = filled + fillcolor = white + ]; + "x" [ label = " x | creator: None" ]; + "label" [ label = " label | creator: None" ]; + "W_1" [ label = " W_1 | creator: None" ]; + "W_2" [ label = " W_2 | creator: None" ]; + "h" [ label = " h | creator: None" ]; + "pred" [ label = " pred | creator: matmul" ]; + "loss" [ label = " loss | creator: softmax" ]; + } + + subgraph data_flow { + "x":f0 -> "matmul0":f0; + "W_1":f0 -> "matmul0":f0; + "matmul0":f0 -> "h":f0; + + "h":f0 -> "matmul1":f0; + "W_2":f0 -> "matmul1":f0; + "matmul1":f0 -> "pred":f0; + + "pred":f0 -> "softmax":f0; + "label":f0 -> "softmax":f0; + "softmax":f0 -> "loss":f0; + } + + subgraph prev_func { + edge [color="red", arrowsize="0.6", penwidth="1", constraint=false]; + "matmul1":f1 -> "matmul0":f0; + "softmax":f1 -> "matmul1":f0; + label = "prev_func"; + } +} +
+ +![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20subgraph%20function%20{%20node%20[%20fontsize%20=%20%2216%22%20style%20=%20filled%20shape%20=%20%22record%22%20];%20%22matmul0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20None%22%20];%20%22matmul1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20matmul%22%20];%20%22softmax%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20prev_func:%20matmul%22%20];%20}%20subgraph%20variable%20{%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22Mrecord%22%20style%20=%20filled%20fillcolor%20=%20white%20];%20%22x%22%20[%20label%20=%20%22%3Cf0%3E%20x%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22label%22%20[%20label%20=%20%22%3Cf0%3E%20label%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_1%22%20[%20label%20=%20%22%3Cf0%3E%20W_1%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_2%22%20[%20label%20=%20%22%3Cf0%3E%20W_2%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22h%22%20[%20label%20=%20%22%3Cf0%3E%20h%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22pred%22%20[%20label%20=%20%22%3Cf0%3E%20pred%20|%20%3Cf1%3E%20creator:%20matmul%22%20];%20%22loss%22%20[%20label%20=%20%22%3Cf0%3E%20loss%20|%20%3Cf1%3E%20creator:%20softmax%22%20];%20}%20subgraph%20data_flow%20{%20%22x%22:f0%20-%3E%20%22matmul0%22:f0;%20%22W_1%22:f0%20-%3E%20%22matmul0%22:f0;%20%22matmul0%22:f0%20-%3E%20%22h%22:f0;%20%22h%22:f0%20-%3E%20%22matmul1%22:f0;%20%22W_2%22:f0%20-%3E%20%22matmul1%22:f0;%20%22matmul1%22:f0%20-%3E%20%22pred%22:f0;%20%22pred%22:f0%20-%3E%20%22softmax%22:f0;%20%22label%22:f0%20-%3E%20%22softmax%22:f0;%20%22softmax%22:f0%20-%3E%20%22loss%22:f0;%20}%20subgraph%20prev_func%20{%20edge%20[color=%22red%22,%20arrowsize=%220.6%22,%20penwidth=%221%22,%20constraint=false];%20%22matmul1%22:f1%20-%3E%20%22matmul0%22:f0;%20%22softmax%22:f1%20-%3E%20%22matmul1%22:f0;%20label%20=%20%22prev_func%22;%20}%20}) + +Chainer and Autograd uses the similar techniques to record the forward pass. For details please refer to the appendix. + +## Design choices + +### 1) Dynet's List vs Pytorch's Node Graph + +What's good about List: +1. It avoids a topological sort. One only needs to traverse the list of operators in reverse and calling the corresponding backward operator. +1. It promises effient data parallelism implementations. One could count the time of usage of a certain variable during the construction list. Then in the play back, one knows the calculation of a variable has completed. This enables communication and computation overlapping. + +What's good about Node Graph: +1. More flexibility. PyTorch users can mix and match independent graphs however they like, in whatever threads they like (without explicit synchronization). An added benefit of structuring graphs this way is that when a portion of the graph becomes dead, it is automatically freed. [[2]](https://openreview.net/pdf?id=BJJsrmfCZ) Consider the following example, Pytorch only does backward on SmallNet while Dynet does both BigNet and SmallNet. +```python +result = BigNet(data) +loss = SmallNet(data) +loss.backward() +``` + +### 2) Dynet's Lazy evaluation vs Pytorch's Immediate evaluation + +Dynet builds the list in a symbolic matter. Consider the following example +```python +for epoch in range(num_epochs): + for in_words, out_label in training_data: + dy.renew_cg() + W = dy.parameter(W_p) + b = dy.parameter(b_p) + score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b) + loss_sym = dy.pickneglogsoftmax(score_sym, out_label) + loss_val = loss_sym.value() + loss_sym.backward() +``` +The computation of `lookup`, `concat`, `matmul` and `softmax` didn't happen until the call of `loss_sym.value()`. This defered execution is useful because it allows some graph-like optimization possible, e.g. kernel fusion. + +Pytorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`. + + +## What can fluid learn from them? + +Please refer to `paddle/contrib/dynamic/`. + +# Appendix + +### Overview + +| Framework | Has Tape | Core in C++ | First Release Date | +|-----------|----------|-------------|--------------------| +| Autograd | No | No | Mar 5, 2015 | +| Chainer | No | No | Jun 5, 2015 | +| Pytorch | No | Yes | Aug 31, 2016 | +| Dynet | Yes | Yes | Oct 12, 2016 | + +### Source Code +#### Autograd +[Backward code](https://github.com/HIPS/autograd/blob/442205dfefe407beffb33550846434baa90c4de7/autograd/core.py#L8-L40). In the forward pass, a graph of VJPNode is constructed. +```python +# User API +def make_grad(fun, x): + start_node = VJPNode.new_root() + end_value, end_node = trace(start_node, fun, x) + return backward_pass(g, end_node), end_value + +# trace the forward pass by creating VJPNodes +def trace(start_node, fun, x): + with trace_stack.new_trace() as t: + start_box = new_box(x, t, start_node) + end_box = fun(start_box) + return end_box._value, end_box._node + +def backward_pass(g, end_node): + outgrads = {end_node : (g, False)} + for node in toposort(end_node): + outgrad = outgrads.pop(node) + ingrads = node.vjp(outgrad[0]) + for parent, ingrad in zip(node.parents, ingrads): + outgrads[parent] = add_outgrads(outgrads.get(parent), ingrad) + return outgrad[0] + +# Every VJPNode corresponds to a op_grad +class VJPNode(Node): + __slots__ = ['parents', 'vjp'] + def __init__(self, value, fun, args, kwargs, parent_argnums, parents): + self.parents = parents + vjpmaker = primitive_vjps[fun] + self.vjp = vjpmaker(parent_argnums, value, args, kwargs) +``` +#### Chainer +Example Code +```python +# (1) Function Set definition, creates FunctionNode +model = FunctionSet( + l1=F.Linear(784, 100), + l2=F.Linear(100, 100), + l3=F.Linear(100, 10)).to_gpu() + +# (2) Optimizer Setup +opt = optimizers.SGD() +opt.setup(model) + +# (3) Forward computation +def forward(x, t): + h1 = F.relu(model.l1(x)) + h2 = F.relu(model.l2(h1)) + y = model.l3(h2) + return F.softmax_cross_entropy(y, t) + +# (4) Training loop +for epoch in xrange(n_epoch): + for i in xrange(0, N, b_size): + x = Variable(to_gpu(...)) + t = Variable(to_gpu(...)) + opt.zero_grads() + loss = forward(x, t) + loss.backward() + opt.update() +``` +In `forward(x, t)`, a graph of [`VariableNode`](https://github.com/chainer/chainer/blob/master/chainer/variable.py#L110) and [`FunctionNode`](https://github.com/chainer/chainer/blob/a69103a4aa59d5b318f39b01dbcb858d465b89cf/chainer/function_node.py#L19) is constructed. Every output's `VariableNode.creator` is pointed to the `FunctionNode`. +```python +class FunctionNode(object): + ... + def apply(self, inputs): + outputs = self.forward(inputs) + ret = tuple([variable.Variable(y, requires_grad=requires_grad) + for y in outputs]) + # Topological ordering + self.rank = max([x.rank for x in inputs]) if input_vars else 0 + # Add backward edges + for y in ret: + y.creator_node = self + self.inputs = tuple([x.node for x in input_vars]) + self.outputs = tuple([y.node for y in ret]) + + return ret +``` +`loss.backward()` will calculate the accumulated gradient of all variables. All the backward of `FunctionNode`s will be called based on the topological order. +```python +class VariableNode(object): + ... + def backward(self, retain_grad, loss_scale): + if self.creator_node is None: + return + + cand_funcs = [] + seen_set = set() + grads = {} + + # Initialize error by 1, if this is a loss variable + if self.data.size == 1 and self._grad_var is None: + self.grad = numpy.ones_like(self.data) + grads[self._node] = self._grad_var + + def add_cand(cand): + if cand not in seen_set: + # Negate since heapq is min-heap. This is a global variable + heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) + seen_set.add(cand) + + add_cand(self.creator_node) + + while cand_funcs: + _, _, func = heapq.heappop(cand_funcs) + gxs = func.backward_accumulate(func.inputs, func.outputs, func.outputs.grad) + + for x, gx in enumerate(gxs): + if x in grads: + grads[x] += gx + else: + grads[x] = gx + + if x.creator_node is not None: + add_cand(x.creator_node) +``` + +#### PyTorch +Example Code +```python +x = Variable(torch.ones(5, 5)) +y = Variable(torch.ones(5, 5) * 4) +z = x ** 2 + x * 2 + x * y + y +z.backward(torch.ones(5, 5)) +``` +The trace is done by `Variable.creator` and `Function.previous_functions`. +```python +class Variable(object): + def __init__(self, tensor, creator=None, requires_grad=True): + if creator is None: + creator = Leaf(self, requires_grad) + self.data = tensor + self.creator = creator + self._grad = None + + def backward(self, gradient=None): + if gradient is None: + if self.data.numel() != 1: + raise RuntimeError('backward should be called only on a scalar (i.e. 1-element tensor) or with gradient w.r.t. the variable') + gradient = self.data.new(1).fill_(1) + self._execution_engine.run_backward(self, gradient) + +class Function(obejct): + # ... + def _do_forward(self, *input): + unpacked_input = tuple(arg.data for arg in input) + raw_output = self.forward(*unpacked_input) + + # mark output.creator = self for backward trace + output = tuple(Variable(tensor, self) for tensor in raw_output) + + self.previous_functions = [(arg.creator, id(arg)) for arg in input] + self.output_ids = {id(var): i for i, var in enumerate(output)} + return output + + def _do_backward(self, grad_output): + return self.backwaerd(grad_output) +``` +The [backward](https://github.com/pytorch/pytorch/blob/v0.1.1/torch/autograd/engine.py) is similar to Autograd. + +#### DyNet +Example code +```python +model = dy.model() +W_p = model.add_parameters((20, 100)) +b_p = model.add_parameters(20) +E = model.add_lookup_parameters((20000, 50)) +for epoch in range(num_epochs): + for in_words, out_label in training_data: + dy.renew_cg() # init tape + W = dy.parameter(W_p) + b = dy.parameter(b_p) + score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b) + loss_sym = dy.pickneglogsoftmax(score_sym, out_label) + loss_val = loss_sym.value() + loss_sym.backward() +``` +[forward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L84-L158), [backward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L166-L284). The trace is done by creating a tape of expressions in every iteration. Backward is done by traverse the tape in the reverse order. +```c++ +void SimpleExecutionEngine::backward(VariableIndex from_where, bool full) { + ... + for (int i = num_nodes - 1; i >= 0; --i) { + // each node corresponds to an op + node->backward(xs, node_fx, node_dEdfx, ai, node_dEdxai); + } + ... +} +``` diff --git a/doc/v2/api/config/evaluators.rst b/doc/v2/api/config/evaluators.rst index 9ac972fb193a2fb525edc507f7ba1303d2c8eabe..458d892e825a7a9bbe7843ad5c508bd5a31f5f0f 100644 --- a/doc/v2/api/config/evaluators.rst +++ b/doc/v2/api/config/evaluators.rst @@ -101,7 +101,7 @@ value_printer :noindex: Detection -===== +========== detection_map ------------- diff --git a/doc/v2/api/config/layer.rst b/doc/v2/api/config/layer.rst index 1a6496968cae1fef88142ba9ca3f9e63a81b196d..5a0cfadfce84df41defdf518b7c3a6222d5b30a1 100644 --- a/doc/v2/api/config/layer.rst +++ b/doc/v2/api/config/layer.rst @@ -11,7 +11,7 @@ Data layer data ---- -.. autoclass:: paddle.v2.layer.data +.. autofunction:: paddle.v2.layer.data :noindex: Fully Connected Layers @@ -21,12 +21,12 @@ Fully Connected Layers fc -- -.. autoclass:: paddle.v2.layer.fc +.. autofunction:: paddle.v2.layer.fc :noindex: selective_fc ------------ -.. autoclass:: paddle.v2.layer.selective_fc +.. autofunction:: paddle.v2.layer.selective_fc :noindex: Conv Layers @@ -34,34 +34,34 @@ Conv Layers conv_operator ------------- -.. autoclass:: paddle.v2.layer.conv_operator +.. autofunction:: paddle.v2.layer.conv_operator :noindex: conv_projection --------------- -.. autoclass:: paddle.v2.layer.conv_projection +.. autofunction:: paddle.v2.layer.conv_projection :noindex: conv_shift ---------- -.. autoclass:: paddle.v2.layer.conv_shift +.. autofunction:: paddle.v2.layer.conv_shift :noindex: img_conv -------- -.. autoclass:: paddle.v2.layer.img_conv +.. autofunction:: paddle.v2.layer.img_conv :noindex: .. _api_v2.layer_context_projection: context_projection ------------------ -.. autoclass:: paddle.v2.layer.context_projection +.. autofunction:: paddle.v2.layer.context_projection :noindex: row_conv -------- -.. autoclass:: paddle.v2.layer.row_conv +.. autofunction:: paddle.v2.layer.row_conv :noindex: Image Pooling Layer @@ -69,27 +69,27 @@ Image Pooling Layer img_pool -------- -.. autoclass:: paddle.v2.layer.img_pool +.. autofunction:: paddle.v2.layer.img_pool :noindex: spp --- -.. autoclass:: paddle.v2.layer.spp +.. autofunction:: paddle.v2.layer.spp :noindex: maxout ------ -.. autoclass:: paddle.v2.layer.maxout +.. autofunction:: paddle.v2.layer.maxout :noindex: roi_pool -------- -.. autoclass:: paddle.v2.layer.roi_pool +.. autofunction:: paddle.v2.layer.roi_pool :noindex: pad ---- -.. autoclass:: paddle.v2.layer.pad +.. autofunction:: paddle.v2.layer.pad :noindex: Norm Layer @@ -97,27 +97,27 @@ Norm Layer img_cmrnorm ----------- -.. autoclass:: paddle.v2.layer.img_cmrnorm +.. autofunction:: paddle.v2.layer.img_cmrnorm :noindex: batch_norm ---------- -.. autoclass:: paddle.v2.layer.batch_norm +.. autofunction:: paddle.v2.layer.batch_norm :noindex: sum_to_one_norm --------------- -.. autoclass:: paddle.v2.layer.sum_to_one_norm +.. autofunction:: paddle.v2.layer.sum_to_one_norm :noindex: cross_channel_norm ------------------ -.. autoclass:: paddle.v2.layer.cross_channel_norm +.. autofunction:: paddle.v2.layer.cross_channel_norm :noindex: row_l2_norm ----------- -.. autoclass:: paddle.v2.layer.row_l2_norm +.. autofunction:: paddle.v2.layer.row_l2_norm :noindex: Recurrent Layers @@ -125,22 +125,22 @@ Recurrent Layers recurrent --------- -.. autoclass:: paddle.v2.layer.recurrent +.. autofunction:: paddle.v2.layer.recurrent :noindex: lstmemory --------- -.. autoclass:: paddle.v2.layer.lstmemory +.. autofunction:: paddle.v2.layer.lstmemory :noindex: grumemory --------- -.. autoclass:: paddle.v2.layer.grumemory +.. autofunction:: paddle.v2.layer.grumemory :noindex: gated_unit ----------- -.. autoclass:: paddle.v2.layer.gated_unit +.. autofunction:: paddle.v2.layer.gated_unit :noindex: Recurrent Layer Group @@ -148,32 +148,32 @@ Recurrent Layer Group memory ------ -.. autoclass:: paddle.v2.layer.memory +.. autofunction:: paddle.v2.layer.memory :noindex: recurrent_group --------------- -.. autoclass:: paddle.v2.layer.recurrent_group +.. autofunction:: paddle.v2.layer.recurrent_group :noindex: lstm_step --------- -.. autoclass:: paddle.v2.layer.lstm_step +.. autofunction:: paddle.v2.layer.lstm_step :noindex: gru_step -------- -.. autoclass:: paddle.v2.layer.gru_step +.. autofunction:: paddle.v2.layer.gru_step :noindex: beam_search ------------ -.. autoclass:: paddle.v2.layer.beam_search +.. autofunction:: paddle.v2.layer.beam_search :noindex: get_output ---------- -.. autoclass:: paddle.v2.layer.get_output +.. autofunction:: paddle.v2.layer.get_output :noindex: Mixed Layer @@ -183,54 +183,54 @@ Mixed Layer mixed ----- -.. autoclass:: paddle.v2.layer.mixed +.. autofunction:: paddle.v2.layer.mixed :noindex: .. _api_v2.layer_embedding: embedding --------- -.. autoclass:: paddle.v2.layer.embedding +.. autofunction:: paddle.v2.layer.embedding :noindex: scaling_projection ------------------ -.. autoclass:: paddle.v2.layer.scaling_projection +.. autofunction:: paddle.v2.layer.scaling_projection :noindex: dotmul_projection ----------------- -.. autoclass:: paddle.v2.layer.dotmul_projection +.. autofunction:: paddle.v2.layer.dotmul_projection :noindex: dotmul_operator --------------- -.. autoclass:: paddle.v2.layer.dotmul_operator +.. autofunction:: paddle.v2.layer.dotmul_operator :noindex: full_matrix_projection ---------------------- -.. autoclass:: paddle.v2.layer.full_matrix_projection +.. autofunction:: paddle.v2.layer.full_matrix_projection :noindex: identity_projection ------------------- -.. autoclass:: paddle.v2.layer.identity_projection +.. autofunction:: paddle.v2.layer.identity_projection :noindex: slice_projection ------------------- -.. autoclass:: paddle.v2.layer.slice_projection +.. autofunction:: paddle.v2.layer.slice_projection :noindex: table_projection ---------------- -.. autoclass:: paddle.v2.layer.table_projection +.. autofunction:: paddle.v2.layer.table_projection :noindex: trans_full_matrix_projection ---------------------------- -.. autoclass:: paddle.v2.layer.trans_full_matrix_projection +.. autofunction:: paddle.v2.layer.trans_full_matrix_projection :noindex: Aggregate Layers @@ -245,51 +245,46 @@ AggregateLevel pooling ------- -.. autoclass:: paddle.v2.layer.pooling +.. autofunction:: paddle.v2.layer.pooling :noindex: .. _api_v2.layer_last_seq: last_seq -------- -.. autoclass:: paddle.v2.layer.last_seq +.. autofunction:: paddle.v2.layer.last_seq :noindex: .. _api_v2.layer_first_seq: first_seq --------- -.. autoclass:: paddle.v2.layer.first_seq +.. autofunction:: paddle.v2.layer.first_seq :noindex: sub_seq --------- -.. autoclass:: paddle.v2.layer.sub_seq +.. autofunction:: paddle.v2.layer.sub_seq :noindex: concat ------ -.. autoclass:: paddle.v2.layer.concat +.. autofunction:: paddle.v2.layer.concat :noindex: seq_concat ---------- -.. autoclass:: paddle.v2.layer.seq_concat +.. autofunction:: paddle.v2.layer.seq_concat :noindex: seq_slice --------- -.. autoclass:: paddle.v2.layer.seq_slice - :noindex: - -kmax_sequence_score -------------------- -.. autoclass:: paddle.v2.layer.kmax_sequence_score +.. autofunction:: paddle.v2.layer.seq_slice :noindex: sub_nested_seq -------------- -.. autoclass:: paddle.v2.layer.sub_nested_seq +.. autofunction:: paddle.v2.layer.sub_nested_seq :noindex: Reshaping Layers @@ -297,7 +292,7 @@ Reshaping Layers block_expand ------------ -.. autoclass:: paddle.v2.layer.block_expand +.. autofunction:: paddle.v2.layer.block_expand :noindex: .. _api_v2.layer_expand: @@ -309,22 +304,22 @@ ExpandLevel expand ------ -.. autoclass:: paddle.v2.layer.expand +.. autofunction:: paddle.v2.layer.expand :noindex: repeat ------ -.. autoclass:: paddle.v2.layer.repeat +.. autofunction:: paddle.v2.layer.repeat :noindex: rotate ------ -.. autoclass:: paddle.v2.layer.rotate +.. autofunction:: paddle.v2.layer.rotate :noindex: seq_reshape ----------- -.. autoclass:: paddle.v2.layer.seq_reshape +.. autofunction:: paddle.v2.layer.seq_reshape :noindex: Math Layers @@ -332,94 +327,94 @@ Math Layers addto ----- -.. autoclass:: paddle.v2.layer.addto +.. autofunction:: paddle.v2.layer.addto :noindex: linear_comb ----------- -.. autoclass:: paddle.v2.layer.linear_comb +.. autofunction:: paddle.v2.layer.linear_comb :noindex: interpolation ------------- -.. autoclass:: paddle.v2.layer.interpolation +.. autofunction:: paddle.v2.layer.interpolation :noindex: bilinear_interp --------------- -.. autoclass:: paddle.v2.layer.bilinear_interp +.. autofunction:: paddle.v2.layer.bilinear_interp :noindex: dropout -------- -.. autoclass:: paddle.v2.layer.dropout +.. autofunction:: paddle.v2.layer.dropout :noindex: dot_prod --------- -.. autoclass:: paddle.v2.layer.dot_prod +.. autofunction:: paddle.v2.layer.dot_prod :noindex: out_prod -------- -.. autoclass:: paddle.v2.layer.out_prod +.. autofunction:: paddle.v2.layer.out_prod :noindex: power ----- -.. autoclass:: paddle.v2.layer.power +.. autofunction:: paddle.v2.layer.power :noindex: scaling ------- -.. autoclass:: paddle.v2.layer.scaling +.. autofunction:: paddle.v2.layer.scaling :noindex: clip ---- -.. autoclass:: paddle.v2.layer.clip +.. autofunction:: paddle.v2.layer.clip :noindex: resize ------ -.. autoclass:: paddle.v2.layer.resize +.. autofunction:: paddle.v2.layer.resize :noindex: slope_intercept --------------- -.. autoclass:: paddle.v2.layer.slope_intercept +.. autofunction:: paddle.v2.layer.slope_intercept :noindex: tensor ------ -.. autoclass:: paddle.v2.layer.tensor +.. autofunction:: paddle.v2.layer.tensor :noindex: .. _api_v2.layer_cos_sim: cos_sim ------- -.. autoclass:: paddle.v2.layer.cos_sim +.. autofunction:: paddle.v2.layer.cos_sim :noindex: l2_distance ----------- -.. autoclass:: paddle.v2.layer.l2_distance +.. autofunction:: paddle.v2.layer.l2_distance :noindex: trans ----- -.. autoclass:: paddle.v2.layer.trans +.. autofunction:: paddle.v2.layer.trans :noindex: scale_shift ----------- -.. autoclass:: paddle.v2.layer.scale_shift +.. autofunction:: paddle.v2.layer.scale_shift :noindex: factorization_machine --------------------- -.. autoclass:: paddle.v2.layer.factorization_machine +.. autofunction:: paddle.v2.layer.factorization_machine :noindex: Sampling Layers @@ -427,17 +422,17 @@ Sampling Layers maxid ----- -.. autoclass:: paddle.v2.layer.max_id +.. autofunction:: paddle.v2.layer.max_id :noindex: sampling_id ----------- -.. autoclass:: paddle.v2.layer.sampling_id +.. autofunction:: paddle.v2.layer.sampling_id :noindex: multiplex --------- -.. autoclass:: paddle.v2.layer.multiplex +.. autofunction:: paddle.v2.layer.multiplex :noindex: .. _api_v2.layer_costs: @@ -447,97 +442,97 @@ Cost Layers cross_entropy_cost ------------------ -.. autoclass:: paddle.v2.layer.cross_entropy_cost +.. autofunction:: paddle.v2.layer.cross_entropy_cost :noindex: cross_entropy_with_selfnorm_cost -------------------------------- -.. autoclass:: paddle.v2.layer.cross_entropy_with_selfnorm_cost +.. autofunction:: paddle.v2.layer.cross_entropy_with_selfnorm_cost :noindex: multi_binary_label_cross_entropy_cost ------------------------------------- -.. autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost +.. autofunction:: paddle.v2.layer.multi_binary_label_cross_entropy_cost :noindex: classification_cost ------------------- -.. autoclass:: paddle.v2.layer.classification_cost +.. autofunction:: paddle.v2.layer.classification_cost :noindex: huber_regression_cost ------------------------- -.. autoclass:: paddle.v2.layer.huber_regression_cost +.. autofunction:: paddle.v2.layer.huber_regression_cost :noindex: huber_classification_cost ------------------------- -.. autoclass:: paddle.v2.layer.huber_classification_cost +.. autofunction:: paddle.v2.layer.huber_classification_cost :noindex: lambda_cost ----------- -.. autoclass:: paddle.v2.layer.lambda_cost +.. autofunction:: paddle.v2.layer.lambda_cost :noindex: square_error_cost ----------------- -.. autoclass:: paddle.v2.layer.square_error_cost +.. autofunction:: paddle.v2.layer.square_error_cost :noindex: rank_cost --------- -.. autoclass:: paddle.v2.layer.rank_cost +.. autofunction:: paddle.v2.layer.rank_cost :noindex: sum_cost --------- -.. autoclass:: paddle.v2.layer.sum_cost +.. autofunction:: paddle.v2.layer.sum_cost :noindex: crf --- -.. autoclass:: paddle.v2.layer.crf +.. autofunction:: paddle.v2.layer.crf :noindex: crf_decoding ------------ -.. autoclass:: paddle.v2.layer.crf_decoding +.. autofunction:: paddle.v2.layer.crf_decoding :noindex: ctc --- -.. autoclass:: paddle.v2.layer.ctc +.. autofunction:: paddle.v2.layer.ctc :noindex: warp_ctc -------- -.. autoclass:: paddle.v2.layer.warp_ctc +.. autofunction:: paddle.v2.layer.warp_ctc :noindex: nce --- -.. autoclass:: paddle.v2.layer.nce +.. autofunction:: paddle.v2.layer.nce :noindex: hsigmoid --------- -.. autoclass:: paddle.v2.layer.hsigmoid +.. autofunction:: paddle.v2.layer.hsigmoid :noindex: smooth_l1_cost -------------- -.. autoclass:: paddle.v2.layer.smooth_l1_cost +.. autofunction:: paddle.v2.layer.smooth_l1_cost :noindex: multibox_loss -------------- -.. autoclass:: paddle.v2.layer.multibox_loss +.. autofunction:: paddle.v2.layer.multibox_loss :noindex: detection_output ---------------- -.. autoclass:: paddle.v2.layer.detection_output +.. autofunction:: paddle.v2.layer.detection_output :noindex: Check Layer @@ -545,7 +540,7 @@ Check Layer eos --- -.. autoclass:: paddle.v2.layer.eos +.. autofunction:: paddle.v2.layer.eos :noindex: Activation @@ -553,5 +548,5 @@ Activation prelu -------- -.. autoclass:: paddle.v2.layer.prelu +.. autofunction:: paddle.v2.layer.prelu :noindex: diff --git a/doc/v2/api/index_en.rst b/doc/v2/api/index_en.rst index b11cd449affd1dcd9d3f42492961469331350942..70c5c524aaf0a9ae003bf4340c3f268c225d4419 100644 --- a/doc/v2/api/index_en.rst +++ b/doc/v2/api/index_en.rst @@ -8,4 +8,3 @@ API model_configs.rst data.rst run_logic.rst - fluid/index.rst diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst index 741c01ce5428c0046daa5a784da70d4bb492438c..6421c5308271c2508597d849c79709255caf349a 100644 --- a/doc/v2/build_and_install/build_from_source_cn.rst +++ b/doc/v2/build_and_install/build_from_source_cn.rst @@ -23,7 +23,7 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安 在 `这里 `__ 找到 paddle_manylinux_devel 镜像的编译以及使用方法。或者参考下述可选步骤,从源码中构建用于编译PaddlePaddle的Docker镜像。 -如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。 +如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 :ref:`编译依赖 <_compile_deps>` 之后才能开始编译的步骤。 编译PaddlePaddle,需要执行: @@ -106,7 +106,7 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安 - 学习 Docker 有多难? - 理解 Docker 并不难,大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。 + 理解 Docker 并不难,大概花十分钟看一下 `如何使用Docker `_ 。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。 - 我可以用 IDE 吗? @@ -123,7 +123,7 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安 - 可以并行编译吗? - 是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。 + 是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 `_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。 - Docker 需要 sudo @@ -131,11 +131,11 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安 - 在 Windows/MacOS 上编译很慢 - Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。 + Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 `_ 。 - 磁盘不够 - 本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。 + 本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考 `如何删除Docker Container `_ 来清理这些内容。 .. _compile_deps: @@ -195,7 +195,7 @@ BLAS PaddlePaddle支持 `MKL `_ 和 `OpenBlAS `_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集, -还会下载MKL-DNN数学库,详细参考 `这里 `_ 。 +还会下载MKL-DNN数学库,详细参考 `mkldnn设计文档 `_ 。 如果关闭MKL,则会使用OpenBLAS作为BLAS库。 @@ -211,7 +211,7 @@ PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行, 编译选项的设置 ++++++++++++++ -PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时,首先在系统路径( :code:`/usr/lib:/usr/local/lib` )中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如 +PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时,首先在系统路径( :code:`/usr/lib:/usr/local/lib` )中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如 .. code-block:: bash diff --git a/doc/v2/build_and_install/build_from_source_en.rst b/doc/v2/build_and_install/build_from_source_en.rst index b06c43e19dcfc52ad0f074a85517a16744895a3a..b08b45d43ec7f1deb2889832079a731ee724a44c 100644 --- a/doc/v2/build_and_install/build_from_source_en.rst +++ b/doc/v2/build_and_install/build_from_source_en.rst @@ -11,7 +11,7 @@ To build PaddlePaddle, you need 1. A computer -- Linux, Windows, MacOS. 2. Docker. -Nothing else. Not even Python and GCC, because you can install all build tools into a Docker image. +Nothing else. Not even Python and GCC, because you can install all build tools into a Docker image. We run all the tools by running this image. .. _build_step: @@ -26,6 +26,8 @@ you can also find how to build and use paddle_manylinux_devel Docker image from `here `__ Or you can build your own image from source as the optional step below: +If you don't wish to use docker,you need to install several compile dependencies manually as :ref:`Compile Dependencies <_compile_deps>` shows to start compilation. + .. code-block:: bash # 1. clone the source code @@ -108,7 +110,7 @@ Frequently Asked Questions - How difficult is it to learn Docker? - It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools. Not even to mention the time saved when other people trying to reproduce the issue you have. + It takes you ten minutes to read `an introductory article `_ and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools. Not even to mention the time saved when other people trying to reproduce the issue you have. - Can I use my favorite IDE? @@ -125,7 +127,7 @@ Frequently Asked Questions - Does Docker do parallel building? - Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores. + Our building Docker image runs a `Bash script `_ , which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores. - Docker requires sudo @@ -133,11 +135,11 @@ Frequently Asked Questions - Docker on Windows/MacOS builds slowly - On Windows and MacOS, Docker containers run in a Linux VM. You might want to give this VM some more memory and CPUs so to make the building efficient. Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details. + On Windows and MacOS, Docker containers run in a Linux VM. You might want to give this VM some more memory and CPUs so to make the building efficient. Please refer to `this issue `_ for details. - Not enough disk space - Examples in this article use option `--rm` with the `docker run` command. This option ensures that stopped containers do not exist on hard disks. We can use `docker ps -a` to list all containers, including stopped. Sometimes `docker build` generates some intermediate dangling images, which also take disk space. To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/). + Examples in this article use option `--rm` with the `docker run` command. This option ensures that stopped containers do not exist on hard disks. We can use `docker ps -a` to list all containers, including stopped. Sometimes `docker build` generates some intermediate dangling images, which also take disk space. To clean them, please refer to `this article `_ . .. _compile_deps: diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst index 853bdb21bbcf07ae1742d2196dbcfe4668828b7b..095da19cd41d29bfa72ab23abd24bec45f925a86 100644 --- a/doc/v2/build_and_install/pip_install_cn.rst +++ b/doc/v2/build_and_install/pip_install_cn.rst @@ -60,6 +60,7 @@ paddlepaddle-gpu==0.11.0 使用CUDA 7.5和cuDNN 5编译的0.11.0版 "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" + "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" .. _pip_dependency: diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst index fecf6d3712feac3265100a6121901ba784f7d5cc..8406e4aa1fbb953c3b615b10d1bcb2c45974dde0 100644 --- a/doc/v2/build_and_install/pip_install_en.rst +++ b/doc/v2/build_and_install/pip_install_en.rst @@ -63,6 +63,7 @@ If the links below shows up the login form, just click "Log in as guest" to star "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" + "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" .. _pip_dependency: diff --git a/paddle/contrib/CMakeLists.txt b/paddle/contrib/CMakeLists.txt index 4b19256ef4533a09162edf907f6cd51146517e46..70e3a0583d8ecf9db19a85c0978aae0ce0625570 100644 --- a/paddle/contrib/CMakeLists.txt +++ b/paddle/contrib/CMakeLists.txt @@ -14,3 +14,4 @@ # add_subdirectory(inference) +add_subdirectory(tape) diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt index 8ca34465395761cab9cbde4bfbcf32edc1c4a1d1..0f56d648b1939e1d6af3368bb2423477a3b638fc 100644 --- a/paddle/contrib/inference/CMakeLists.txt +++ b/paddle/contrib/inference/CMakeLists.txt @@ -17,6 +17,9 @@ if(APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") endif(APPLE) + +set(inference_deps paddle_inference_api paddle_fluid_api) + function(inference_api_test TARGET_NAME) if (WITH_TESTING) set(options "") @@ -27,7 +30,7 @@ function(inference_api_test TARGET_NAME) set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) cc_test(${TARGET_NAME} SRCS ${TARGET_NAME}.cc - DEPS paddle_fluid paddle_inference_api + DEPS "${inference_deps}" ARGS --dirname=${PYTHON_TESTS_DIR}/book/) if(inference_test_ARGS) set_tests_properties(${TARGET_NAME} @@ -47,6 +50,19 @@ cc_test(test_paddle_inference_api inference_api_test(test_paddle_inference_api_impl ARGS test_word2vec test_image_classification) +if (WITH_ANAKIN AND WITH_TESTING) # only needed in CI + # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's, + # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to + # compile the libinference_anakin_api.a and compile with anakin.so. + nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc) + target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) + target_link_libraries(inference_anakin_api anakin anakin_saber_common) + cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc + ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin + DEPS inference_anakin_api) + target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) +endif() + if(WITH_TESTING) add_subdirectory(demo) endif() diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc index 165d2e196b3d544f540cf72d61c6f9d0dfa62977..192a6414260ce06048b8c765402d89882cabc51b 100644 --- a/paddle/contrib/inference/demo/simple_on_word2vec.cc +++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include #include #include +#include #include "paddle/contrib/inference/paddle_inference_api.h" - namespace paddle { namespace demo { @@ -54,18 +54,75 @@ void Main(bool use_gpu) { CHECK(predictor->Run(slots, &outputs)); //# 4. Get output. - ASSERT_EQ(outputs.size(), 1); + ASSERT_EQ(outputs.size(), 1UL); LOG(INFO) << "output buffer size: " << outputs.front().data.length; const size_t num_elements = outputs.front().data.length / sizeof(float); // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(5UL, num_elements); i++) { LOG(INFO) << static_cast(outputs.front().data.data)[i]; } + // TODO(Superjomn): this is should be free automatically + free(outputs[0].data.data); + } +} + +void MainThreads(int num_threads, bool use_gpu) { + // Multi-threads only support on CPU + // 0. Create PaddlePredictor with a config. + NativeConfig config; + config.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config.use_gpu = use_gpu; + config.fraction_of_gpu_memory = 0.15; + config.device = 0; + auto main_predictor = + CreatePaddlePredictor(config); + + std::vector threads; + for (int tid = 0; tid < num_threads; ++tid) { + threads.emplace_back([&, tid]() { + // 1. clone a predictor which shares the same parameters + auto predictor = main_predictor->Clone(); + constexpr int num_batches = 3; + for (int batch_id = 0; batch_id < num_batches; ++batch_id) { + // 2. Dummy Input Data + int64_t data[4] = {1, 2, 3, 4}; + PaddleBuf buf{.data = data, .length = sizeof(data)}; + PaddleTensor tensor{.name = "", + .shape = std::vector({4, 1}), + .data = buf, + .dtype = PaddleDType::INT64}; + std::vector inputs(4, tensor); + std::vector outputs; + // 3. Run + CHECK(predictor->Run(inputs, &outputs)); + + // 4. Get output. + ASSERT_EQ(outputs.size(), 1UL); + LOG(INFO) << "TID: " << tid << ", " + << "output buffer size: " << outputs.front().data.length; + const size_t num_elements = outputs.front().data.length / sizeof(float); + // The outputs' buffers are in CPU memory. + for (size_t i = 0; i < std::min(5UL, num_elements); i++) { + LOG(INFO) << static_cast(outputs.front().data.data)[i]; + } + free(outputs[0].data.data); + } + }); + } + for (int i = 0; i < num_threads; ++i) { + threads[i].join(); } } TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); } +TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); } +TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); } + +#ifdef PADDLE_WITH_CUDA TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); } +TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); } +TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); } +#endif } // namespace demo } // namespace paddle diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h index 5fe8399762bba69bc99ed9ae694db32f532ed953..77e2d77b6b7fe3eeed865c8de0818d059cfa6c6e 100644 --- a/paddle/contrib/inference/paddle_inference_api.h +++ b/paddle/contrib/inference/paddle_inference_api.h @@ -1,16 +1,16 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 +http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ /* * This file contains the definition of a simple Inference API for Paddle. @@ -47,8 +47,8 @@ struct PaddleTensor { enum class PaddleEngineKind { kNative = 0, // Use the native Fluid facility. + kAnakin, // Use Anakin for inference. // TODO(Superjomn) support following engines latter. - // kAnakin, // Use Anakin for inference. // kTensorRT, // Use TensorRT for inference. // kAutoMixedAnakin, // Automatically mix Fluid with Anakin. // kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT. @@ -63,6 +63,7 @@ class PaddlePredictor { struct Config; PaddlePredictor() = default; PaddlePredictor(const PaddlePredictor&) = delete; + PaddlePredictor& operator=(const PaddlePredictor&) = delete; // Predict an record. // The caller should be responsible for allocating and releasing the memory of @@ -76,7 +77,7 @@ class PaddlePredictor { virtual std::unique_ptr Clone() = 0; // Destroy the Predictor. - virtual ~PaddlePredictor() {} + virtual ~PaddlePredictor() = default; // The common configs for all the predictors. struct Config { @@ -95,6 +96,13 @@ struct NativeConfig : public PaddlePredictor::Config { std::string param_file; }; +// Configurations for Anakin engine. +struct AnakinConfig : public PaddlePredictor::Config { + int device; + std::string model_file; + int max_batch_size{-1}; +}; + // A factory to help create different predictors. // // FOR EXTENSION DEVELOPER: @@ -105,5 +113,4 @@ struct NativeConfig : public PaddlePredictor::Config { // Similarly, each engine kind should map to a unique predictor implementation. template std::unique_ptr CreatePaddlePredictor(const ConfigT& config); - } // namespace paddle diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc new file mode 100644 index 0000000000000000000000000000000000000000..5bafc58fa53f7d99de571f66b6224f0f2de66e32 --- /dev/null +++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h" +#include + +namespace paddle { + +PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor( + const AnakinConfig &config) { + CHECK(Init(config)); +} + +bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) { + if (!(graph_.load(config.model_file))) { + return false; + } + graph_.ResetBatchSize("input_0", config.max_batch_size); + // optimization for graph + if (!(graph_.Optimize())) { + return false; + } + // construct executer + executor_.init(graph_); + return true; +} + +bool PaddleInferenceAnakinPredictor::Run( + const std::vector &inputs, + std::vector *output_data) { + for (const auto &input : inputs) { + if (input.dtype != PaddleDType::FLOAT32) { + LOG(ERROR) << "Only support float type inputs. " << input.name + << "'s type is not float"; + return false; + } + auto d_tensor_in_p = executor_.get_in(input.name); + float *d_data_p = d_tensor_in_p->mutable_data(); + if (cudaMemcpy(d_data_p, + static_cast(input.data.data), + d_tensor_in_p->valid_size() * sizeof(float), + cudaMemcpyHostToDevice) != 0) { + LOG(ERROR) << "copy data from CPU to GPU error"; + return false; + } + } + + executor_.prediction(); + + if (output_data->empty()) { + LOG(ERROR) << "At least one output should be set with tensors' names."; + return false; + } + for (auto &output : *output_data) { + auto *tensor = executor_.get_out(output.name); + output.shape = tensor->shape(); + // Copy data from GPU -> CPU + if (cudaMemcpy(output.data.data, + tensor->mutable_data(), + tensor->valid_size() * sizeof(float), + cudaMemcpyDeviceToHost) != 0) { + LOG(ERROR) << "copy data from GPU to CPU error"; + return false; + } + } + return true; +} + +anakin::Net + &PaddleInferenceAnakinPredictor::get_executer() { + return executor_; +} + +// the cloned new Predictor of anakin share the same net weights from original +// Predictor +std::unique_ptr PaddleInferenceAnakinPredictor::Clone() { + VLOG(3) << "Anakin Predictor::clone"; + std::unique_ptr cls(new PaddleInferenceAnakinPredictor()); + // construct executer from other graph + auto anakin_predictor_p = + dynamic_cast(cls.get()); + if (!anakin_predictor_p) { + LOG(ERROR) << "fail to call Init"; + return nullptr; + } + anakin_predictor_p->get_executer().init(graph_); + + return std::move(cls); +} + +// A factory to help create difference predictor. +template <> +std::unique_ptr +CreatePaddlePredictor( + const AnakinConfig &config) { + VLOG(3) << "Anakin Predictor create."; + std::unique_ptr x( + new PaddleInferenceAnakinPredictor(config)); + return x; +}; + +} // namespace paddle diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h new file mode 100644 index 0000000000000000000000000000000000000000..212ba41cdf8ff2feccb6b6498f9679d76a2efe7c --- /dev/null +++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file contains the implementation of inference API with Anakin engine + * embeded, this API can only support Anakin models. + */ + +#pragma once + +#include "paddle/contrib/inference/paddle_inference_api.h" + +// from anakin +#include "framework/core/net/net.h" +#include "saber/saber_types.h" + +namespace paddle { + +class PaddleInferenceAnakinPredictor : public PaddlePredictor { + public: + PaddleInferenceAnakinPredictor() {} + + PaddleInferenceAnakinPredictor(const AnakinConfig& config); + + // NOTE Unlike the native engine, the buffers of anakin engine's output_data + // should be allocated first. + bool Run(const std::vector& inputs, + std::vector* output_data) override; + + std::unique_ptr Clone() override; + + anakin::Net& + get_executer(); + + ~PaddleInferenceAnakinPredictor() override{}; + + private: + bool Init(const AnakinConfig& config); + + anakin::graph::Graph + graph_; + anakin::Net + executor_; + AnakinConfig config_; +}; + +} // namespace paddle diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..1d41a5c73e75723f8614d810eae09ed8cdc8cf2b --- /dev/null +++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/contrib/inference/paddle_inference_api.h" + +DEFINE_string(model, "", "Directory of the inference model."); + +namespace paddle { + +AnakinConfig GetConfig() { + AnakinConfig config; + config.model_file = FLAGS_model; + config.device = 0; + config.max_batch_size = 1; + return config; +} + +TEST(inference, anakin) { + AnakinConfig config = GetConfig(); + auto predictor = + CreatePaddlePredictor(config); + + float data[1 * 3 * 224 * 224] = {1.0f}; + + PaddleBuf buf{.data = data, .length = sizeof(data)}; + PaddleTensor tensor{.name = "input_0", + .shape = std::vector({1, 3, 224, 224}), + .data = buf, + .dtype = PaddleDType::FLOAT32}; + + // For simplicity, we set all the slots with the same data. + std::vector paddle_tensor_feeds(1, tensor); + + float data_out[1000]; + + PaddleBuf buf_out{.data = data_out, .length = sizeof(data)}; + PaddleTensor tensor_out{.name = "prob_out", + .shape = std::vector({1000, 1}), + .data = buf_out, + .dtype = PaddleDType::FLOAT32}; + + std::vector outputs(1, tensor_out); + + ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); + + float* data_o = static_cast(outputs[0].data.data); + for (size_t j = 0; j < 1000; ++j) { + LOG(INFO) << "output[" << j << "]: " << data_o[j]; + } +} + +} // namespace paddle diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc index e7a8fa68b7fa84e246c0860dcb6b5528eb155a66..bda2981a14482e2c4a29773d37b074506cc344b1 100644 --- a/paddle/contrib/inference/paddle_inference_api_impl.cc +++ b/paddle/contrib/inference/paddle_inference_api_impl.cc @@ -1,16 +1,16 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 +http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #include #include @@ -54,7 +54,8 @@ std::string num2str(T a) { } } // namespace -bool NativePaddlePredictor::Init() { +bool NativePaddlePredictor::Init( + std::shared_ptr parent_scope) { VLOG(3) << "Predictor::init()"; if (config_.use_gpu) { @@ -62,9 +63,15 @@ bool NativePaddlePredictor::Init() { } else { place_ = paddle::platform::CPUPlace(); } - paddle::framework::InitDevices(false); + if (parent_scope) { + scope_ = parent_scope; + sub_scope_ = &(parent_scope->NewScope()); + } else { + paddle::framework::InitDevices(false); + scope_.reset(new paddle::framework::Scope()); + } + executor_.reset(new paddle::framework::Executor(place_)); - scope_.reset(new paddle::framework::Scope()); // Initialize the inference program if (!config_.model_dir.empty()) { @@ -83,13 +90,8 @@ bool NativePaddlePredictor::Init() { return false; } ctx_ = executor_->Prepare(*inference_program_, 0); - - // Create temporary variables first, so that the first batch do not need to - // create variables in the runtime. This is the logics of the old inference - // API. - // TODO(Superjomn) this should be modified when `Clone` is valid for - // multi-thread application. - executor_->CreateVariables(*inference_program_, scope_.get(), 0); + executor_->CreateVariables( + *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0); // Get the feed_target_names and fetch_target_names feed_target_names_ = inference_program_->GetFeedTargetNames(); @@ -97,6 +99,13 @@ bool NativePaddlePredictor::Init() { return true; } +NativePaddlePredictor::~NativePaddlePredictor() { + if (sub_scope_) { + PADDLE_ENFORCE_NOT_NULL(scope_, "Should have parent scope!"); + scope_->DeleteScope(sub_scope_); + } +}; + bool NativePaddlePredictor::Run(const std::vector &inputs, std::vector *output_data) { VLOG(3) << "Predictor::predict"; @@ -121,11 +130,12 @@ bool NativePaddlePredictor::Run(const std::vector &inputs, } // Run the inference program // if share variables, we need not create variables - executor_->RunPreparedContext(ctx_.get(), - scope_.get(), - &feed_targets, - &fetch_targets, - false /* don't create variable eatch time */); + executor_->RunPreparedContext( + ctx_.get(), + sub_scope_ != nullptr ? sub_scope_ : scope_.get(), + &feed_targets, + &fetch_targets, + false /* don't create variable eatch time */); if (!GetFetch(fetchs, output_data)) { LOG(ERROR) << "fail to get fetchs"; return false; @@ -138,7 +148,7 @@ std::unique_ptr NativePaddlePredictor::Clone() { VLOG(3) << "Predictor::clone"; std::unique_ptr cls(new NativePaddlePredictor(config_)); - if (!dynamic_cast(cls.get())->Init()) { + if (!dynamic_cast(cls.get())->Init(scope_)) { LOG(ERROR) << "fail to call Init"; return nullptr; } @@ -266,7 +276,7 @@ CreatePaddlePredictor( } std::unique_ptr predictor(new NativePaddlePredictor(config)); - if (!dynamic_cast(predictor.get())->Init()) { + if (!dynamic_cast(predictor.get())->Init(nullptr)) { return nullptr; } return std::move(predictor); diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/contrib/inference/paddle_inference_api_impl.h index 84707e223d7aa3d1ebca933923e932b3973613ae..86d1db7bcc7567e104cd20c9f767ed4513f611f5 100644 --- a/paddle/contrib/inference/paddle_inference_api_impl.h +++ b/paddle/contrib/inference/paddle_inference_api_impl.h @@ -34,14 +34,15 @@ class NativePaddlePredictor : public PaddlePredictor { explicit NativePaddlePredictor(const NativeConfig &config) : config_(config) {} - bool Init(); + // will only create sub scope if have global scope + bool Init(std::shared_ptr parent_scope); bool Run(const std::vector &inputs, std::vector *output_data) override; std::unique_ptr Clone() override; - ~NativePaddlePredictor() override{}; + ~NativePaddlePredictor() override; private: bool SetFeed(const std::vector &input_datas, @@ -52,11 +53,13 @@ class NativePaddlePredictor : public PaddlePredictor { NativeConfig config_; platform::Place place_; std::unique_ptr executor_; - std::unique_ptr scope_; + std::shared_ptr scope_; std::unique_ptr ctx_; std::unique_ptr inference_program_; std::vector feed_target_names_; std::vector fetch_target_names_; + // Do not use unique_ptr, use parent scope to delete + framework::Scope *sub_scope_{nullptr}; }; } // namespace paddle diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc index 1f960677163988be6f4c502738861bf86588f406..5d843010e02b09087e6b328428e80fb40eb5bb97 100644 --- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc +++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc @@ -15,6 +15,8 @@ limitations under the License. */ #include #include +#include + #include "gflags/gflags.h" #include "paddle/contrib/inference/paddle_inference_api_impl.h" #include "paddle/fluid/inference/tests/test_helper.h" @@ -45,14 +47,19 @@ NativeConfig GetConfig() { config.model_dir = FLAGS_dirname + "word2vec.inference.model"; LOG(INFO) << "dirname " << config.model_dir; config.fraction_of_gpu_memory = 0.15; +#ifdef PADDLE_WITH_CUDA config.use_gpu = true; +#else + config.use_gpu = false; +#endif config.device = 0; return config; } -TEST(paddle_inference_api_impl, word2vec) { +void MainWord2Vec(bool use_gpu) { NativeConfig config = GetConfig(); auto predictor = CreatePaddlePredictor(config); + config.use_gpu = use_gpu; framework::LoDTensor first_word, second_word, third_word, fourth_word; framework::LoD lod{{0, 1}}; @@ -100,11 +107,11 @@ TEST(paddle_inference_api_impl, word2vec) { free(outputs[0].data.data); } -TEST(paddle_inference_api_impl, image_classification) { +void MainImageClassification(bool use_gpu) { int batch_size = 2; - bool use_mkldnn = false; bool repeat = false; NativeConfig config = GetConfig(); + config.use_gpu = use_gpu; config.model_dir = FLAGS_dirname + "image_classification_resnet.inference.model"; @@ -126,12 +133,8 @@ TEST(paddle_inference_api_impl, image_classification) { std::vector cpu_fetchs1; cpu_fetchs1.push_back(&output1); - TestInference(config.model_dir, - cpu_feeds, - cpu_fetchs1, - repeat, - is_combined, - use_mkldnn); + TestInference( + config.model_dir, cpu_feeds, cpu_fetchs1, repeat, is_combined); auto predictor = CreatePaddlePredictor(config); std::vector paddle_tensor_feeds; @@ -149,4 +152,143 @@ TEST(paddle_inference_api_impl, image_classification) { free(data); } +void MainThreadsWord2Vec(bool use_gpu) { + NativeConfig config = GetConfig(); + config.use_gpu = use_gpu; + auto main_predictor = CreatePaddlePredictor(config); + + // prepare inputs data and reference results + constexpr int num_jobs = 3; + std::vector> jobs(num_jobs); + std::vector> paddle_tensor_feeds(num_jobs); + std::vector refs(num_jobs); + for (size_t i = 0; i < jobs.size(); ++i) { + // each job has 4 words + jobs[i].resize(4); + for (size_t j = 0; j < 4; ++j) { + framework::LoD lod{{0, 1}}; + int64_t dict_size = 2073; // The size of dictionary + SetupLoDTensor(&jobs[i][j], lod, static_cast(0), dict_size - 1); + paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i][j])); + } + + // get reference result of each job + std::vector ref_feeds; + std::vector ref_fetches(1, &refs[i]); + for (auto& word : jobs[i]) { + ref_feeds.push_back(&word); + } + TestInference(config.model_dir, ref_feeds, ref_fetches); + } + + // create threads and each thread run 1 job + std::vector threads; + for (int tid = 0; tid < num_jobs; ++tid) { + threads.emplace_back([&, tid]() { + auto predictor = main_predictor->Clone(); + auto& local_inputs = paddle_tensor_feeds[tid]; + std::vector local_outputs; + ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs)); + + // check outputs range + ASSERT_EQ(local_outputs.size(), 1UL); + const size_t len = local_outputs[0].data.length; + float* data = static_cast(local_outputs[0].data.data); + for (size_t j = 0; j < len / sizeof(float); ++j) { + ASSERT_LT(data[j], 1.0); + ASSERT_GT(data[j], -1.0); + } + + // check outputs correctness + float* ref_data = refs[tid].data(); + EXPECT_EQ(refs[tid].numel(), static_cast(len / sizeof(float))); + for (int i = 0; i < refs[tid].numel(); ++i) { + EXPECT_NEAR(ref_data[i], data[i], 1e-3); + } + free(data); + }); + } + for (int i = 0; i < num_jobs; ++i) { + threads[i].join(); + } +} + +void MainThreadsImageClassification(bool use_gpu) { + constexpr int num_jobs = 4; // each job run 1 batch + constexpr int batch_size = 1; + NativeConfig config = GetConfig(); + config.use_gpu = use_gpu; + config.model_dir = + FLAGS_dirname + "image_classification_resnet.inference.model"; + + auto main_predictor = CreatePaddlePredictor(config); + std::vector jobs(num_jobs); + std::vector> paddle_tensor_feeds(num_jobs); + std::vector refs(num_jobs); + for (size_t i = 0; i < jobs.size(); ++i) { + // prepare inputs + std::vector> feed_target_shapes = + GetFeedTargetShapes(config.model_dir, /*is_combined*/ false); + feed_target_shapes[0][0] = batch_size; + framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]); + SetupTensor(&jobs[i], input_dims, 0.f, 1.f); + paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i])); + + // get reference result of each job + std::vector ref_feeds(1, &jobs[i]); + std::vector ref_fetches(1, &refs[i]); + TestInference(config.model_dir, ref_feeds, ref_fetches); + } + + // create threads and each thread run 1 job + std::vector threads; + for (int tid = 0; tid < num_jobs; ++tid) { + threads.emplace_back([&, tid]() { + auto predictor = main_predictor->Clone(); + auto& local_inputs = paddle_tensor_feeds[tid]; + std::vector local_outputs; + ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs)); + + // check outputs correctness + ASSERT_EQ(local_outputs.size(), 1UL); + const size_t len = local_outputs[0].data.length; + float* data = static_cast(local_outputs[0].data.data); + float* ref_data = refs[tid].data(); + EXPECT_EQ(refs[tid].numel(), len / sizeof(float)); + for (int i = 0; i < refs[tid].numel(); ++i) { + EXPECT_NEAR(ref_data[i], data[i], 1e-3); + } + free(data); + }); + } + for (int i = 0; i < num_jobs; ++i) { + threads[i].join(); + } +} + +TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); } +TEST(inference_api_native, word2vec_cpu_threads) { + MainThreadsWord2Vec(false /*use_gpu*/); +} +TEST(inference_api_native, image_classification_cpu) { + MainThreadsImageClassification(false /*use_gpu*/); +} +TEST(inference_api_native, image_classification_cpu_threads) { + MainThreadsImageClassification(false /*use_gpu*/); +} + +#ifdef PADDLE_WITH_CUDA +TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); } +TEST(inference_api_native, word2vec_gpu_threads) { + MainThreadsWord2Vec(true /*use_gpu*/); +} +TEST(inference_api_native, image_classification_gpu) { + MainThreadsImageClassification(true /*use_gpu*/); +} +TEST(inference_api_native, image_classification_gpu_threads) { + MainThreadsImageClassification(true /*use_gpu*/); +} + +#endif + } // namespace paddle diff --git a/paddle/contrib/tape/CMakeLists.txt b/paddle/contrib/tape/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5450359d859de93ca19c56422f1243c7f445aff7 --- /dev/null +++ b/paddle/contrib/tape/CMakeLists.txt @@ -0,0 +1,25 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +if(APPLE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") +endif(APPLE) + +cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES} device_context framework_proto proto_desc operator) +cc_library(tape SRCS tape.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} tape_variable) + +cc_test(test_tape + SRCS test_tape.cc + DEPS tape tape_variable) diff --git a/paddle/contrib/tape/README.md b/paddle/contrib/tape/README.md new file mode 100644 index 0000000000000000000000000000000000000000..16c22a45d59664e44c83923371c0f0d957a8ca7f --- /dev/null +++ b/paddle/contrib/tape/README.md @@ -0,0 +1,252 @@ +# Dynamic Graph on Fluid + +PaddlePaddle Fluid is targeting the autodiff without tape, which, however, is very +challenging and we are still way from there. DyNet and PyTorch provide a good design +idea, the *tape*, that significantly eases the challenge. Also, DyNet provides +a C++ API that is as convenient as Python but with higher efficiency and could +conveniently integrate with industrial/production systems. This package, `tape`, +combines the good of + +1. tape from PyTorch and DyNet +2. C++ API and core from DyNet +3. rich set of operators from PaddlePaddle + +## Overview + +We can implement Dynet-like Tape(See this [survey](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/survey/dynamic_graph.md)) +by wrapping Paddle Fluid's `Operator` and `Variable`. + +The user API is straight forward since + +1. it is imperative. And it uses host language's control flow logic. +1. it avoids extra concepts such as `Scope` and `Executor`. + +All of these benefits come at the cost of just adding one line `reset_global_tape` +at every iteration. + +## Code Structure + +In short, the `Tape` contains a vector of `OpHandle`s. And an `OpHandle` contains its +`type`, the pointers to the `Variable`s, and necessary attributes. + +```c++ +class Variable { +public: + VriableHandle Grad(); // returns its gradient variable +private: + framework::VarDesc desc_; // compile time infershape, necessary for lazy execution + framework::Variable var_; // run time variable, holds data memory +}; + +using VariableHandle = shared_ptr; + +struct OpHandle { + string type_; + map> inputs_; + map> outputs_; + AttributeMap attrs_; +}; + +class Tape { +public: + void AddOp(OpHandle); // add op + void Forward(); // execute the tape_ + void Backward(); // execute the backward of the tape_ +private: + vector tape_; +}; +``` + +We uses `Function` to indicate layers. It takes care of parameter +initialization and `AddOp` to the Tape when it is called. + +```c++ +class Linear { + public: + Linear(int in_dim, int out_dim, const std::string &act) + : w_(new Variable("LinearWeight")), + b_(new Variable("LinearBias")), + act_(act) { + Tape init_tape; + + std::string initializer = "fill_constant"; + framework::AttributeMap attrs; + attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32; + attrs["shape"] = std::vector{in_dim, out_dim}; + attrs["value"] = 1.0f; + init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs); + + attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32; + attrs["shape"] = std::vector{out_dim}; + attrs["value"] = 1.0f; + init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs); + + init_tape.Forward(); + } + + VariableHandle operator()(VariableHandle input) { + VariableHandle pre_bias(new Variable("linear")); + get_global_tape().AddOp("mul", + {{"X", {input}}, {"Y", {w_}}}, + {{"Out", {pre_bias}}}, + {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}}); + VariableHandle pre_act(new Variable("linear")); + get_global_tape().AddOp("elementwise_add", + {{"X", {pre_bias}}, {"Y", {b_}}}, + {{"Out", {pre_act}}}, + {{"axis", 1}}); + VariableHandle post_act(new Variable("linear")); + get_global_tape().AddOp(act_, + {{"X", {pre_act}}}, + {{"Out", {post_act}}}, + {}); + return post_act; + } + + std::vector Params() { return {w_, b_}; } + + private: + VariableHandle w_; + VariableHandle b_; + std::string act_; +}; +``` + +## User API + +```c++ +// Model function +paddle::tape::Linear linear1(3, 3, "relu"); // init weight and bias +paddle::tape::Linear linear2(3, 3, "relu"); // init weight and bias +paddle::tape::Mean mean; + +// Optimizer +paddle::tape::SGD sgd(0.001); + +// Data Feeder +paddle::tape::Fill data_feeder(...); +VariableHandle input(new paddle::tape::Variable("input")); +VariableHandle label(new paddle::tape::Variable("label")); + +for (int i = 0; i < 2; ++i) { + reset_global_tape(); + + data_feeder(input, label); + + auto loss = softmax(linear2(linear1(input)), label); // compile time InferShape & InferVarType + LOG(INFO) << loss.value(); // Run forward up to loss + + // Run backward, store gradient of w at w->Grad() + get_global_tape.Backward(loss); + + // Update w + sgd(linear1.Params()); + sgd(linear2.Params()); +} +``` + +
+ +digraph G { + + subgraph cluster_0 { + node [shape=record,style=filled]; + style=filled; + color=lightgrey; + linear1 [label="{type: mul | {input | {X: before_mul1 | Y: weight1}} | {output | Out: before_bias1}}"]; + elementwise_add1 [label="{type: elementwise_add | {input | {X: before_bias1 | Y: bias1}} | {output | Out: before_act1}}"]; + relu1 [label="{type: relu | {input | {X: before_act1 }} | {output | Out: after_act1}}"]; + + linear1 -> elementwise_add1->relu1; + label = "forward tape"; + } + + linear1:before_mul1->before_mul1 + linear1:weight1->weight1 + linear1:before_bias1->before_bias1 + + elementwise_add1:bias1->bias1 + elementwise_add1:before_bias1->before_bias1 + elementwise_add1:before_act1->before_act1 + + relu1:before_act1->before_act1 + relu1:after_act1->after_act1 + + subgraph cluster_1 { + node [shape=record,style=filled]; + style=filled; + color=lightgrey; + linear1_grad [label="{type: mul_grad | {input | {X: before_mul1 | Y: weight1| Out_grad: before_bias1_grad}} | {output |{X_grad: before_mul1_grad | Y_grad: weight1_grad}}}"]; + + elementwise_add1_grad [label="{type: elementwise_add_grad | {input | Out_grad: before_act1_grad} | {output |{X_grad: before_bias1_grad | Y_grad: bias1_grad}}}"]; + + relu1_grad [label="{type: relu_grad | {input | Out_grad: after_act1_grad} | {ouput | {X_grad: before_act1_grad }}}"]; + + linear1_grad -> elementwise_add1_grad ->relu1_grad [dir=back]; + label = "backward tape"; + } + + relu1_grad:after_act1_grad->after_act1_grad + relu1_grad:before_act1_grad->before_act1_grad + + elementwise_add1_grad:before_act1_grad->before_act1_grad + elementwise_add1_grad:before_bias1_grad->before_bias1_grad + elementwise_add1_grad:bias1_grad->bias1_grad + + linear1_grad:before_mul1->before_mul1 + linear1_grad:weight1->weight1 + linear1_grad:before_bias1_grad->before_bias1_grad + linear1_grad:before_mul1_grad->before_mul1_grad + linear1_grad:weight1_grad->weight1_grad + + + subgraph cluster_2 { + node [shape=record]; + label = "Linear1"; + weight1 + bias1 + } + + weight1 -> weight1_grad [ label="Grad()", style="dashed" ]; + bias1 -> bias1_grad [ label="Grad()", style="dashed"]; + + + +} +
+ +![Image](https://github.com/tonyyang-svail/Paddle/blob/cpp_tap/paddle/contrib/tape/computation_graph.png) + +## Code Reuse + +We want to stay close to Paddle Fluid as much as possible. + +### Reuse All Operators + +As all Ops are registered at `OpInfoMap`, the effort of adding a new `Function` +is about 10 lines of code, similar to expose an operator to Python. + +### Reuse Compile Time InferShape and InferVarType + +Note that all the symbolic information is stored at `tape::Varaible::desc_`, instead +of `ProgramDesc.block.vars`, we create a temporary `BlockDesc` to do `InferShape` and +`InferVarType` every time we `AddOp` to the tape. + +### Reuse Operator::Run + +We use smart pointer, instead of `Scope`, to manage memory. So we create a temporary +`Scope` for every `Operator::Run()`. + +## Possible Feature + +### Release Memory on Backward + +We can release memory aggressively. During backward, we can delete the OpHandle once +we have finished its backward. Since all the variable is managed by smart pointer, the +memory is automatically released when its `ref_count` goes to 0. + +### Kernel Fusion + +As a symbolic representation of the Tape is constructed first before the actual +execution, it would be possible to perform graph optimization. One use case is kernel +fusion. diff --git a/paddle/contrib/tape/computation_graph.png b/paddle/contrib/tape/computation_graph.png new file mode 100644 index 0000000000000000000000000000000000000000..6cf5ead735d5d18b204b079771e53d44483cf016 Binary files /dev/null and b/paddle/contrib/tape/computation_graph.png differ diff --git a/paddle/contrib/tape/function.h b/paddle/contrib/tape/function.h new file mode 100644 index 0000000000000000000000000000000000000000..8c9694d9a21b5948361164eab60a663ec4fd3803 --- /dev/null +++ b/paddle/contrib/tape/function.h @@ -0,0 +1,131 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/contrib/tape/tape.h" +#include "paddle/contrib/tape/variable.h" +#include "paddle/fluid/framework/type_defs.h" + +namespace paddle { +namespace tape { + +class Function {}; + +class Fill { + public: + Fill(const std::string &initializer, const framework::AttributeMap &attrs) + : initializer_(initializer), attrs_(attrs) {} + + void operator()(VariableHandle var) { + get_global_tape().AddOp(initializer_, {}, {{"Out", {var}}}, attrs_); + } + + private: + const std::string initializer_; + const framework::AttributeMap attrs_; +}; + +class Mean { + public: + VariableHandle operator()(VariableHandle var) { + VariableHandle out(new Variable("mean")); + get_global_tape().AddOp("mean", {{"X", {var}}}, {{"Out", {out}}}, {}); + return out; + } +}; + +class Linear { + public: + Linear(int in_dim, int out_dim, const std::string &act) + : w_(new Variable("LinearWeight")), + b_(new Variable("LinearBias")), + act_(act) { + Tape init_tape; + + std::string initializer = "fill_constant"; + framework::AttributeMap attrs; + attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32; + attrs["shape"] = std::vector{in_dim, out_dim}; + attrs["value"] = 1.0f; + init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs); + + attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32; + attrs["shape"] = std::vector{out_dim}; + attrs["value"] = 1.0f; + init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs); + + init_tape.Forward(); + } + + VariableHandle operator()(VariableHandle input) { + VariableHandle pre_bias(new Variable("linear")); + get_global_tape().AddOp("mul", + {{"X", {input}}, {"Y", {w_}}}, + {{"Out", {pre_bias}}}, + {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}}); + VariableHandle pre_act(new Variable("linear")); + get_global_tape().AddOp("elementwise_add", + {{"X", {pre_bias}}, {"Y", {b_}}}, + {{"Out", {pre_act}}}, + {{"axis", 1}}); + VariableHandle post_act(new Variable("linear")); + get_global_tape().AddOp( + act_, {{"X", {pre_act}}}, {{"Out", {post_act}}}, {}); + return post_act; + } + + std::vector Params() { return {w_, b_}; } + + private: + VariableHandle w_; + VariableHandle b_; + std::string act_; +}; + +class SGD { + public: + SGD(float learning_rate) : learning_rate_(new Variable("sgd")) { + Tape init_tape; + + std::string initializer = "fill_constant"; + framework::AttributeMap attrs; + attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32; + attrs["shape"] = std::vector{1}; + attrs["value"] = learning_rate; + init_tape.AddOp(initializer, {}, {{"Out", {learning_rate_}}}, attrs); + + init_tape.Forward(); + } + + void operator()(VariableHandle input) { + PADDLE_ENFORCE(get_global_tape().HasBeenBackwarded(), + "optimization must happen after the backward"); + Tape temp_tape; + temp_tape.AddOp("sgd", + {{"Param", {input}}, + {"LearningRate", {learning_rate_}}, + {"Grad", {input->Grad()}}}, + {{"ParamOut", {input}}}, + {}); + temp_tape.Forward(); + } + + private: + VariableHandle learning_rate_; +}; +} +} diff --git a/paddle/contrib/tape/tape.cc b/paddle/contrib/tape/tape.cc new file mode 100644 index 0000000000000000000000000000000000000000..531499b6fe02abf200b7d4401494fd6350646622 --- /dev/null +++ b/paddle/contrib/tape/tape.cc @@ -0,0 +1,265 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/contrib/tape/tape.h" + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/dim.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/pybind/pybind.h" + +namespace paddle { +namespace tape { + +// borrowed from +// https://stackoverflow.com/questions/874134/find-if-string-ends-with-another-string-in-c +inline bool ends_with(std::string const &value, std::string const &ending) { + if (ending.size() > value.size()) return false; + return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); +} + +std::ostream &operator<<(std::ostream &os, const framework::VarDesc &var_desc) { + os << var_desc.Name(); + os << "[" << var_desc.GetType() << "]"; + os << "[" << var_desc.GetDataType() << "]"; + os << "{"; + for (auto &i : var_desc.GetShape()) { + os << i << ","; + } + os << "}"; + return os; +} + +std::string to_string(const std::string &type, + const VariableHandleMap &in_vars, + const VariableHandleMap &out_vars, + const framework::AttributeMap &attrs) { + std::stringstream ss; + ss << type << " "; + for (auto ¶m_name : in_vars) { + for (auto &var : param_name.second) { + ss << param_name.first << ":(" << var->Desc() << ") "; + } + } + for (auto ¶m_name : out_vars) { + for (auto &var : param_name.second) { + ss << param_name.first << ":(" << var->Desc() << ") "; + } + } + return ss.str(); +} + +framework::OpDesc CreateOpDesc(const std::string &type, + const VariableHandleMap &in_vars, + const VariableHandleMap &out_vars, + const framework::AttributeMap &attrs) { + framework::VariableNameMap inputs; + for (auto ¶m_name : in_vars) { + for (auto &var : param_name.second) { + inputs[param_name.first].emplace_back(var->Name()); + } + } + framework::VariableNameMap outputs; + for (auto ¶m_name : out_vars) { + for (auto &var : param_name.second) { + outputs[param_name.first].emplace_back(var->Name()); + } + } + return framework::OpDesc(type, inputs, outputs, attrs); +} + +void InferShapeAndVarType(const std::string &type, + const VariableHandleMap &in_vars, + VariableHandleMap *out_vars, + const framework::AttributeMap &attrs) { + framework::OpDesc op_desc = CreateOpDesc(type, in_vars, *out_vars, attrs); + + // Create a temporary block for compile-time + framework::ProgramDesc program_desc; + framework::BlockDesc *block_desc = program_desc.MutableBlock(0); + PADDLE_ENFORCE(block_desc); + + for (auto ¶m_name : in_vars) { + for (auto &var : param_name.second) { + *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto(); + } + } + for (auto ¶m_name : *out_vars) { + for (auto &var : param_name.second) { + *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto(); + } + } + + LOG(INFO) << "- " << to_string(type, in_vars, *out_vars, attrs); + op_desc.InferShape(*block_desc); + op_desc.InferVarType(block_desc); + for (auto ¶m_name : *out_vars) { + for (auto &var : param_name.second) { + *var->MutableDesc()->Proto() = *block_desc->Var(var->Name())->Proto(); + } + } + LOG(INFO) << "+ " << to_string(type, in_vars, *out_vars, attrs); +} + +void Tape::AddOp(const std::string &type, + const VariableHandleMap &in_vars, + VariableHandleMap out_vars, + const framework::AttributeMap &attrs) { + InferShapeAndVarType(type, in_vars, &out_vars, attrs); + tape_.emplace_back(type, in_vars, out_vars, attrs); +} + +// Temporary Scope for Operator::Run() +class ScopeWrapper : public framework::Scope { + public: + ScopeWrapper(const VariableHandleMap &in_vars, + const VariableHandleMap &out_vars) { + for (auto &v : in_vars) { + for (auto &vv : v.second) { + if (!vars_.count(vv->Name())) { + vars_[vv->Name()].reset(vv->Var()); + } + } + } + for (auto &v : out_vars) { + for (auto &vv : v.second) { + if (!vars_.count(vv->Name())) { + vars_[vv->Name()].reset(vv->Var()); + } + } + } + } + + ~ScopeWrapper() { + for (auto &pair : vars_) { + pair.second.release(); + } + } +}; + +void Tape::Forward() { + LOG(INFO) << "Starting forward -------------------------"; + PADDLE_ENFORCE(!has_been_backwarded_); + while (current_position_ < tape_.size()) { + OpHandle &op = tape_[current_position_]; + + // Create Output Tensor, this is only necessary for OpWithKernel + for (auto ¶m2var : op.outputs_) { + for (auto &var : param2var.second) { + var->InitializeVariable(); + } + } + + framework::OpDesc op_desc = + CreateOpDesc(op.type_, op.inputs_, op.outputs_, op.attrs_); + ScopeWrapper scope(op.inputs_, op.outputs_); + framework::OpRegistry::CreateOp(op_desc)->Run(scope, platform::CPUPlace()); + current_position_++; + } + + LOG(INFO) << "Finishing forward -------------------------"; +} + +void Tape::Backward(VariableHandle target) { + PADDLE_ENFORCE(!has_been_backwarded_); + + Forward(); + + // TODO(tonyyang-svail): check output of last op is target + backward_tape_.reset(new Tape()); + + framework::AttributeMap attrs; + + // FIXME(tonyyang-svail): Need to infer_data_type + attrs["dtype"] = framework::proto::VarType::Type::VarType_Type_FP32; + attrs["shape"] = std::vector{1}; + attrs["value"] = 1.0f; + backward_tape_->AddOp( + "fill_constant", {}, {{"Out", {target->Grad()}}}, attrs); + + for (auto it = tape_.rbegin(); it != tape_.rend(); ++it) { + framework::OpDesc op_desc = + CreateOpDesc(it->type_, it->inputs_, it->outputs_, it->attrs_); + std::unordered_map grad_to_var; + std::vector> grad_op_descs = + framework::OpInfoMap::Instance() + .Get(op_desc.Type()) + .GradOpMaker()(op_desc, {}, &grad_to_var, {}); + + for (auto &op_desc : grad_op_descs) { + std::unordered_map name2var; + for (auto ¶m2vars : it->inputs_) { + for (auto &a : param2vars.second) { + name2var[a->Name()] = a; + } + } + for (auto ¶m2vars : it->outputs_) { + for (auto &a : param2vars.second) { + name2var[a->Name()] = a; + } + } + + VariableHandleMap in_vars; + VariableHandleMap out_vars; + std::map + loop_over{{&op_desc->Inputs(), &in_vars}, + {&op_desc->Outputs(), &out_vars}}; + for (auto &each : loop_over) { + auto &vmp = *each.first; + auto &vhm = *each.second; + for (auto &p2a : vmp) { + for (auto &argu : p2a.second) { + if (name2var.count(argu)) { + vhm[p2a.first].push_back(name2var[argu]); + } else { + PADDLE_ENFORCE(ends_with(argu, framework::kGradVarSuffix), + argu.c_str()); + std::string name = argu.substr( + 0, argu.size() - std::strlen(framework::kGradVarSuffix)); + PADDLE_ENFORCE(name2var.count(name), name.c_str()); + vhm[p2a.first].push_back(name2var[name]->Grad()); + } + } + } + } + + backward_tape_->AddOp( + op_desc->Type(), in_vars, out_vars, op_desc->GetAttrMap()); + } + + // TODO(tonyyang-svail): how to fill empty grad? + // TODO(tonyyang-svail): Sum var grad is necessary + } + + backward_tape_->Forward(); + has_been_backwarded_ = true; +} + +Tape &get_global_tape() { + static Tape T; + return T; +} + +void reset_global_tape() { get_global_tape() = Tape(); } +} +} diff --git a/paddle/contrib/tape/tape.h b/paddle/contrib/tape/tape.h new file mode 100644 index 0000000000000000000000000000000000000000..ed79de17a7fca58a2c542831560f0dd5ad34f960 --- /dev/null +++ b/paddle/contrib/tape/tape.h @@ -0,0 +1,64 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include + +#include "paddle/contrib/tape/variable.h" + +namespace paddle { +namespace tape { + +using VariableHandleMap = std::map>; + +struct OpHandle { + OpHandle(const std::string &type, + const VariableHandleMap &in_vars, + const VariableHandleMap &out_vars, + const framework::AttributeMap &attrs) + : type_(type), inputs_(in_vars), outputs_(out_vars), attrs_(attrs) {} + + std::string type_; + VariableHandleMap inputs_; + VariableHandleMap outputs_; + framework::AttributeMap attrs_; +}; + +class Tape { + public: + void AddOp(const std::string &type, + const VariableHandleMap &in_vars, + VariableHandleMap out_vars, + const framework::AttributeMap &attrs); + void Forward(); + void Backward(VariableHandle target); + + bool HasBeenBackwarded() { return has_been_backwarded_; } + + private: + bool has_been_backwarded_ = false; + size_t current_position_ = 0; + + std::vector tape_; + std::shared_ptr backward_tape_; +}; + +Tape &get_global_tape(); + +void reset_global_tape(); +} +} diff --git a/paddle/contrib/tape/test_tape.cc b/paddle/contrib/tape/test_tape.cc new file mode 100644 index 0000000000000000000000000000000000000000..e9bfd21a7189c5867a52d2b25db09a462d5c7ba7 --- /dev/null +++ b/paddle/contrib/tape/test_tape.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" +#include "paddle/contrib/tape/function.h" + +using namespace paddle::tape; + +TEST(Tape, TestMLP) { + LOG(INFO) << "TestMLP"; + Linear linear1(3, 3, "relu"); + Linear linear2(3, 3, "relu"); + Mean mean; + + SGD sgd(0.001); + + std::string initializer = "fill_constant"; + paddle::framework::AttributeMap attrs; + attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32; + attrs["shape"] = std::vector{3, 3}; + attrs["value"] = 1.0f; + Fill filler(initializer, attrs); + + for (int i = 0; i < 2; ++i) { + reset_global_tape(); + + VariableHandle input(new Variable("input")); + filler(input); + + auto loss = mean(linear2(linear1(input))); + + get_global_tape().Backward(loss); + + for (auto w : linear1.Params()) { + sgd(w); + } + for (auto w : linear2.Params()) { + sgd(w); + } + } +} + +int main(int argc, char** argv) { + std::vector places; + places.emplace_back(paddle::platform::CPUPlace()); + paddle::platform::DeviceContextPool::Init(places); + + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/contrib/tape/variable.cc b/paddle/contrib/tape/variable.cc new file mode 100644 index 0000000000000000000000000000000000000000..5ec1612909503f666bca0fce3246002879854156 --- /dev/null +++ b/paddle/contrib/tape/variable.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/contrib/tape/variable.h" + +namespace paddle { +namespace tape { + +void Variable::InitializeVariable() { + LOG(INFO) << "Initialzing " << desc_.Name() << " as " << desc_.GetType(); + framework::proto::VarType::Type var_type = desc_.GetType(); + if (var_type == framework::proto::VarType::LOD_TENSOR) { + var_.GetMutable(); + } else if (var_type == framework::proto::VarType::SELECTED_ROWS) { + var_.GetMutable(); + } else { + PADDLE_THROW("Variable type %d is not in [LOD_TENSOR, SELECTED_ROWS]", + var_type); + } +} +} +} diff --git a/paddle/contrib/tape/variable.h b/paddle/contrib/tape/variable.h new file mode 100644 index 0000000000000000000000000000000000000000..35c328e69c9ebe25e907a59e4d67b999aff1d876 --- /dev/null +++ b/paddle/contrib/tape/variable.h @@ -0,0 +1,85 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#include "paddle/fluid/framework/operator.h" // framework::kGradVarSuffix +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace tape { + +class Variable; +using VariableHandle = std::shared_ptr; + +/* + * Combination of + * framework::VarDesc desc_; + * framework::Variable var_; + */ +class Variable { + public: + Variable(const std::string pre_fix) + : desc_(pre_fix + std::to_string(count())) {} + + Variable(const std::string pre_fix, bool is_grad) + : desc_(pre_fix + (is_grad ? framework::kGradVarSuffix + : std::to_string(count()))) {} + + ~Variable() { LOG(INFO) << "Deleting " << Name(); } + + // Instantiate LoDTensor/SelectedRow + void InitializeVariable(); + + VariableHandle Grad() { + if (grad_.expired()) { + VariableHandle new_grad(new Variable(desc_.Name(), true)); + grad_ = new_grad; + return new_grad; + } else { + return VariableHandle(grad_); + } + } + + // Stochastic Gradient Descent with Momentum + // VariableHandle Momentum (); + + // void init(const std::string& initializer, + // const framework::AttributeMap& attrs); + + // void value() {}; + + const framework::VarDesc& Desc() const { return desc_; } + framework::VarDesc* MutableDesc() { return &desc_; } + + // TODO(tonyyang-svail): No need to expose name + std::string Name() const { return desc_.Name(); } + + framework::Variable* Var() { return &var_; } + + private: + int count() { + static int counter = 0; + return counter++; + } + + framework::VarDesc desc_; + framework::Variable var_; + + std::weak_ptr grad_; +}; +} +} diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index ed1e70c6460b513c1d2e1add18ac037f71d36944..6286dda4a54991b7a1042aed9886fdcb694198ba 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -83,11 +83,16 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) -cc_library(executor SRCS executor.cc DEPS op_registry device_context scope -framework_proto glog lod_rank_table feed_fetch_method) +if(WITH_DISTRIBUTE) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +else() + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method) +endif() -cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor) +cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc index e7842e9b8130d35e511e02dfb1dc27f307d17f38..f537e4b9e569dd4c513ac0efde7240833bcf04b6 100644 --- a/paddle/fluid/framework/block_desc.cc +++ b/paddle/fluid/framework/block_desc.cc @@ -169,17 +169,13 @@ void BlockDesc::Flush() { } if (need_update_) { - auto &op_field = *this->desc_->mutable_ops(); - this->ClearPBOps(); - op_field.Reserve(static_cast(ops_.size())); + this->desc_->mutable_ops()->Clear(); for (auto &op_desc : ops_) { - op_field.AddAllocated(op_desc->Proto()); + this->desc_->mutable_ops()->Add()->CopyFrom(*op_desc->Proto()); } - auto &var_field = *this->desc_->mutable_vars(); - this->ClearPBVars(); - var_field.Reserve(static_cast(vars_.size())); + this->desc_->mutable_vars()->Clear(); for (auto &var_desc : vars_) { - var_field.AddAllocated(var_desc.second->Proto()); + this->desc_->mutable_vars()->Add()->CopyFrom(*var_desc.second->Proto()); } need_update_ = false; } @@ -217,22 +213,6 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, } } -void BlockDesc::ClearPBOps() { - auto ops = this->desc_->mutable_ops(); - while (!ops->empty()) { - // we do not own the OpDesc, so release the ownership. - ops->ReleaseLast(); - } -} - -void BlockDesc::ClearPBVars() { - auto vars = this->desc_->mutable_vars(); - while (!vars->empty()) { - // we do not own the VarDesc, so release the ownership. - vars->ReleaseLast(); - } -} - void BlockDesc::SetForwardBlockID(int32_t forward_block_id) { PADDLE_ENFORCE(!desc_->has_forward_block_idx(), "Parent block ID has been set to %d. Cannot set to %d", diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h index 189dd6c52f85b5bf623b98c64c07c0c7269505d4..ce48548418478cc5c9f9ca1244df9e66dca884e6 100644 --- a/paddle/fluid/framework/block_desc.h +++ b/paddle/fluid/framework/block_desc.h @@ -41,11 +41,6 @@ class BlockDesc { BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog); - ~BlockDesc() { - this->ClearPBVars(); - this->ClearPBOps(); - } - int32_t ID() const { return desc_->idx(); } int32_t Parent() const { return desc_->parent_idx(); } @@ -113,10 +108,6 @@ class BlockDesc { ProgramDesc *Program() const { return this->prog_; } - private: - void ClearPBOps(); - void ClearPBVars(); - private: ProgramDesc *prog_; // not_own proto::BlockDesc *desc_; // not_own diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h index 9c5e2cf7ccdcea2822da42210ff1fdb915a9a4ec..b611bb77b4e1ec05b8bd029ac37cefba346c6eb0 100644 --- a/paddle/fluid/framework/data_layout.h +++ b/paddle/fluid/framework/data_layout.h @@ -27,6 +27,7 @@ enum class DataLayout { kNHWC = 0, kNCHW = 1, kAnyLayout = 2, + kMKLDNN = 3, // all layouts supported by MKLDNN internally }; inline DataLayout StringToDataLayout(const std::string& str) { @@ -41,6 +42,8 @@ inline DataLayout StringToDataLayout(const std::string& str) { return DataLayout::kNCHW; } else if (s == "ANYLAYOUT") { return DataLayout::kAnyLayout; + } else if (s == "MKLDNNLAYOUT") { + return DataLayout::kMKLDNN; } else { PADDLE_THROW("Unknown storage order string: %s", s); } @@ -54,8 +57,10 @@ inline std::string DataLayoutToString(const DataLayout& data_layout) { return "NCHW"; case DataLayout::kAnyLayout: return "ANY_LAYOUT"; + case DataLayout::kMKLDNN: + return "MKLDNNLAYOUT"; default: - PADDLE_THROW("unknown DataLayou %d", data_layout); + PADDLE_THROW("unknown DataLayout %d", data_layout); } } diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 60ec60a427ba9046ce690eb75c27cd322fdd726d..5b8dfc57ba020cea259041f55a66472ea26b4eec 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -16,6 +16,9 @@ #include #include "paddle/fluid/operators/math/math_function.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif namespace paddle { namespace framework { @@ -88,5 +91,85 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var, out->set_layout(expected_kernel_type.data_layout_); } +#ifdef PADDLE_WITH_MKLDNN +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::reorder; + +void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) { + switch (type) { + case mkldnn::memory::data_type::f32: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::s8: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::u8: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::s16: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::s32: + return platform::to_void_cast(tensor.data()); + default: + PADDLE_THROW("wrong mkldnn type provided"); + } +} +#endif + +void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, + const Tensor& in, Tensor* out) { + auto in_layout = kernel_type_for_var.data_layout_; + auto out_layout = expected_kernel_type.data_layout_; + + PADDLE_ENFORCE( + in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN, + "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to " + "non-MKLDNN"); + +#ifdef PADDLE_WITH_MKLDNN + PADDLE_ENFORCE(in.format() != memory::format::format_undef && + in.format() != memory::format::any, + "Input tensor should have specified memory format"); + + // Set default as NCHW in case not specified + out_layout = + out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout; + + auto& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = dynamic_cast( + pool.Get(expected_kernel_type.place_)); + auto& cpu_engine = dev_ctx->GetEngine(); + + std::vector in_tz = paddle::framework::vectorize2int(in.dims()); + std::vector out_tz = in_tz; + + memory::data_type in_type = ToMKLDNNDataType(in.type()); + PADDLE_ENFORCE(in_type != memory::data_type::data_undef, + "Input tensor type is not supported: ", in.type().name()); + memory::data_type out_type = in_type; + + memory::format in_format = + in_tz.size() == 2 ? memory::format::nc : in.format(); + memory::format out_format = + out_tz.size() == 2 ? memory::format::nc : ToMKLDNNFormat(out_layout); + + void* in_data = GetDataFromTensor(in, in_type); + + // output tensor has the same dims as input. Reorder don't change dims + out->Resize(in.dims()); + + auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); + + auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data); + auto out_memory = + memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data); + + platform::Reorder(in_memory, out_memory); + + out->set_layout(out_layout); + // reset format since the out tensor will be feed to non-MKLDNN OPkernel + out->set_format(memory::format::format_undef); +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h index 06b638663dd334837a3bcb7737e507fcbc871c7a..2ba84ce57fd8aa3d9aa651bdaa2930e459c74e88 100644 --- a/paddle/fluid/framework/data_layout_transform.h +++ b/paddle/fluid/framework/data_layout_transform.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/tensor.h" @@ -22,6 +23,50 @@ namespace paddle { namespace framework { +#ifdef PADDLE_WITH_MKLDNN +using MKLDNNFormat = mkldnn::memory::format; +using MKLDNNDataType = mkldnn::memory::data_type; + +inline MKLDNNFormat ToMKLDNNFormat(const DataLayout& layout) { + switch (layout) { + case DataLayout::kNHWC: + return MKLDNNFormat::nhwc; + case DataLayout::kNCHW: + return MKLDNNFormat::nchw; + default: + PADDLE_THROW("Fail to convert layout %s to MKLDNN format", + DataLayoutToString(layout)); + } +} + +inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) { + switch (format) { + case MKLDNNFormat::nhwc: + return DataLayout::kNHWC; + case MKLDNNFormat::nchw: + return DataLayout::kNCHW; + default: + PADDLE_THROW("Fail to convert MKLDNN format to paddle layout"); + } +} + +inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) { + static const std::map dict{ + {std::type_index(typeid(float)), MKLDNNDataType::f32}, // NOLINT + {std::type_index(typeid(char)), MKLDNNDataType::s8}, // NOLINT + {std::type_index(typeid(unsigned char)), MKLDNNDataType::u8}, + {std::type_index(typeid(int16_t)), MKLDNNDataType::s16}, + {std::type_index(typeid(int32_t)), MKLDNNDataType::s32}}; + auto iter = dict.find(type); + if (iter != dict.end()) return iter->second; + return MKLDNNDataType::data_undef; +} +#endif + +void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, + const Tensor& in, Tensor* out); + std::vector GetAxis(const DataLayout& from, const DataLayout& to); void TransDataLayout(const OpKernelType& kernel_type_for_var, diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index 9c277a27da5af34fc9fb18ca073e369c05ecdf22..b8fcc92697ca1bf1d971f8fef020f31d405605a9 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -33,11 +33,34 @@ void DataTransform(const OpKernelType& expected_kernel_type, Tensor in; in.ShareDataWith(input_tensor); Tensor out; + DataLayout lin = kernel_type_for_var.data_layout_; + DataLayout lout = expected_kernel_type.data_layout_; // do layout transform - if (NeedTransformLayout(expected_kernel_type.data_layout_, - kernel_type_for_var.data_layout_)) { - TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out); + if (NeedTransformLayout(lout, lin)) { + if (lin == DataLayout::kMKLDNN || lout == DataLayout::kMKLDNN) { + PADDLE_ENFORCE( + !(lin == DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN), + "No layout transform needed between two MKLDNN OPKernels"); + + if (lin != DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN) { +#ifdef PADDLE_WITH_MKLDNN + // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel + // Just set layout/format. No real transform occur + out.ShareDataWith(input_tensor); + out.set_layout(DataLayout::kMKLDNN); + out.set_format(ToMKLDNNFormat(lin)); +#endif + } else { + // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel + // Do transform via MKLDNN lib + TransDataLayoutFromMKLDNN(kernel_type_for_var, expected_kernel_type, in, + &out); + } + } else { + // Case3 - transfrom between Non-MKLDNN OPKernels + TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out); + } transformed = true; PassTensorData(&out, &in); } diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc index b6b93cf422a60c1d8e9cb8b477efd562f9fe4758..60382faffb8e53870658b2d1ff83abc4008cb4cf 100644 --- a/paddle/fluid/framework/data_type.cc +++ b/paddle/fluid/framework/data_type.cc @@ -28,6 +28,9 @@ struct DataTypeMap { }; static DataTypeMap* InitDataTypeMap(); +// C++11 removes the need for manual locking. Concurrent execution shall wait if +// a static local variable is already being initialized. +// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex static DataTypeMap& gDataTypeMap() { static DataTypeMap* g_data_type_map_ = InitDataTypeMap(); return *g_data_type_map_; diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 1bcd8412eb2d618b923bcd0557d118af62271f4a..3c73b6cc55c187c3f6e7edd1ce38cc58f4e8413d 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -7,26 +7,32 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph) +cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder) +cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder) cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) if(WITH_GPU) - nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory - dynload_cuda) - set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) + nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + dynload_cuda variable_visitor) nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda) nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) else() - set(multi_devices_graph_builder_deps) + cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + variable_visitor) cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim) cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) endif() cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) +cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle - scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle) + scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle) + + +cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope @@ -36,5 +42,6 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha device_context broadcast_op_handle) cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context gather_op_handle) +cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor) #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory # device_context reduce_op_handle ) diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc similarity index 61% rename from paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc rename to paddle/fluid/framework/details/all_reduce_op_handle.cc index 95aa599cd3e403e9cc66b2b5ad35d0d214d1ab5b..b335d3a0d364c916e19574de8d3ed89aaec7de41 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -11,46 +11,65 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - -#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" #include + +#include "paddle/fluid/framework/details/all_reduce_op_handle.h" +#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" +#include "paddle/fluid/framework/details/variable_visitor.h" namespace paddle { namespace framework { namespace details { -NCCLAllReduceOpHandle::NCCLAllReduceOpHandle( - const std::vector &local_scopes, - const std::vector &places, - const platform::NCCLContextMap &ctxs) + +#ifdef PADDLE_WITH_CUDA +AllReduceOpHandle::AllReduceOpHandle(const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLContextMap *ctxs) : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) { - for (auto &p : places_) { - this->dev_ctxes_[p] = nccl_ctxs_.DevCtx(p); + if (nccl_ctxs_) { + for (auto &p : places_) { + this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p); + } } } +#else +AllReduceOpHandle::AllReduceOpHandle(const std::vector &local_scopes, + const std::vector &places) + : local_scopes_(local_scopes), places_(places) {} +#endif -void NCCLAllReduceOpHandle::RunImpl() { - if (inputs_.size() == 1) { +void AllReduceOpHandle::RunImpl() { + if (NoDummyInputSize() == 1) { return; // No need to all reduce when GPU count = 1; } else { // Wait input done WaitInputVarGenerated(); - - auto &var_name = static_cast(this->inputs_[0])->name_; - int dtype = -1; - size_t numel = 0; + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), places_.size(), + "The NoDummyInputSize should be equal to the number of places."); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), out_var_handles.size(), + "The NoDummyInputSize and NoDummyOutputSize should be equal."); std::vector lod_tensors; - for (size_t i = 0; i < local_scopes_.size(); ++i) { auto *s = local_scopes_[i]; auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get(); - - auto &lod_tensor = local_scope.FindVar(var_name)->Get(); + auto &lod_tensor = + local_scope.FindVar(in_var_handles[i]->name_)->Get(); lod_tensors.emplace_back(&lod_tensor); + PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_, + "The name of input and output should be equal."); } if (platform::is_gpu_place(lod_tensors[0]->place())) { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); + int dtype = -1; + size_t numel = 0; std::vector> all_reduce_calls; for (size_t i = 0; i < local_scopes_.size(); ++i) { auto &p = places_[i]; @@ -66,7 +85,7 @@ void NCCLAllReduceOpHandle::RunImpl() { } int dev_id = boost::get(p).device; - auto &nccl_ctx = nccl_ctxs_.at(dev_id); + auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; all_reduce_calls.emplace_back([=] { @@ -81,22 +100,25 @@ void NCCLAllReduceOpHandle::RunImpl() { call(); } }); +#else + PADDLE_THROW("Not compiled with CUDA"); +#endif } else { // Special handle CPU only Operator's gradient. Like CRF auto &trg = *this->local_scopes_[0] ->FindVar(kLocalExecScopeName) ->Get() - ->Var() + ->FindVar(out_var_handles[0]->name_) ->GetMutable(); // Reduce All Tensor to trg in CPU ReduceLoDTensor func(lod_tensors, &trg); VisitDataType(ToDataType(lod_tensors[0]->type()), func); - for (size_t i = 0; i < local_scopes_.size(); ++i) { + for (size_t i = 1; i < local_scopes_.size(); ++i) { auto &scope = *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get(); auto &p = places_[i]; - auto *var = scope.FindVar(var_name); + auto *var = scope.FindVar(out_var_handles[i]->name_); auto *dev_ctx = dev_ctxes_[p]; RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { @@ -109,7 +131,7 @@ void NCCLAllReduceOpHandle::RunImpl() { } } -std::string NCCLAllReduceOpHandle::Name() const { return "nccl_all_reduce"; } +std::string AllReduceOpHandle::Name() const { return "all_reduce"; } } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h similarity index 68% rename from paddle/fluid/framework/details/nccl_all_reduce_op_handle.h rename to paddle/fluid/framework/details/all_reduce_op_handle.h index a0c321843e3fc5abcbd1ef2ce2e153250269aa7d..fdd250b0d3eb166249271a95f7592b9fadee5265 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/all_reduce_op_handle.h @@ -20,17 +20,23 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/nccl_helper.h" +#endif namespace paddle { namespace framework { namespace details { -struct NCCLAllReduceOpHandle : public OpHandleBase { - NCCLAllReduceOpHandle(const std::vector &local_scopes, - const std::vector &places, - const platform::NCCLContextMap &ctxs); - +struct AllReduceOpHandle : public OpHandleBase { +#ifdef PADDLE_WITH_CUDA + AllReduceOpHandle(const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLContextMap *ctxs); +#else + AllReduceOpHandle(const std::vector &local_scopes, + const std::vector &places); +#endif std::string Name() const override; // Delay and buffer nccl_all_reduce together can significantly increase @@ -41,9 +47,11 @@ struct NCCLAllReduceOpHandle : public OpHandleBase { void RunImpl() override; private: - const std::vector &local_scopes_; - const std::vector &places_; - const platform::NCCLContextMap &nccl_ctxs_; + std::vector local_scopes_; + std::vector places_; +#ifdef PADDLE_WITH_CUDA + const platform::NCCLContextMap *nccl_ctxs_; +#endif }; } // namespace details diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index 629aa00cb817c4b1446e7b750ca62a7c6b1db670..8036f756b6d6506684c109ab881d546f38176a10 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -59,8 +59,8 @@ struct BroadcastOpHandle : public OpHandleBase { void RunImpl() override; private: - const std::vector &local_scopes_; - const std::vector &places_; + std::vector local_scopes_; + std::vector places_; #ifdef PADDLE_WITH_CUDA const platform::NCCLContextMap *nccl_ctxs_; #endif diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 91bdfe6134ffbd1404336c9d6d1222a505084b2b..64e83acb4dc1995800c4ca3caf81668b24a7c9fe 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -14,6 +14,8 @@ #pragma once +#include + namespace paddle { namespace framework { namespace details { @@ -29,6 +31,8 @@ struct BuildStrategy { ReduceStrategy reduce_{ReduceStrategy::kAllReduce}; GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice}; + + std::string debug_graphviz_path_{""}; }; } // namespace details diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index e8d510ec955602b5a3f73ca06caa121886eb150b..716d674fa29bad9321fc20979775c06f26bf4679 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -20,8 +20,9 @@ namespace details { struct ExecutionStrategy { size_t num_threads_{0}; - bool use_event_{true}; + bool use_cuda_{true}; bool allow_op_delay_{false}; + size_t num_iteration_per_drop_scope_{100}; }; } // namespace details diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc new file mode 100644 index 0000000000000000000000000000000000000000..018c9bff71e553d8a3641f06f10b350453676b24 --- /dev/null +++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fuse_vars_op_handle.h" + +namespace paddle { +namespace framework { +namespace details { + +void FuseVarsOpHandle::RunImpl() { + WaitInputVarGenerated(place_); + + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + PADDLE_ENFORCE_EQ(in_var_handles.size(), 0); + PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), ""); + + auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get(); + + auto out_var_handle = out_var_handles[0]; + auto out_var = scope->Var(out_var_handle->name_); + + auto out_tensor = out_var->GetMutable(); + out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_); + + int64_t s = 0; + for (size_t i = 1; i < out_var_handles.size(); ++i) { + auto out_name = out_var_handles[i]->name_; + auto out_t = scope->Var(out_name)->GetMutable(); + auto numel = this->inputs_numel_.at(out_name); + out_t->ShareDataWith(out_tensor->Slice(s, s + numel)); + s += numel; + } + this->RunAndRecordEvent([] {}); +} + +std::string FuseVarsOpHandle::Name() const { return "fuse vars"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h new file mode 100644 index 0000000000000000000000000000000000000000..140fb5bb49a33146de974b6d79559b4cf15bdd7b --- /dev/null +++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h @@ -0,0 +1,63 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { + +struct FuseVarsOpHandle : public OpHandleBase { + public: + FuseVarsOpHandle(Scope *local_scope, const platform::Place &place, + const std::unordered_map &inputs_numel, + const std::type_index &var_type) + : local_scope_(local_scope), + place_(place), + inputs_numel_(inputs_numel), + type_(var_type) { + total_numel_ = 0; + for (auto in_numel : inputs_numel) { + PADDLE_ENFORCE_GT(in_numel.second, 0); + total_numel_ += in_numel.second; + } + } + + std::string Name() const override; + + bool IsMultiDeviceTransfer() override { return false; }; + + protected: + void RunImpl() override; + + private: + Scope *local_scope_; + const platform::Place place_; + const std::unordered_map inputs_numel_; + const std::type_index type_; + int64_t total_numel_; +}; +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 17baacd13eecac8f410631fe9e94788da4fff848..78356cb1be3bd089c26dde663275e2c8109df951 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -17,6 +17,7 @@ #include #include +#include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_graph_builder.h" @@ -26,14 +27,6 @@ #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/scope.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" -#endif - -DEFINE_string(ssa_graph_path, "/tmp/ssa_graph.dot", - "the ssa graph path only print with GLOG_v=10," - "default /tmp/graph.dot"); - namespace paddle { namespace framework { namespace details { @@ -93,7 +86,7 @@ std::vector MultiDevSSAGraphBuilder::FindDistTrainSendVars( for (auto *op : program.Block(0).AllOps()) { // TODO(Yancey1989): use a graceful method to find send op, // instead of the the hard code string - if (op->Type() == "send_vars") { + if (op->Type() == "send") { auto op_vars = op->InputArgumentNames(); send_vars.reserve(send_vars.size() + std::distance(op_vars.begin(), op_vars.end())); @@ -247,7 +240,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( CreateReduceOp(&result, g_name, 0); CreateBroadcastOp(&result, g_name, 0); } else { - InsertNCCLAllReduceOp(&result, g_name); + InsertAllReduceOp(&result, g_name); } break; } @@ -277,11 +270,6 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( */ AddOutputToLeafOps(&result); - if (VLOG_IS_ON(10)) { - std::ofstream fout(FLAGS_ssa_graph_path); - PrintGraphviz(*graph, fout); - } - return std::unique_ptr(graph); } @@ -295,6 +283,19 @@ bool MultiDevSSAGraphBuilder::IsSparseGradient( return false; } +void MultiDevSSAGraphBuilder::SetCommunicationContext( + OpHandleBase *op_handle, const platform::Place &p) const { +#ifdef PADDLE_WITH_CUDA + if (nccl_ctxs_ == nullptr) { + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); + } +#else + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); +#endif +} + void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result, const std::string &p_name, size_t src_dev_id) const { @@ -309,15 +310,12 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result, op_handle->AddInput(in); for (size_t i = 0; i < places_.size(); ++i) { - auto &vars = result->vars_.at(i).at(p_name); auto &p = places_[i]; + SetCommunicationContext(op_handle, p); + auto &vars = result->vars_.at(i).at(p_name); auto *out_var = new VarHandle(vars.size(), i, p_name, p); vars.emplace_back(out_var); op_handle->AddOutput(out_var); -#ifndef ADDLE_WITH_CUDA - op_handle->SetDeviceContext(p, - platform::DeviceContextPool::Instance().Get(p)); -#endif } } @@ -329,15 +327,19 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result, CreateOpHandleIOs(result, op, dev_id); } -void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp( - SSAGraph *result, const std::string &og) const { +void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result, + const std::string &og) const { #ifdef PADDLE_WITH_CUDA result->ops_.emplace_back( - new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_)); + new AllReduceOpHandle(local_scopes_, places_, nccl_ctxs_)); +#else + result->ops_.emplace_back(new AllReduceOpHandle(local_scopes_, places_)); +#endif auto *op_handle = result->ops_.back().get(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; + SetCommunicationContext(op_handle, p); auto &vars = result->vars_[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); @@ -347,9 +349,6 @@ void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp( vars.emplace_back(var); op_handle->AddOutput(var); } -#else - PADDLE_ENFORCE("Not implemented"); -#endif } bool MultiDevSSAGraphBuilder::IsParameterGradientOnce( @@ -388,7 +387,9 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const { for (size_t i = 0; i < places_.size(); ++i) { // Insert ScaleCost OpHandle #ifdef PADDLE_WITH_CUDA - auto *communication_dev_ctx = nccl_ctxs_->DevCtx(places_[i]); + auto *communication_dev_ctx = + nccl_ctxs_ ? nccl_ctxs_->DevCtx(places_[i]) + : platform::DeviceContextPool::Instance().Get(places_[i]); #else auto *communication_dev_ctx = platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); @@ -433,12 +434,9 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result, auto *op_handle = result->ops_.back().get(); for (size_t i = 0; i < places_.size(); ++i) { - auto &vars = result->vars_[i][og]; -#ifndef PADDLE_WITH_CUDA auto &p = places_[i]; - op_handle->SetDeviceContext(p, - platform::DeviceContextPool::Instance().Get(p)); -#endif + SetCommunicationContext(op_handle, p); + auto &vars = result->vars_[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); op_handle->AddInput(prev_grad.get()); @@ -473,22 +471,21 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result, void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, const OpDesc &op) const { - auto &p = places_[0]; - auto *s = local_scopes_[0]; - result->ops_.emplace_back(new RPCOpHandle(op, s, p, op.Type())); + result->ops_.emplace_back( + new RPCOpHandle(op, local_scopes_[0], op.Type(), places_[0])); if (op.Type() == "send_barrier") { - ConnectOp(result, result->ops_.back().get(), "send_vars"); + ConnectOp(result, result->ops_.back().get(), "send"); } else if (op.Type() == "recv") { ConnectOp(result, result->ops_.back().get(), "send_barrier"); } else if (op.Type() == "fetch_barrier") { ConnectOp(result, result->ops_.back().get(), "recv"); - } else if (op.Type() == "send_vars") { + } else if (op.Type() == "send") { // do nothing } else { PADDLE_THROW( "rpc op should be in [" - "send_vars, send_barrier. recv, fetch_barrier]"); + "send, send_barrier. recv, fetch_barrier]"); } // TODO(Yancey1989): schedule rpc op on different place may diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index 544cbe585c7423b5f3eb98ee698ca5668376f1ca..78581755fe4890800636944d6cd89875a852cc19 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -100,7 +100,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::vector> &var_name_on_devices, const OpDesc &op) const; - void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const; + void InsertAllReduceOp(SSAGraph *result, const std::string &og) const; void CreateBroadcastOp(SSAGraph *result, const std::string &p_name, size_t src_dev_id) const; @@ -111,6 +111,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { private: BuildStrategy strategy_; + + void SetCommunicationContext(OpHandleBase *op_handle, + const platform::Place &p) const; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 6b064650b4f09737836bda4a43fa421720077929..f79565fe71c4aef140475c922cbbf5a1e0b7fe03 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -39,9 +39,9 @@ OpHandleBase::~OpHandleBase() { #endif } -void OpHandleBase::Run(bool use_event) { +void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA - if (events_.empty() && use_event) { + if (events_.empty() && use_cuda) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; PADDLE_ENFORCE(cudaSetDevice(dev_id)); @@ -50,7 +50,7 @@ void OpHandleBase::Run(bool use_event) { } } #else - PADDLE_ENFORCE(!use_event); + PADDLE_ENFORCE(!use_cuda); #endif RunImpl(); @@ -104,6 +104,16 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { } } +size_t OpHandleBase::NoDummyInputSize() const { + size_t cnt = 0; + for (auto *in : inputs_) { + if (dynamic_cast(in) == nullptr) { + ++cnt; + } + } + return cnt; +} + bool OpHandleBase::NeedWait(VarHandleBase *in_var) { return in_var && in_var->generated_op_; } diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 8f94206a87dbae8a81727ca48718886bbabbe25c..fbd90a3296bca92b097cab925b218b91e7f4752f 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -36,7 +36,7 @@ class OpHandleBase { virtual std::string Name() const = 0; - void Run(bool use_event); + void Run(bool use_cuda); virtual void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx); @@ -80,6 +80,8 @@ class OpHandleBase { const std::vector &Outputs() const { return outputs_; } + size_t NoDummyInputSize() const; + protected: void RunAndRecordEvent(const std::function &callback); diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h index 2b95a284990da8f9b7c16d6e4221eb1ed061f74b..a6ffb37313a88120bc9e8d5ce326f60aeebdff69 100644 --- a/paddle/fluid/framework/details/reduce_and_gather.h +++ b/paddle/fluid/framework/details/reduce_and_gather.h @@ -37,7 +37,9 @@ struct ReduceLoDTensor { PADDLE_ENFORCE_NE(t0.numel(), 0); dst_tensor_.Resize(t0.dims()); T *dst = dst_tensor_.mutable_data(platform::CPUPlace()); - std::copy(t0.data(), t0.data() + t0.numel(), dst); + if (dst != t0.data()) { + std::copy(t0.data(), t0.data() + t0.numel(), dst); + } for (size_t i = 1; i < src_tensors_.size(); ++i) { auto &t = *src_tensors_[i]; diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index c652a2f4eb0f9b73cb19ebbd9d0809210b280ad3..4d14334cdfe06e2e805c2577458d6689e6324cc7 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -32,8 +32,8 @@ namespace framework { namespace details { struct ReduceOpHandle : public OpHandleBase { - const std::vector &local_scopes_; - const std::vector &places_; + std::vector local_scopes_; + std::vector places_; #ifdef PADDLE_WITH_CUDA const platform::NCCLContextMap *nccl_ctxs_; diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc index 7f4da4c01de1010467d839ee5490c5e0d02d8c24..586465f99fd94117c821be2952bffda385fbcf75 100644 --- a/paddle/fluid/framework/details/rpc_op_handle.cc +++ b/paddle/fluid/framework/details/rpc_op_handle.cc @@ -19,12 +19,12 @@ namespace framework { namespace details { RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc, - const Scope *local_scope, const platform::Place &place, - const std::string &name) + const Scope *local_scope, const std::string &name, + const platform::Place &place) : op_(framework::OpRegistry::CreateOp(op_desc)), local_scope_(local_scope), - place_(place), - name_(name) {} + name_(name), + place_(place) {} void RPCOpHandle::RunImpl() { // TODO(wuyi): need further analysis whether wait VarDummyHandle. diff --git a/paddle/fluid/framework/details/rpc_op_handle.h b/paddle/fluid/framework/details/rpc_op_handle.h index d28b7721720d808a8d81701c3811eae16121fb41..ae38c7fe19e102a330455d89a1068414a7835fab 100644 --- a/paddle/fluid/framework/details/rpc_op_handle.h +++ b/paddle/fluid/framework/details/rpc_op_handle.h @@ -29,7 +29,7 @@ namespace details { struct RPCOpHandle : public OpHandleBase { RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope, - const platform::Place& place, const std::string& name); + const std::string& name, const platform::Place& place); std::string Name() const override; @@ -43,8 +43,8 @@ struct RPCOpHandle : public OpHandleBase { private: std::unique_ptr op_; const Scope* local_scope_; - const platform::Place& place_; const std::string name_; + platform::Place place_; }; } // namespace details diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..eb4e7ec52f907f9403e21ec2734d61824f51a58b --- /dev/null +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" +#include +#include +#include "paddle/fluid/framework/executor.h" + +namespace paddle { +namespace framework { +namespace details { +ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( + ExecutionStrategy strategy, std::vector local_scopes, + std::vector var_infos, std::vector places, + std::unique_ptr &&underlying_executor) + : strategy_(std::move(strategy)), + underlying_executor_(std::move(underlying_executor)), + local_scopes_(std::move(local_scopes)), + var_infos_(std::move(var_infos)), + places_(std::move(places)) {} + +FeedFetchList ScopeBufferedSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + if (drop_scope_counter_ == 0) { + // Create local scopes. + for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) { + auto &scope = *it; + Scope &local_scope = scope->NewScope(); + *scope->Var(details::kLocalExecScopeName)->GetMutable() = + &local_scope; + + for (auto &info : var_infos_) { + if (scope->FindVar(info.name_) != nullptr) { + continue; + } + + if (info.persistable_) { // Persistable + InitializeVariable(scope->Var(info.name_), info.type_); + } else { + InitializeVariable(local_scope.Var(info.name_), info.type_); + } + } + } + } + + auto fetch_data = underlying_executor_->Run(fetch_tensors); + drop_scope_counter_ += 1; + if (!fetch_tensors.empty() || + drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { + drop_scope_counter_ = 0; + // Wait All computational streams + for (auto p : places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } + for (auto &scope : local_scopes_) { + auto &local_scope = + *scope->Var(details::kLocalExecScopeName)->GetMutable(); + scope->DeleteScope(local_scope); + } + } + return fetch_data; +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h new file mode 100644 index 0000000000000000000000000000000000000000..20df7a4722d589ffd168f842e927cff8411096bb --- /dev/null +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -0,0 +1,53 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/details/execution_strategy.h" +#include "paddle/fluid/framework/details/ssa_graph_executor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/place.h" +namespace paddle { +namespace framework { +namespace details { + +struct VariableInfo { + std::string name_; + proto::VarType::Type type_; + bool persistable_; +}; + +class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { + public: + ScopeBufferedSSAGraphExecutor( + ExecutionStrategy strategy, std::vector local_scopes, + std::vector var_infos, std::vector places, + std::unique_ptr&& underlying_executor); + FeedFetchList Run(const std::vector& fetch_tensors) override; + + private: + size_t drop_scope_counter_{0}; + + ExecutionStrategy strategy_; + std::unique_ptr underlying_executor_; + std::vector local_scopes_; + std::vector var_infos_; + std::vector places_; +}; +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index 6a567527550883add08031e50aa8de2b204cf13d..88a21f48879a15450051ad94ed76e1c48bf23014 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/framework/details/ssa_graph_builder.h" +#include namespace paddle { namespace framework { @@ -73,64 +73,6 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, op_handle->AddOutput(var); } -template -void IterAllVar(const SSAGraph &graph, Callback callback) { - for (auto &each : graph.vars_) { - for (auto &pair1 : each) { - for (auto &pair2 : pair1.second) { - callback(*pair2); - } - } - } - - for (auto &var : graph.dep_vars_) { - callback(*var); - } -} - -void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) { - size_t var_id = 0; - std::unordered_map vars; - - sout << "digraph G {\n"; - - IterAllVar(graph, [&](const VarHandleBase &var) { - auto *var_ptr = &var; - auto *var_handle_ptr = dynamic_cast(var_ptr); - auto *dummy_ptr = dynamic_cast(var_ptr); - - size_t cur_var_id = var_id++; - vars[var_ptr] = cur_var_id; - - if (var_handle_ptr) { - sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_ - << "\\n" - << var_handle_ptr->place_ << "\\n" - << var_handle_ptr->version_ << "\"]" << std::endl; - } else if (dummy_ptr) { - sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl; - } - }); - - size_t op_id = 0; - for (auto &op : graph.ops_) { - std::string op_name = "op_" + std::to_string(op_id++); - sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]" - << std::endl; - for (auto in : op->Inputs()) { - std::string var_name = "var_" + std::to_string(vars[in]); - sout << var_name << " -> " << op_name << std::endl; - } - - for (auto out : op->Outputs()) { - std::string var_name = "var_" + std::to_string(vars[out]); - sout << op_name << " -> " << var_name << std::endl; - } - } - - sout << "}\n"; -} - void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) { for (auto &op : graph->ops_) { if (!op->Outputs().empty()) { diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index 64e5d93081eb76c56898bbeb530e37364619fdbb..5fc12a44b51fae26e5a8f5fdba952d3879e82d0f 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -55,8 +55,6 @@ class SSAGraphBuilder { const platform::Place &place, size_t place_offset); static void AddOutputToLeafOps(SSAGraph *graph); - - static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout); }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc new file mode 100644 index 0000000000000000000000000000000000000000..b4b49d3de6da2e5fd7836668619e42d10bb6b35a --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h" +#include +#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" +#include "paddle/fluid/framework/details/ssa_graph_checker.h" +#include "paddle/fluid/framework/details/ssa_graph_printer.h" + +namespace paddle { +namespace framework { +namespace details { +std::unique_ptr SSAGraphBuilderFactory::Create() { + std::unique_ptr res( +#ifdef PADDLE_WITH_CUDA + new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_, + local_scopes_, nccl_ctxs_, strategy_) +#else + new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_, + local_scopes_, strategy_) +#endif + ); // NOLINT + + if (!strategy_.debug_graphviz_path_.empty()) { + std::unique_ptr fout( + new std::ofstream(strategy_.debug_graphviz_path_)); + PADDLE_ENFORCE(fout->good()); + std::unique_ptr graphviz_printer( + new GraphvizSSAGraphPrinter()); + res.reset(new SSAGraghBuilderWithPrinter( + std::move(fout), std::move(graphviz_printer), std::move(res))); + } + res.reset(new SSAGraghBuilderWithChecker(std::move(res))); + + return res; +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.h b/paddle/fluid/framework/details/ssa_graph_builder_factory.h new file mode 100644 index 0000000000000000000000000000000000000000..91a119de83ed3d1573803e48faf86c874eed98d6 --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.h @@ -0,0 +1,71 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/ssa_graph_builder.h" +#include "paddle/fluid/platform/place.h" + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace framework { +class Scope; +namespace details { + +class SSAGraphBuilderFactory { + public: + SSAGraphBuilderFactory(const std::vector& places, + const std::string& loss_var_name, + const std::unordered_set& param_names, + const std::vector& local_scopes, + const BuildStrategy& strategy) + : places_(places), + loss_var_name_(loss_var_name), + param_names_(param_names), + local_scopes_(local_scopes), + strategy_(strategy) { +#ifdef PADDLE_WITH_CUDA + nccl_ctxs_ = nullptr; +#endif + } + +#ifdef PADDLE_WITH_CUDA + void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) { + nccl_ctxs_ = nccl_ctxs; + } +#endif + + std::unique_ptr Create(); + + private: + std::vector places_; + std::string loss_var_name_; + std::unordered_set param_names_; + std::vector local_scopes_; + BuildStrategy strategy_; + +#ifdef PADDLE_WITH_CUDA + platform::NCCLContextMap* nccl_ctxs_; +#endif +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/ssa_graph_checker.cc new file mode 100644 index 0000000000000000000000000000000000000000..da5428946ee588e8eac1f78929dc0432df532975 --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_checker.cc @@ -0,0 +1,87 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/ssa_graph.h" +#include +#include "paddle/fluid/framework/details/ssa_graph_checker.h" + +namespace paddle { +namespace framework { +namespace details { + +bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const { + std::unordered_map pending_ops; + std::unordered_set pending_vars; + std::unordered_set ready_vars; + std::unordered_set ready_ops; + + auto insert_pending_var = [&](VarHandleBase *var) { + pending_vars.insert(var); + if (var->generated_op_ == nullptr) { + ready_vars.emplace(var); + } + }; + + for (auto &var_map : graph->vars_) { + for (auto &name_pair : var_map) { + for (auto &version_pair : name_pair.second) { + insert_pending_var(version_pair.get()); + } + } + } + + for (auto &var : graph->dep_vars_) { + insert_pending_var(var.get()); + } + + for (auto &op : graph->ops_) { + if (op->Inputs().empty()) { + ready_ops.insert(op.get()); + } else { + pending_ops.insert({op.get(), op.get()->NoDupInputSize()}); + } + } + + auto run_all_ops = [&](std::unordered_set &set) { + for (auto *op : set) { + for (auto out : op->Outputs()) { + ready_vars.emplace(out); + } + } + set.clear(); + }; + + while (!pending_vars.empty()) { + run_all_ops(ready_ops); + + if (ready_vars.empty()) { + return false; + } + + for (auto ready_var : ready_vars) { + pending_vars.erase(ready_var); + for (auto *op : ready_var->pending_ops_) { + auto &deps = --pending_ops[op]; + if (deps == 0) { + ready_ops.insert(op); + } + } + } + ready_vars.clear(); + } + return true; +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h new file mode 100644 index 0000000000000000000000000000000000000000..304b221e7e4c414a0ab562a1b99836d3b7c02efb --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_checker.h @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/details/ssa_graph_builder.h" + +namespace paddle { +namespace framework { +namespace details { +struct SSAGraph; + +class SSAGraghBuilderWithChecker : public SSAGraphBuilder { + public: + explicit SSAGraghBuilderWithChecker( + std::unique_ptr&& builder) + : builder_(std::move(builder)) {} + + std::unique_ptr Build(const ProgramDesc& program) const override { + auto graph = builder_->Build(program); + PADDLE_ENFORCE(IsValidGraph(graph.get())); + return graph; + } + + bool IsValidGraph(const SSAGraph* graph) const; + + private: + std::unique_ptr builder_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc index 8da6ca889b89999e0f6f974503cea476c9de97f3..09b97bd0d98dc4ad1124dcbc495cff921bf03efc 100644 --- a/paddle/fluid/framework/details/ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/ssa_graph_executor.cc @@ -17,10 +17,6 @@ namespace paddle { namespace framework { namespace details { - -SSAGraphExecutor::SSAGraphExecutor(std::unique_ptr &&graph) - : graph_(std::move(graph)) {} - SSAGraphExecutor::~SSAGraphExecutor() {} } // namespace details diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h index a8833b7388ab907020a260d356f1484ffd227658..958086033607a4ed8fb840f5b14fe5779625bd82 100644 --- a/paddle/fluid/framework/details/ssa_graph_executor.h +++ b/paddle/fluid/framework/details/ssa_graph_executor.h @@ -28,15 +28,11 @@ class SSAGraphExecutor { DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor); public: - // Steal graph inside - explicit SSAGraphExecutor(std::unique_ptr &&graph); + SSAGraphExecutor() {} virtual ~SSAGraphExecutor(); virtual FeedFetchList Run(const std::vector &fetch_tensors) = 0; - - protected: - std::unique_ptr graph_; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/ssa_graph_printer.cc new file mode 100644 index 0000000000000000000000000000000000000000..22a40ca4b25cdd8ed9856b6c71bffc79561edcac --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_printer.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/ssa_graph_printer.h" +#include +#include "paddle/fluid/framework/details/ssa_graph.h" + +namespace paddle { +namespace framework { +namespace details { + +template +static inline void IterAllVar(const SSAGraph &graph, Callback callback) { + for (auto &each : graph.vars_) { + for (auto &pair1 : each) { + for (auto &pair2 : pair1.second) { + callback(*pair2); + } + } + } + + for (auto &var : graph.dep_vars_) { + callback(*var); + } +} + +void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph, + std::ostream &sout) const { + size_t var_id = 0; + std::unordered_map vars; + + sout << "digraph G {\n"; + + IterAllVar(graph, [&](const VarHandleBase &var) { + auto *var_ptr = &var; + auto *var_handle_ptr = dynamic_cast(var_ptr); + auto *dummy_ptr = dynamic_cast(var_ptr); + + size_t cur_var_id = var_id++; + vars[var_ptr] = cur_var_id; + + if (var_handle_ptr) { + sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_ + << "\\n" + << var_handle_ptr->place_ << "\\n" + << var_handle_ptr->version_ << "\"]" << std::endl; + } else if (dummy_ptr) { + sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl; + } + }); + + size_t op_id = 0; + for (auto &op : graph.ops_) { + std::string op_name = "op_" + std::to_string(op_id++); + sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]" + << std::endl; + for (auto in : op->Inputs()) { + std::string var_name = "var_" + std::to_string(vars[in]); + sout << var_name << " -> " << op_name << std::endl; + } + + for (auto out : op->Outputs()) { + std::string var_name = "var_" + std::to_string(vars[out]); + sout << op_name << " -> " << var_name << std::endl; + } + } + + sout << "}\n"; +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h new file mode 100644 index 0000000000000000000000000000000000000000..b4c90013789759d17646d95efdc81fc6a0a4f3e7 --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_printer.h @@ -0,0 +1,67 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/details/ssa_graph_builder.h" + +namespace paddle { +namespace framework { +namespace details { +struct SSAGraph; +class SSAGraphPrinter { + public: + virtual ~SSAGraphPrinter() {} + virtual void Print(const SSAGraph& graph, std::ostream& sout) const = 0; +}; + +class GraphvizSSAGraphPrinter : public SSAGraphPrinter { + public: + void Print(const SSAGraph& graph, std::ostream& sout) const override; +}; + +class SSAGraghBuilderWithPrinter : public SSAGraphBuilder { + public: + SSAGraghBuilderWithPrinter(std::ostream& sout, + std::unique_ptr&& printer, + std::unique_ptr&& builder) + : printer_(std::move(printer)), + builder_(std::move(builder)), + stream_ref_(sout) {} + + SSAGraghBuilderWithPrinter(std::unique_ptr&& sout, + std::unique_ptr&& printer, + std::unique_ptr&& builder) + : printer_(std::move(printer)), + builder_(std::move(builder)), + stream_ptr_(std::move(sout)), + stream_ref_(*stream_ptr_) {} + + std::unique_ptr Build(const ProgramDesc& program) const override { + auto graph = builder_->Build(program); + printer_->Print(*graph, stream_ref_); + return graph; + } + + private: + std::unique_ptr printer_; + std::unique_ptr builder_; + std::unique_ptr stream_ptr_; + std::ostream& stream_ref_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 815f739371e77d953a28be99b38ec1b8ff26506c..6c5098ce85b784a3edcf8f48d2cc828aabd8e161 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -21,7 +21,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, std::unique_ptr &&graph) - : SSAGraphExecutor(std::move(graph)), + : graph_(std::move(graph)), pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) : nullptr), local_scopes_(local_scopes), @@ -185,12 +185,15 @@ void ThreadedSSAGraphExecutor::InsertPendingVar( ready_vars->Push(var); } } + void ThreadedSSAGraphExecutor::RunOp( BlockingQueue *ready_var_q, details::OpHandleBase *op) { auto op_run = [ready_var_q, op, this] { try { - VLOG(10) << op << " " << op->Name() << " : " << op->DebugString(); - op->Run(strategy_.use_event_); + if (VLOG_IS_ON(10)) { + VLOG(10) << op << " " << op->Name() << " : " << op->DebugString(); + } + op->Run(strategy_.use_cuda_); VLOG(10) << op << " " << op->Name() << " Done "; running_ops_--; ready_var_q->Extend(op->Outputs()); diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 1f7f88d75218e757e4555ad093f3cd6558f624dd..4a2075f1cccb3211316567197da56c01d26f35ce 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -51,6 +51,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { details::OpHandleBase *op); private: + std::unique_ptr graph_; std::unique_ptr<::ThreadPool> pool_; std::vector local_scopes_; std::vector places_; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 863053c32b190f4e8497b16f3edd76cb2f76168b..e15232a77bb9c3e325b55737ea7abc55e3121708 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -20,10 +20,14 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" +#ifdef PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/detail/grpc_client.h" +#endif #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" DECLARE_bool(benchmark); +DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); namespace paddle { namespace framework { @@ -43,6 +47,14 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { Executor::Executor(const platform::Place& place) : place_(place) {} +#ifdef PADDLE_WITH_DISTRIBUTE +void Executor::Complete() { + ::paddle::operators::detail::RPCClient::GetInstance< + ::paddle::operators::detail::GRPCClient>() + ->SendComplete(); +} +#endif + void InitializeVariable(Variable* var, proto::VarType::Type var_type) { if (var_type == proto::VarType::LOD_TENSOR) { var->GetMutable(); @@ -115,6 +127,7 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool create_local_scope, bool create_vars) { platform::RecordBlock b(block_id); + if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); auto ctx = Prepare(pdesc, block_id); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); } @@ -214,16 +227,18 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, const std::string& feed_holder_name, const std::string& fetch_holder_name) { platform::RecordBlock b(kProgramId); + if (FLAGS_use_mkldnn) EnableMKLDNN(program); bool has_feed_ops = has_feed_operators(program.Block(0), *feed_targets, feed_holder_name); bool has_fetch_ops = has_fetch_operators(program.Block(0), *fetch_targets, fetch_holder_name); ProgramDesc* copy_program = const_cast(&program); + std::unique_ptr unique_ptr_of_copy_program; if (!has_feed_ops || !has_fetch_ops) { - copy_program = std::unique_ptr(new ProgramDesc(program)).get(); + unique_ptr_of_copy_program.reset(new ProgramDesc(program)); + copy_program = unique_ptr_of_copy_program.get(); } - auto* global_block = copy_program->MutableBlock(0); if (!has_feed_ops) { @@ -315,8 +330,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } for (auto& op : ctx->ops_) { - VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); + VLOG(4) << place_ << " " << op->DebugStringEx(local_scope); op->Run(*local_scope, place_); + // NOTE! Please do not delete this line, it's usefull because the debug + // string before and after op.run are different, after run the output + // will have right shape which is usefull for debug. + VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); if (FLAGS_benchmark) { VLOG(2) << "Memory used after operator " + op->Type() + " running: " @@ -376,5 +395,19 @@ void Executor::RunPreparedContext( } } +void Executor::EnableMKLDNN(const ProgramDesc& program) { +#ifdef PADDLE_WITH_MKLDNN + VLOG(3) << "use_mkldnn=True"; + for (size_t bid = 0; bid < program.Size(); ++bid) { + auto* block = const_cast(program).MutableBlock(bid); + for (auto* op : block->AllOps()) { + if (op->HasAttr("use_mkldnn")) { + op->SetAttr("use_mkldnn", true); + } + } + } +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 0c3c23611d95e0da67cabfb8fb2755a4a52c991b..67a0761dac2a9adcdd0ce2b218c4aa505d688d56 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -44,6 +44,13 @@ class Executor { explicit Executor(const platform::Place& place); +#ifdef PADDLE_WITH_DISTRIBUTE + /* + * Sending signal to pserver to mark current trainer stop. + */ + void Complete(); +#endif + /* @Brief * Runtime evaluation of the given ProgramDesc under certain Scope * @@ -81,6 +88,8 @@ class Executor { const std::string& feed_holder_name = "feed", const std::string& fetch_holder_name = "fetch"); + void EnableMKLDNN(const ProgramDesc& program); + private: const platform::Place place_; }; diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index d35125fe8c3c8018c38650dc87b2b1474ded6058..68fcc104d48b2b39929ed2198a2dd2eabae10e94 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -71,6 +71,7 @@ message OpProto { optional bool duplicable = 3 [ default = false ]; optional bool intermediate = 4 [ default = false ]; optional bool dispensable = 5 [ default = false ]; + optional string reuse = 6; } // AttrProto describes the C++ type Attribute. diff --git a/paddle/fluid/framework/init.cc b/paddle/fluid/framework/init.cc index 85beae775b96c3b7e08a2795bcd0ec79b24faeb4..a1094976f6c0965ac0a601d7e37575969146fdab 100644 --- a/paddle/fluid/framework/init.cc +++ b/paddle/fluid/framework/init.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/init.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/piece.h" @@ -113,6 +114,9 @@ void InitDevices(bool init_p2p, const std::vector devices) { } places.emplace_back(platform::CPUPlace()); platform::DeviceContextPool::Init(places); +#ifndef PADDLE_WITH_MKLDNN + operators::math::SetNumThreads(1); +#endif } void InitGLOG(const std::string &prog_name) { diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc index b99e82f8c4358b60a014c6fc7c61c9bbb8683834..f1261dee0319440995951d1bee145404186a8ad4 100644 --- a/paddle/fluid/framework/op_info.cc +++ b/paddle/fluid/framework/op_info.cc @@ -17,12 +17,11 @@ limitations under the License. */ namespace paddle { namespace framework { -static OpInfoMap* g_op_info_map = nullptr; - +// C++11 removes the need for manual locking. Concurrent execution shall wait if +// a static local variable is already being initialized. +// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex OpInfoMap& OpInfoMap::Instance() { - if (g_op_info_map == nullptr) { - g_op_info_map = new OpInfoMap(); - } + static OpInfoMap* g_op_info_map = new OpInfoMap(); return *g_op_info_map; } } // namespace framework diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h index fab20d75f5a45257f243333c1998d7b2549a25f9..f51a184e7bae2283f335fe9462a77b9c5fb831a5 100644 --- a/paddle/fluid/framework/op_kernel_type.h +++ b/paddle/fluid/framework/op_kernel_type.h @@ -87,7 +87,14 @@ inline std::string KernelTypeToString(const OpKernelType& kernel_key) { } inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) { - return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r; + bool ret = + (l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r); +#ifdef PADDLE_WITH_MKLDNN + // Layout transform needed for either non-MKLDNN to MKLDNN or vice versa + ret |= (l != DataLayout::kMKLDNN && r == DataLayout::kMKLDNN); + ret |= (l == DataLayout::kMKLDNN && r != DataLayout::kMKLDNN); +#endif + return ret; } inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) { diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index ae9f4efd44acdcdff2806deea6826e4089459a78..001b5cb5a8eb57cbe0a2e0ad7f64ef05f8149922 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -21,6 +21,7 @@ namespace framework { void OpProtoAndCheckerMaker::Validate() { validated_ = true; CheckNoDuplicatedInOutAttrs(); + CheckReuseVars(); } OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput( @@ -56,6 +57,24 @@ void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() { } } +void OpProtoAndCheckerMaker::CheckReuseVars() { + std::unordered_set names; + for (auto& input : proto_->inputs()) { + names.insert(input.name()); + } + auto checker = [&](const std::string& name, const std::string& reused) { + PADDLE_ENFORCE( + names.count(reused), + "Output [%s] reuse Input [%s], but the input is not registered.", name, + reused); + }; + for (auto& output : proto_->outputs()) { + if (output.has_reuse()) { + checker(output.name(), output.reuse()); + } + } +} + void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, OpAttrChecker* attr_checker) { proto_ = proto; diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 8493b9d8b326c71a33b95bf95e5fc1743c686eb7..92f86bb5de520878d0a7b8d7214620580242c061 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once #include +#include + #include "glog/logging.h" #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/framework.pb.h" @@ -64,6 +66,11 @@ class OpProtoAndCheckerMaker { var_->set_dispensable(true); return *this; } + + VariableBuilder &Reuse(const std::string &name) { + var_->set_reuse(name); + return *this; + } }; VariableBuilder AddInput(const std::string &name, const std::string &comment); @@ -89,6 +96,8 @@ class OpProtoAndCheckerMaker { void CheckNoDuplicatedInOutAttrs(); void Validate(); + void CheckReuseVars(); + proto::OpProto *proto_; OpAttrChecker *op_checker_; bool validated_{false}; diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc index a8030d377fdb4d4aef74b315e21792dad10fac96..58f70cb39c0d96ed3b9ff35ea132ba75a37f5405 100644 --- a/paddle/fluid/framework/op_proto_maker_test.cc +++ b/paddle/fluid/framework/op_proto_maker_test.cc @@ -47,3 +47,23 @@ TEST(ProtoMaker, DuplicatedInOut) { ASSERT_THROW(proto_maker(&op_proto, &op_checker), paddle::platform::EnforceNotMet); } + +class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "input of test op"); + AddOutput("XOut", "output of test op").Reuse("X"); + AddOutput("NoOut", "output of test op").Reuse("NotExists"); + } +}; + +TEST(ProtoMaker, InplaceOutput) { + paddle::framework::proto::OpProto op_proto; + paddle::framework::OpAttrChecker op_checker; + TestInplaceProtoMaker proto_maker; + ASSERT_THROW(proto_maker(&op_proto, &op_checker), + paddle::platform::EnforceNotMet); + // proto_maker(&op_proto, &op_checker); + // proto_maker.Make(); + // ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet); +} diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 748317438b44bc4af84f13b25f8e4f88386388fb..43ab227a9478707445892c14723801992d0041aa 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -83,8 +83,14 @@ struct OpKernelRegistrarFunctor { void operator()(const char* op_type, const char* library_type) const { using T = typename KERNEL_TYPE::ELEMENT_TYPE; + std::string library(library_type); + std::string data_layout = "ANYLAYOUT"; + if (library == "MKLDNN") { + data_layout = "MKLDNNLAYOUT"; + } OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(), - DataLayout::kAnyLayout, StringToLibraryType(library_type)); + StringToDataLayout(data_layout), + StringToLibraryType(library_type)); OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE); constexpr auto size = std::tuple_size>::value; @@ -99,7 +105,8 @@ struct OpKernelRegistrarFunctor { void operator()(const char* op_type, const char* library_type) const {} }; -// User can register many kernel in one place. The data type could be different. +// User can register many kernel in one place. The data type could be +// different. template class OpKernelRegistrar : public Registrar { public: @@ -149,15 +156,15 @@ class OpKernelRegistrar : public Registrar { /** * Macro to register OperatorKernel. */ -#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...) \ +#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...) \ STATIC_ASSERT_GLOBAL_NAMESPACE( \ - __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__, \ + __reg_op_kernel_##op_type##_##library_type##__, \ "REGISTER_OP_KERNEL must be called in global namespace"); \ static ::paddle::framework::OpKernelRegistrar \ - __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type, \ - #LIBRARY_TYPE); \ - int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() { \ - __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch(); \ + __op_kernel_registrar_##op_type##_##library_type##__(#op_type, \ + #library_type); \ + int TouchOpKernelRegistrar_##op_type##_##library_type() { \ + __op_kernel_registrar_##op_type##_##library_type##__.Touch(); \ return 0; \ } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index f87d5521492418d2daf5b7fba1500c4bb31e10f5..122ee1dab35b8c7d42392a983b5b15b7c1be7869 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -69,6 +69,19 @@ static DDim GetDims(const Scope& scope, const std::string& name, } } +static int GetRowSize(const Scope& scope, const std::string& name) { + Variable* var = scope.FindVar(name); + if (var == nullptr) { + return -1; + } + + if (var->IsType()) { + return var->Get().rows().size(); + } + + return -1; +} + static LoD GetLoD(const Scope& scope, const std::string& name) { Variable* var = scope.FindVar(name); auto default_lod = LoD({{}}); @@ -85,6 +98,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { } void OperatorBase::Run(const Scope& scope, const platform::Place& place) { + VLOG(10) << "- " << DebugStringEx(&scope); if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA PADDLE_THROW("Cannot run operator on place %s", place); @@ -94,6 +108,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { #endif } RunImpl(scope, place); + VLOG(10) << "+ " << DebugStringEx(&scope); } bool OperatorBase::HasInputs(const std::string& name) const { @@ -153,6 +168,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { for (size_t i = 0; i < input.second.size(); ++i) { ss << input.second[i]; if (scope) { + int row_size = GetRowSize(*scope, input.second[i]); + if (row_size >= 0) { + ss << "[row_size=" << row_size << "]"; + } ss << "[" << GetDims(*scope, input.second[i], true) << "]"; ss << "(" << GetLoD(*scope, input.second[i]) << ")"; } @@ -173,6 +192,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { for (size_t i = 0; i < output.second.size(); ++i) { ss << output.second[i]; if (scope) { + int row_size = GetRowSize(*scope, output.second[i]); + if (row_size >= 0) { + ss << "[row_size=" << row_size << "]"; + } ss << "[" << GetDims(*scope, output.second[i], true) << "]"; ss << "(" << GetLoD(*scope, output.second[i]) << ")"; } @@ -293,6 +316,38 @@ static Tensor* GetMutableTensorFromVar(Variable* var) { } } +bool ExecutionContext::HasInput(const std::string& name) const { + if (!op_.HasInputs(name)) { + return false; + } + auto& ins = Inputs(name); + size_t length = ins.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Input %s should not have more than one inputs", name); + auto arg = ins[0]; + auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg); + return var != nullptr; +} + +bool ExecutionContext::HasOutput(const std::string& name) const { + if (!op_.HasOutputs(name)) { + return false; + } + auto& outs = Outputs(name); + size_t length = outs.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Output %s should not have more than one inputs", name); + auto arg = outs[0]; + auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg); + return var != nullptr; +} + template <> const Tensor* ExecutionContext::Input(const std::string& name) const { auto* var = InputVar(name); @@ -444,10 +499,25 @@ class RuntimeInferShapeContext : public InferShapeContext { auto* out_tensor = out_var->GetMutable(); out_tensor->set_lod(in_tensor.lod()); - // TODO(dzhwinter) : reuse ShareLoD in most operators. - // Need to call ShareLayout explicitly in sequence related ops. - // Shall we have a better method to shared info between in/out Tensor? - out_tensor->set_layout(in_tensor.layout()); +// TODO(dzhwinter) : reuse ShareLoD in most operators. +// Need to call ShareLayout explicitly in sequence related ops. +// Shall we have a better method to shared info between in/out Tensor? +#ifdef PADDLE_WITH_MKLDNN + // Fix me: ugly workaround below + // Correct solution: + // set_layout() should NOT be called here (i.e. ShareLoD). Instead, + // layout of output tensor should be set "manually" in Compute() + // of each OPKernel. The reason layout should NOT be shared between + // input and output "automatically" (now by InferShape()->ShareLoD()) + // is that layout transform may occur after InferShape(). + // Workaround: + // Skip set_layout() when input layout is kMKLDNN + // This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN + // OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called + // in Compute() + if (in_tensor.layout() != DataLayout::kMKLDNN) +#endif + out_tensor->set_layout(in_tensor.layout()); } void ShareLayout(const std::string& in, const std::string& out, size_t i = 0, @@ -646,8 +716,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( } if (t != nullptr) { int tmp = static_cast(ToDataType(t->type())); - PADDLE_ENFORCE(tmp == data_type || data_type == -1, - "DataType of Paddle Op %s must be the same.", Type()); + PADDLE_ENFORCE( + tmp == data_type || data_type == -1, + "DataType of Paddle Op %s must be the same. Get %d != %d", Type(), + data_type, tmp); data_type = tmp; } } @@ -665,7 +737,8 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType( OpKernelType OperatorWithKernel::GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, const OpKernelType& expected_kernel_type) const { - return OpKernelType(expected_kernel_type.data_type_, tensor.place()); + return OpKernelType(expected_kernel_type.data_type_, tensor.place(), + tensor.layout()); } } // namespace framework diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 2f480e00c100d579e100de17d3feb957f5ef6167..b1d75d0d0ff3dccc67a1e833ccfe03a4cad8df39 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -191,9 +191,9 @@ class ExecutionContext { return op_.Attr(name); } - bool HasInput(const std::string& name) const { return op_.HasInputs(name); } + bool HasInput(const std::string& name) const; - bool HasOutput(const std::string& name) const { return op_.HasOutputs(name); } + bool HasOutput(const std::string& name) const; size_t InputSize(const std::string& name) const { return op_.Inputs(name).size(); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 50c3468d556bfe05d6c41906cf35cb671f711b1e..9406c6155da860c90739bddac1e81403b094e619 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -22,7 +22,8 @@ limitations under the License. */ #include "paddle/fluid/platform/nccl_helper.h" #endif -#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" +#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" +#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" @@ -42,9 +43,8 @@ class ParallelExecutorPrivate { #ifdef PADDLE_WITH_CUDA std::unique_ptr nccl_ctxs_; #endif - - std::vector> var_types_; - bool own_local_scope; + bool own_local_scope_; + bool use_cuda_; }; std::vector &ParallelExecutor::GetLocalScopes() { @@ -61,65 +61,78 @@ ParallelExecutor::ParallelExecutor( size_t num_trainers, size_t trainer_id) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; + member_->use_cuda_ = exec_strategy.use_cuda_; // Step 1. Bcast the params to devs. // Create local scopes if (local_scopes.empty()) { - member_->own_local_scope = true; + member_->own_local_scope_ = true; member_->local_scopes_.emplace_back(member_->global_scope_); for (size_t i = 1; i < member_->places_.size(); ++i) { member_->local_scopes_.emplace_back(&scope->NewScope()); } } else { - member_->own_local_scope = false; + member_->own_local_scope_ = false; PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size()); for (size_t i = 0; i < member_->places_.size(); ++i) { member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope()); } } + if (member_->use_cuda_) { // Bcast Parameters to all GPUs #ifdef PADDLE_WITH_CUDA - auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); - ncclUniqueId *nccl_id = nullptr; - if (nccl_id_var != nullptr) { - nccl_id = nccl_id_var->GetMutable(); - } - member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id, num_trainers, trainer_id)); + auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); + ncclUniqueId *nccl_id = nullptr; + if (nccl_id_var != nullptr) { + nccl_id = nccl_id_var->GetMutable(); + } + member_->nccl_ctxs_.reset(new platform::NCCLContextMap( + member_->places_, nccl_id, num_trainers, trainer_id)); +#else + PADDLE_THROW("Not compiled with CUDA"); #endif - if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 && - local_scopes.empty()) { // Is CUDA + } + + if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { BCastParamsToGPUs(bcast_vars); } -// Startup Program has been run. All local scopes has correct parameters. + // Startup Program has been run. All local scopes has correct parameters. -// Step 2. Convert main_program to SSA form and dependency graph. Also, insert -// ncclOp -#ifdef PADDLE_WITH_CUDA - details::MultiDevSSAGraphBuilder builder( + // Step 2. Create vars in each scope; + std::vector var_infos; + for (auto *var : main_program.Block(0).AllVars()) { + var_infos.emplace_back(); + var_infos.back().name_ = var->Name(); + var_infos.back().type_ = var->GetType(); + var_infos.back().persistable_ = var->Persistable(); + } + + // Step 3. Convert main_program to SSA form and dependency graph. Also, insert + // ncclOp + + details::SSAGraphBuilderFactory builder_factory( member_->places_, loss_var_name, params, member_->local_scopes_, - member_->nccl_ctxs_.get(), build_strategy); + build_strategy); + if (member_->use_cuda_) { +#ifdef PADDLE_WITH_CUDA + builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get()); #else - details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, - params, member_->local_scopes_, - build_strategy); + PADDLE_THROW("Not compiled with CUDA"); #endif - auto graph = builder.Build(main_program); + } member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, places, + builder_factory.Create()->Build(main_program))); - // Step 3. Create vars in each scope; - for (auto *var : main_program.Block(0).AllVars()) { - member_->var_types_.emplace_back(var->Name(), var->GetType(), - var->Persistable()); - } + member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, std::move(var_infos), + member_->places_, std::move(member_->executor_))); } void ParallelExecutor::BCastParamsToGPUs( const std::unordered_set &vars) const { -#ifdef PADDLE_WITH_CUDA auto *main_scope = member_->local_scopes_[0]; for (auto &var : vars) { @@ -131,9 +144,10 @@ void ParallelExecutor::BCastParamsToGPUs( auto &main_tensor = main_var->Get(); auto &dims = main_tensor.dims(); if (paddle::platform::is_gpu_place(main_tensor.place())) { +#ifdef PADDLE_WITH_CUDA + std::vector buffers; size_t numel = main_tensor.numel(); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); - platform::NCCLGroupGuard guard; for (size_t i = 0; i < member_->places_.size(); ++i) { auto place = member_->places_[i]; void *buffer; @@ -145,10 +159,24 @@ void ParallelExecutor::BCastParamsToGPUs( t->Resize(dims); buffer = t->mutable_data(place, main_tensor.type()); } - auto &nccl_ctx = member_->nccl_ctxs_->at(place); - platform::dynload::ncclBcast(buffer, numel, data_type, 0, - nccl_ctx.comm_, nccl_ctx.stream()); + buffers.push_back(buffer); } + + PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(), + "variables' buffer size to bcast NOT equal to places"); + { + platform::NCCLGroupGuard guard; + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]); + platform::dynload::ncclBcast(buffers[i], numel, data_type, 0, + nccl_ctx.comm_, nccl_ctx.stream()); + } + member_->nccl_ctxs_->WaitAll(); + } + +#else + PADDLE_THROW("Not compiled with CUDA"); +#endif } else { platform::CPUPlace cpu; for (size_t i = 1; i < member_->places_.size(); ++i) { @@ -159,52 +187,15 @@ void ParallelExecutor::BCastParamsToGPUs( paddle::framework::TensorCopy(main_tensor, cpu, t); } } - member_->nccl_ctxs_->WaitAll(); } -#else - PADDLE_THROW("Not compiled with CUDA"); -#endif } void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { platform::RecordBlock b(0); - // Create local scopes. - for (auto it = member_->local_scopes_.rbegin(); - it != member_->local_scopes_.rend(); ++it) { - auto &scope = *it; - Scope &local_scope = scope->NewScope(); - *scope->Var(details::kLocalExecScopeName)->GetMutable() = - &local_scope; - - for (auto &name_type_pair : member_->var_types_) { - if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) { - continue; - } - - if (std::get<2>(name_type_pair)) { // Persistable - InitializeVariable(scope->Var(std::get<0>(name_type_pair)), - std::get<1>(name_type_pair)); - } else { - InitializeVariable(local_scope.Var(std::get<0>(name_type_pair)), - std::get<1>(name_type_pair)); - } - } - } - auto fetch_data = member_->executor_->Run(fetch_tensors); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetch_data; - - // Wait All computational streams - for (auto p : member_->places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); - } - for (auto &scope : member_->local_scopes_) { - auto &local_scope = - *scope->Var(details::kLocalExecScopeName)->GetMutable(); - scope->DeleteScope(local_scope); - } } void ParallelExecutor::FeedTensorsIntoLocalScopes( @@ -242,7 +233,7 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( } ParallelExecutor::~ParallelExecutor() { - if (member_->own_local_scope) { + if (member_->own_local_scope_) { for (size_t i = 1; i < member_->local_scopes_.size(); ++i) { member_->global_scope_->DeleteScope(member_->local_scopes_[i]); } diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h index 3a413941df964c8d9454fafc6030c377c10f9fb1..64d4ceab624312ed366d7e835072899f1f033a88 100644 --- a/paddle/fluid/framework/reader.h +++ b/paddle/fluid/framework/reader.h @@ -35,14 +35,15 @@ class ReaderBase { class DecoratedReader : public ReaderBase { public: - explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) { + explicit DecoratedReader(const std::shared_ptr& reader) + : ReaderBase(), reader_(reader) { PADDLE_ENFORCE_NOT_NULL(reader_); } void ReInit() override { reader_->ReInit(); } protected: - ReaderBase* reader_; + std::shared_ptr reader_; }; class FileReader : public ReaderBase { @@ -64,7 +65,7 @@ class ReaderHolder { public: void Reset(ReaderBase* reader) { reader_.reset(reader); } - ReaderBase* Get() const { return reader_.get(); } + std::shared_ptr Get() const { return reader_; } void ReadNext(std::vector* out) { PADDLE_ENFORCE_NOT_NULL(reader_); @@ -76,7 +77,7 @@ class ReaderHolder { } private: - std::unique_ptr reader_; + std::shared_ptr reader_; }; } // namespace framework diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 9091713158c8071d5386f14250e3c546284e7fd0..bb2d866c824e0fec1b241caea407a38c88a3cb51 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -34,13 +34,7 @@ DEFINE_bool( namespace paddle { namespace framework { -Scope::~Scope() { - DropKids(); - for (auto& kv : vars_) { - VLOG(3) << "Destroy variable " << kv.first; - delete kv.second; - } -} +Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { std::unique_lock lock(mutex_); @@ -49,10 +43,13 @@ Scope& Scope::NewScope() const { } Variable* Scope::Var(const std::string& name) { + // acquire the lock when new var under this scope + std::unique_lock lock(mutex_); auto* v = FindVarLocally(name); if (v != nullptr) return v; + v = new Variable(); - vars_[name] = v; + vars_[name].reset(v); VLOG(3) << "Create variable " << name; v->name_ = &(vars_.find(name)->first); return v; @@ -67,22 +64,29 @@ Variable* Scope::Var(std::string* name) { } Variable* Scope::FindVar(const std::string& name) const { + // acquire the lock when find var + std::unique_lock lock(mutex_); + return FindVarInternal(name); +} + +Variable* Scope::FindVarInternal(const std::string& name) const { auto var = FindVarLocally(name); if (var != nullptr) { return var; } - return (parent_ == nullptr) ? nullptr : parent_->FindVar(name); + return (parent_ == nullptr) ? nullptr : parent_->FindVarInternal(name); } const Scope* Scope::FindScope(const Variable* var) const { for (auto& kv : vars_) { - if (kv.second == var) { + if (kv.second.get() == var) { return this; } } return (parent_ == nullptr) ? nullptr : parent_->FindScope(var); } void Scope::DropKids() { + std::unique_lock lock(mutex_); for (Scope* s : kids_) delete s; kids_.clear(); } @@ -110,10 +114,10 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { + std::unique_lock lock(mutex_); std::set var_set(var_names.begin(), var_names.end()); for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { - delete it->second; it = vars_.erase(it); } else { ++it; @@ -129,7 +133,7 @@ void Scope::Rename(const std::string& origin_name, auto new_it = vars_.find(new_name); PADDLE_ENFORCE(new_it == vars_.end(), "The variable with name %s is already in the scope", new_name); - vars_[new_name] = origin_it->second; + vars_[new_name].reset(origin_it->second.release()); vars_.erase(origin_it); } @@ -141,7 +145,7 @@ std::string Scope::Rename(const std::string& origin_name) const { Variable* Scope::FindVarLocally(const std::string& name) const { auto it = vars_.find(name); - if (it != vars_.end()) return it->second; + if (it != vars_.end()) return it->second.get(); return nullptr; } diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index abc82e452d732638a2f7315022074850f299a7ea..95b4f7c5f66a4161058955c7666be34414f5074c 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -47,15 +47,18 @@ class Scope { Scope& NewScope() const; /// Create a variable with given name if it doesn't exist. + /// Caller doesn't own the returned Variable. Variable* Var(const std::string& name); /// Create a variable with a scope-unique name. + /// Caller doesn't own the returned Variable. Variable* Var(std::string* name = nullptr); void EraseVars(const std::vector& var_names); /// Find a variable in the scope or any of its ancestors. Returns /// nullptr if cannot find. + /// Caller doesn't own the returned Variable. Variable* FindVar(const std::string& name) const; const Scope* parent() const { return parent_; } @@ -78,13 +81,22 @@ class Scope { // Rename variable to a new name and return the new name std::string Rename(const std::string& origin_name) const; - Variable* FindVarLocally(const std::string& name) const; + protected: + mutable std::unordered_map> vars_; private: // Call Scope::NewScope for a sub-scope. explicit Scope(Scope const* parent) : parent_(parent) {} - mutable std::unordered_map vars_; + // Called by FindVar recursively. + // Caller doesn't own the returned Variable. + Variable* FindVarInternal(const std::string& name) const; + + // Called by FindVarInternal and Var. + // Caller doesn't own the returned Variable. + Variable* FindVarLocally(const std::string& name) const; + + // Scope in `kids_` are owned by this class. mutable std::list kids_; Scope const* parent_{nullptr}; diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index e97ada06f06d0538f17160220e3aa3f4ffc55520..c7286dacf01659f3af0927a71856e5a6496cb877 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -15,5 +15,102 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" namespace paddle { -namespace framework {} +namespace framework { +extern size_t SizeOfType(std::type_index type); +void Tensor::check_memory_size() const { + PADDLE_ENFORCE_NOT_NULL( + holder_, "Tensor holds no memory. Call Tensor::mutable_data first."); + PADDLE_ENFORCE_LE( + numel() * SizeOfType(type()), memory_size(), + "Tensor's dims_ is out of bound. Call Tensor::mutable_data " + "first to re-allocate memory.\n" + "or maybe the required data-type mismatches the data already stored."); +} + +size_t Tensor::memory_size() const { + return holder_ == nullptr ? 0UL : holder_->size() - offset_; +} + +void* Tensor::mutable_data(platform::Place place, std::type_index type) { + if (holder_ != nullptr) { + holder_->set_type(type); + } + PADDLE_ENFORCE_GE(numel(), 0, + "When calling this method, the Tensor's numel must be " + "equal or larger than zero. " + "Please check Tensor::Resize has been called first."); + int64_t size = numel() * SizeOfType(type); + /* some versions of boost::variant don't have operator!= */ + if (holder_ == nullptr || !(holder_->place() == place) || + holder_->size() < size + offset_) { + if (platform::is_cpu_place(place)) { + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); + } else if (platform::is_gpu_place(place) || + platform::is_cuda_pinned_place(place)) { +#ifndef PADDLE_WITH_CUDA + PADDLE_THROW( + "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode."); + } +#else + if (platform::is_gpu_place(place)) { + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); + } else if (platform::is_cuda_pinned_place(place)) { + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); + } + } +#endif + offset_ = 0; + } + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} + +void* Tensor::mutable_data(platform::Place place) { + PADDLE_ENFORCE(this->holder_ != nullptr, + "Cannot invoke mutable data if current hold nothing."); + return mutable_data(place, holder_->type()); +} + +Tensor& Tensor::ShareDataWith(const Tensor& src) { + src.check_memory_size(); + *this = src; + return *this; +} + +Tensor Tensor::Slice(int begin_idx, int end_idx) const { + check_memory_size(); + PADDLE_ENFORCE_GE(begin_idx, 0, + "The start row index must be greater than 0."); + PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound."); + PADDLE_ENFORCE_LT( + begin_idx, end_idx, + "The start row index must be lesser than the end row index."); + + if (dims_[0] == 1) { + return *this; + } else { + size_t base = numel() / dims_[0]; + Tensor dst; + dst.holder_ = holder_; + dst.set_layout(layout_); + DDim dst_dims = dims_; + dst_dims[0] = end_idx - begin_idx; + dst.Resize(dst_dims); + dst.offset_ = offset_ + begin_idx * base * SizeOfType(type()); + return dst; + } +} + +Tensor& Tensor::Resize(const DDim& dims) { + dims_ = dims; + return *this; +} + +const DDim& Tensor::dims() const { return dims_; } + +int64_t Tensor::numel() const { return product(dims_); } +} // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 6f878541e6de1deec1829145b1b325ecd176a034..ef224d68f1fc561f45e9d7a81425e62655457648 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -34,6 +34,28 @@ namespace framework { class LoDTensor; class Tensor { +#ifdef PADDLE_WITH_MKLDNN + + public: + inline mkldnn::memory::format format() const { return format_; } + + inline void set_format(const mkldnn::memory::format format) { + format_ = format; + } + + protected: + /** + * @brief the detail format of memory block which have layout as kMKLDNN + * + * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, + * nChw16c, etc. For a MKLDNN memory block, layout will be set as + * DataLayout::kMKLDNN meanwhile detail memory format will be kept in + * this field. + */ + + mkldnn::memory::format format_ = mkldnn::memory::format::format_undef; +#endif + public: template friend struct EigenTensor; @@ -54,26 +76,24 @@ class Tensor { /*! Return a pointer to mutable memory block. */ template - inline T* data(); + T* data(); /*! Return a pointer to constant memory block. */ template - inline const T* data() const; + const T* data() const; - inline bool IsInitialized() const; - - inline void switch_place(platform::Place new_place); + bool IsInitialized() const; /** * @brief Return a pointer to mutable memory block. * @note If not exist, then allocation. */ template - inline T* mutable_data(platform::Place place); + T* mutable_data(platform::Place place); - inline void* mutable_data(platform::Place place, std::type_index type); + void* mutable_data(platform::Place place, std::type_index type); - inline void* mutable_data(platform::Place place); + void* mutable_data(platform::Place place); /** * @brief Return a pointer to mutable memory block. @@ -84,19 +104,19 @@ class Tensor { * @note If not exist, then allocation. */ template - inline T* mutable_data(DDim dims, platform::Place place); + T* mutable_data(DDim dims, platform::Place place); /*! Return the dimensions of the memory block. */ - inline const DDim& dims() const; + const DDim& dims() const; /*! Return the numel of the memory block. */ - inline int64_t numel() const; + int64_t numel() const; /*! Resize the dimensions of the memory block. */ - inline Tensor& Resize(const DDim& dims); + Tensor& Resize(const DDim& dims); /*! The internal of two tensors share the same memory block. */ - inline Tensor& ShareDataWith(const Tensor& src); + Tensor& ShareDataWith(const Tensor& src); /** * @brief Return a sub-tensor of the given tensor. @@ -106,7 +126,7 @@ class Tensor { * @param[in] end_idx The index of the end row(exclusive) to slice. * The index number begins from 0. */ - inline Tensor Slice(int begin_idx, int end_idx) const; + Tensor Slice(int begin_idx, int end_idx) const; platform::Place place() const { PADDLE_ENFORCE_NOT_NULL( @@ -123,11 +143,11 @@ class Tensor { // memory size returns the holding memory size in byte. size_t memory_size() const; - inline void check_memory_size() const; + void check_memory_size() const; - inline DataLayout layout() const { return layout_; } + DataLayout layout() const { return layout_; } - inline void set_layout(const DataLayout layout) { layout_ = layout; } + void set_layout(const DataLayout layout) { layout_ = layout; } private: /** @@ -197,8 +217,10 @@ class Tensor { * N,C,H,W for respectively the batch size, the number of * feature maps, the height. */ - - DataLayout layout_ = DataLayout::kNHWC; + // Fix me: here just change the default layout to kNCHW + // it doesn't fix the real issue, i.e. feeder should set up tensor layout + // according to actual input data + DataLayout layout_ = DataLayout::kNCHW; /** * @brief A PlaceHolder may be shared by more than one tensor. @@ -210,15 +232,6 @@ class Tensor { size_t offset_; }; -inline void Tensor::switch_place(platform::Place new_place) { - if (holder_->place() == new_place) { - return; - } - - // TODO(tonyyang-svail): do memcpy here. - PADDLE_THROW("Not Implemented"); -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index 2f19ec0f0a9338e2b96d1f64eac45387bae4d1eb..96114678a9992f2975c4173c7cc003114f04d8df 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -20,21 +20,6 @@ limitations under the License. */ namespace paddle { namespace framework { -extern size_t SizeOfType(std::type_index type); -inline void Tensor::check_memory_size() const { - PADDLE_ENFORCE_NOT_NULL( - holder_, "Tensor holds no memory. Call Tensor::mutable_data first."); - PADDLE_ENFORCE_LE( - numel() * SizeOfType(type()), memory_size(), - "Tensor's dims_ is out of bound. Call Tensor::mutable_data " - "first to re-allocate memory.\n" - "or maybe the required data-type mismatches the data already stored."); -} - -inline size_t Tensor::memory_size() const { - return holder_ == nullptr ? 0UL : holder_->size() - offset_; -} - template inline const T* Tensor::data() const { check_memory_size(); @@ -73,88 +58,6 @@ inline T* Tensor::mutable_data(platform::Place place) { return reinterpret_cast(mutable_data(place, typeid(T))); } -inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { - if (holder_ != nullptr) { - holder_->set_type(type); - } - PADDLE_ENFORCE_GE(numel(), 0, - "When calling this method, the Tensor's numel must be " - "equal or larger than zero. " - "Please check Tensor::Resize has been called first."); - int64_t size = numel() * SizeOfType(type); - /* some versions of boost::variant don't have operator!= */ - if (holder_ == nullptr || !(holder_->place() == place) || - holder_->size() < size + offset_) { - if (platform::is_cpu_place(place)) { - holder_.reset(new PlaceholderImpl( - boost::get(place), size, type)); - } else if (platform::is_gpu_place(place) || - platform::is_cuda_pinned_place(place)) { -#ifndef PADDLE_WITH_CUDA - PADDLE_THROW( - "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode."); - } -#else - if (platform::is_gpu_place(place)) { - holder_.reset(new PlaceholderImpl( - boost::get(place), size, type)); - } else if (platform::is_cuda_pinned_place(place)) { - holder_.reset(new PlaceholderImpl( - boost::get(place), size, type)); - } - } -#endif - offset_ = 0; - } - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - offset_); -} - -inline void* Tensor::mutable_data(platform::Place place) { - PADDLE_ENFORCE(this->holder_ != nullptr, - "Cannot invoke mutable data if current hold nothing."); - return mutable_data(place, holder_->type()); -} - -inline Tensor& Tensor::ShareDataWith(const Tensor& src) { - src.check_memory_size(); - *this = src; - return *this; -} - -inline Tensor Tensor::Slice(int begin_idx, int end_idx) const { - check_memory_size(); - PADDLE_ENFORCE_GE(begin_idx, 0, - "The start row index must be greater than 0."); - PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound."); - PADDLE_ENFORCE_LT( - begin_idx, end_idx, - "The start row index must be lesser than the end row index."); - - if (dims_[0] == 1) { - return *this; - } else { - size_t base = numel() / dims_[0]; - Tensor dst; - dst.holder_ = holder_; - dst.set_layout(layout_); - DDim dst_dims = dims_; - dst_dims[0] = end_idx - begin_idx; - dst.Resize(dst_dims); - dst.offset_ = offset_ + begin_idx * base * SizeOfType(type()); - return dst; - } -} - -inline Tensor& Tensor::Resize(const DDim& dims) { - dims_ = dims; - return *this; -} - -inline const DDim& Tensor::dims() const { return dims_; } - -inline int64_t Tensor::numel() const { return product(dims_); } - inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { Tensor res; res.ShareDataWith(src); diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index e1012de2ec36eb4a858202d56a678b6a204c2f0a..0a1cb6d5703dace5e6be73285655ecd9d2ad89fb 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -209,7 +209,7 @@ TEST(Tensor, ReshapeToMatrix) { TEST(Tensor, Layout) { framework::Tensor src; - ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC); + ASSERT_EQ(src.layout(), framework::DataLayout::kNCHW); src.set_layout(framework::DataLayout::kAnyLayout); ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout); } diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 9faf5bb3036775a2ba0c08d3d6ea17ffa73753c6..50835784440bfa177e38f9760bb4a47ad335a9e1 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -15,3 +15,9 @@ cc_test(test_subgraph_splitter DEPS analysis paddle_fluid tensor ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model) set_tests_properties(test_subgraph_splitter PROPERTIES DEPENDS test_word2vec) + +cc_test(test_dfg_graphviz_draw_pass + SRCS dfg_graphviz_draw_pass_tester.cc + DEPS analysis + ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model) +set_tests_properties(test_dfg_graphviz_draw_pass PROPERTIES DEPENDS test_word2vec) diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..41d4475382befa1bdaf7473520d64005a472a459 --- /dev/null +++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file create an DFG_GraphvizDrawPass which helps to draw a data flow + * graph's structure using graphviz. + */ + +#pragma once + +#include +#include +#include "paddle/fluid/inference/analysis/pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * Output a dot file and write to some place. + */ +class DFG_GraphvizDrawPass : public DataFlowGraphPass { + public: + DFG_GraphvizDrawPass(const std::string& dir, const std::string& id) + : dir_(dir), id_(id) {} + + bool Initialize() override { return Pass::Initialize(); } + void Run(DataFlowGraph* graph) override { + auto content = Draw(graph); + std::ofstream file(GenDotPath()); + file.write(content.c_str(), content.size()); + file.close(); + LOG(INFO) << "draw dot to " << GenDotPath(); + } + + bool Finalize() override { return Pass::Finalize(); } + + Pass* CreatePrinterPass(std::ostream& os, + const std::string& banner) const override { + return nullptr; + } + + private: + // Path of the dot file to output. + std::string GenDotPath() const { + return dir_ + "/" + "graph_" + id_ + ".dot"; + } + + std::string Draw(DataFlowGraph* graph) { return graph->DotString(); } + + std::string dir_; + std::string id_; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..3fc1cc18b855440c54c1ed6a9ab49a104c8c21f0 --- /dev/null +++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc @@ -0,0 +1,46 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" + +#include +#include +#include +#include "paddle/fluid/inference/analysis/ut_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) { + auto dfg = ProgramDescToDFG(desc); + DFG_GraphvizDrawPass pass("./", "test"); + pass.Initialize(); + pass.Run(&dfg); + + // test content + std::ifstream file("./graph_test.dot"); + ASSERT_TRUE(file.is_open()); + + std::string line; + int no{0}; + while (std::getline(file, line)) { + no++; + } + ASSERT_EQ(no, 82); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 153dca576bd6734d62f00c4a7cb9b503506b33e2..58eb0e715cb71d87179f3240de55021603cd7423 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -18,6 +18,8 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -107,6 +109,13 @@ class OrderedRegistry { std::vector> data_; }; +template +T &GetFromScope(const framework::Scope &scope, const std::string &name) { + framework::Variable *var = scope.FindVar(name); + PADDLE_ENFORCE(var != nullptr); + return *var->GetMutable(); +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 65db7c7b5008dcb301e741ec17c3623715e10bb8..6b03ac7119b117e442e6af34c719c8a4f736bde9 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -20,16 +20,20 @@ limitations under the License. */ #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/pybind/pybind.h" DEFINE_string(devices, "", "The devices to be used which is joined by comma."); DEFINE_bool(init_p2p, false, "Whether to init p2p."); +DEFINE_int32(math_num_threads, 1, + "Number of threads used to run math functions."); namespace paddle { namespace inference { void Init(const std::vector argv) { framework::InitGflags(argv); + operators::math::SetNumThreads(FLAGS_math_num_threads); // init devices std::vector devices; std::string token; diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 23ca8bfac84f35ebdca2e2a1a8538d366358ca8b..748f5a084e8c880df215a60fe51c835ba5cd3110 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,12 +1,15 @@ # Add TRT tests -nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine) -# This test is not stable -# See https://paddleci.ngrok.io/viewLog.html?tab=buildLog&buildTypeId=Paddle_PrCi2&buildId=36834&_focus=8828 -#nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc -# DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine -# SERIAL) +nv_library(tensorrt_converter + SRCS mul_op.cc conv2d_op.cc fc_op.cc + DEPS tensorrt_engine mul_op) + +nv_test(test_op_converter SRCS test_op_converter.cc DEPS + ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter) + nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor) nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) +nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc index 79d01b640a214ed5eb86173a36d5e85a6626066f..e1cace9cc1b06f036f52e82b7b86c99a02d50f50 100644 --- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" namespace paddle { @@ -21,7 +22,8 @@ namespace tensorrt { class ReluOpConverter : public OpConverter { public: ReluOpConverter() {} - void operator()(const framework::proto::OpDesc& op) override { + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { // Here the two nullptr looks strange, that's because the // framework::OpDesc's constructor is strange. framework::OpDesc op_desc(op, nullptr); @@ -32,12 +34,17 @@ class ReluOpConverter : public OpConverter { nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER( engine_, Activation, *const_cast(input_tensor), nvinfer1::ActivationType::kRELU); - engine_->SetITensor(op_desc.Output("Out")[0], layer->getOutput(0)); + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { // the test framework can not determine which is the + // output, so place the declaration inside. + engine_->DeclareOutput(output_name); + } } }; -REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter); - } // namespace tensorrt } // namespace inference } // namespace paddle + +REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 668d344f1bba1c012dcb42c71b996209b4703d78..8e7e23377d4b2fe7afd51f1f58048fc4ed3c6d99 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -22,14 +22,14 @@ class Conv2dOpConverter : public OpConverter { public: Conv2dOpConverter() {} void operator()(const framework::proto::OpDesc& op, - const framework::Scope& scope) override { + const framework::Scope& scope, bool test_mode) override { LOG(INFO) << "convert a fluid conv2d op to tensorrt conv layer without bias"; } }; -REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter); - } // namespace tensorrt } // namespace inference } // namespace paddle + +REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 45b079559754a8f5c3fe39781b5700a75f425e99..bb603efaf30bb72d74b5583abc45d01a16c076a3 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -56,7 +56,7 @@ void ReorderCKtoKC(TensorRTEngine::Weight& iweights, class FcOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, - const framework::Scope& scope) override { + const framework::Scope& scope, bool test_mode) override { VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias"; framework::OpDesc op_desc(op, nullptr); @@ -106,14 +106,16 @@ class FcOpConverter : public OpConverter { n_output, weight.get(), bias.get()); auto output_name = op_desc.Output("Out").front(); - engine_->DeclareOutput(layer, 0, output_name); + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { + engine_->DeclareOutput(output_name); + } } }; -REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter); - } // namespace tensorrt } // namespace inference } // namespace paddle +REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter); USE_OP(mul); diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc index 6bb07709c7ee1c6b29c46425849a4f472d3df59d..3c342957360ad4192d838147bf37e84d233c2629 100644 --- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc @@ -23,9 +23,8 @@ namespace tensorrt { */ class MulOpConverter : public OpConverter { public: - MulOpConverter() {} void operator()(const framework::proto::OpDesc& op, - const framework::Scope& scope) override { + const framework::Scope& scope, bool test_mode) override { VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias"; framework::OpDesc op_desc(op, nullptr); @@ -37,12 +36,18 @@ class MulOpConverter : public OpConverter { engine_, MatrixMultiply, *const_cast(input1), false, *const_cast(input2), false); - engine_->DeclareOutput(layer, 0, op_desc.Output("Out")[0]); + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { // the test framework can not determine which is the + // output, so place the declaration inside. + engine_->DeclareOutput(output_name); + } } }; -REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter); - } // namespace tensorrt } // namespace inference } // namespace paddle + +USE_OP(mul); +REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 3beafeefd06f24ec50b0e61c1fabe13d7e53f242..c7a5a49dd02d0db022fabff5c3ae1c7800bac25c 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -34,12 +35,15 @@ class OpConverter { // Converter logic for an op. virtual void operator()(const framework::proto::OpDesc& op, - const framework::Scope& scope) {} + const framework::Scope& scope, + bool test_mode = false) {} - // Convert a single fluid operaotr and add the corresponding layer to TRT. + // Convert a single fluid operator and add the corresponding layer to TRT. + // test_mode: whether the instance executes in an unit test. void ConvertOp(const framework::proto::OpDesc& op, const std::unordered_set& parameters, - const framework::Scope& scope, TensorRTEngine* engine) { + const framework::Scope& scope, TensorRTEngine* engine, + bool test_mode = false) { framework::OpDesc op_desc(op, nullptr); OpConverter* it{nullptr}; @@ -57,7 +61,7 @@ class OpConverter { PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_desc.Type()); it->SetEngine(engine); - (*it)(op, scope); + (*it)(op, scope, test_mode); } // convert fluid block to tensorrt network @@ -77,6 +81,9 @@ class OpConverter { // TensorRT engine TensorRTEngine* engine_{nullptr}; + protected: + bool test_mode_; + private: // registered op converter map, whose key is the fluid op type, and value is // the pointer position of corresponding OpConverter class. @@ -85,13 +92,24 @@ class OpConverter { framework::Scope* scope_{nullptr}; }; -#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__) \ - struct trt_##op_type__##_converter { \ - trt_##op_type__##_converter() { \ - Registry::Register(#op_type__); \ - } \ - }; \ - trt_##op_type__##_converter trt_##op_type__##_converter__; +#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__) \ + struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \ + trt_##op_type__##_converter() { \ + ::paddle::inference:: \ + Registry::Register< \ + ::paddle::inference::tensorrt::Converter__>(#op_type__); \ + } \ + }; \ + trt_##op_type__##_converter trt_##op_type__##_converter__; \ + int TouchConverterRegister_##op_type__() { \ + trt_##op_type__##_converter__.Touch(); \ + return 0; \ + } + +#define USE_TRT_CONVERTER(op_type__) \ + extern int TouchConverterRegister_##op_type__(); \ + static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \ + TouchConverterRegister_##op_type__(); } // namespace tensorrt } // namespace inference diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 86ca2ca08eb14265e1bfe7abd5eb6af5c83b8a5c..0a02a7bebf9efbd0555707e6cfa701ef1e7d9659 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -1,106 +1,47 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ #include -#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/inference/tensorrt/convert/io_converter.h" -#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/place.h" - -USE_OP(relu); +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" namespace paddle { namespace inference { namespace tensorrt { -void Compare(const std::string op_type, float input, float expect) { +TEST(ReluOpConverter, main) { framework::Scope scope; - platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); - - // init fluid op and variable - auto x_var = scope.Var("X"); - auto x_tensor = x_var->GetMutable(); - x_tensor->Resize({1, 1}); - x_tensor->mutable_data(place); - std::vector init; - init.push_back(input); - framework::TensorFromVector(init, ctx, x_tensor); - - auto out_var = scope.Var("Out"); - auto out_tensor = out_var->GetMutable(); - out_tensor->Resize({1, 1}); - out_tensor->mutable_data(place); - - framework::OpDesc op_desc; - op_desc.SetType(op_type); - op_desc.SetInput("X", {"X"}); - op_desc.SetOutput("Out", {"Out"}); - - auto op = framework::OpRegistry::CreateOp(*op_desc.Proto()); - - // run fluid op - op->Run(scope, place); - // get fluid output - std::vector out1; - framework::TensorToVector(*out_tensor, ctx, &out1); - - // init tensorrt op - cudaStream_t stream; - ASSERT_EQ(0, cudaStreamCreate(&stream)); - TensorRTEngine* engine = new TensorRTEngine(1, 1 << 10, &stream); - engine->InitNetwork(); - engine->DeclareInput("X", nvinfer1::DataType::kFLOAT, - nvinfer1::DimsCHW{1, 1, 1}); - // convert op - OpConverter op_converter; - op_converter.ConvertOp(*op_desc.Proto(), engine); - - engine->DeclareOutput("Out"); - engine->FreezeNetwork(); - - // convert LoDTensor to ITensor - size_t size = x_tensor->memory_size(); - EngineIOConverter::ConvertInput(op_type, *x_tensor, - engine->buffer("X").buffer, size, &stream); - // run tensorrt Outp - engine->Execute(1); - // convert ITensor to LoDTensor - EngineIOConverter::ConvertOutput(op_type, engine->buffer("Out").buffer, - out_tensor, size, &stream); - // get tensorrt output - std::vector out2; - framework::TensorToVector(*out_tensor, ctx, &out2); - - // compare - ASSERT_EQ(out1[0], out2[0]); - ASSERT_EQ(out1[0], expect); - - delete engine; - cudaStreamDestroy(stream); -} - -TEST(OpConverter, ConvertRelu) { - Compare("relu", 1, 1); // relu(1) = 1 - Compare("relu", -5, 0); // relu(-5) = 0 + std::unordered_set parameters; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6)); + validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("relu"); + desc.SetInput("X", {"relu-X"}); + desc.SetOutput("Out", {"relu-Out"}); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(10); } } // namespace tensorrt } // namespace inference } // namespace paddle -USE_OP(activation); +USE_OP(relu); diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc index 1d3f5eabb2f839b2acfa9da6527589df1ec3767f..9b79f86b0edba983019bd932f52b08711ff36d41 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc @@ -36,3 +36,5 @@ TEST(OpConverter, ConvertBlock) { } // namespace tensorrt } // namespace inference } // namespace paddle + +USE_TRT_CONVERTER(conv2d) diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index d7e05dd5b5b235b7b166b22c5b094dc364e28dfc..3b1f531adc5d756259df1c350f7f44bf71ee1f93 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -27,6 +27,7 @@ limitations under the License. */ #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/utils/singleton.h" namespace paddle { namespace inference { @@ -63,7 +64,8 @@ class TRTConvertValidation { TRTConvertValidation(int batch_size, const std::unordered_set& parameters, - framework::Scope& scope, int workspace_size = 1 << 10) + framework::Scope& scope, // NOLINT + int workspace_size = 1 << 10) : parameters_(parameters), scope_(scope) { // create engine. engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_)); @@ -104,8 +106,8 @@ class TRTConvertValidation { void SetOp(const framework::proto::OpDesc& desc) { op_ = framework::OpRegistry::CreateOp(desc); - OpConverter op_converter; - op_converter.ConvertOp(desc, parameters_, scope_, engine_.get()); + Singleton::Global().ConvertOp( + desc, parameters_, scope_, engine_.get(), true /*test_mode*/); engine_->FreezeNetwork(); @@ -150,7 +152,8 @@ class TRTConvertValidation { // Compare two output ASSERT_FALSE(fluid_out.empty()); for (size_t i = 0; i < fluid_out.size(); i++) { - EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 1e-6); + // Loose the threshold for CI in different machine model. + EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5); } } } diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 3d75fefc1a735168131a6c67ac073e80aba32945..596e0fe9da3d272ecb1c0f8dbef09a75d08a4b1a 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -43,9 +43,10 @@ void TensorRTEngine::Execute(int batch_size) { } TensorRTEngine::~TensorRTEngine() { + cudaStreamSynchronize(*stream_); // clean buffer for (auto& buf : buffers_) { - if (buf.buffer != nullptr) { + if (buf.device == DeviceType::GPU && buf.buffer != nullptr) { PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer)); buf.buffer = nullptr; buf.max_size = 0; @@ -80,6 +81,8 @@ void TensorRTEngine::FreezeNetwork() { auto& buf = buffer(item.first); CHECK(buf.buffer == nullptr); // buffer should be allocated only once. PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second)); + VLOG(4) << "buffer malloc " << item.first << " " << item.second << " " + << buf.buffer; buf.size = buf.max_size = item.second; buf.device = DeviceType::GPU; } @@ -96,6 +99,7 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name, PADDLE_ENFORCE(input, "infer network add input %s failed", name); buffer_sizes_[name] = kDataTypeSize[static_cast(dtype)] * analysis::AccuDims(dims.d, dims.nbDims); + PADDLE_ENFORCE(input->isNetworkInput()); TensorRTEngine::SetITensor(name, input); return input; } @@ -109,7 +113,9 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset, SetITensor(name, output); PADDLE_ENFORCE(output != nullptr); output->setName(name.c_str()); + PADDLE_ENFORCE(!output->isNetworkInput()); infer_network_->markOutput(*output); + PADDLE_ENFORCE(output->isNetworkOutput()); // output buffers' size can only be decided latter, set zero here to mark this // and will reset latter. buffer_sizes_[name] = 0; @@ -122,6 +128,7 @@ void TensorRTEngine::DeclareOutput(const std::string& name) { auto* output = TensorRTEngine::GetITensor(name); PADDLE_ENFORCE(output != nullptr); output->setName(name.c_str()); + PADDLE_ENFORCE(!output->isNetworkInput()); infer_network_->markOutput(*output); // output buffers' size can only be decided latter, set zero here to mark this // and will reset latter. diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index fabcfd9e80cc0ef2637201a1499ebbe2d6adfd8c..b60f00de9fa5fc8f8f4537379bf9ee9c8bb6f31c 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -21,6 +21,7 @@ limitations under the License. */ #include #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" +#include "paddle/fluid/inference/utils/singleton.h" namespace paddle { namespace inference { @@ -131,7 +132,11 @@ class TensorRTEngine : public EngineBase { // TensorRT related internal members template struct Destroyer { - void operator()(T* x) { x->destroy(); } + void operator()(T* x) { + if (x) { + x->destroy(); + } + } }; template using infer_ptr = std::unique_ptr>; @@ -155,6 +160,27 @@ class TensorRTEngine : public EngineBase { #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \ engine__->network()->add##layer__(ARGS); +/* + * Helper to control the TensorRT engine's creation and deletion. + */ +class TRT_EngineManager { + public: + TensorRTEngine* Create(int max_batch, int max_workspace, + cudaStream_t* stream) { + engines_.emplace_back(new TensorRTEngine(max_batch, max_workspace, stream)); + return engines_.back().get(); + } + + void DeleteALl() { + for (auto& ptr : engines_) { + ptr.reset(nullptr); + } + } + + private: + std::vector> engines_; +}; + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt index dbb81462b8273bd701e9c9f530eaf69817abd6a1..2fa5a9540ba1311c7f87e6675a53044b23dd8276 100644 --- a/paddle/fluid/inference/tests/book/CMakeLists.txt +++ b/paddle/fluid/inference/tests/book/CMakeLists.txt @@ -38,3 +38,11 @@ inference_test(recommender_system) #inference_test(rnn_encoder_decoder) #inference_test(understand_sentiment ARGS conv) inference_test(word2vec) + +# This is an unly work around to make this test run +# TODO(TJ): clean me up +cc_test(test_inference_nlp + SRCS test_inference_nlp.cc + DEPS paddle_fluid + ARGS + --model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model) diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc index 987da18116cc6f4902bd66ae317f2470a8bc5057..60c761c5281e2f535aab0200c93fb738addcdb87 100644 --- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc +++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc @@ -21,7 +21,6 @@ DEFINE_string(fp16_dirname, "", "Directory of the float16 inference model."); DEFINE_int32(batch_size, 1, "Batch size of input data"); DEFINE_int32(repeat, 1, "Running the inference program repeat times"); DEFINE_bool(skip_cpu, false, "Skip the cpu test"); -DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference"); TEST(inference, image_classification) { if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) { @@ -59,10 +58,8 @@ TEST(inference, image_classification) { // Run inference on CPU LOG(INFO) << "--- CPU Runs: ---"; LOG(INFO) << "Batch size is " << FLAGS_batch_size; - LOG(INFO) << "FLAGS_use_mkldnn: " << FLAGS_use_mkldnn; TestInference( - dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined, - FLAGS_use_mkldnn); + dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined); LOG(INFO) << output1.dims(); } diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc new file mode 100644 index 0000000000000000000000000000000000000000..9dcd79c3bb9ed713ff0f12024969cc5798750988 --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc @@ -0,0 +1,227 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include // NOLINT +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/fluid/inference/tests/test_helper.h" +#ifdef PADDLE_WITH_MKLML +#include +#include +#endif + +DEFINE_string(model_path, "", "Directory of the inference model."); +DEFINE_string(data_file, "", "File of input index data."); +DEFINE_int32(repeat, 100, "Running the inference program repeat times"); +DEFINE_bool(prepare_vars, true, "Prepare variables before executor"); +DEFINE_int32(num_threads, 1, "Number of threads should be used"); + +inline double GetCurrentMs() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec; +} + +// This function just give dummy data for recognize_digits model. +size_t DummyData(std::vector* out) { + paddle::framework::LoDTensor input; + SetupTensor(&input, {1, 1, 28, 28}, -1.f, 1.f); + out->emplace_back(input); + return 1; +} + +// Load the input word index data from file and save into LodTensor. +// Return the size of words. +size_t LoadData(std::vector* out, + const std::string& filename) { + if (filename.empty()) { + return DummyData(out); + } + + size_t sz = 0; + std::fstream fin(filename); + std::string line; + out->clear(); + while (getline(fin, line)) { + std::istringstream iss(line); + std::vector ids; + std::string field; + while (getline(iss, field, ' ')) { + ids.push_back(stoi(field)); + } + if (ids.size() >= 1024) { + // Synced with NLP guys, they will ignore input larger then 1024 + continue; + } + + paddle::framework::LoDTensor words; + paddle::framework::LoD lod{{0, ids.size()}}; + words.set_lod(lod); + int64_t* pdata = words.mutable_data( + {static_cast(ids.size()), 1}, paddle::platform::CPUPlace()); + memcpy(pdata, ids.data(), words.numel() * sizeof(int64_t)); + out->emplace_back(words); + sz += ids.size(); + } + return sz; +} + +// Split input data samples into small pieces jobs as balanced as possible, +// according to the number of threads. +void SplitData( + const std::vector& datasets, + std::vector>* jobs, + const int num_threads) { + size_t s = 0; + jobs->resize(num_threads); + while (s < datasets.size()) { + for (auto it = jobs->begin(); it != jobs->end(); it++) { + it->emplace_back(&datasets[s]); + s++; + if (s >= datasets.size()) { + break; + } + } + } +} + +void ThreadRunInfer( + const int tid, paddle::framework::Scope* scope, + const std::vector>& jobs) { + // maybe framework:ProgramDesc is not thread-safe + auto& sub_scope = scope->NewScope(); + auto place = paddle::platform::CPUPlace(); + auto executor = paddle::framework::Executor(place); + auto inference_program = + paddle::inference::Load(&executor, scope, FLAGS_model_path); + + auto ctx = executor.Prepare(*inference_program, /*block_id*/ 0); + executor.CreateVariables(*inference_program, &sub_scope, /*block_id*/ 0); + + const std::vector& feed_target_names = + inference_program->GetFeedTargetNames(); + const std::vector& fetch_target_names = + inference_program->GetFetchTargetNames(); + + PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL); + std::map fetch_targets; + paddle::framework::LoDTensor outtensor; + fetch_targets[fetch_target_names[0]] = &outtensor; + + std::map feed_targets; + PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL); + + auto& inputs = jobs[tid]; + auto start_ms = GetCurrentMs(); + for (size_t i = 0; i < inputs.size(); ++i) { + feed_targets[feed_target_names[0]] = inputs[i]; + executor.RunPreparedContext(ctx.get(), &sub_scope, &feed_targets, + &fetch_targets, false /*create_local_scope*/); + } + auto stop_ms = GetCurrentMs(); + scope->DeleteScope(&sub_scope); + LOG(INFO) << "Tid: " << tid << ", process " << inputs.size() + << " samples, avg time per sample: " + << (stop_ms - start_ms) / inputs.size() << " ms"; +} + +TEST(inference, nlp) { + if (FLAGS_model_path.empty()) { + LOG(FATAL) << "Usage: ./example --model_path=path/to/your/model"; + } + if (FLAGS_data_file.empty()) { + LOG(WARNING) << "No data file provided, will use dummy data!" + << "Note: if you use nlp model, please provide data file."; + } + LOG(INFO) << "Model Path: " << FLAGS_model_path; + LOG(INFO) << "Data File: " << FLAGS_data_file; + + std::vector datasets; + size_t num_total_words = LoadData(&datasets, FLAGS_data_file); + LOG(INFO) << "Number of samples (seq_len<1024): " << datasets.size(); + LOG(INFO) << "Total number of words: " << num_total_words; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + std::unique_ptr scope( + new paddle::framework::Scope()); + +#ifdef PADDLE_WITH_MKLML + // only use 1 thread number per std::thread + omp_set_dynamic(0); + omp_set_num_threads(1); + mkl_set_num_threads(1); +#endif + + double start_ms = 0, stop_ms = 0; + if (FLAGS_num_threads > 1) { + std::vector> jobs; + SplitData(datasets, &jobs, FLAGS_num_threads); + std::vector> threads; + start_ms = GetCurrentMs(); + for (int i = 0; i < FLAGS_num_threads; ++i) { + threads.emplace_back( + new std::thread(ThreadRunInfer, i, scope.get(), std::ref(jobs))); + } + for (int i = 0; i < FLAGS_num_threads; ++i) { + threads[i]->join(); + } + stop_ms = GetCurrentMs(); + } else { + // 1. Define place, executor, scope + auto place = paddle::platform::CPUPlace(); + auto executor = paddle::framework::Executor(place); + + // 2. Initialize the inference_program and load parameters + std::unique_ptr inference_program; + inference_program = InitProgram(&executor, scope.get(), FLAGS_model_path, + /*model combined*/ false); + // always prepare context + std::unique_ptr ctx; + ctx = executor.Prepare(*inference_program, 0); + if (FLAGS_prepare_vars) { + executor.CreateVariables(*inference_program, scope.get(), 0); + } + // preapre fetch + const std::vector& fetch_target_names = + inference_program->GetFetchTargetNames(); + PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL); + std::map fetch_targets; + paddle::framework::LoDTensor outtensor; + fetch_targets[fetch_target_names[0]] = &outtensor; + + // prepare feed + const std::vector& feed_target_names = + inference_program->GetFeedTargetNames(); + PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL); + std::map feed_targets; + + // feed data and run + start_ms = GetCurrentMs(); + for (size_t i = 0; i < datasets.size(); ++i) { + feed_targets[feed_target_names[0]] = &(datasets[i]); + executor.RunPreparedContext(ctx.get(), scope.get(), &feed_targets, + &fetch_targets, !FLAGS_prepare_vars); + } + stop_ms = GetCurrentMs(); + LOG(INFO) << "Tid: 0, process " << datasets.size() + << " samples, avg time per sample: " + << (stop_ms - start_ms) / datasets.size() << " ms"; + } + LOG(INFO) << "Total inference time with " << FLAGS_num_threads + << " threads : " << (stop_ms - start_ms) / 1000.0 + << " sec, QPS: " << datasets.size() / ((stop_ms - start_ms) / 1000); +} diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 01b8dc0be662da22fe15a79cd9abfe5fa92c9577..44c36b1683b037832a218df02184e7cd2ba143e9 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -22,6 +22,8 @@ limitations under the License. */ #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/profiler.h" +DECLARE_bool(use_mkldnn); + template void SetupTensor(paddle::framework::LoDTensor* input, paddle::framework::DDim dims, T lower, T upper) { @@ -133,24 +135,11 @@ std::vector> GetFeedTargetShapes( return feed_target_shapes; } -void EnableMKLDNN( - const std::unique_ptr& program) { - for (size_t bid = 0; bid < program->Size(); ++bid) { - auto* block = program->MutableBlock(bid); - for (auto* op : block->AllOps()) { - if (op->HasAttr("use_mkldnn")) { - op->SetAttr("use_mkldnn", true); - } - } - } -} - template void TestInference(const std::string& dirname, const std::vector& cpu_feeds, const std::vector& cpu_fetchs, - const int repeat = 1, const bool is_combined = false, - const bool use_mkldnn = false) { + const int repeat = 1, const bool is_combined = false) { // 1. Define place, executor, scope auto place = Place(); auto executor = paddle::framework::Executor(place); @@ -182,9 +171,6 @@ void TestInference(const std::string& dirname, "init_program", paddle::platform::DeviceContextPool::Instance().Get(place)); inference_program = InitProgram(&executor, scope, dirname, is_combined); - if (use_mkldnn) { - EnableMKLDNN(inference_program); - } } // Disable the profiler and print the timing information paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, @@ -210,7 +196,10 @@ void TestInference(const std::string& dirname, fetch_targets[fetch_target_names[i]] = cpu_fetchs[i]; } - // 6. Run the inference program + // 6. If export Flags_use_mkldnn=True, use mkldnn related ops. + if (FLAGS_use_mkldnn) executor.EnableMKLDNN(*inference_program); + + // 7. Run the inference program { if (!CreateVars) { // If users don't want to create and destroy variables every time they diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index de6ff29c6f8edbcf930546ff157a1c226e1311db..d6a36eff09c7f70803d3be619b26d16660da1ec2 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -166,8 +166,6 @@ function(op_library TARGET) # NOTE(*): activation use macro to regist the kernels, set use_op manually. if(${TARGET} STREQUAL "activation") file(APPEND ${pybind_file} "USE_OP(relu);\n") - elseif(${TARGET} STREQUAL "reduce") - file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n") elseif(${TARGET} STREQUAL "fake_dequantize") file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") else() @@ -188,19 +186,23 @@ endif() add_subdirectory(detail) if(WITH_DISTRIBUTE) - - set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) + + set(DISTRIBUTE_DEPS "") + if(WITH_GRPC) + set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) + else() + set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib) + endif() + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - op_library(send_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + op_library(send_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS}) op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) @@ -210,15 +212,18 @@ if(WITH_DISTRIBUTE) # listen_and_serv_op sum_op executor SERIAL) if(WITH_GPU) set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op - listen_and_serv_op executor SERIAL) - op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc) + cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op executor SERIAL) + if(WITH_GRPC) + op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc) + else() + op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_brpc) + endif() set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op) endif() else() - set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op fetch_barrier_op gen_nccl_id_op) + set(DEPS_OPS ${DEPS_OPS} prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op) endif() op_library(cross_entropy_op DEPS cross_entropy) @@ -227,6 +232,8 @@ op_library(softmax_op DEPS softmax) op_library(sequence_softmax_op DEPS softmax) if (WITH_GPU AND TENSORRT_FOUND) op_library(tensorrt_engine_op DEPS tensorrt_engine) + nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc + DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter) else() set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op) endif() diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc index b892ac77d9ed60210ddadaecb1a4f214e5a25180..46ed99bcf2234f7621d9f00eb48c846d8a355795 100644 --- a/paddle/fluid/operators/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/activation_mkldnn_op.cc @@ -222,35 +222,35 @@ struct MKLDNNActivationGradFunc : public BaseActivationFunctor { }; template -using ReluMkldnnFunctor = +using ReluMKLDNNFunctor = MKLDNNActivationFunc; template -using TanhMkldnnFunctor = +using TanhMKLDNNFunctor = MKLDNNActivationFunc; template -using SqrtMkldnnFunctor = +using SqrtMKLDNNFunctor = MKLDNNActivationFunc; template -using AbsMkldnnFunctor = +using AbsMKLDNNFunctor = MKLDNNActivationFunc; template -using ReluMkldnnGradFunctor = +using ReluMKLDNNGradFunctor = MKLDNNActivationGradFunc; template -using TanhMkldnnGradFunctor = +using TanhMKLDNNGradFunctor = MKLDNNActivationGradFunc; template -using SqrtMkldnnGradFunctor = +using SqrtMKLDNNGradFunctor = MKLDNNActivationGradFunc; template -using AbsMkldnnGradFunctor = +using AbsMKLDNNGradFunctor = MKLDNNActivationGradFunc; } // namespace operators } // namespace paddle @@ -265,9 +265,9 @@ namespace ops = paddle::operators; ops::MKLDNNActivationGradKernel>); #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \ - __macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor); \ - __macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor); \ - __macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor); \ - __macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor); + __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \ + __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradFunctor); \ + __macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradFunctor); \ + __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor); FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index dd71c66a75a039429f6e4b1771bb31508bb6b56d..af1d85047e519df6766b2139a0445ae9dc5945e2 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -24,12 +24,12 @@ namespace operators { : public ::paddle::framework::OpProtoAndCheckerMaker { \ public: \ void Make() override { \ - AddInput("X", "Input of " #OP_NAME "operator"); \ - AddOutput("Out", "Output of" #OP_NAME "operator"); \ + AddInput("X", "Input of " #OP_NAME " operator"); \ + AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X"); \ AddAttr("use_mkldnn", \ "(bool, default false) Only used in mkldnn kernel") \ .SetDefault(false); \ - AddComment(#OP_COMMENT); \ + AddComment(OP_COMMENT); \ } \ } @@ -58,14 +58,16 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, const framework::OperatorWithKernel& oper, const std::string& name) { framework::LibraryType library{framework::LibraryType::kPlain}; + + framework::DataLayout layout = framework::DataLayout::kAnyLayout; #ifdef PADDLE_WITH_MKLDNN auto it = oper.Attrs().find("use_mkldnn"); if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() && platform::CanMKLDNNBeUsed(ctx)) { library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } #endif - framework::DataLayout layout = framework::DataLayout::kAnyLayout; return framework::OpKernelType( framework::ToDataType(ctx.Input(name)->type()), ctx.GetPlace(), layout, library); diff --git a/paddle/fluid/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc index 99b0239855d6241b064a5883c2be3d58078b3b61..6ee73c3000fb45b4e1cd5bbb730da7d61b494b6f 100644 --- a/paddle/fluid/operators/adam_op.cc +++ b/paddle/fluid/operators/adam_op.cc @@ -89,9 +89,9 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator"); AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator"); - AddOutput("ParamOut", "(Tensor) Output parameter"); - AddOutput("Moment1Out", "(Tensor) Output first moment"); - AddOutput("Moment2Out", "(Tensor) Output second moment"); + AddOutput("ParamOut", "(Tensor) Output parameter").Reuse("Param"); + AddOutput("Moment1Out", "(Tensor) Output first moment").Reuse("Moment1"); + AddOutput("Moment2Out", "(Tensor) Output second moment").Reuse("Moment2"); AddAttr("beta1", "(float, default 0.9) " diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8174d3735859b1fac40cd4c07545f34874d31ab7 --- /dev/null +++ b/paddle/fluid/operators/arg_max_op.cc @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" + +REGISTER_OPERATOR(arg_max, paddle::operators::ArgMinMaxOp, + paddle::operators::ArgMaxOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL( + arg_max, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel); diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..a147d77a9e9c577984028e1a6ed9582dda622069 --- /dev/null +++ b/paddle/fluid/operators/arg_max_op.cu @@ -0,0 +1,31 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" + +REGISTER_OP_CUDA_KERNEL( + arg_max, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel); diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h new file mode 100644 index 0000000000000000000000000000000000000000..6cbdaefeda099c36a864289ef8195c20d09c55e6 --- /dev/null +++ b/paddle/fluid/operators/arg_min_max_op_base.h @@ -0,0 +1,160 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/printf.h" + +namespace paddle { +namespace operators { + +enum ArgMinMaxType { kArgMin, kArgMax }; + +template +struct ArgMinMaxFunctor {}; + +#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value) \ + template \ + struct ArgMinMaxFunctor { \ + void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \ + framework::LoDTensor* out, int64_t axis) { \ + auto in_eigen = framework::EigenTensor::From(in); \ + auto out_eigen = framework::EigenTensor::From(*out); \ + out_eigen.device(*(ctx.eigen_device())) = \ + in_eigen.eigen_op_type(axis).template cast(); \ + } \ + } + +DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin); +DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax); + +template +class ArgMinMaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& x = *(ctx.Input("X")); + auto& out = *(ctx.Output("Out")); + out.mutable_data(ctx.GetPlace()); + auto axis = ctx.Attr("axis"); + auto& dev_ctx = ctx.template device_context(); + +#define CALL_ARG_MINMAX_FUNCTOR(rank) \ + ArgMinMaxFunctor \ + functor##rank; \ + functor##rank(dev_ctx, x, &out, axis) + + switch (x.dims().size()) { + case 1: + CALL_ARG_MINMAX_FUNCTOR(1); + break; + case 2: + CALL_ARG_MINMAX_FUNCTOR(2); + break; + case 3: + CALL_ARG_MINMAX_FUNCTOR(3); + break; + case 4: + CALL_ARG_MINMAX_FUNCTOR(4); + break; + case 5: + CALL_ARG_MINMAX_FUNCTOR(5); + break; + case 6: + CALL_ARG_MINMAX_FUNCTOR(6); + break; + default: + PADDLE_THROW( + "%s operator doesn't supports tensors whose ranks are greater " + "than 6.", + (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")); + break; +#undef CALL_ARG_MINMAX_FUNCTOR + } + } +}; + +template +using ArgMinKernel = + ArgMinMaxKernel; + +template +using ArgMaxKernel = + ArgMinMaxKernel; + +class ArgMinMaxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); + const auto& x_dims = ctx->GetInputDim("X"); + int64_t axis = ctx->Attrs().Get("axis"); + PADDLE_ENFORCE(axis >= -x_dims.size() && axis < x_dims.size(), + "'axis' must be inside [-Rank(X), Rank(X))"); + + auto x_rank = x_dims.size(); + if (axis < 0) axis += x_rank; + + std::vector vec; + for (int64_t i = 0; i < axis; i++) vec.push_back(x_dims[i]); + for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]); + ctx->SetOutputDim("Out", framework::make_ddim(vec)); + } +}; + +class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker { + protected: + virtual const char* OpName() const = 0; + virtual const char* Name() const = 0; + + public: + void Make() override { + AddInput("X", "Input tensor."); + AddOutput("Out", "Output tensor."); + AddAttr("axis", "The axis in which to compute the arg indics."); + AddComment(string::Sprintf(R"DOC( + %s Operator. + + Computes the indices of the %s elements of the input tensor's element + along the provided axis. +)DOC", + OpName(), Name())); + } +}; + +class ArgMinOpMaker : public BaseArgMinMaxOpMaker { + protected: + const char* OpName() const override { return "ArgMin"; } + const char* Name() const override { return "min"; } +}; + +class ArgMaxOpMaker : public BaseArgMinMaxOpMaker { + protected: + const char* OpName() const override { return "ArgMax"; } + const char* Name() const override { return "max"; } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..41f188029f17dbe8717afc0ca0760a39edc24b54 --- /dev/null +++ b/paddle/fluid/operators/arg_min_op.cc @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" + +REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinMaxOp, + paddle::operators::ArgMinOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL( + arg_min, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel); diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..4d020508505a6ebac8be41ce1e4f99d436b67ab5 --- /dev/null +++ b/paddle/fluid/operators/arg_min_op.cu @@ -0,0 +1,31 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" + +REGISTER_OP_CUDA_KERNEL( + arg_min, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel); diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc index 0e4a56d4a45a732cfcf43b09228bc0c44df5924c..8206cc9890160da756efb13c991020f09b20126a 100644 --- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc @@ -19,10 +19,17 @@ limitations under the License. */ namespace paddle { namespace operators { -using Tensor = framework::Tensor; +using batch_norm_bwd = mkldnn::batch_normalization_backward; +using batch_norm_fwd = mkldnn::batch_normalization_forward; +using framework::DataLayout; +using framework::Tensor; +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::reorder; +using mkldnn::stream; using paddle::platform::MKLDNNDeviceContext; using paddle::platform::MKLDNNMemDesc; -using mkldnn::memory; +using platform::to_void_cast; template using EigenArrayMap = @@ -64,21 +71,12 @@ void run_batch_norm_op(Args &&... args) { mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); } -template -inline void *cast_const_to_void(const T *t) { - return static_cast(const_cast(t)); -} } // namespace template class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - auto data_layout_str = ctx.Attr("data_layout"); - auto data_layout = framework::StringToDataLayout(data_layout_str); - PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW, - "MKLDNN batch normalization handles only NCHW data layout"); - const float epsilon = ctx.Attr("epsilon"); const float momentum = ctx.Attr("momentum"); const bool is_test = ctx.Attr("is_test"); @@ -99,41 +97,53 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { const auto *scale = ctx.Input("Scale"); const auto *shift = ctx.Input("Bias"); - y->mutable_data(ctx.GetPlace()); - mean_out->mutable_data(ctx.GetPlace()); - variance_out->mutable_data(ctx.GetPlace()); + PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN && + x->format() != memory::format::format_undef, + "Wrong layout/format set for Input x tensor"); + + const T *x_data = x->data(); + const T *mean_data = mean->data(); + const T *variance_data = variance->data(); + T *y_data = y->mutable_data(ctx.GetPlace()); + T *mean_out_data = mean_out->mutable_data(ctx.GetPlace()); + T *variance_out_data = variance_out->mutable_data(ctx.GetPlace()); + T *batch_mean_data = nullptr; + T *batch_variance_data = nullptr; if (!is_test) { - batch_mean->mutable_data(ctx.GetPlace()); - batch_variance->mutable_data(ctx.GetPlace()); + batch_mean_data = batch_mean->mutable_data(ctx.GetPlace()); + batch_variance_data = batch_variance->mutable_data(ctx.GetPlace()); } auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring : mkldnn::prop_kind::forward_training; - auto dims = paddle::framework::vectorize2int(x->dims()); - - auto src_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - auto dst_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - - auto src_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine}; - auto dst_pd = mkldnn::memory::primitive_desc{dst_md, mkldnn_engine}; - - auto src = mkldnn::memory{src_pd, cast_const_to_void(x->data())}; - auto dst = mkldnn::memory{dst_pd, y->data()}; + auto src_tz = paddle::framework::vectorize2int(x->dims()); + auto scale_tz = paddle::framework::vectorize2int(scale->dims()); + PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1"); + const unsigned int ic = scale_tz[0]; unsigned flags = mkldnn::use_scale_shift; if (is_test) flags |= mkldnn::use_global_stats; + // create mkldnn memory from input x tensor + auto src_memory = + memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine}, + to_void_cast(x_data)); + + // create primitive descriptor for batch norm forward using bn_fwd_types = bn_type_traits; - auto batch_norm_fwd_desc = - bn_fwd_types::op_desc{propagation, src_md, epsilon, flags}; - auto batch_norm_fwd_pd = - bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine}; + auto batch_norm_fwd_desc = bn_fwd_types::op_desc{ + propagation, src_memory.get_primitive_desc().desc(), epsilon, flags}; + std::shared_ptr batch_norm_fwd_pd = + std::shared_ptr( + new batch_norm_fwd::primitive_desc(batch_norm_fwd_desc, + mkldnn_engine)); - const unsigned int ic = dims[1]; + // Save the pd to be used in backward pass + const std::string key = ctx.op().Output("SavedMean"); + const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; + dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd); // MKLDNN requires a single piece of memory for scale and shift/bias data const size_t scaleshift_size = 2 * ic; @@ -143,73 +153,58 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { copy_to_weights(scale->data(), scale->data() + ic, shift->data(), shift->data() + ic, &scaleshift_data); - auto scaleshift_memory = mkldnn::memory{ - batch_norm_fwd_pd.weights_primitive_desc(), scaleshift_data.data()}; + // crate mkldnn memory for weights(scale/shift) + auto scaleshift_memory = memory(batch_norm_fwd_pd->weights_primitive_desc(), + scaleshift_data.data()); - if (is_test) { - auto mean_memory = mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(), - cast_const_to_void(mean->data())}; + // create mkldnn memory for output y tensor + auto dst_memory = memory(batch_norm_fwd_pd->dst_primitive_desc(), y_data); + if (is_test) { + // create mkldnn memory for stats (as input) + auto mean_memory = memory(batch_norm_fwd_pd->mean_primitive_desc(), + to_void_cast(mean_data)); auto variance_memory = - mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(), - cast_const_to_void(variance->data())}; + memory(batch_norm_fwd_pd->variance_primitive_desc(), + to_void_cast(variance_data)); run_batch_norm_op( - batch_norm_fwd_pd, src, (const mkldnn::primitive::at &)mean_memory, + *batch_norm_fwd_pd, src_memory, + (const mkldnn::primitive::at &)mean_memory, (const mkldnn::primitive::at &)variance_memory, scaleshift_memory, - dst); + dst_memory); } else { + // create mkldnn memory for stats (as output) auto mean_memory = - mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(), - cast_const_to_void(batch_mean->data())}; - - auto variance_memory = - mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(), - cast_const_to_void(batch_variance->data())}; + memory(batch_norm_fwd_pd->mean_primitive_desc(), batch_mean_data); + auto variance_memory = memory( + batch_norm_fwd_pd->variance_primitive_desc(), batch_variance_data); - run_batch_norm_op(batch_norm_fwd_pd, src, - scaleshift_memory, dst, + run_batch_norm_op(*batch_norm_fwd_pd, src_memory, + scaleshift_memory, dst_memory, mean_memory, variance_memory); } if (!is_test) { - const unsigned int in = dims[0]; - const unsigned int sample_size = x->numel() / in / ic; - - // saved_xx is use just in this batch of data - EigenVectorArrayMap saved_mean_e( - batch_mean->mutable_data(ctx.GetPlace()), ic); - EigenVectorArrayMap saved_variance_e( - batch_variance->mutable_data(ctx.GetPlace()), ic); - saved_mean_e.setZero(); - saved_variance_e.setZero(); - - const unsigned int x_arr_size = in * ic; - ConstEigenArrayMap x_arr(x->data(), sample_size, x_arr_size); - for (unsigned int nc = 0; nc < x_arr_size; ++nc) { - saved_mean_e(nc % ic) += x_arr.col(nc).sum(); - } - saved_mean_e /= in * sample_size; - for (unsigned int nc = 0; nc < x_arr_size; ++nc) { - saved_variance_e(nc % ic) += - (x_arr.col(nc) - saved_mean_e(nc % ic)).matrix().squaredNorm(); - } - saved_variance_e /= in * sample_size; - - ConstEigenVectorArrayMap mean_arr{mean->data(), ic}; - ConstEigenVectorArrayMap variance_arr{variance->data(), ic}; - - EigenVectorArrayMap running_mean_arr( - mean_out->mutable_data(ctx.GetPlace()), ic); - EigenVectorArrayMap running_var_arr( - variance_out->mutable_data(ctx.GetPlace()), ic); + // mkldnn only compute stats for current batch + // so we need compute momentum stats via Eigen lib + EigenVectorArrayMap batch_mean_e(batch_mean_data, ic); + EigenVectorArrayMap batch_variance_e(batch_variance_data, ic); + ConstEigenVectorArrayMap mean_e(mean_data, ic); + ConstEigenVectorArrayMap variance_e{variance_data, ic}; + + EigenVectorArrayMap running_mean_e(mean_out_data, ic); + EigenVectorArrayMap running_variance_e(variance_out_data, ic); auto one_minus_momentum = 1. - momentum; - running_mean_arr = - mean_arr * momentum + saved_mean_e * one_minus_momentum; - running_var_arr = - variance_arr * momentum + saved_variance_e * one_minus_momentum; + running_mean_e = mean_e * momentum + batch_mean_e * one_minus_momentum; + running_variance_e = + variance_e * momentum + batch_variance_e * one_minus_momentum; } + + y->set_layout(DataLayout::kMKLDNN); + y->set_format( + (memory::format)dst_memory.get_primitive_desc().desc().data.format); } }; @@ -217,11 +212,6 @@ template class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext &ctx) const override { - auto data_layout_str = ctx.Attr("data_layout"); - auto data_layout = framework::StringToDataLayout(data_layout_str); - PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW, - "MKLDNN batch normalization handles only NCHW data layout"); - auto &dev_ctx = ctx.template device_context(); auto mkldnn_engine = dev_ctx.GetEngine(); @@ -238,88 +228,132 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto *diff_scale = ctx.Output(framework::GradVarName("Scale")); auto *diff_shift = ctx.Output(framework::GradVarName("Bias")); - diff_x->mutable_data(ctx.GetPlace()); - diff_scale->mutable_data(ctx.GetPlace()); - diff_shift->mutable_data(ctx.GetPlace()); + PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN && + diff_y->format() != memory::format::format_undef, + "Wrong layout/format set for Input diff_y tensor"); + + const T *x_data = x->data(); + const T *diff_y_data = diff_y->data(); + const T *batch_mean_data = batch_mean->data(); + const T *batch_variance_data = batch_variance->data(); + const T *scale_data = scale->data(); + const T *shift_data = shift->data(); + T *diff_x_data = diff_x->mutable_data(ctx.GetPlace()); + T *diff_scale_data = diff_scale->mutable_data(ctx.GetPlace()); + T *diff_shift_data = diff_shift->mutable_data(ctx.GetPlace()); + + auto src_tz = paddle::framework::vectorize2int(x->dims()); + auto diff_src_tz = src_tz; + auto dst_tz = src_tz; + auto diff_dst_tz = dst_tz; + auto scale_tz = paddle::framework::vectorize2int(scale->dims()); + PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1"); + + const unsigned int ic = scale_tz[0]; + + // Retrieve bn_fwd_pd from device context + const std::string key = ctx.op().Input("SavedMean"); + const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; + auto batch_norm_fwd_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_batch_norm_fwd_pd)); + PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr, + "Fail to find batch_norm_fwd_pd in device context"); - auto dims = paddle::framework::vectorize2int(x->dims()); - unsigned flags = mkldnn::use_scale_shift | !mkldnn::use_global_stats; + using bn_bwd_types = bn_type_traits; - auto src_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - auto dst_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - auto diff_src_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - auto diff_dst_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); + // create mkldnn memory from input diff_y tensor + auto user_diff_dst_memory = + memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()}, + mkldnn_engine}, + to_void_cast(diff_y_data)); - using bn_bwd_types = bn_type_traits; - using bn_fwd_types = bn_type_traits; + // create mkldnn memory from input x tensor + auto src_memory = + memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine}, + to_void_cast(x_data)); - auto batch_norm_fwd_desc = bn_fwd_types::op_desc{ - mkldnn::prop_kind::forward_training, src_md, epsilon, flags}; - auto batch_norm_fwd_pd = - bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine}; + // for diff_dst, try to use same format as dst in forward pass + auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc(); + auto diff_dst_md = diff_dst_pd.desc(); + // create primitive descriptor for batch norm backward + unsigned flags = mkldnn::use_scale_shift; auto batch_norm_bwd_desc = bn_bwd_types::op_desc{ - mkldnn::prop_kind::backward, diff_dst_md, dst_md, epsilon, flags}; + mkldnn::prop_kind::backward, diff_dst_md, + src_memory.get_primitive_desc().desc(), epsilon, flags}; auto batch_norm_bwd_pd = bn_bwd_types::op_prim{ - batch_norm_bwd_desc, mkldnn_engine, batch_norm_fwd_pd}; - - auto src = mkldnn::memory{{src_md, mkldnn_engine}, - cast_const_to_void(x->data())}; - - auto mean = mkldnn::memory{batch_norm_bwd_pd.mean_primitive_desc(), - cast_const_to_void(batch_mean->data())}; - - auto variance = - mkldnn::memory{batch_norm_bwd_pd.variance_primitive_desc(), - cast_const_to_void(batch_variance->data())}; - - auto diff_dst = mkldnn::memory{{diff_dst_md, mkldnn_engine}, - cast_const_to_void(diff_y->data())}; + batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd}; + + // reorder user_diff_dst if it's not in preferred format + auto diff_dst_memory = user_diff_dst_memory; + primitive reorder_diff_dst; + bool is_diff_dst_reordered = false; + if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory = memory(diff_dst_pd); + reorder_diff_dst = reorder(user_diff_dst_memory, diff_dst_memory); + is_diff_dst_reordered = true; + } - const unsigned int ic = dims[1]; + // create mkldnn memory for input tensors (src/mean/variance) + auto mean_memory = memory(batch_norm_bwd_pd.mean_primitive_desc(), + to_void_cast(batch_mean_data)); + auto variance_memory = memory(batch_norm_bwd_pd.variance_primitive_desc(), + to_void_cast(batch_variance_data)); + // MKLDNN requires a single piece of memory for scale and shift/bias data const size_t scaleshift_size = 2 * ic; std::vector scaleshift_data; scaleshift_data.reserve(scaleshift_size); - copy_to_weights(scale->data(), scale->data() + ic, shift->data(), - shift->data() + ic, &scaleshift_data); + copy_to_weights(scale_data, scale_data + ic, shift_data, shift_data + ic, + &scaleshift_data); - auto scaleshift_memory = mkldnn::memory{ - batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data()}; + // create mkldnn memory for input tensors (scale/shift) + auto scaleshift_memory = memory(batch_norm_bwd_pd.weights_primitive_desc(), + scaleshift_data.data()); + // create mkldnn memory for output diff weights (combined scale/shift) std::vector diff_scaleshift_data; diff_scaleshift_data.reserve(scaleshift_size); - copy_to_weights(diff_scale->data(), diff_scale->data() + ic, - diff_shift->data(), diff_shift->data() + ic, - &diff_scaleshift_data); - auto diff_scaleshift_memory = - mkldnn::memory{batch_norm_bwd_pd.diff_weights_primitive_desc(), - diff_scaleshift_data.data()}; - - auto diff_src = mkldnn::memory{{diff_src_md, mkldnn_engine}, - static_cast(diff_x->data())}; - - run_batch_norm_op( - batch_norm_bwd_pd, src, mean, variance, diff_dst, scaleshift_memory, - diff_src, diff_scaleshift_memory); - + memory(batch_norm_bwd_pd.diff_weights_primitive_desc(), + diff_scaleshift_data.data()); + + // here assume diff_src is in the same format of src + auto diff_src_memory = memory(src_memory.get_primitive_desc(), diff_x_data); + + // finally create batch_norm backward primitive + auto batch_norm_bwd_prim = + batch_norm_bwd(batch_norm_bwd_pd, src_memory, mean_memory, + variance_memory, diff_dst_memory, scaleshift_memory, + diff_src_memory, diff_scaleshift_memory); + + // execute optional reorder and batch_norm backward primitive + std::vector pipeline; + if (is_diff_dst_reordered) pipeline.push_back(reorder_diff_dst); + pipeline.push_back(batch_norm_bwd_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + // copy back diff sacle/shift to output tensors (diff scale/shift) + diff_scaleshift_data.resize(scaleshift_size); auto it = std::begin(diff_scaleshift_data); - std::copy(it, std::next(it, ic), diff_scale->data()); + std::copy(it, std::next(it, ic), diff_scale_data); std::copy(std::next(it, ic), std::end(diff_scaleshift_data), - diff_shift->data()); + diff_shift_data); + + // set layout/format of output tensors + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format((memory::format)diff_src_memory.get_primitive_desc() + .desc() + .data.format); } }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_KERNEL(batch_norm, MKLDNN, paddle::platform::CPUPlace, +REGISTER_OP_KERNEL(batch_norm, MKLDNN, ::paddle::platform::CPUPlace, ops::BatchNormMKLDNNOpKernel); -REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, paddle::platform::CPUPlace, +REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, ::paddle::platform::CPUPlace, ops::BatchNormMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 6ec8c9d18b466142acdb46b0f46826a2aca7a47e..625ca2d7c4c70d1098b0fb28380d8d1eb24cb338 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -110,17 +110,19 @@ class BatchNormOp : public framework::OperatorWithKernel { ctx.Input("Variance")->type()), "Variance input should be of float type"); - framework::LibraryType library_{framework::LibraryType::kPlain}; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; #ifdef PADDLE_WITH_MKLDNN - if (library_ == framework::LibraryType::kPlain && + if (library == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { - library_ = framework::LibraryType::kMKLDNN; + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } #endif - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout = framework::DataLayout::kAnyLayout; + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, - library_); + library); } }; @@ -149,13 +151,15 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Variance", "The global variance (for training) " "or estimated Variance (for testing)"); - AddOutput("Y", "result after normalization"); + AddOutput("Y", "result after normalization").Reuse("X"); AddOutput("MeanOut", "Share memory with Mean. " - "Store the global mean when training"); + "Store the global mean when training") + .Reuse("Mean"); AddOutput("VarianceOut", "Share memory with Variance. " - "Store the global Variance when training"); + "Store the global Variance when training") + .Reuse("Variance"); AddOutput("SavedMean", "Mean of the current mini batch, " "will apply to output when training") @@ -366,18 +370,21 @@ class BatchNormGradOp : public framework::OperatorWithKernel { PADDLE_THROW("can't find Y@GRAD"); } - framework::LibraryType library_{framework::LibraryType::kPlain}; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + #ifdef PADDLE_WITH_MKLDNN - if (library_ == framework::LibraryType::kPlain && + if (library == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { - library_ = framework::LibraryType::kMKLDNN; + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } #endif - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout = framework::DataLayout::kAnyLayout; + return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - layout, library_); + layout, library); } }; diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h index 483c9f8c2191fa4eb98b91112f9d6753e2fbddc3..fc15d56891cf7af10a91ca22a09c84fa2e52d465 100644 --- a/paddle/fluid/operators/batch_size_like.h +++ b/paddle/fluid/operators/batch_size_like.h @@ -54,18 +54,18 @@ class BatchSizeLikeOp : public framework::OperatorWithKernel { class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() final { - AddInput("Input", - "(Tensor) Tensor " - "whose input_dim_idx'th dimension specifies the batch_size"); + AddInput( + "Input", + "Tensor whose input_dim_idx'th dimension specifies the batch_size"); AddOutput("Out", - "(Tensor) Tensor of specified shape will be filled " + "Tensor of specified shape will be filled " "with the specified value"); - AddAttr>("shape", "(vector) The shape of the output"); + AddAttr>("shape", "The shape of the output"); AddAttr("input_dim_idx", - "(int, default 0) The index of input's batch size dimension") + "default 0. The index of input's batch size dimension") .SetDefault(0); AddAttr("output_dim_idx", - "(int, default 0) The index of output's batch size dimension") + "default 0. The index of output's batch size dimension") .SetDefault(0); Apply(); } diff --git a/paddle/fluid/operators/bilinear_interp_op.cc b/paddle/fluid/operators/bilinear_interp_op.cc index 3321adf2743c28f6eeca8b5cc91ef89beed6b97c..2572e813d656353a2187c29da89266733a32f3ce 100644 --- a/paddle/fluid/operators/bilinear_interp_op.cc +++ b/paddle/fluid/operators/bilinear_interp_op.cc @@ -56,17 +56,16 @@ class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "(Tensor) The input tensor of bilinear interpolation, " + "The input tensor of bilinear interpolation, " "This is a 4-D tensor with shape of (N x C x h x w)"); AddInput("OutSize", - "(Tensor) This is a 1-D tensor with two number. " + "This is a 1-D tensor with two number. " "The first number is height and the second number is width.") .AsDispensable(); - AddOutput("Out", - "(Tensor) The dimension of output is (N x C x out_h x out_w]"); + AddOutput("Out", "The dimension of output is (N x C x out_h x out_w)"); - AddAttr("out_h", "(int) output height of bilinear interpolation op."); - AddAttr("out_w", "(int) output width of bilinear interpolation op."); + AddAttr("out_h", "output height of bilinear interpolation op."); + AddAttr("out_w", "output width of bilinear interpolation op."); AddComment(R"DOC( Bilinear interpolation is an extension of linear interpolation for interpolating functions of two variables (e.g. H-direction and diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index 7a7b8b76e43b1f91a3ba2767c217993cc39f26b6..1828be57b5a54005a0066b18ebebdb740726f67a 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/float16.h" -DEFINE_bool(cudnn_algo_use_autotune, true, +DEFINE_bool(cudnn_deterministic, true, "Whether allow using an autotuning algorithm for convolution " "operator. The autotuning algorithm may be non-deterministic. If " "false, the algorithm is deterministic."); @@ -272,7 +272,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); if (input_grad) { - if (FLAGS_cudnn_algo_use_autotune) { + if (FLAGS_cudnn_deterministic) { PADDLE_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( handle, cudnn_filter_desc, @@ -297,7 +297,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { } if (filter_grad) { - if (FLAGS_cudnn_algo_use_autotune) { + if (FLAGS_cudnn_deterministic) { PADDLE_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( handle, cudnn_input_desc, cudnn_output_grad_desc, diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 63d371310d2a26a1460e527fc51923dfd6e0b8bc..6b06913d1c83f4534238ac3dd22ac4035c0f0fbf 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -18,6 +18,17 @@ namespace paddle { namespace operators { +using conv_bwd_data = mkldnn::convolution_backward_data; +using conv_bwd_weights = mkldnn::convolution_backward_weights; +using conv_fwd = mkldnn::convolution_forward; +using framework::DataLayout; +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::reorder; +using mkldnn::stream; +using platform::to_void_cast; +using platform::GetMKLDNNFormat; + template class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -25,6 +36,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); + // Get unique name for index + const std::string key = ctx.op().Output("Output"); + const std::string key_conv_pd = key + "@conv_pd"; + auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); @@ -33,10 +48,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto* filter = ctx.Input("Filter"); auto* output = ctx.Output("Output"); - // Get an unique name from "argument" name of "Output" variable - // This name will be used as key when saving info into device context - const std::string key = ctx.op().Output("Output"); - const std::string key_conv_pd = key + "@conv_pd"; + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); @@ -63,60 +80,86 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { paddle::framework::vectorize2int(filter->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - // TODO(pzelazko-intel): support more formats - auto src_md = platform::MKLDNNMemDesc( - src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - auto weights_md = - platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::oihw); - auto dst_md = platform::MKLDNNMemDesc( - dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - - auto src_memory = - mkldnn::memory({src_md, mkldnn_engine}, - reinterpret_cast(const_cast(input_data))); - auto weights_memory = - mkldnn::memory({weights_md, mkldnn_engine}, - reinterpret_cast(const_cast(filter_data))); - auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data); - - std::shared_ptr conv_pd = - ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, - mkldnn_engine); - - // save conv_pd into global device context to be referred in backward path - dev_ctx.SetBlob(key_conv_pd, conv_pd); + // create mkldnn memory from input tensors (data/weights) + auto user_src_memory = memory( + {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine}, + to_void_cast(input_data)); + auto user_weights_memory = + memory({{{weights_tz}, memory::data_type::f32, filter->format()}, + mkldnn_engine}, + to_void_cast(filter_data)); + + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, + memory::format::any); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::f32, memory::format::any); + auto dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32, + memory::format::any); + + // create a conv primitive descriptor and save it for usage in backward + std::shared_ptr conv_pd = ConvFwdPrimitiveDesc( + src_md, weights_md, dst_md, strides, paddings, mkldnn_engine); + + // create reorder primitive if the input format is not the preferred one + auto src_memory = user_src_memory; + primitive reorder_src; + bool is_src_reordered = false; + if (memory::primitive_desc(conv_pd->src_primitive_desc()) != + user_src_memory.get_primitive_desc()) { + src_memory = memory(conv_pd->src_primitive_desc()); + reorder_src = reorder(user_src_memory, src_memory); + is_src_reordered = true; + } + auto weights_memory = user_weights_memory; + primitive reorder_weights; + bool is_weights_reordered = false; + if (memory::primitive_desc(conv_pd->weights_primitive_desc()) != + user_weights_memory.get_primitive_desc()) { + weights_memory = memory(conv_pd->weights_primitive_desc()); + reorder_weights = reorder(user_weights_memory, weights_memory); + is_weights_reordered = true; + } + + // create memory primitive for conv dst + auto dst_memory = memory(conv_pd->dst_primitive_desc(), output_data); // create convolution op primitive - auto conv_prim = mkldnn::convolution_forward(*conv_pd, src_memory, - weights_memory, dst_memory); + auto conv_prim = conv_fwd(*conv_pd, src_memory, weights_memory, dst_memory); // push primitive to stream and wait until it's executed - std::vector pipeline{conv_prim}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + std::vector pipeline; + if (is_src_reordered) pipeline.push_back(reorder_src); + if (is_weights_reordered) pipeline.push_back(reorder_weights); + pipeline.push_back(conv_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + // Save conv_pd/src_memory/weights_memory for backward pass + dev_ctx.SetBlob(key_conv_pd, conv_pd); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(dst_memory)); } private: - std::unique_ptr - ConvFwdPrimitiveDesc(const mkldnn::memory::desc& src, - const mkldnn::memory::desc& weights, - const mkldnn::memory::desc& dst, - const std::vector& strides, - const std::vector& paddings, - const mkldnn::engine& engine) const { - mkldnn::memory::dims stride_dims = {strides[0], strides[1]}; - mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]}; - - auto conv_desc = mkldnn::convolution_forward::desc( - mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, - dst, stride_dims, padding_dims, padding_dims, - mkldnn::padding_kind::zero); - - auto p_conv_pd = - new mkldnn::convolution_forward::primitive_desc(conv_desc, engine); - - return std::unique_ptr( - p_conv_pd); + std::unique_ptr ConvFwdPrimitiveDesc( + const memory::desc& src, const memory::desc& weights, + const memory::desc& dst, const std::vector& strides, + const std::vector& paddings, const mkldnn::engine& engine) const { + memory::dims stride_dims = {strides[0], strides[1]}; + memory::dims padding_dims = {paddings[0], paddings[1]}; + + auto conv_desc = + conv_fwd::desc(mkldnn::prop_kind::forward, mkldnn::convolution_direct, + src, weights, dst, stride_dims, padding_dims, + padding_dims, mkldnn::padding_kind::zero); + + auto p_conv_pd = new conv_fwd::primitive_desc(conv_desc, engine); + + return std::unique_ptr(p_conv_pd); } }; @@ -139,6 +182,19 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); Tensor* filter_grad = ctx.Output(framework::GradVarName("Filter")); + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); + PADDLE_ENFORCE(output->layout() == DataLayout::kMKLDNN && + output->format() != memory::format::format_undef, + "Wrong layout/format set for Output tensor"); + PADDLE_ENFORCE(output_grad->layout() == DataLayout::kMKLDNN && + output_grad->format() != memory::format::format_undef, + "Wrong layout/format set for output_grad tensor"); + if (!input_grad && !filter_grad) return; // Get an unique name from "argument" name of "Output" variable @@ -167,108 +223,147 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { paddle::framework::vectorize2int(filter->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - // TODO(pzelazko-intel): support more formats - auto src_md = platform::MKLDNNMemDesc( - src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - auto diff_src_md = platform::MKLDNNMemDesc( - src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - auto weights_md = - platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::oihw); - auto diff_weights_md = - platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::oihw); - auto diff_dst_md = platform::MKLDNNMemDesc( - dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - - // create memory - auto diff_dst_memory = mkldnn::memory( - {diff_weights_md, mkldnn_engine}, - reinterpret_cast(const_cast(output_grad_data))); + // create mkldnn memory from input tensors (input/weights/output_grad) + auto user_src_memory = memory( + {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine}, + to_void_cast(input_data)); + auto user_weights_memory = + memory({{{weights_tz}, memory::data_type::f32, filter->format()}, + mkldnn_engine}, + to_void_cast(filter_data)); + auto user_diff_dst_memory = + memory({{{dst_tz}, memory::data_type::f32, output_grad->format()}, + mkldnn_engine}, + to_void_cast(output_grad_data)); + + /* create memory descriptor for conv backward without specified format + * ('any') which lets a primitive (conv backward in this case) choose + * the memory format preferred for best performance + */ + auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, + memory::format::any); + auto diff_src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, + memory::format::any); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::f32, memory::format::any); + auto diff_weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::f32, memory::format::any); + auto diff_dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32, + memory::format::any); + // Retrieve conv_pd from device context - auto conv_pd = - std::static_pointer_cast( - dev_ctx.GetBlob(key_conv_pd)); + auto conv_pd = std::static_pointer_cast( + dev_ctx.GetBlob(key_conv_pd)); PADDLE_ENFORCE(conv_pd != nullptr, "Fail to find conv_pd in device context"); // create backward conv primitive for weights if (filter_grad) { - // create primitive descriptor - mkldnn::convolution_backward_weights::primitive_desc conv_bwd_weights_pd = - ConvBwdWeightsPrimitiveDesc(src_md, diff_weights_md, diff_dst_md, - strides, paddings, *conv_pd, - mkldnn_engine); - - // create memory + // create backward convolution primitive descriptor + auto conv_bwd_weights_desc = conv_bwd_weights::desc( + mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md, + strides, paddings, paddings, mkldnn::padding_kind::zero); + auto conv_bwd_weights_pd = conv_bwd_weights::primitive_desc( + conv_bwd_weights_desc, mkldnn_engine, *conv_pd); + + // create reorder primitive if the input format is not the preferred one + auto src_memory = user_src_memory; + primitive reorder_src; + bool is_src_reordered = false; + if (memory::primitive_desc(conv_bwd_weights_pd.src_primitive_desc()) != + user_src_memory.get_primitive_desc()) { + src_memory = memory(conv_bwd_weights_pd.src_primitive_desc()); + reorder_src = reorder(user_src_memory, src_memory); + is_src_reordered = true; + } + + auto diff_dst_memory_4filter = user_diff_dst_memory; + primitive reorder_diff_dst_4filter; + bool is_diff_dst_reordered_4filter = false; + if (memory::primitive_desc( + conv_bwd_weights_pd.diff_dst_primitive_desc()) != + user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory_4filter = + memory(conv_bwd_weights_pd.diff_dst_primitive_desc()); + reorder_diff_dst_4filter = + reorder(user_diff_dst_memory, diff_dst_memory_4filter); + is_diff_dst_reordered_4filter = true; + } + + // create mkldnn memory for output (i.e. diff weights) auto diff_weights_memory = - mkldnn::memory({diff_weights_md, mkldnn_engine}, - reinterpret_cast(filter_grad_data)); - auto src_memory = - mkldnn::memory({src_md, mkldnn_engine}, - reinterpret_cast(const_cast(input_data))); + memory(conv_bwd_weights_pd.diff_weights_primitive_desc(), + reinterpret_cast(filter_grad_data)); // create backward conv primitive for weights - auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights( - conv_bwd_weights_pd, src_memory, diff_dst_memory, - diff_weights_memory); + auto conv_bwd_weights_prim = + conv_bwd_weights(conv_bwd_weights_pd, src_memory, + diff_dst_memory_4filter, diff_weights_memory); // push primitive and execute it - std::vector pipeline{conv_bwd_weights_prim}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + std::vector pipeline; + if (is_src_reordered) pipeline.push_back(reorder_src); + if (is_diff_dst_reordered_4filter) + pipeline.push_back(reorder_diff_dst_4filter); + pipeline.push_back(conv_bwd_weights_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + filter_grad->set_layout(DataLayout::kMKLDNN); + filter_grad->set_format(GetMKLDNNFormat(diff_weights_memory)); } if (input_grad) { - // create primitive descriptor - mkldnn::convolution_backward_data::primitive_desc conv_bwd_data_pd = - ConvBwdDataPrimitiveDesc(diff_src_md, weights_md, diff_dst_md, - strides, paddings, *conv_pd, mkldnn_engine); - - // create memory - auto diff_src_memory = mkldnn::memory( - {diff_src_md, mkldnn_engine}, - reinterpret_cast(const_cast(input_grad_data))); - auto weights_memory = - mkldnn::memory({weights_md, mkldnn_engine}, - reinterpret_cast(const_cast(filter_data))); + // create backward convolution primitive descriptor + auto conv_bwd_data_desc = conv_bwd_data::desc( + mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md, + strides, paddings, paddings, mkldnn::padding_kind::zero); + auto conv_bwd_data_pd = conv_bwd_data::primitive_desc( + conv_bwd_data_desc, mkldnn_engine, *conv_pd); + + // create reorder primitive if the input format is not the preferred one + auto weights_memory = user_weights_memory; + primitive reorder_weights; + bool is_weights_reordered = false; + if (memory::primitive_desc(conv_bwd_data_pd.weights_primitive_desc()) != + user_weights_memory.get_primitive_desc()) { + weights_memory = memory(conv_bwd_data_pd.weights_primitive_desc()); + reorder_weights = reorder(user_weights_memory, weights_memory); + is_weights_reordered = true; + } + + auto diff_dst_memory_4data = user_diff_dst_memory; + primitive reorder_diff_dst_4data; + bool is_diff_dst_reordered_4data = false; + if (memory::primitive_desc(conv_bwd_data_pd.diff_dst_primitive_desc()) != + user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory_4data = + memory(conv_bwd_data_pd.diff_dst_primitive_desc()); + reorder_diff_dst_4data = + reorder(user_diff_dst_memory, diff_dst_memory_4data); + is_diff_dst_reordered_4data = true; + } + + // create mkldnn memory for output (i.e. diff src) + auto diff_src_memory = memory(conv_bwd_data_pd.diff_src_primitive_desc(), + reinterpret_cast(input_grad_data)); // create backward conv primitive for data - auto conv_bwd_data_prim = mkldnn::convolution_backward_data( - conv_bwd_data_pd, diff_dst_memory, weights_memory, diff_src_memory); + auto conv_bwd_data_prim = + conv_bwd_data(conv_bwd_data_pd, diff_dst_memory_4data, weights_memory, + diff_src_memory); - // push primitive to stream and wait until it's executed - std::vector pipeline{conv_bwd_data_prim}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + // push primitive and execute it + std::vector pipeline; + if (is_weights_reordered) pipeline.push_back(reorder_weights); + if (is_diff_dst_reordered_4data) + pipeline.push_back(reorder_diff_dst_4data); + pipeline.push_back(conv_bwd_data_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + input_grad->set_layout(DataLayout::kMKLDNN); + input_grad->set_format(GetMKLDNNFormat(diff_src_memory)); } } // Compute() - - private: - mkldnn::convolution_backward_weights::primitive_desc - ConvBwdWeightsPrimitiveDesc( - const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights, - const mkldnn::memory::desc& diff_dst, const std::vector& strides, - const std::vector& paddings, - const mkldnn::convolution_forward::primitive_desc& conv_pd, - const mkldnn::engine& engine) const { - auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc( - mkldnn::convolution_direct, src, diff_weights, diff_dst, strides, - paddings, paddings, mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_weights::primitive_desc( - conv_bwd_weights_desc, engine, conv_pd); - } - - mkldnn::convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc( - const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights, - const mkldnn::memory::desc& diff_dst, const std::vector& strides, - const std::vector& paddings, - const mkldnn::convolution_forward::primitive_desc& conv_pd, - const mkldnn::engine& engine) const { - auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc( - mkldnn::convolution_direct, diff_src, weights, diff_dst, strides, - paddings, paddings, mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_data::primitive_desc(conv_bwd_data_desc, - engine, conv_pd); - } }; } // namespace operators diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 697d91484257984b104a13b0572cf19b16f8d37e..37153d58439a90190eb2ad82d5dcc145e22dfa48 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -75,6 +75,10 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library{framework::LibraryType::kPlain}; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library = framework::LibraryType::kCUDNN; @@ -84,6 +88,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( if (library == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } #endif @@ -99,9 +104,6 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( "float16 can only be used when CUDNN is used"); } - std::string data_format = ctx.Attr("data_format"); - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout = framework::StringToDataLayout(data_format); return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, library); } @@ -122,7 +124,8 @@ void Conv2DOpMaker::Make() { "input image channels divided by the groups."); AddOutput("Output", "(Tensor) The output tensor of convolution operator. " - "The format of output tensor is also NCHW."); + "The format of output tensor is also NCHW.") + .Reuse("Input"); AddAttr>("strides", "(vector default:{1, 1}), the " "strides(h_stride, w_stride) of " @@ -217,7 +220,8 @@ void Conv3DOpMaker::Make() { "input image channels divided by the groups."); AddOutput("Output", "(Tensor) The output tensor of convolution operator." - "The format of output tensor is also NCDHW."); + "The format of output tensor is also NCDHW.") + .Reuse("Input"); AddAttr>("strides", "(vector, default:{1, 1, 1}), the " "strides(d_stride, h_stride, w_stride) of " @@ -309,6 +313,10 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; @@ -318,12 +326,10 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif - std::string data_format = ctx.Attr("data_format"); - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), layout_, library_); diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc index 669b3bbe9df4cae1aa381184092dfa51157ab6a3..5b5a220cf90e7813f914ae35733e7a4103391b2d 100644 --- a/paddle/fluid/operators/crop_op.cc +++ b/paddle/fluid/operators/crop_op.cc @@ -48,6 +48,13 @@ class CropOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", y_dim); } } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } }; class CropOpMaker : public framework::OpProtoAndCheckerMaker { @@ -60,13 +67,19 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker { "The input used as reference for cropping, " "which is of the same dimensions as X.") .AsDispensable(); + AddInput("Offsets", + "The input used to describe offsets in runtime, which is a " + "1-D vector whose size equals to the rank of input 'X'. The " + "elements data type must be int.") + .AsDispensable(); AddOutput("Out", "The output of crop op, " "which is of the same dimensions as X."); AddAttr>("offsets", "A list describing offsets to be cropped. " "The size of offsets list should be the same as " - "the dimension size of input X."); + "the dimension size of input X.") + .SetDefault(std::vector()); AddAttr>("shape", "A list describing the shape of output. " "The size of shape list should be the same as " @@ -77,6 +90,17 @@ Crop Operator. Crop input into output, as specified by offsets and shape. +There are two ways to set the offsets: +1. In runtime: Using the input 'Offsets', which is a Vairbale and can be + output of other operators. This way is suitable for + dynamic offsets. +2. In network configuration: Using the attribute 'offsets', which will be + set in Python configure script. This way is + suitable for fixed offsets. +You CANNOT use these two ways at the same time. An exception will be raised +if input 'Offset' is configured and meanwhile the attribute 'offsets' is +not empty. + There are two ways to set shape: 1. reference input: crop input X into the same shape as reference input. The dimension of reference input should @@ -146,6 +170,15 @@ class CropOpGrad : public framework::OperatorWithKernel { ctx->SetOutputDim(x_grad_name, x_dims); } } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input(framework::GradVarName("Out")) + ->type()), + ctx.device_context()); + } }; } // namespace operators diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h index f05c2e23284e3a24cf48442996f671ec6084c391..91cfbbda7352c9b1676aae99e2bd57ccc9e10069 100644 --- a/paddle/fluid/operators/crop_op.h +++ b/paddle/fluid/operators/crop_op.h @@ -27,6 +27,37 @@ template ; using framework::Tensor; +static std::vector GetOffsets(const framework::ExecutionContext& ctx) { + std::vector res; + int rank = ctx.Input("X")->dims().size(); + if (ctx.HasInput("Offsets")) { + PADDLE_ENFORCE(ctx.Attr>("offsets").empty(), + "Input 'Offsets' and attribute 'offsets' should not be used " + "at the same time."); + const auto* offsets_tensor = ctx.Input("Offsets"); + PADDLE_ENFORCE_EQ(offsets_tensor->dims().size(), 1); + PADDLE_ENFORCE_EQ( + rank, offsets_tensor->dims()[0], + "Offsets size should be equal to dimension size of input tensor."); + const int* offsets_data; + framework::Tensor cpu_tmp_tensor; + if (platform::is_cpu_place(offsets_tensor->place())) { + offsets_data = offsets_tensor->data(); + } else { + framework::TensorCopySync(*offsets_tensor, platform::CPUPlace(), + &cpu_tmp_tensor); + offsets_data = cpu_tmp_tensor.data(); + } + res = std::vector(offsets_data, offsets_data + rank); + } else { + res = ctx.Attr>("offsets"); + PADDLE_ENFORCE_EQ( + rank, res.size(), + "Offsets size should be equal to dimension size of input tensor."); + } + return res; +} + template class CropKernel : public framework::OpKernel { public: @@ -37,10 +68,7 @@ class CropKernel : public framework::OpKernel { T* out_data = out->mutable_data(context.GetPlace()); auto x_stride = framework::stride(x->dims()); auto out_stride = framework::stride(out->dims()); - auto offsets = context.Attr>("offsets"); - PADDLE_ENFORCE_EQ( - x->dims().size(), static_cast(offsets.size()), - "Offsets size should be equal to dimension size of input tensor."); + auto offsets = GetOffsets(context); int64_t offset = 0; for (size_t i = 0; i < offsets.size(); ++i) { offset += (x_stride[i] * offsets[i]); @@ -56,7 +84,7 @@ void CropGradFunction(const framework::ExecutionContext& context) { if (d_x != nullptr) { auto* d_out = context.Input(framework::GradVarName("Out")); d_x->mutable_data(context.GetPlace()); - auto offsets = context.Attr>("offsets"); + auto offsets = GetOffsets(context); Eigen::array, D> paddings; for (size_t i = 0; i < D; ++i) { paddings[i].first = offsets[i]; diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index a3bec3da45136bca5cb2763e7ffd6b67703a1813..d5e095f9cad95b74b8ff79e4a60ccbdf11512a5a 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -124,7 +124,8 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { "Tensor with shape [N x D]."); AddOutput("Y", "(Tensor, default Tensor), a 2-D tensor with shape " - "[N x 1]. The cross entropy loss."); + "[N x 1]. The cross entropy loss.") + .Reuse("X"); AddAttr("soft_label", "(bool, default false), a flag indicating whether to " "interpretate the given labels as soft labels.") diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt index cf20530513cf6cd420e56b2f6378225f73c2bc8b..abc5aad0430e71928a441c9488dda16dfdd63b9c 100644 --- a/paddle/fluid/operators/detail/CMakeLists.txt +++ b/paddle/fluid/operators/detail/CMakeLists.txt @@ -1,12 +1,38 @@ -if(WITH_DISTRIBUTE) +if(NOT WITH_DISTRIBUTE) + return() +endif() + + +if(WITH_GRPC) grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc - request_handler_impl.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor + request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows memory) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr + set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc SERIAL) - cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc + cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op SERIAL) + return() endif() + + +set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") +set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc + PROTO send_recv.proto + DEPS lod_tensor selected_rows memory) + +find_library(OPENSSL_CRYPTO_LIBRARY_STATIC NAMES libcrypto.so) +ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY_STATIC}) + + +find_library(OPENSSL_SSL_LIBRARY_STATIC NAMES libssl.so) +ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY_STATIC}) + +cc_test(brpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_brpc + brpc protobuf leveldb gflags glog + protobuf executor proto_desc lookup_table_op snappystream snappy ssl crypto SERIAL) diff --git a/paddle/fluid/operators/detail/brpc_client.cc b/paddle/fluid/operators/detail/brpc_client.cc new file mode 100644 index 0000000000000000000000000000000000000000..9a4e410f1d83e93883438fae116c38eb60787673 --- /dev/null +++ b/paddle/fluid/operators/detail/brpc_client.cc @@ -0,0 +1,180 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/detail/brpc_client.h" +#include "paddle/fluid/framework/threadpool.h" + +namespace paddle { +namespace operators { +namespace detail { + +DEFINE_int32(brpc_channel_num, 24, + "Number of channels to send requests connected to one server"); +DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds"); +DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)"); + +BRPCClient::~BRPCClient() { Wait(); } + +void HandleSendResponse(brpc::Controller* cntl, + sendrecv::VoidMessage* response) { + // std::unique_ptr makes sure cntl/response will be deleted before returning. + std::unique_ptr cntl_guard(cntl); + std::unique_ptr response_guard(response); + + if (cntl->Failed()) { + LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText(); + return; + } + LOG(INFO) << "Received response from " << cntl->remote_side() + << " latency=" << cntl->latency_us() << "us"; +} + +bool BRPCClient::AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const framework::Scope* p_scope = &scope; + const auto ch_ptr = GetChannel(ep_val); + + framework::AsyncIO( + [var_name_val, p_ctx, ep_val, p_scope, time_out, ch_ptr, this] { + auto ch_ctx = ch_ptr->Pop(); + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); + cntl->set_timeout_ms(time_out); + + google::protobuf::Closure* done = + brpc::NewCallback(&HandleSendResponse, cntl, response); + + sendrecv::VariableMessage request; + ch_ctx->stub->SendVariable(cntl, &request, response, done); + }); + req_count_++; + + return true; +} + +void HandleGetResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response) { + // std::unique_ptr makes sure cntl/response will be deleted before returning. + std::unique_ptr cntl_guard(cntl); + std::unique_ptr response_guard(response); + + if (cntl->Failed()) { + LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText(); + return; + } + LOG(INFO) << "Received response from " << cntl->remote_side() + << " latency=" << cntl->latency_us() << "us"; + + // framework::Variable* outvar = nullptr; + // DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar); +} + +bool BRPCClient::AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + + framework::AsyncIO( + [var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {}); + + req_count_++; + + return true; +} + +bool BRPCClient::AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string in_var_name_val = in_var_name; + const std::string out_var_name_val = out_var_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + + framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, + time_out, ch, this] {}); + + req_count_++; + return true; +} + +void BRPCClient::AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out) { + req_count_++; +} + +void BRPCClient::AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) { + req_count_++; +} + +void BRPCClient::Wait() { + std::unique_lock lk(sync_mutex_); + sync_cond_.wait(lk, [this] { return req_count_ == 0; }); +} + +ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { + { + std::lock_guard guard(chan_mutex_); + auto it = channels_.find(ep); + if (it != channels_.end()) { + return it->second; + } + } + + ChannelQueuePtr q(new framework::BlockingQueue()); + + brpc::ChannelOptions options; + options.protocol = "baidu_std"; + options.connection_type = "pooled"; + options.connect_timeout_ms = 100; + options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/; + options.max_retry = FLAGS_max_retry; + for (int i = 0; i < FLAGS_brpc_channel_num; ++i) { + std::shared_ptr c(new ChannelContext()); + if (c->channel.Init(ep.c_str(), &options) != 0) { + LOG(ERROR) << "Fail to initialize channel"; + return nullptr; + } + + c->stub.reset(new sendrecv::SendRecvService_Stub( + static_cast(&c->channel))); + q->Push(c); + } + + { + std::lock_guard guard(chan_mutex_); + channels_[ep] = q; + } + + return q; +} + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/brpc_client.h b/paddle/fluid/operators/detail/brpc_client.h new file mode 100644 index 0000000000000000000000000000000000000000..1e953ea431d51a9586bfd0b352c7f27d079ff1a8 --- /dev/null +++ b/paddle/fluid/operators/detail/brpc_client.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include // NOLINT +#include +#include +#include +#include +#include // NOLINT +#include +#include + +#include "brpc/channel.h" +#include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/detail/rpc_client.h" +#include "paddle/fluid/operators/detail/send_recv.pb.h" +#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN + +namespace paddle { +namespace operators { +namespace detail { + +struct ChannelContext { + brpc::Channel channel; + std::shared_ptr stub; +}; + +typedef std::shared_ptr ChannelContextPtr; +typedef std::shared_ptr> + ChannelQueuePtr; + +class BRPCClient : public RPCClient { + public: + BRPCClient() {} + virtual ~BRPCClient(); + + bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = RPCClient::rpc_time_out) override; + + bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = RPCClient::rpc_time_out) override; + + bool AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out = RPCClient::rpc_time_out) override; + + void AsyncSendBatchBarrier( + const std::string& ep, + int64_t time_out = RPCClient::rpc_time_out) override; + + void AsyncSendFetchBarrier( + const std::string& ep, + int64_t time_out = RPCClient::rpc_time_out) override; + + void Wait() override; + + private: + void Proceed(); + ChannelQueuePtr GetChannel(const std::string& ep); + + private: + std::unordered_map channels_; + + // mutex for Wait client sync + std::mutex sync_mutex_; + std::condition_variable sync_cond_; + std::atomic req_count_{0}; + + // mutex for GetChannel thread safety + std::mutex chan_mutex_; + DISABLE_COPY_AND_ASSIGN(BRPCClient); +}; + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/brpc_server.cc b/paddle/fluid/operators/detail/brpc_server.cc new file mode 100644 index 0000000000000000000000000000000000000000..2170abe679f9ededff3b53e3139e56f8aad227cb --- /dev/null +++ b/paddle/fluid/operators/detail/brpc_server.cc @@ -0,0 +1,144 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/detail/brpc_server.h" +#include "paddle/fluid/operators/detail/request_handler.h" + +namespace sendrecv { + +typedef std::unordered_map + HandlerMap; + +class BRPCServiceImpl : public SendRecvService { + public: + explicit BRPCServiceImpl(const HandlerMap& rpc_call_map) + : request_send_h_(nullptr), + request_get_h_(nullptr), + request_prefetch_h_(nullptr) { + auto it = rpc_call_map.find(paddle::operators::detail::kRequestSend); + if (it != rpc_call_map.end()) { + request_send_h_ = it->second; + } + + it = rpc_call_map.find(paddle::operators::detail::kRequestSend); + if (it != rpc_call_map.end()) { + request_get_h_ = it->second; + } + + it = rpc_call_map.find(paddle::operators::detail::kRequestPrefetch); + if (it != rpc_call_map.end()) { + request_prefetch_h_ = it->second; + } + } + + virtual ~BRPCServiceImpl() {} + + void SendVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE(request_send_h_ != nullptr, + "RequestSend handler should be registed first!"); + brpc::ClosureGuard done_guard(done); + + paddle::framework::Scope* local_scope = request_send_h_->scope(); + paddle::framework::Variable* outvar = nullptr; + paddle::framework::Variable* invar = nullptr; + + std::string varname = request->varname(); + + if (!request_send_h_->sync_mode()) { + local_scope = &request_send_h_->scope()->NewScope(); + invar = local_scope->Var(varname); + } else { + invar = local_scope->FindVar(varname); + } + + request_send_h_->Handle(varname, local_scope, invar, &outvar); + + if (!request_send_h_->sync_mode()) { + request_send_h_->scope()->DeleteScope(local_scope); + } + } + + void GetVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VariableMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE(request_get_h_ != nullptr, + "RequestGet handler should be registed first!"); + } + + void PrefetchVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE(request_prefetch_h_ != nullptr, + "kRequestPrefetch handler should be registed first!"); + } + + private: + paddle::operators::detail::RequestHandler* request_send_h_; + paddle::operators::detail::RequestHandler* request_get_h_; + paddle::operators::detail::RequestHandler* request_prefetch_h_; +}; +} // namespace sendrecv + +namespace paddle { +namespace operators { +namespace detail { + +void AsyncBRPCServer::StartServer() { + // Instance of your service. + sendrecv::BRPCServiceImpl service_impl(rpc_call_map_); + + // Add the service into server. Notice the second parameter, because the + // service is put on stack, we don't want server to delete it, otherwise + // use brpc::SERVER_OWNS_SERVICE. + if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) { + LOG(FATAL) << "Fail to add service"; + return; + } + + brpc::ServerOptions options; + options.idle_timeout_sec = idle_timeout_s_; + options.max_concurrency = max_concurrency_; + if (server_.Start(bind_address_.c_str(), &options) != 0) { + LOG(FATAL) << "Fail to start EchoServer" << bind_address_; + return; + } + + butil::EndPoint ep = server_.listen_address(); + selected_port_ = ep.port; + + { + std::lock_guard lock(this->mutex_ready_); + ready_ = 1; + } + condition_ready_.notify_all(); + + server_.Join(); +} + +void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); } + +void AsyncBRPCServer::WaitServerReady() { + VLOG(3) << "AsyncGRPCServer is wait server ready"; + std::unique_lock lock(this->mutex_ready_); + condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); + VLOG(3) << "AsyncGRPCServer WaitSeverReady"; +} + +}; // namespace detail +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/detail/brpc_server.h b/paddle/fluid/operators/detail/brpc_server.h new file mode 100644 index 0000000000000000000000000000000000000000..0105c8074a46849031d8fa9c21a5507a982ec3c3 --- /dev/null +++ b/paddle/fluid/operators/detail/brpc_server.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // NOLINT +#include // NOLINT +#include + +#include "brpc/server.h" +#include "paddle/fluid/operators/detail/rpc_server.h" +#include "paddle/fluid/operators/detail/send_recv.pb.h" + +namespace paddle { +namespace operators { +namespace detail { + +class AsyncBRPCServer final : public RPCServer { + public: + explicit AsyncBRPCServer(const std::string& address, int client_num) + : RPCServer(address, client_num), ready_(0) {} + + virtual ~AsyncBRPCServer() {} + void StartServer() override; + void WaitServerReady() override; + + private: + void ShutDownImpl() override; + + brpc::Server server_; + + static constexpr int idle_timeout_s_ = -1; + static constexpr int max_concurrency_ = 0; + + std::mutex mutex_ready_; + std::condition_variable condition_ready_; + int ready_; +}; + +}; // namespace detail +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index da9ca1a0c1d55018141f0e4285fe35d7c437fd55..02ffe3651e1deefcf6981c3d304d64b9a01661bf 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -19,32 +19,43 @@ limitations under the License. */ #include #include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/detail/request_handler.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { namespace detail { -std::once_flag RPCClient::init_flag_; +void GRPCClient::InitImpl() { InitEventLoop(); } -std::unique_ptr RPCClient::rpc_client_(nullptr); +void GRPCClient::InitEventLoop() { + // start the client process thread + // TODO(wuyi): can make this in a threadpool + client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this))); +} -RPCClient* RPCClient::GetInstance() { - std::call_once(init_flag_, &RPCClient::Init); - return rpc_client_.get(); +void GRPCClient::SendComplete() { + for (auto& it : channels_) { + this->AsyncSendComplete(it.first); + } } -void RPCClient::Init() { - if (rpc_client_.get() == nullptr) { - rpc_client_.reset(new RPCClient()); +GRPCClient::~GRPCClient() { + Wait(); + cq_.Shutdown(); + { + std::lock_guard guard(chan_mutex_); + for (auto& it : channels_) { + it.second.reset(); + } } + client_thread_->join(); } -bool RPCClient::AsyncSendVariable(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out) { +bool GRPCClient::AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; @@ -94,11 +105,10 @@ void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) { result->Swap(&tmp); } -bool RPCClient::AsyncGetVariable(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out) { +bool GRPCClient::AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; @@ -136,12 +146,12 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, return true; } -bool RPCClient::AsyncPrefetchVariable(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out) { +bool GRPCClient::AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string in_var_name_val = in_var_name; @@ -179,7 +189,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep, return true; } -void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) { +void GRPCClient::AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out) { const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); @@ -192,7 +203,8 @@ void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) { req_count_++; } -void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) { +void GRPCClient::AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) { const auto ch = GetChannel(ep); FetchBarrierProcessor* s = new FetchBarrierProcessor(ch); s->Prepare(time_out); @@ -204,70 +216,50 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) { req_count_++; } -bool RPCClient::Wait() { - VLOG(3) << "RPCClient begin Wait()" - << " req_count_:" << req_count_; - if (req_count_ <= 0) { - return true; - } - const size_t kReqCnt = req_count_; - bool a[kReqCnt]; - std::vector> waits(req_count_); - std::mutex mu; - - for (int i = 0; i < req_count_; i++) { - waits[i] = framework::AsyncIO([i, &a, &mu, this] { - bool ret = Proceed(); - std::lock_guard l(mu); - a[i] = ret; - }); - } - - for (int i = 0; i < req_count_; i++) { - waits[i].wait(); - } +void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) { + const auto ch = GetChannel(ep); - int last_req_count = req_count_; - req_count_ = 0; + BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); + s->Prepare(time_out); - for (int i = 0; i < last_req_count; i++) { - if (!a[i]) { - return false; - } - } + sendrecv::VariableMessage req; + req.set_varname(COMPLETE_MESSAGE); + auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + req_count_++; +} - return true; +void GRPCClient::Wait() { + std::unique_lock lk(sync_mutex_); + sync_cond_.wait(lk, [this] { return req_count_ == 0; }); } -bool RPCClient::Proceed() { - void* tag = NULL; +void GRPCClient::Proceed() { + void* tag = nullptr; bool ok = false; - // request counts. - if (!cq_.Next(&tag, &ok)) { - LOG(ERROR) << "Get meets CompletionQueue error"; - return false; - } - - GPR_ASSERT(ok); - PADDLE_ENFORCE(tag); - - // TODO(gongwb): add more retries. - BaseProcessor* c = static_cast(tag); - if (!c->status_.ok()) { - LOG(ERROR) << "proc param error:" << c->var_h_.String() - << " grpc error:" << c->status_.error_message(); + while (cq_.Next(&tag, &ok)) { + BaseProcessor* c = static_cast(tag); + GPR_ASSERT(ok); + PADDLE_ENFORCE(c); + if (c->status_.ok()) { + c->Process(); + } else { + LOG(ERROR) << "var: " << c->var_h_.String() + << " grpc error:" << c->status_.error_message(); + } delete c; - return false; + { + std::lock_guard lk(sync_mutex_); + req_count_--; + } + sync_cond_.notify_all(); } - - c->Process(); - delete c; - return true; } -std::shared_ptr RPCClient::GetChannel(const std::string& ep) { + +std::shared_ptr GRPCClient::GetChannel(const std::string& ep) { // TODO(Yancey1989): make grpc client completely thread-safe - std::unique_lock lock(mutex_); + std::lock_guard guard(chan_mutex_); auto it = channels_.find(ep); if (it != channels_.end()) { return it->second; diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h index 449d5105afb8c02294a0ef57610e7de1b1631b35..44000c028b499d9ad1a0e0dd40a5e287cd61d143 100644 --- a/paddle/fluid/operators/detail/grpc_client.h +++ b/paddle/fluid/operators/detail/grpc_client.h @@ -16,15 +16,18 @@ limitations under the License. */ #include -#include // NOLINT +#include // NOLINT +#include // NOLINT #include #include #include #include #include // NOLINT #include +#include // NOLINT #include +#include "grpc++/channel.h" #include "grpc++/generic/generic_stub.h" #include "grpc++/grpc++.h" #include "grpc++/support/byte_buffer.h" @@ -35,6 +38,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/detail/rpc_client.h" #include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN @@ -161,53 +165,65 @@ class FetchBarrierProcessor : public BaseProcessor { std::unique_ptr stub_; }; -class RPCClient { +class GRPCClient : public RPCClient { public: - RPCClient() {} + GRPCClient() {} + virtual ~GRPCClient(); - static RPCClient* GetInstance(); + bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = RPCClient::rpc_time_out) override; - bool AsyncSendVariable(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = 600 * 1000); + bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = RPCClient::rpc_time_out) override; - bool AsyncGetVariable(const std::string& ep, + bool AsyncPrefetchVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = 600 * 1000); + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out = RPCClient::rpc_time_out) override; - bool AsyncPrefetchVariable(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out = 600 * 1000); + void AsyncSendBatchBarrier( + const std::string& ep, + int64_t time_out = RPCClient::rpc_time_out) override; - void AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out = 600 * 1000); + void AsyncSendFetchBarrier( + const std::string& ep, + int64_t time_out = RPCClient::rpc_time_out) override; - void AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out = 600 * 1000); + void Wait() override; - bool Wait(); + void SendComplete() override; + + protected: + void InitImpl() override; private: - bool Proceed(); + // InitEventLoop should only be called by Init() + void InitEventLoop(); + + void Proceed(); + + void AsyncSendComplete(const std::string& ep, + int64_t time_out = RPCClient::rpc_time_out); + std::shared_ptr GetChannel(const std::string& ep); - // Init is called by GetInstance. - static void Init(); private: grpc::CompletionQueue cq_; - std::map> channels_; + std::unordered_map> channels_; + std::unique_ptr client_thread_; + + // mutex for Wait client sync + std::mutex sync_mutex_; + std::condition_variable sync_cond_; std::atomic req_count_{0}; - std::mutex mutex_; - static std::unique_ptr rpc_client_; - static std::once_flag init_flag_; - DISABLE_COPY_AND_ASSIGN(RPCClient); + + // mutex for GetChannel thread safety + std::mutex chan_mutex_; + DISABLE_COPY_AND_ASSIGN(GRPCClient); }; } // namespace detail diff --git a/paddle/fluid/operators/detail/serde_test.cc b/paddle/fluid/operators/detail/grpc_serde_test.cc similarity index 100% rename from paddle/fluid/operators/detail/serde_test.cc rename to paddle/fluid/operators/detail/grpc_serde_test.cc diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index e73756d89004bc48339c0aa31dd0857c2ca6722d..5a87258901c6563fe793d4041f344011a56d9a01 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -41,11 +41,22 @@ class RequestBase { virtual ~RequestBase() {} virtual void Process() = 0; - CallStatus Status() { return status_; } - void SetStatus(CallStatus status) { status_ = status; } + CallStatus Status() const { + std::lock_guard l(status_mu_); + return status_; + } + + template + void Finish(const T& reply, ServerAsyncResponseWriter* responder) { + std::lock_guard l(status_mu_); + status_ = FINISH; + responder->Finish(reply, ::grpc::Status::OK, + reinterpret_cast(static_cast(req_id_))); + } virtual std::string GetReqName() = 0; protected: + mutable std::mutex status_mu_; ::grpc::ServerContext ctx_; GrpcService::AsyncService* service_; ::grpc::ServerCompletionQueue* cq_; @@ -68,9 +79,7 @@ class RequestSend final : public RequestBase { method_id, &ctx_, request_.get(), &responder_, cq_, cq_, reinterpret_cast(static_cast(req_id))); } - virtual ~RequestSend() {} - std::string GetReqName() override { return request_->Varname(); } void Process() override { @@ -82,10 +91,7 @@ class RequestSend final : public RequestBase { framework::Variable* outvar = nullptr; request_handler_->Handle(varname, scope, invar, &outvar); - - status_ = FINISH; - responder_.Finish(reply_, ::grpc::Status::OK, - reinterpret_cast(static_cast(req_id_))); + Finish(reply_, &responder_); } protected: @@ -125,10 +131,7 @@ class RequestGet final : public RequestBase { SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(), &reply_); } - - status_ = FINISH; - responder_.Finish(reply_, ::grpc::Status::OK, - reinterpret_cast(static_cast(req_id_))); + Finish(reply_, &responder_); } protected: @@ -159,21 +162,21 @@ class RequestPrefetch final : public RequestBase { void Process() override { // prefetch process... - std::string varname = request_->OutVarname(); - VLOG(3) << "RequestPrefetch " << varname; + std::string in_var_name = request_->Varname(); + std::string out_var_name = request_->OutVarname(); + VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name + << " out_var_name: " << out_var_name; auto scope = request_->GetMutableLocalScope(); - auto invar = scope->FindVar(varname); - framework::Variable* outvar = nullptr; + auto invar = scope->FindVar(in_var_name); + // out var must be created in local scope! + framework::Variable* outvar = scope->Var(out_var_name); - request_handler_->Handle(varname, scope, invar, &outvar); + request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name); - SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(), + SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(), &reply_); - - status_ = FINISH; - responder_.Finish(reply_, ::grpc::Status::OK, - reinterpret_cast(static_cast(req_id_))); + Finish(reply_, &responder_); } protected: @@ -287,7 +290,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, } else if (rpc_name == kRequestPrefetch) { b = new RequestPrefetch(&service_, cq.get(), handler, req_id); } else { - PADDLE_ENFORCE(false, "not surpported rpc"); + PADDLE_ENFORCE(false, "not supported rpc"); } reqs[req_id] = b; diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h index d1fcbc414f123c5c4810d9cecf807a406aa2c405..f1db7590f6f14d5d44acc12453861a446e278cd2 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -53,6 +53,7 @@ class AsyncGRPCServer final : public RPCServer { void StartServer() override; private: + // HandleRequest needs to be thread-safe. void HandleRequest( ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name, std::function TryToRegisterNewOne); @@ -71,8 +72,6 @@ class AsyncGRPCServer final : public RPCServer { std::unique_ptr<::grpc::Server> server_; // condition of the sub program - std::mutex barrier_mutex_; - mutable int barrier_cond_step_; std::condition_variable barrier_condition_; std::mutex mutex_ready_; diff --git a/paddle/fluid/operators/detail/macros.h b/paddle/fluid/operators/detail/macros.h new file mode 100644 index 0000000000000000000000000000000000000000..da1de72dad00db3ffe609e17bd198ef0a56bbfcd --- /dev/null +++ b/paddle/fluid/operators/detail/macros.h @@ -0,0 +1,27 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_GRPC +#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/grpc_server.h" +#define RPCSERVER_T detail::AsyncGRPCServer +#define RPCCLIENT_T detail::GRPCClient +#else +#include "paddle/fluid/operators/detail/brpc_client.h" +#include "paddle/fluid/operators/detail/brpc_server.h" +#define RPCSERVER_T detail::AsyncBRPCServer +#define RPCCLIENT_T detail::BRPCClient +#endif diff --git a/paddle/fluid/operators/detail/request_handler.h b/paddle/fluid/operators/detail/request_handler.h index 4bc5e7f10ee2a8939d230fe96517bd9f56c13933..a2d08747d59220d30a5b8fd56074fd2739ae3bab 100644 --- a/paddle/fluid/operators/detail/request_handler.h +++ b/paddle/fluid/operators/detail/request_handler.h @@ -28,7 +28,6 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" namespace paddle { namespace operators { @@ -38,6 +37,11 @@ constexpr char kRequestSend[] = "RequestSend"; constexpr char kRequestGet[] = "RequestGet"; constexpr char kRequestPrefetch[] = "RequestPrefetch"; +#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" +#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" +#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" +#define COMPLETE_MESSAGE "COMPLETE@RECV" + class RPCServer; class RequestHandler { @@ -57,9 +61,12 @@ class RequestHandler { void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; } void SetProgram(framework::ProgramDesc* program) { program_ = program; } void SetExecutor(framework::Executor* executor) { executor_ = executor; } + + // Used for dist lookup table prefetch void SetPrefetchPreparedCtx( - std::unique_ptr prepared) { - prefetch_ctx_.reset(prepared.release()); + std::unordered_map< + std::string, std::shared_ptr>* g) { + prefetch_var_name_to_prepared_ctx_ = g; } // Used for async. @@ -75,12 +82,8 @@ class RequestHandler { bool sync_mode() { return sync_mode_; } framework::Scope* scope() { return scope_; } const platform::DeviceContext* dev_ctx() { return dev_ctx_; } - framework::ExecutorPrepareContext* prefetch_ctx() { - return prefetch_ctx_.get(); - } framework::ProgramDesc* program() { return program_; } framework::Executor* executor() { return executor_; } - std::vector& sparse_vars() { return sparse_vars_; } // This function processes user's rpc request. // The implemention is in request_handler_impl. @@ -97,8 +100,8 @@ class RequestHandler { // *request_handler_->dev_ctx(), &reply_); // } virtual bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, - framework::Variable** outvar) = 0; + framework::Variable* var, framework::Variable** outvar, + const std::string& out_var_name = "") = 0; protected: const bool sync_mode_; @@ -107,19 +110,18 @@ class RequestHandler { framework::Executor* executor_; framework::Scope* scope_; framework::ProgramDesc* program_; - std::unique_ptr prefetch_ctx_; + + // used for distribute lookup table prefetch + std::unordered_map>* + prefetch_var_name_to_prepared_ctx_; // Used for async. std::unordered_map>* grad_to_prepared_ctx_; - // Record received sparse variables, so that - // we could reset those after execute optimize program - std::vector sparse_vars_; RPCServer* rpc_server_; - - std::mutex sparse_var_mutex_; }; } // namespace detail diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/detail/request_handler_impl.cc index f16c06d52f4fb86d51083a8b3b98d05a64c1af74..7425bee798cd9ba0af8cd777a6db63862c8a4031 100644 --- a/paddle/fluid/operators/detail/request_handler_impl.cc +++ b/paddle/fluid/operators/detail/request_handler_impl.cc @@ -16,15 +16,12 @@ #include #include -#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/detail/rpc_server.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" -#include "paddle/fluid/operators/detail/variable_response.h" namespace paddle { namespace operators { @@ -33,7 +30,8 @@ namespace detail { bool RequestSendHandler::Handle(const std::string& varname, framework::Scope* scope, framework::Variable* invar, - framework::Variable** outvar) { + framework::Variable** outvar, + const std::string& out_var_name) { VLOG(4) << "RequestSendHandler:" << varname; // Async @@ -52,6 +50,9 @@ bool RequestSendHandler::Handle(const std::string& varname, if (varname == BATCH_BARRIER_MESSAGE) { VLOG(3) << "sync: recv batch barrier message"; rpc_server_->IncreaseBatchBarrier(kRequestSend); + } else if (varname == COMPLETE_MESSAGE) { + VLOG(3) << "sync: recv complete message"; + rpc_server_->DecreaseClientNum(); } else { VLOG(3) << "sync: received var_name: " << varname; if (sync_mode_) { @@ -63,20 +64,27 @@ bool RequestSendHandler::Handle(const std::string& varname, PADDLE_THROW("sync: Can not find server side var"); return false; } - if (invar->IsType()) { - std::unique_lock lock(sparse_var_mutex_); + std::unique_lock lock(mutex_sparse_vars_); sparse_vars_.push_back(invar); } } - return true; } +void RequestSendHandler::ResetSparseVarRecorder() { + std::unique_lock lock(mutex_sparse_vars_); + for (auto* var : sparse_vars_) { + var->GetMutable()->mutable_rows()->clear(); + } + sparse_vars_.clear(); +} + bool RequestGetHandler::Handle(const std::string& varname, framework::Scope* scope, framework::Variable* invar, - framework::Variable** outvar) { + framework::Variable** outvar, + const std::string& out_var_name) { VLOG(4) << "RequestGetHandler:" << varname; if (varname != FETCH_BARRIER_MESSAGE) { @@ -99,13 +107,14 @@ bool RequestGetHandler::Handle(const std::string& varname, bool RequestPrefetchHandler::Handle(const std::string& varname, framework::Scope* scope, framework::Variable* invar, - framework::Variable** outvar) { + framework::Variable** outvar, + const std::string& out_var_name) { VLOG(4) << "RequestPrefetchHandler " << varname; - auto var_desc = program_->Block(0).FindVar(varname); - *outvar = scope->FindVar(varname); + auto var_desc = program_->Block(0).FindVar(out_var_name); InitializeVariable(*outvar, var_desc->GetType()); - executor_->RunPreparedContext(prefetch_ctx_.get(), scope); + executor_->RunPreparedContext( + (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope); return true; } diff --git a/paddle/fluid/operators/detail/request_handler_impl.h b/paddle/fluid/operators/detail/request_handler_impl.h index 8d0c62232b68ad6c05e751c25103802ee12db57e..3f77c09a9598b431d747f1b824615e49d939098e 100644 --- a/paddle/fluid/operators/detail/request_handler_impl.h +++ b/paddle/fluid/operators/detail/request_handler_impl.h @@ -29,7 +29,6 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/operators/detail/request_handler.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" namespace paddle { namespace operators { @@ -40,7 +39,13 @@ class RequestSendHandler final : public RequestHandler { explicit RequestSendHandler(bool sync_mode) : RequestHandler(sync_mode) {} virtual ~RequestSendHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar) override; + framework::Variable* var, framework::Variable** outvar, + const std::string& out_var_name = "") override; + void ResetSparseVarRecorder(); + + private: + std::mutex mutex_sparse_vars_; + std::vector sparse_vars_; }; class RequestGetHandler final : public RequestHandler { @@ -48,7 +53,8 @@ class RequestGetHandler final : public RequestHandler { explicit RequestGetHandler(bool sync_mode) : RequestHandler(sync_mode) {} virtual ~RequestGetHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar) override; + framework::Variable* var, framework::Variable** outvar, + const std::string& out_var_name = "") override; }; class RequestPrefetchHandler final : public RequestHandler { @@ -56,7 +62,8 @@ class RequestPrefetchHandler final : public RequestHandler { explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {} virtual ~RequestPrefetchHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar) override; + framework::Variable* var, framework::Variable** outvar, + const std::string& out_var_name = "") override; }; } // namespace detail diff --git a/paddle/fluid/operators/detail/rpc_client.cc b/paddle/fluid/operators/detail/rpc_client.cc new file mode 100644 index 0000000000000000000000000000000000000000..9a791403e3d6b99c5d4de5183e83e1af655d7d4c --- /dev/null +++ b/paddle/fluid/operators/detail/rpc_client.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/detail/rpc_client.h" + +namespace paddle { +namespace operators { +namespace detail { + +std::once_flag RPCClient::init_flag_; +std::unique_ptr RPCClient::rpc_client_(nullptr); + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/rpc_client.h b/paddle/fluid/operators/detail/rpc_client.h new file mode 100644 index 0000000000000000000000000000000000000000..47c6ffb4fd7a002fc0bd8053fb3314a2fbf18fd3 --- /dev/null +++ b/paddle/fluid/operators/detail/rpc_client.h @@ -0,0 +1,89 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace operators { +namespace detail { + +class RPCClient { + public: + RPCClient() {} + virtual ~RPCClient() {} + virtual bool AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = rpc_time_out) = 0; + + virtual bool AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = rpc_time_out) = 0; + + virtual bool AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out = rpc_time_out) = 0; + + virtual void AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out = rpc_time_out) = 0; + + virtual void AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out = rpc_time_out) = 0; + + // SendComplete tells all the server that current trainer have no more data + // to train, so that the pserver can reduce it's barrier count, and continue + // to train with other trainers. + virtual void SendComplete() = 0; + + virtual void Wait() = 0; + + static constexpr int64_t rpc_time_out = 120 * 1000; + + template + static RPCClient* GetInstance() { + std::call_once(init_flag_, &RPCClient::Init); + return rpc_client_.get(); + } + + // Init is called by GetInstance. + template + static void Init() { + if (rpc_client_.get() == nullptr) { + rpc_client_.reset(new T()); + rpc_client_->InitImpl(); + } + } + + protected: + virtual void InitImpl() {} + + private: + static std::once_flag init_flag_; + static std::unique_ptr rpc_client_; +}; +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/rpc_server.cc b/paddle/fluid/operators/detail/rpc_server.cc index 448763372a8c224cc68319a4a444915896b68234..cd0fe96e2301ee3304fe9a2967df58b9f7072d8d 100644 --- a/paddle/fluid/operators/detail/rpc_server.cc +++ b/paddle/fluid/operators/detail/rpc_server.cc @@ -43,7 +43,7 @@ void RPCServer::SavePort() const { void RPCServer::WaitBarrier(const std::string& rpc_name) { std::unique_lock lock(this->mutex_); - barrier_cond_.wait(lock, [=] { + barrier_cond_.wait(lock, [this, &rpc_name] { return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load()); }); @@ -53,19 +53,23 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) { void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; int b = 0; - { - std::unique_lock lock(mutex_); - b = ++barrier_counter_[rpc_name]; - } - - VLOG(3) << "RPCServer IncreaseBatchBarrier " << rpc_name - << ", barrier_count:" << b << ", fan_in" << client_num_; - + std::unique_lock lock(mutex_); + b = ++barrier_counter_[rpc_name]; if (b >= client_num_) { + lock.unlock(); barrier_cond_.notify_all(); + lock.lock(); } } +void RPCServer::DecreaseClientNum() { + { + std::unique_lock lock(mutex_); + client_num_--; + } + barrier_cond_.notify_all(); +} + void RPCServer::ResetBarrierCounter() { VLOG(3) << "RPCServer ResetBarrierCounter "; std::unique_lock lock(mutex_); diff --git a/paddle/fluid/operators/detail/rpc_server.h b/paddle/fluid/operators/detail/rpc_server.h index c2e7ae706c9dc6776e09b25e424b30f110c3855d..2e3342428cb56c34abaca655d5906668cda8f140 100644 --- a/paddle/fluid/operators/detail/rpc_server.h +++ b/paddle/fluid/operators/detail/rpc_server.h @@ -60,6 +60,7 @@ class RPCServer { void SetCond(const std::string& rpc_name); void WaitCond(const std::string& rpc_name); void IncreaseBatchBarrier(const std::string rpc_name); + void DecreaseClientNum(); void ResetBarrierCounter(); protected: @@ -78,8 +79,7 @@ class RPCServer { std::string bind_address_; std::atomic exit_flag_; int selected_port_; - - const int client_num_; + int client_num_; std::unordered_map rpc_call_map_; std::unordered_map rpc_thread_num_; diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/rpc_server_test.cc similarity index 83% rename from paddle/fluid/operators/detail/grpc_server_test.cc rename to paddle/fluid/operators/detail/rpc_server_test.cc index f97f638701cfb263f28dddbdc3bc80fb16468744..463a7b80cfac280de5afe91ee85caaaf074cef32 100644 --- a/paddle/fluid/operators/detail/grpc_server_test.cc +++ b/paddle/fluid/operators/detail/rpc_server_test.cc @@ -17,14 +17,14 @@ limitations under the License. */ #include // NOLINT #include "gtest/gtest.h" -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/detail/grpc_server.h" - #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/request_handler_impl.h" +#include "paddle/fluid/operators/detail/rpc_client.h" +#include "paddle/fluid/operators/detail/rpc_server.h" namespace framework = paddle::framework; namespace platform = paddle::platform; @@ -32,7 +32,7 @@ namespace detail = paddle::operators::detail; USE_OP(lookup_table); -std::unique_ptr g_rpc_service; +std::unique_ptr g_rpc_service; std::unique_ptr g_req_handler; framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) { @@ -98,11 +98,17 @@ void StartServer() { framework::Executor exe(place); platform::CPUDeviceContext ctx(place); auto* block = AppendPrefetchBlcok(&program); - auto prepared = exe.Prepare(program, block->ID()); + std::string in_var_name("ids"); + std::vector prefetch_block_ids{block->ID()}; + auto prepared = exe.Prepare(program, prefetch_block_ids); InitTensorsOnServer(&scope, &place, 10); + std::unordered_map> + prefetch_var_name_to_prepared; + prefetch_var_name_to_prepared[in_var_name] = prepared[0]; g_req_handler->SetProgram(&program); - g_req_handler->SetPrefetchPreparedCtx(std::move(prepared)); + g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared); g_req_handler->SetDevCtx(&ctx); g_req_handler->SetScope(&scope); g_req_handler->SetExecutor(&exe); @@ -111,23 +117,19 @@ void StartServer() { g_req_handler->SetRPCServer(g_rpc_service.get()); std::thread server_thread( - std::bind(&detail::AsyncGRPCServer::StartServer, g_rpc_service.get())); + std::bind(&detail::RPCServer::StartServer, g_rpc_service.get())); - // FIXME(gongwb): don't use hard time. - sleep(10); - LOG(INFO) << "got nccl id and stop server..."; - g_rpc_service->ShutDown(); server_thread.join(); } TEST(PREFETCH, CPU) { g_req_handler.reset(new detail::RequestPrefetchHandler(true)); - g_rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", 1)); + g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); + detail::RPCClient* client = detail::RPCClient::GetInstance(); std::thread server_thread(StartServer); g_rpc_service->WaitServerReady(); - detail::RPCClient client; int port = g_rpc_service->GetSelectedPort(); std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); @@ -141,8 +143,8 @@ TEST(PREFETCH, CPU) { std::string in_var_name("ids"); std::string out_var_name("out"); - client.AsyncPrefetchVariable(ep, ctx, scope, in_var_name, out_var_name); - client.Wait(); + client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name); + client->Wait(); auto var = scope.Var(out_var_name); auto value = var->GetMutable()->value(); auto ptr = value.mutable_data(place); @@ -152,6 +154,7 @@ TEST(PREFETCH, CPU) { } } + g_rpc_service->ShutDown(); server_thread.join(); LOG(INFO) << "begin reset"; g_rpc_service.reset(nullptr); diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto index a244afc46f3247c7e6e8481b09b5c729a2a569f7..54cb93e04d18b3784be187c9c8885bbccc55488b 100644 --- a/paddle/fluid/operators/detail/send_recv.proto +++ b/paddle/fluid/operators/detail/send_recv.proto @@ -14,6 +14,8 @@ limitations under the License. */ syntax = "proto3"; package sendrecv; +// option cc_generic_services = true; + service SendRecvService { // For parameter server round-robin like hashing, do not split tensors. // Send and recv only one tensor diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h index c72e1bd076f670458f3915072154847db6205092..bd16bf1dab8d933ffd18b6d6d9e3ce1c7d73029b 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.h +++ b/paddle/fluid/operators/detail/sendrecvop_utils.h @@ -32,16 +32,6 @@ namespace paddle { namespace operators { namespace detail { -#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" -#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" -#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" - -static int64_t GetTimestamp() { - struct timeval tp; - gettimeofday(&tp, NULL); - return tp.tv_sec * 1000 + tp.tv_usec / 1000; -} - typedef void (*DestroyCallback)(void*); void SerializeToByteBuffer(const std::string& name, framework::Variable* var, diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc index 74848005d0bea6e5c818ff999727aa2b8ad51d84..8c4b4321b7582a5cfad89f23e3d298ed16162d99 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cc +++ b/paddle/fluid/operators/detection/box_coder_op.cc @@ -22,21 +22,21 @@ class BoxCoderOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("PriorBox"), "Input(PriorBox) of BoxCoderOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"), - "Input(PriorBoxVar) of BoxCoderOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("TargetBox"), "Input(TargetBox) of BoxCoderOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("OutputBox"), "Output(OutputBox) of BoxCoderOp should not be null."); auto prior_box_dims = ctx->GetInputDim("PriorBox"); - auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); auto target_box_dims = ctx->GetInputDim("TargetBox"); PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2, "The rank of Input of PriorBoxVar must be 2"); PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]"); - PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims); + if (ctx->HasInput("PriorBoxVar")) { + auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); + PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims); + } auto code_type = GetBoxCodeType(ctx->Attrs().Get("code_type")); if (code_type == BoxCodeType::kEncodeCenterSize) { @@ -71,9 +71,11 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker { "of the coordinate system. [xmax, ymax] is the right bottom " "coordinate of the anchor box."); AddInput("PriorBoxVar", - "(Tensor, default Tensor) " + "(Tensor, default Tensor, optional) " "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group " - "of variance."); + "of variance. PriorBoxVar will set all elements to 1 by " + "default.") + .AsDispensable(); AddInput( "TargetBox", "(LoDTensor or Tensor) This input can be a 2-D LoDTensor with shape " @@ -91,6 +93,10 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker { "the code type used with the target box") .SetDefault("encode_center_size") .InEnum({"encode_center_size", "decode_center_size"}); + AddAttr("box_normalized", + "(bool, default true) " + "whether treat the priorbox as a noramlized box") + .SetDefault(true); AddOutput("OutputBox", "(LoDTensor or Tensor) " "When code_type is 'encode_center_size', the output tensor of " @@ -127,5 +133,6 @@ width and height. namespace ops = paddle::operators; REGISTER_OPERATOR(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel, - ops::BoxCoderKernel); +REGISTER_OP_CPU_KERNEL( + box_coder, ops::BoxCoderKernel, + ops::BoxCoderKernel); diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu index 8cef8e03439df4ca5b0fa94176a21a36f9eb9f70..a7af111f63d654319dd1d90d2032956951dfe49e 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cu +++ b/paddle/fluid/operators/detection/box_coder_op.cu @@ -20,15 +20,16 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data, const T* prior_box_var_data, const T* target_box_data, const int row, const int col, const int len, - T* output) { + const bool normalized, T* output) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < row * col) { const int row_idx = idx / col; const int col_idx = idx % col; - T prior_box_width = - prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len]; - T prior_box_height = - prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1]; + T prior_box_width = prior_box_data[col_idx * len + 2] - + prior_box_data[col_idx * len] + (normalized == false); + T prior_box_height = prior_box_data[col_idx * len + 3] - + prior_box_data[col_idx * len + 1] + + (normalized == false); T prior_box_center_x = (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2; T prior_box_center_y = (prior_box_data[col_idx * len + 3] + @@ -41,20 +42,24 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data, T target_box_center_y = (target_box_data[row_idx * len + 3] + target_box_data[row_idx * len + 1]) / 2; - T target_box_width = - target_box_data[row_idx * len + 2] - target_box_data[row_idx * len]; - T target_box_height = - target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1]; + T target_box_width = target_box_data[row_idx * len + 2] - + target_box_data[row_idx * len] + (normalized == false); + T target_box_height = target_box_data[row_idx * len + 3] - + target_box_data[row_idx * len + 1] + + (normalized == false); - output[idx * len] = (target_box_center_x - prior_box_center_x) / - prior_box_width / prior_box_var_data[col_idx * len]; - output[idx * len + 1] = (target_box_center_y - prior_box_center_y) / - prior_box_height / - prior_box_var_data[col_idx * len + 1]; - output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) / - prior_box_var_data[col_idx * len + 2]; - output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) / - prior_box_var_data[col_idx * len + 3]; + output[idx * len] = + (target_box_center_x - prior_box_center_x) / prior_box_width; + output[idx * len + 1] = + (target_box_center_y - prior_box_center_y) / prior_box_height; + output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)); + output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)); + if (prior_box_var_data) { + output[idx * len] /= prior_box_var_data[col_idx * len]; + output[idx * len + 1] /= prior_box_var_data[col_idx * len + 1]; + output[idx * len + 2] /= prior_box_var_data[col_idx * len + 2]; + output[idx * len + 3] /= prior_box_var_data[col_idx * len + 3]; + } } } @@ -63,42 +68,56 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data, const T* prior_box_var_data, const T* target_box_data, const int row, const int col, const int len, - T* output) { + const bool normalized, T* output) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < row * col) { const int col_idx = idx % col; - T prior_box_width = - prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len]; - T prior_box_height = - prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1]; + T prior_box_width = prior_box_data[col_idx * len + 2] - + prior_box_data[col_idx * len] + (normalized == false); + T prior_box_height = prior_box_data[col_idx * len + 3] - + prior_box_data[col_idx * len + 1] + + (normalized == false); T prior_box_center_x = (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2; T prior_box_center_y = (prior_box_data[col_idx * len + 3] + prior_box_data[col_idx * len + 1]) / 2; - - T target_box_width = exp(prior_box_var_data[col_idx * len + 2] * + T target_box_width, target_box_height; + T target_box_center_x, target_box_center_y; + if (prior_box_var_data) { + target_box_width = exp(prior_box_var_data[col_idx * len + 2] * target_box_data[idx * len + 2]) * prior_box_width; - T target_box_height = exp(prior_box_var_data[col_idx * len + 3] * + target_box_height = exp(prior_box_var_data[col_idx * len + 3] * target_box_data[idx * len + 3]) * prior_box_height; - T target_box_center_x = prior_box_var_data[col_idx * len] * + target_box_center_x = prior_box_var_data[col_idx * len] * target_box_data[idx * len] * prior_box_width + prior_box_center_x; - T target_box_center_y = prior_box_var_data[col_idx * len + 1] * + target_box_center_y = prior_box_var_data[col_idx * len + 1] * target_box_data[idx * len + 1] * prior_box_height + prior_box_center_y; + } else { + target_box_width = exp(target_box_data[idx * len + 2]) * prior_box_width; + target_box_height = + exp(target_box_data[idx * len + 3]) * prior_box_height; + target_box_center_x = + target_box_data[idx * len] * prior_box_width + prior_box_center_x; + target_box_center_y = target_box_data[idx * len + 1] * prior_box_height + + prior_box_center_y; + } output[idx * len] = target_box_center_x - target_box_width / 2; output[idx * len + 1] = target_box_center_y - target_box_height / 2; - output[idx * len + 2] = target_box_center_x + target_box_width / 2; - output[idx * len + 3] = target_box_center_y + target_box_height / 2; + output[idx * len + 2] = + target_box_center_x + target_box_width / 2 - (normalized == false); + output[idx * len + 3] = + target_box_center_y + target_box_height / 2 - (normalized == false); } } -template +template class BoxCoderCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -109,6 +128,11 @@ class BoxCoderCUDAKernel : public framework::OpKernel { auto* target_box = context.Input("TargetBox"); auto* output_box = context.Output("OutputBox"); + const T* prior_box_data = prior_box->data(); + const T* target_box_data = target_box->data(); + const T* prior_box_var_data = nullptr; + if (prior_box_var) prior_box_var_data = prior_box_var->data(); + if (target_box->lod().size()) { PADDLE_ENFORCE_EQ(target_box->lod().size(), 1, "Only support 1 level of LoD."); @@ -120,22 +144,19 @@ class BoxCoderCUDAKernel : public framework::OpKernel { int grid = (row * col + block - 1) / block; auto& device_ctx = context.cuda_device_context(); - const T* prior_box_data = prior_box->data(); - const T* prior_box_var_data = prior_box_var->data(); - const T* target_box_data = target_box->data(); - output_box->mutable_data({row, col, len}, context.GetPlace()); T* output = output_box->data(); auto code_type = GetBoxCodeType(context.Attr("code_type")); + bool normalized = context.Attr("box_normalized"); if (code_type == BoxCodeType::kEncodeCenterSize) { EncodeCenterSizeKernel<<>>( prior_box_data, prior_box_var_data, target_box_data, row, col, len, - output); + normalized, output); } else if (code_type == BoxCodeType::kDecodeCenterSize) { DecodeCenterSizeKernel<<>>( prior_box_data, prior_box_var_data, target_box_data, row, col, len, - output); + normalized, output); } } }; @@ -144,5 +165,7 @@ class BoxCoderCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel, - ops::BoxCoderCUDAKernel); +REGISTER_OP_CUDA_KERNEL( + box_coder, + ops::BoxCoderCUDAKernel, + ops::BoxCoderCUDAKernel); diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h index 77fc6c2b62af42e6526b889aeef2d9bab795baec..5ed8520acddfa8fe2105a7c1615bcb3243cb130f 100644 --- a/paddle/fluid/operators/detection/box_coder_op.h +++ b/paddle/fluid/operators/detection/box_coder_op.h @@ -28,26 +28,28 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) { PADDLE_THROW("Not support type %s.", type); } -template +template class BoxCoderKernel : public framework::OpKernel { public: - void EncodeCenterSize(const framework::Tensor& target_box, - const framework::Tensor& prior_box, - const framework::Tensor& prior_box_var, - T* output) const { - int64_t row = target_box.dims()[0]; - int64_t col = prior_box.dims()[0]; - int64_t len = prior_box.dims()[1]; - auto* target_box_data = target_box.data(); - auto* prior_box_data = prior_box.data(); - auto* prior_box_var_data = prior_box_var.data(); + void EncodeCenterSize(const framework::Tensor* target_box, + const framework::Tensor* prior_box, + const framework::Tensor* prior_box_var, + const bool normalized, T* output) const { + int64_t row = target_box->dims()[0]; + int64_t col = prior_box->dims()[0]; + int64_t len = prior_box->dims()[1]; + auto* target_box_data = target_box->data(); + auto* prior_box_data = prior_box->data(); + const T* prior_box_var_data = nullptr; + if (prior_box_var) prior_box_var_data = prior_box_var->data(); for (int64_t i = 0; i < row; ++i) { for (int64_t j = 0; j < col; ++j) { - T prior_box_width = - prior_box_data[j * len + 2] - prior_box_data[j * len]; - T prior_box_height = - prior_box_data[j * len + 3] - prior_box_data[j * len + 1]; + T prior_box_width = prior_box_data[j * len + 2] - + prior_box_data[j * len] + (normalized == false); + T prior_box_height = prior_box_data[j * len + 3] - + prior_box_data[j * len + 1] + + (normalized == false); T prior_box_center_x = (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2; T prior_box_center_y = @@ -57,67 +59,89 @@ class BoxCoderKernel : public framework::OpKernel { (target_box_data[i * len + 2] + target_box_data[i * len]) / 2; T target_box_center_y = (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2; - T target_box_width = - target_box_data[i * len + 2] - target_box_data[i * len]; - T target_box_height = - target_box_data[i * len + 3] - target_box_data[i * len + 1]; + T target_box_width = target_box_data[i * len + 2] - + target_box_data[i * len] + (normalized == false); + T target_box_height = target_box_data[i * len + 3] - + target_box_data[i * len + 1] + + (normalized == false); size_t offset = i * col * len + j * len; - output[offset] = (target_box_center_x - prior_box_center_x) / - prior_box_width / prior_box_var_data[j * len]; - output[offset + 1] = (target_box_center_y - prior_box_center_y) / - prior_box_height / prior_box_var_data[j * len + 1]; + output[offset] = + (target_box_center_x - prior_box_center_x) / prior_box_width; + output[offset + 1] = + (target_box_center_y - prior_box_center_y) / prior_box_height; output[offset + 2] = - std::log(std::fabs(target_box_width / prior_box_width)) / - prior_box_var_data[j * len + 2]; + std::log(std::fabs(target_box_width / prior_box_width)); output[offset + 3] = - std::log(std::fabs(target_box_height / prior_box_height)) / - prior_box_var_data[j * len + 3]; + std::log(std::fabs(target_box_height / prior_box_height)); + if (prior_box_var) { + output[offset] /= prior_box_var_data[j * len]; + output[offset + 1] /= prior_box_var_data[j * len + 1]; + output[offset + 2] /= prior_box_var_data[j * len + 2]; + output[offset + 3] /= prior_box_var_data[j * len + 3]; + } } } } - void DecodeCenterSize(const framework::Tensor& target_box, - const framework::Tensor& prior_box, - const framework::Tensor& prior_box_var, - T* output) const { - int64_t row = target_box.dims()[0]; - int64_t col = prior_box.dims()[0]; - int64_t len = prior_box.dims()[1]; - - auto* target_box_data = target_box.data(); - auto* prior_box_data = prior_box.data(); - auto* prior_box_var_data = prior_box_var.data(); + void DecodeCenterSize(const framework::Tensor* target_box, + const framework::Tensor* prior_box, + const framework::Tensor* prior_box_var, + const bool normalized, T* output) const { + int64_t row = target_box->dims()[0]; + int64_t col = prior_box->dims()[0]; + int64_t len = prior_box->dims()[1]; + + auto* target_box_data = target_box->data(); + auto* prior_box_data = prior_box->data(); + const T* prior_box_var_data = nullptr; + if (prior_box_var) prior_box_var_data = prior_box_var->data(); for (int64_t i = 0; i < row; ++i) { for (int64_t j = 0; j < col; ++j) { size_t offset = i * col * len + j * len; - T prior_box_width = - prior_box_data[j * len + 2] - prior_box_data[j * len]; - T prior_box_height = - prior_box_data[j * len + 3] - prior_box_data[j * len + 1]; + T prior_box_width = prior_box_data[j * len + 2] - + prior_box_data[j * len] + (normalized == false); + T prior_box_height = prior_box_data[j * len + 3] - + prior_box_data[j * len + 1] + + (normalized == false); T prior_box_center_x = (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2; T prior_box_center_y = (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; - T target_box_center_x = prior_box_var_data[j * len] * + T target_box_center_x = 0, target_box_center_y = 0; + T target_box_width = 0, target_box_height = 0; + if (prior_box_var) { + target_box_center_x = prior_box_var_data[j * len] * target_box_data[offset] * prior_box_width + prior_box_center_x; - T target_box_center_y = prior_box_var_data[j * len + 1] * + target_box_center_y = prior_box_var_data[j * len + 1] * target_box_data[offset + 1] * prior_box_height + prior_box_center_y; - T target_box_width = std::exp(prior_box_var_data[j * len + 2] * + target_box_width = std::exp(prior_box_var_data[j * len + 2] * target_box_data[offset + 2]) * prior_box_width; - T target_box_height = std::exp(prior_box_var_data[j * len + 3] * + target_box_height = std::exp(prior_box_var_data[j * len + 3] * target_box_data[offset + 3]) * prior_box_height; + } else { + target_box_center_x = + target_box_data[offset] * prior_box_width + prior_box_center_x; + target_box_center_y = target_box_data[offset + 1] * prior_box_height + + prior_box_center_y; + target_box_width = + std::exp(target_box_data[offset + 2]) * prior_box_width; + target_box_height = + std::exp(target_box_data[offset + 3]) * prior_box_height; + } output[offset] = target_box_center_x - target_box_width / 2; output[offset + 1] = target_box_center_y - target_box_height / 2; - output[offset + 2] = target_box_center_x + target_box_width / 2; - output[offset + 3] = target_box_center_y + target_box_height / 2; + output[offset + 2] = + target_box_center_x + target_box_width / 2 - (normalized == false); + output[offset + 3] = + target_box_center_y + target_box_height / 2 - (normalized == false); } } } @@ -139,11 +163,14 @@ class BoxCoderKernel : public framework::OpKernel { output_box->mutable_data({row, col, len}, context.GetPlace()); auto code_type = GetBoxCodeType(context.Attr("code_type")); + bool normalized = context.Attr("box_normalized"); T* output = output_box->data(); if (code_type == BoxCodeType::kEncodeCenterSize) { - EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output); + EncodeCenterSize(target_box, prior_box, prior_box_var, normalized, + output); } else if (code_type == BoxCodeType::kDecodeCenterSize) { - DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output); + DecodeCenterSize(target_box, prior_box, prior_box_var, normalized, + output); } } }; diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h index f4cec8ad971abebe8d6dff1a384c8414269148a5..12364fff96c03c5f9dff23c7c00ceedd043803a6 100644 --- a/paddle/fluid/operators/elementwise_op.h +++ b/paddle/fluid/operators/elementwise_op.h @@ -59,47 +59,48 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { void Make() final { AddInput("X", "(Tensor), The first input tensor of elementwise op."); AddInput("Y", "(Tensor), The second input tensor of elementwise op."); - AddOutput("Out", "The output of elementwise op."); + AddOutput("Out", "The output of elementwise op.").Reuse("X"); AddAttr("axis", "(int, default -1). The start dimension index " "for broadcasting Y onto X.") .SetDefault(-1) .EqualGreaterThan(-1); AddComment(string::Sprintf(R"DOC( -Limited Elementwise %s Operator. +Limited Elementwise %s Operator The equation is: $$%s$$ -$X$ is a tensor of any dimension and the dimensions of tensor $Y$ must be -smaller than or equal to the dimensions of $X$. +- $X$: a tensor of any dimension. +- $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$. There are two cases for this operator: -1. The shape of $Y$ is same with $X$; -2. The shape of $Y$ is a congiguous subsequencet of $X$. The trailing dimensions - of size 1 for $Y$ will be ignored for the consideration of subsequence. +1. The shape of $Y$ is the same with $X$. +2. The shape of $Y$ is a continuous subsequence of $X$. For case 2: -$Y$ will be broadcasted to match the shape of $X$ and axis should be -set to index of the start dimension to broadcast $Y$ onto $X$. +1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index + for broadcasting $Y$ onto $X$. +2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$. +3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of + subsequence, such as shape(Y) = (2, 1) => (2). -If axis is -1, it is treated as axis=rank(X)-rank(Y). +For example: -For example .. code-block:: python shape(X) = (2, 3, 4, 5), shape(Y) = (,) shape(X) = (2, 3, 4, 5), shape(Y) = (5,) - shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5) + shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2 shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1 shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0 shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0 -Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details) -information. However, the output only shares the LoD information with input $X$. +The inputs $X$ and $Y$ can carry the different LoD information. +But the output only shares the LoD information with the input $X$. )DOC", GetName(), GetEquation())); diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index 8843a1c44b7004ba5d7935f75d3c99d9c30fc6c0..a9ae1396db8d7dab0364779e506d5c0a3e2ff6ed 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -43,7 +43,7 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType FCOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library{framework::LibraryType::kMKLDNN}; - framework::DataLayout layout{framework::DataLayout::kAnyLayout}; + framework::DataLayout layout{framework::DataLayout::kMKLDNN}; return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), @@ -65,7 +65,7 @@ void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType FCOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library{framework::LibraryType::kMKLDNN}; - framework::DataLayout layout{framework::DataLayout::kAnyLayout}; + framework::DataLayout layout{framework::DataLayout::kMKLDNN}; return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc index 79ec02f52094121d01c6bda2a5d99d2211893e89..98b051afb551f373009d2bd3df1a8daa64b7e6c7 100644 --- a/paddle/fluid/operators/fetch_barrier_op.cc +++ b/paddle/fluid/operators/fetch_barrier_op.cc @@ -19,8 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" - -#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -43,15 +42,16 @@ class FetchBarrierOp : public framework::OperatorBase { // For profiling platform::RecordEvent record_event(Type(), &ctx); - auto rpc_client = detail::RPCClient::GetInstance(); + detail::RPCClient* rpc_client = + detail::RPCClient::GetInstance(); - PADDLE_ENFORCE(rpc_client->Wait()); + rpc_client->Wait(); for (auto& ep : eps) { VLOG(3) << "fetch barrier, ep: " << ep; rpc_client->AsyncSendFetchBarrier(ep); } - PADDLE_ENFORCE(rpc_client->Wait()); + rpc_client->Wait(); } }; diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc index 1ae78675a0cac8a72aeaef1227b631a41e4a10b2..453a1b32a0171a2ca88879ab3287e89c4d3c7759 100644 --- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc @@ -32,16 +32,16 @@ class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp { class FillConstantBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker { protected: void Apply() override { - AddAttr("dtype", - "(int, default 5 (FP32)) " - "Output data type") + AddAttr( + "dtype", + "It could be numpy.dtype. Output data type. Default is float32") .SetDefault(framework::proto::VarType::FP32); - AddAttr("value", "(float, default 0) The value to be filled") + AddAttr("value", "default 0. The value to be filled") .SetDefault(0.0f); AddComment(R"DOC( -FillConstantBatchSizeLike Operator. - -Fill up a variable with specified constant value. +This function creates a tensor of specified *shape*, *dtype* and batch size, +and initializes this with a constant supplied in *value*. The batch size is +obtained from the `input` tensor. )DOC"); } diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc index 9c0561b016fdbfa8e48535eaa673a3f85bc936e5..f6b156eb30dae154395b34dcfc26319cd89edbca 100644 --- a/paddle/fluid/operators/gather_test.cc +++ b/paddle/fluid/operators/gather_test.cc @@ -43,7 +43,8 @@ TEST(Gather, GatherData) { auto* cpu_place = new paddle::platform::CPUPlace(); paddle::platform::CPUDeviceContext ctx(*cpu_place); paddle::operators::CPUGather(ctx, *src, *index, output); - + delete cpu_place; + cpu_place = NULL; for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4); for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4); diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc index 4bce2d322d825110a446c9bc5eccdacf0ba3c943..f824eee4e7d1ef19c9a38fd5d3369265f9c549a0 100644 --- a/paddle/fluid/operators/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/gen_nccl_id_op.cc @@ -21,8 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/detail/grpc_server.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/platform/nccl_helper.h" @@ -61,12 +60,17 @@ class GenNCCLIdOp : public framework::OperatorBase { std::vector endpoint_list = Attr>("endpoint_list"); - detail::RPCClient client; + detail::RPCClient* client = detail::RPCClient::GetInstance(); + for (auto& ep : endpoint_list) { VLOG(3) << "sending nccl id to " << ep; - client.AsyncSendVariable(ep, dev_ctx, *scope, NCCL_ID_VARNAME); + client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME); + } + client->Wait(); + for (auto& ep : endpoint_list) { + client->AsyncSendBatchBarrier(ep); } - client.Wait(); + client->Wait(); VLOG(3) << "sending completed..."; } @@ -77,9 +81,11 @@ class GenNCCLIdOp : public framework::OperatorBase { // deleter will call GRPC Server's base class's dtor and // that will cause a wired crash. detail::RequestSendHandler rpc_h(true); - detail::AsyncGRPCServer rpc_service(endpoint, 1); - rpc_service.RegisterRPC(detail::kRequestSend, &rpc_h); - rpc_h.SetRPCServer(&rpc_service); + std::unique_ptr rpc_service( + new RPCSERVER_T(endpoint, 1)); + + rpc_service->RegisterRPC(detail::kRequestSend, &rpc_h); + rpc_h.SetRPCServer(rpc_service.get()); framework::ProgramDesc empty_program; framework::Executor executor(dev_ctx.GetPlace()); @@ -89,12 +95,13 @@ class GenNCCLIdOp : public framework::OperatorBase { rpc_h.SetExecutor(&executor); std::thread server_thread( - std::bind(&detail::AsyncGRPCServer::StartServer, &rpc_service)); - rpc_service.SetCond(detail::kRequestSend); + std::bind(&detail::RPCServer::StartServer, rpc_service.get())); + + rpc_service->SetCond(detail::kRequestSend); VLOG(3) << "start getting nccl id from trainer 0..."; - rpc_service.WaitBarrier(detail::kRequestSend); + rpc_service->WaitBarrier(detail::kRequestSend); VLOG(3) << "got nccl id and stop server..."; - rpc_service.ShutDown(); + rpc_service->ShutDown(); VLOG(3) << "rpc server stopped"; server_thread.join(); } diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc index e38525cd7f44de020f364ffd16e71a439048347f..a711da362771353891f900f544d97e64510dc0ba 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.cc +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -67,8 +67,6 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker { "mini-batch. Note: S is equal to the sequence number in a mini-batch. " "The output is no longer a LoDTensor."); AddComment(R"DOC( -LinearChainCRF Operator. - Conditional Random Field defines an undirected probabilistic graph with nodes denoting random variables and edges denoting dependencies between these variables. CRF learns the conditional probability $P(Y|X)$, where diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 71e75c25321812c849e205460217b174d80654be..4d12278799f66f2fb92b7580ba0c43e845aa4d3a 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -19,7 +19,8 @@ limitations under the License. */ #include // NOLINT #include -#include "paddle/fluid/operators/detail/grpc_server.h" +#include "paddle/fluid/operators/detail/macros.h" + #include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/platform/profiler.h" @@ -89,28 +90,34 @@ void ListenAndServOp::SavePort() const { rpc_service_->SavePort(); } -void ListenAndServOp::RunSyncLoop(framework::Executor *executor, - framework::ProgramDesc *program, - framework::Scope *recv_scope, - framework::BlockDesc *prefetch_block) const { +static int64_t GetTimestamp() { + struct timeval tp; + gettimeofday(&tp, NULL); + return tp.tv_sec * 1000 + tp.tv_usec / 1000; +} + +void ListenAndServOp::RunSyncLoop( + framework::Executor *executor, framework::ProgramDesc *program, + framework::Scope *recv_scope, + const std::vector &prefetch_block_id_list) const { size_t num_blocks = program->Size(); PADDLE_ENFORCE_GE(num_blocks, 2, "server program should have at least 2 blocks"); - std::vector block_list; - for (size_t blkid = 1; blkid < num_blocks; ++blkid) { - block_list.push_back(blkid); + std::vector optimize_block_id_list; + for (int blkid = 1; blkid < num_blocks; ++blkid) { + if (std::find(prefetch_block_id_list.begin(), prefetch_block_id_list.end(), + blkid) == prefetch_block_id_list.end()) { + optimize_block_id_list.push_back(blkid); + } } - auto optimize_prepared = executor->Prepare(*program, block_list); + auto optimize_prepared = executor->Prepare(*program, optimize_block_id_list); // Insert placeholder for block0 which holds current op itself. optimize_prepared.insert( optimize_prepared.begin(), std::shared_ptr(nullptr)); rpc_service_->ResetBarrierCounter(); - // Record received sparse variables, so that - // we could reset those after execute optimize program - std::vector sparse_vars; while (true) { // Get from multiple trainers, we don't care about the order in which // the gradients arrives, just add suffix 0~n and merge the gradient. @@ -130,34 +137,29 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, int32_t last_parent_blkid = program->Block(1).Parent(); std::vector parallel_blkids; parallel_blkids.push_back(1); - double ts = detail::GetTimestamp(); - for (size_t blkid = 2; blkid < num_blocks; ++blkid) { - if (blkid != static_cast(prefetch_block->ID())) { - if (program->Block(blkid).Parent() != last_parent_blkid) { - ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, - program, recv_scope); - parallel_blkids.clear(); - last_parent_blkid = program->Block(blkid).Parent(); - } - parallel_blkids.push_back(blkid); + double ts = GetTimestamp(); + for (size_t i = 1; i < optimize_block_id_list.size(); ++i) { + // skip the first optimize block because it is already in the + // parallel_blkids. + int blkid = optimize_block_id_list[i]; + if (program->Block(blkid).Parent() != last_parent_blkid) { + ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, + program, recv_scope); + parallel_blkids.clear(); + last_parent_blkid = program->Block(blkid).Parent(); } + parallel_blkids.push_back(blkid); } ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program, recv_scope); - VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)"; - - // Reset the received sparse variables, the sum operator would not - // sum the input sparse variables which rows is empty at the next - // mini-batch. - // TODO(Yancey1989): move the reset action into an operator, we couldn't - // have any hide logic in the operator. - for (framework::Variable *var : sparse_vars) { - var->GetMutable()->mutable_rows()->clear(); - } + VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; rpc_service_->SetCond(detail::kRequestGet); rpc_service_->WaitBarrier(detail::kRequestGet); rpc_service_->ResetBarrierCounter(); + // reset received sparse vars to avoid reuse it in the next mini-batch + dynamic_cast(request_send_handler_.get()) + ->ResetSparseVarRecorder(); } // while(true) } @@ -212,18 +214,19 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, } // while(true) } -static void FillRequestCtx(detail::RequestHandler *h, framework::Scope *scope, - platform::DeviceContext *dev_ctx, - framework::Executor *executor, - framework::ProgramDesc *program, - framework::ExecutorPrepareContext *prefetch_ctx, - detail::RPCServer *rpc_server) { +static void FillRequestCtx( + detail::RequestHandler *h, framework::Scope *scope, + platform::DeviceContext *dev_ctx, framework::Executor *executor, + framework::ProgramDesc *program, + std::unordered_map> + *prefetch_ctx, + detail::RPCServer *rpc_server) { h->SetScope(scope); h->SetDevCtx(dev_ctx); h->SetExecutor(executor); h->SetProgram(program); - h->SetPrefetchPreparedCtx(std::move( - std::unique_ptr(prefetch_ctx))); + h->SetPrefetchPreparedCtx(prefetch_ctx); h->SetRPCServer(rpc_server); } @@ -244,8 +247,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, LOG(INFO) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in << ", end_point:" << endpoint; - // request_handler_.reset(new detail::GRPCRequestSendHandler(sync_mode)); - rpc_service_.reset(new detail::AsyncGRPCServer(endpoint, fan_in)); + rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in)); + request_send_handler_.reset(new detail::RequestSendHandler(sync_mode)); request_get_handler_.reset(new detail::RequestGetHandler(sync_mode)); request_prefetch_handler_.reset( @@ -257,17 +260,42 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, request_prefetch_handler_.get()); auto *optimize_block = Attr(kOptimizeBlock); - auto *prefetch_block = Attr(kPrefetchBlock); auto *program = optimize_block->Program(); framework::Executor executor(dev_place); // prepare for prefetch - VLOG(3) << "prefetch block id is " << prefetch_block->ID(); - auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID()); + std::vector prefetch_block_id_list; + std::unordered_map block_id_to_prefetch_var_name; + + auto prefetch_var_name_to_block_id_str = + Attr>(kPrefetchVarNameToBlockId); + for (const auto &prefetch_var_name_and_id : + prefetch_var_name_to_block_id_str) { + std::vector pieces; + split(prefetch_var_name_and_id, ':', &pieces); + VLOG(3) << "after split, prefetch_var = " << pieces[0] + << ", id=" << pieces[1]; + PADDLE_ENFORCE_EQ(pieces.size(), 2); + + int block_id = std::stoi(pieces[1]); + prefetch_block_id_list.push_back(block_id); + block_id_to_prefetch_var_name[block_id] = pieces[0]; + } + + auto prefetch_prepared = executor.Prepare(*program, prefetch_block_id_list); + + std::unordered_map> + prefetch_var_name_to_prepared_ctx; + for (size_t i = 0; i < prefetch_block_id_list.size(); ++i) { + auto block_id = prefetch_block_id_list[i]; + auto prefetch_var_name = block_id_to_prefetch_var_name[block_id]; + prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i]; + } auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, - &dev_ctx, &executor, program, prefetch_prepared.release(), - rpc_service_.get()); + &dev_ctx, &executor, program, + &prefetch_var_name_to_prepared_ctx, rpc_service_.get()); f(request_send_handler_.get()); f(request_get_handler_.get()); @@ -285,7 +313,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, // Write to a file of server selected port for python use. SavePort(); if (sync_mode) { - RunSyncLoop(&executor, program, &recv_scope, prefetch_block); + RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list); } else { RunAsyncLoop(&executor, program); } @@ -311,8 +339,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("sync_mode", "if works at sync_mode or not").SetDefault(true); AddAttr(kOptimizeBlock, "BlockID to run on server side."); - AddAttr(kPrefetchBlock, - "prefetch block to run on server side."); + AddAttr>(kPrefetchVarNameToBlockId, + "prefetch blocks to run on server side.") + .SetDefault({}); AddAttr("Fanin", "How many clients send to this server.") .SetDefault(1); } diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h index 87952cb0e683596b2b0395890b6e25b15f74d7e2..46c3a19e20b3f2dd970a672bb99f98e83d3e25bf 100644 --- a/paddle/fluid/operators/listen_and_serv_op.h +++ b/paddle/fluid/operators/listen_and_serv_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include +#include #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -30,7 +31,7 @@ namespace paddle { namespace operators { constexpr char kOptimizeBlock[] = "OptimizeBlock"; -constexpr char kPrefetchBlock[] = "PrefetchBlock"; +constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id"; void RunServer(std::shared_ptr service); @@ -46,7 +47,7 @@ class ListenAndServOp : public framework::OperatorBase { void RunSyncLoop(framework::Executor* executor, framework::ProgramDesc* program, framework::Scope* recv_scope, - framework::BlockDesc* prefetch_block) const; + const std::vector& prefetch_block_id_list) const; void RunAsyncLoop(framework::Executor* executor, framework::ProgramDesc* program) const; diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index 93f45cff8a26201b1fbb1c44141e125a67c44037..8f4b5049271c9592d2db268ea7ff2f5c8abc28b6 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -74,25 +74,18 @@ class LoadOp : public framework::OperatorBase { class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddOutput("Out", "(Tensor) The tensor need to be loaded"); + AddOutput("Out", "The tensor need to be loaded"); AddAttr( "load_as_fp16", - "(boolean, default false)" "If true, the tensor will be first loaded and then " "converted to float16 data type. Otherwise, the tensor will be " - "directly loaded without data type conversion.") + "directly loaded without data type conversion. Default is false.") .SetDefault(false); AddAttr("file_path", - "(string) " - "Variable will be loaded from \"file_path\".") + R"(Variable will be loaded from "file_path")") .AddCustomChecker( [](const std::string &path) { return !path.empty(); }); - AddComment(R"DOC( -Load Operator. - -Load operator will load a tensor variable from disk file. - -)DOC"); + AddComment("Load operator will load a tensor variable from disk file."); } }; } // namespace operators diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index 52b9cd7fb7019b738098a8649f23277afd40e938..52b459a6a2e56b7c256efdb535b4652c64bae23c 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -124,16 +124,17 @@ namespace { framework::OpKernelType GetExpectedLRNKernel( const framework::ExecutionContext& ctx) { framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_MKLDNN if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif - std::string data_format = ctx.Attr("data_format"); - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), layout_, library_); diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 1a37cb39d56066b8380338b9710a441e41518c39..6207d14ecdc922cbca2d05d20e4b8a9da9b9d627 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -20,13 +20,16 @@ #ifdef PADDLE_WITH_MKLML #include #include +#include #include #endif #ifdef PADDLE_USE_OPENBLAS #include +#ifdef LAPACK_FOUND #include #endif +#endif #ifndef LAPACK_FOUND extern "C" { @@ -46,6 +49,18 @@ namespace paddle { namespace operators { namespace math { +static void SetNumThreads(int num_threads) { +#ifdef PADDLE_USE_OPENBLAS + int real_num_threads = num_threads > 1 ? num_threads : 1; + openblas_set_num_threads(real_num_threads); +#elif defined(PADDLE_WITH_MKLML) + int real_num_threads = num_threads > 1 ? num_threads : 1; + mkl_set_num_threads(real_num_threads); +#else + PADDLE_ENFORCE(false, "To be implemented."); +#endif +} + /** * Matrix Descriptor of a memory buffer. * diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h index d4b0e17ed44da61e2633b9bd97faeb62f9967c3c..8b296b6a07ca222ddc08fedfd2eed423b46dc5c3 100644 --- a/paddle/fluid/operators/math/math_function.h +++ b/paddle/fluid/operators/math/math_function.h @@ -21,8 +21,10 @@ limitations under the License. */ #ifdef PADDLE_USE_OPENBLAS #include +#ifdef LAPACK_FOUND #include #endif +#endif #ifndef LAPACK_FOUND extern "C" { diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc index 3719a264e90ea7d1a99eb9589ce4fd0d8e074781..b545671b43d3a453ab03e4774427179617f62db0 100644 --- a/paddle/fluid/operators/math/math_function_test.cc +++ b/paddle/fluid/operators/math/math_function_test.cc @@ -77,6 +77,8 @@ TEST(math_function, gemm_trans_clbas) { paddle::platform::CPUDeviceContext context(*cpu_place); GetBlas(context).GEMM(false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1, input3_ptr + 1, 4); + delete cpu_place; + cpu_place = NULL; EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[1], 24); diff --git a/paddle/fluid/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc index 8e508b68eeab69a4595904dcc3ea0a541d9ab6e6..b1e69f375d3274aade3184af02f7f914dba5db71 100644 --- a/paddle/fluid/operators/max_sequence_len_op.cc +++ b/paddle/fluid/operators/max_sequence_len_op.cc @@ -42,10 +42,15 @@ class MaxSeqenceLenOp : public framework::OperatorBase { class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("RankTable", "The lod_rank_table."); - AddOutput("Out", "The max sequence length."); - AddComment( - R"DOC(Calculate the max sequence length through lod_rank_table.)DOC"); + AddInput("RankTable", "Input variable which is a LoDRankTable object"); + AddOutput("Out", "The max sequence length"); + AddComment(R"DOC( + Given a LoDRankTable object, this layer returns the max length of + a batch of sequences. In fact, a LoDRankTable object contains a list of + tuples() and the list is already sorted by + sequence length in descending order, so the operator just returns the + sequence length of the first tuple element +)DOC"); } }; diff --git a/paddle/fluid/operators/mean_iou_op.cc b/paddle/fluid/operators/mean_iou_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a60f245f53e342fd9c1382fdda33a011a7fb06d6 --- /dev/null +++ b/paddle/fluid/operators/mean_iou_op.cc @@ -0,0 +1,110 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/mean_iou_op.h" + +namespace paddle { +namespace operators { + +class MeanIoUOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Predictions"), + "Input (Predictions) of MeanIoU op should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input (labels) of MeanIoU op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutMeanIou"), + "Output (OutMeanIou) of MeanIoU op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutWrong"), + "Output (OutWrong) of MeanIoU op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutCorrect"), + "Output (OutWrong) of MeanIoU op should not be null."); + + int64_t num_classes = + static_cast(ctx->Attrs().Get("num_classes")); + + ctx->SetOutputDim("OutMeanIou", {1}); + ctx->SetOutputDim("OutWrong", {num_classes}); + ctx->SetOutputDim("OutCorrect", {num_classes}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Predictions")->type()), + ctx.GetPlace()); + } +}; + +class MeanIoUOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Predictions", + "(Tensor), A Tensor of prediction results for semantic labels" + " with type int32 or int64. The rank should be greater than 1."); + AddInput( + "Labels", + "(Tensor), A Tensor of ground truth labels with type int32 or int64." + "Its shape should be the same as Input(Predictions)."); + AddInput("InWrongs", + "(vector), A list of Tensor with shape " + "[num_classes]. They are used to collect wrong number among " + "batches. Empty list is also valid here.") + .AsDuplicable() + .AsDispensable(); + AddInput( + "InCorrects", + "(vector), A list of Tensor with shape " + "[num_classes]. They are used to collect correct number among batches. " + "Empty list is also valid here.") + .AsDuplicable() + .AsDispensable(); + AddInput("InMeanIou", + "(vector), A list of Tensor that Output(mean_iou) should " + "be added to. Empty list is also valid here.") + .AsDuplicable() + .AsDispensable(); + AddOutput("OutMeanIou", + "(vector), A Tensor representing the" + " mean intersection-over-union with shape [1]."); + AddOutput("OutWrong", "(Tensor), A Tensor with shape [num_classes]. "); + AddOutput("OutCorrect", "(Tensor), A Tensor with shape [num_classes]. "); + AddAttr("num_classes", "(int), The possible number of labels."); + + AddComment(R"DOC( +mean-IOU Operator. +Mean Intersection-Over-Union is a common evaluation metric for +semantic image segmentation, which first computes the IOU for each +semantic class and then computes the average over classes. +IOU is defined as follows: + IOU = true_positive / (true_positive + false_positive + false_negative). +It is based on pixel level area while "IOU Similarity Operator" +is based on area of rectangle. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(mean_iou, ops::MeanIoUOp, ops::MeanIoUOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(mean_iou, ops::MeanIoUKernel, + ops::MeanIoUKernel, + ops::MeanIoUKernel); diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..83bb4dde46fa241affad3788e3381b6ecd8aa098 --- /dev/null +++ b/paddle/fluid/operators/mean_iou_op.cu @@ -0,0 +1,164 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/mean_iou_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { + +using platform::PADDLE_CUDA_NUM_THREADS; + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void CountCUDAKernel(const int num_classes, const int count, + const T* predictions, const T* labels, + int* wrong, int* correct) { + extern __shared__ int blcok_cache[]; + int* wrong_c = blcok_cache; + int* correct_c = blcok_cache + num_classes; + // init cache + for (int i = threadIdx.x; i < num_classes * 2; i += blockDim.x) { + blcok_cache[i] = 0; + } + __syncthreads(); + + T pred; + T label; + CUDA_1D_KERNEL_LOOP(i, count) { + pred = predictions[i]; + label = labels[i]; + if (pred == label) { + atomicAdd(correct_c + pred, 1); + } else { + atomicAdd(wrong_c + pred, 1); + atomicAdd(wrong_c + label, 1); + } + } + + __syncthreads(); + + for (int i = threadIdx.x; i < num_classes; i += blockDim.x) { + atomicAdd(wrong + i, wrong_c[i]); + atomicAdd(correct + i, correct_c[i]); + } +} + +__global__ void ComputeIoUCUDAKernel(const int num_classes, int* wrong, + int* correct, float* ious, float* iou) { + __shared__ int valid_count_c; + if (threadIdx.x == 0) { + valid_count_c = 0; + } + __syncthreads(); + CUDA_1D_KERNEL_LOOP(i, num_classes) { + int wrong_n = wrong[i]; + int correct_n = correct[i]; + int denominator = wrong_n + correct_n; + if (denominator > 0) { + atomicAdd(&valid_count_c, 1); + ious[i] = static_cast(correct_n) / denominator; + } else { + ious[i] = 0; + } + } + __syncthreads(); + if (threadIdx.x == 0) { + float iou_sum = 0; + for (int i = 0; i < num_classes; ++i) { + iou_sum += ious[i]; + } + iou[0] += iou_sum / valid_count_c; + } +} + +template +class MeanIoUCUDAOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& place = *ctx.template device_context() + .eigen_device(); + // get input and output tensor + auto* predictions = ctx.Input("Predictions"); + auto* labels = ctx.Input("Labels"); + auto* out_mean_iou = ctx.Output("OutMeanIou"); + auto* out_wrong = ctx.Output("OutWrong"); + auto* out_correct = ctx.Output("OutCorrect"); + int num_classes = static_cast(ctx.Attr("num_classes")); + + // Get data ptr + const T* predictions_data = predictions->data(); + const T* labels_data = labels->data(); + int* out_wrong_data = out_wrong->mutable_data(ctx.GetPlace()); + int* out_correct_data = out_correct->mutable_data(ctx.GetPlace()); + float* out_mean_iou_data = + out_mean_iou->mutable_data(ctx.GetPlace()); + + // Get Eigen tensor + auto out_mean_iou_t = EigenTensor::From(*out_mean_iou); + auto out_wrong_t = EigenTensor::From(*out_wrong); + auto out_correct_t = EigenTensor::From(*out_correct); + + // Temporary tensor + Tensor ious; + float* ious_data = ious.mutable_data( + {static_cast(num_classes)}, ctx.GetPlace()); + auto ious_t = EigenTensor::From(ious); + + // Init out_wrong, out_correct and out_mean_iou + out_wrong_t.device(place) = out_wrong_t.constant(0); + out_correct_t.device(place) = out_correct_t.constant(0); + out_mean_iou_t.device(place) = out_mean_iou_t.constant(0.0f); + + // collect pre wrong, correct and mean_iou + auto in_mean_ious = ctx.MultiInput("InMeanIou"); + for (int i = 0; i < in_mean_ious.size(); ++i) { + out_mean_iou_t.device(place) += + EigenTensor::From(*in_mean_ious[i]); + } + auto in_wrongs = ctx.MultiInput("InWrongs"); + for (int i = 0; i < in_wrongs.size(); ++i) { + out_wrong_t.device(place) += EigenTensor::From(*in_wrongs[i]); + } + auto in_corrects = ctx.MultiInput("InCorrects"); + for (int i = 0; i < in_corrects.size(); ++i) { + out_correct_t.device(place) += EigenTensor::From(*in_corrects[i]); + } + // compute + auto stream = ctx.cuda_device_context().stream(); + int block = PADDLE_CUDA_NUM_THREADS; + int grid = (predictions->numel() + block - 1) / block; + int cache_size = (num_classes * 2 + 1) * sizeof(int); + CountCUDAKernel<<>>( + num_classes, predictions->numel(), predictions_data, labels_data, + out_wrong_data, out_correct_data); + ctx.device_context().Wait(); + ComputeIoUCUDAKernel<<<1, block, 0, stream>>>(num_classes, out_wrong_data, + out_correct_data, ious_data, + out_mean_iou_data); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(mean_iou, ops::MeanIoUCUDAOpKernel, + ops::MeanIoUCUDAOpKernel, + ops::MeanIoUCUDAOpKernel); diff --git a/paddle/fluid/operators/mean_iou_op.h b/paddle/fluid/operators/mean_iou_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9fa00e60e05504e0bb8658c6908e4d4ac46b2ca4 --- /dev/null +++ b/paddle/fluid/operators/mean_iou_op.h @@ -0,0 +1,117 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +template +using EigenTensor = framework::EigenTensor; + +template +class MeanIoUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& place = *ctx.template device_context() + .eigen_device(); + // get input and output tensor + auto* predictions = ctx.Input("Predictions"); + auto* labels = ctx.Input("Labels"); + auto* out_mean_iou = ctx.Output("OutMeanIou"); + auto* out_wrong = ctx.Output("OutWrong"); + auto* out_correct = ctx.Output("OutCorrect"); + int num_classes = static_cast(ctx.Attr("num_classes")); + + // get data ptr + const T* predictions_data = predictions->data(); + const T* labels_data = labels->data(); + float* out_mean_iou_data = + out_mean_iou->mutable_data(ctx.GetPlace()); + int* out_wrong_data = out_wrong->mutable_data(ctx.GetPlace()); + int* out_correct_data = out_correct->mutable_data(ctx.GetPlace()); + + // get eigen tensor + auto out_mean_iou_t = EigenTensor::From(*out_mean_iou); + auto out_wrong_t = EigenTensor::From(*out_wrong); + auto out_correct_t = EigenTensor::From(*out_correct); + + // Tmp tensor + Tensor denominator; + Tensor valid_count; + Tensor iou_sum; + + // get data ptr of tmp tensor + int* denominator_data = denominator.mutable_data( + {static_cast(num_classes)}, ctx.GetPlace()); + int* valid_count_data = valid_count.mutable_data({1}, ctx.GetPlace()); + float* iou_sum_data = iou_sum.mutable_data({1}, ctx.GetPlace()); + + // get eigen tensor of tmp tensor + auto denominator_t = EigenTensor::From(denominator); + auto valid_count_t = EigenTensor::From(valid_count); + auto iou_sum_t = EigenTensor::From(iou_sum); + + // init out_wrong, out_correct and out_mean_iou + out_wrong_t = out_wrong_t.constant(0); + out_correct_t = out_correct_t.constant(0); + out_mean_iou_t = out_mean_iou_t.constant(0); + + // collect pre wrong, correct and mean_iou + auto in_mean_ious = ctx.MultiInput("InMeanIou"); + for (size_t i = 0; i < in_mean_ious.size(); ++i) { + out_mean_iou_t.device(place) += + EigenTensor::From(*in_mean_ious[i]); + } + auto in_wrongs = ctx.MultiInput("InWrongs"); + for (size_t i = 0; i < in_wrongs.size(); ++i) { + out_wrong_t.device(place) += EigenTensor::From(*in_wrongs[i]); + } + auto in_corrects = ctx.MultiInput("InCorrects"); + for (size_t i = 0; i < in_corrects.size(); ++i) { + out_correct_t.device(place) += EigenTensor::From(*in_corrects[i]); + } + + // compute + for (int64_t i = 0; i < predictions->numel(); ++i) { + if (predictions_data[i] == labels_data[i]) { + out_correct_data[predictions_data[i]] += 1; + } else { + out_wrong_data[labels_data[i]] += 1; + out_wrong_data[predictions_data[i]] += 1; + } + } + + denominator_t = out_wrong_t + out_correct_t; + valid_count_t = + (denominator_t > denominator_t.constant(0.0f)).cast().sum(); + + for (int i = 0; i < num_classes; ++i) { + if (denominator_data[i] == 0) { + denominator_data[i] = 1; + } + } + + iou_sum_t = + (out_correct_t.cast() / denominator_t.cast()).sum(); + out_mean_iou_data[0] += (iou_sum_data[0] / valid_count_data[0]); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc index 74477eb439dc202c3f5f17fdf3e1647bc5c23512..4881cff4a368ffae9b030f04b7fff01d6ee7d26e 100644 --- a/paddle/fluid/operators/mean_op.cc +++ b/paddle/fluid/operators/mean_op.cc @@ -34,7 +34,7 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", "The input of mean op"); - AddOutput("Out", "The output of mean op"); + AddOutput("Out", "The output of mean op").Reuse("X"); AddComment(R"DOC( Mean Operator. diff --git a/paddle/fluid/operators/merge_ids_op.cc b/paddle/fluid/operators/merge_ids_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c6ec4ab047d5e91625e646fd26108d2e477cdce5 --- /dev/null +++ b/paddle/fluid/operators/merge_ids_op.cc @@ -0,0 +1,128 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/merge_ids_op.h" + +namespace paddle { +namespace operators { + +class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}"); + AddInput( + "X", + "(LoDTensors) multi input tensor with shape{batch_num, N}, N is the " + "size of embedding table") + .AsDuplicable(); + AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors."); + + AddComment(R"DOC( +Merge multi LoDTensor's into one according to Ids's shard num. + + +split_ids_op -> prefetch_op -> merge_ids_op + + +merge_ids_op should be used after split_ids_op and prefetch_op, split_ids_op + will split input Ids into multiple tensors according to Id's shard number. +prefetch_op will send them to parameter server to prefetch embedding value +back. During split, the order of ids is disordered. In merge_ids_op we use +the original Ids to restore the order of the fetched embedding value and + also pass the lod information to the merged output. + + +Example: + + Ids = [1,2,3,4,5,6] # 3 shared + +split_ids_op -> + + Id0 = [3, 6] # id % 3 == 0 + Id1 = [1, 4] # id % 3 == 1 + Id2 = [2, 5] # id % 3 == 2 + +prefetch_op -> + + X0 = [[0.3 0.3] # 3 + [0.6 0.6]] # 6 + X1 = [[0.1 0.1] # 1 + [0.4 0.4]] # 4 + X2 = [[0.2 0.2] # 2 + [0.5 0.5]] # 5 + +merge_ids_op -> + + Out = [[0.1 0.1] # 1 + [0.2 0.2] # 2 + [0.3 0.3] # 3 + [0.4 0.4] # 4 + [0.5 0.5] # 5 + [0.6 0.6]] # 6 +)DOC"); + } +}; + +class MergeIdsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Ids"), "MergeIdsOp must has input Ids."); + PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has input X."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "MergeIdsOp must has output Out."); + + auto ids_var_type = ctx->GetInputsVarType("Ids").front(); + auto ids_dims = ctx->GetInputDim("Ids"); + if (ids_var_type == framework::proto::VarType::LOD_TENSOR) { + PADDLE_ENFORCE_EQ(ids_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[1], 1); + } + auto x_var_type = ctx->GetInputsVarType("X"); + for (auto &var_type : x_var_type) { + PADDLE_ENFORCE_EQ(var_type, framework::proto::VarType::LOD_TENSOR, + "input X only support lod tensors"); + } + ctx->ShareLoD("Ids", "Out"); + } + + private: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.MultiInput("X").front()->type()), + ctx.GetPlace()); + } +}; + +class MergeIdsOpInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + auto *input_var = block->Var(op_desc.Input("Ids")[0]); + for (auto &out_var : op_desc.Output("Out")) { + block->Var(out_var)->SetType(input_var->GetType()); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(merge_ids, ops::MergeIdsOp, ops::MergeIdsOpMaker, + ops::MergeIdsOpInferVarType); +REGISTER_OP_CPU_KERNEL( + merge_ids, ops::MergeIdsOpKernel); diff --git a/paddle/fluid/operators/merge_ids_op.h b/paddle/fluid/operators/merge_ids_op.h new file mode 100644 index 0000000000000000000000000000000000000000..83712a8519c6817151e1922c606c0fdd4682a2db --- /dev/null +++ b/paddle/fluid/operators/merge_ids_op.h @@ -0,0 +1,92 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { + +template +class MergeIdsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto place = ctx.GetPlace(); + if (!platform::is_cpu_place(place)) { + PADDLE_THROW("MergeIds do not support GPU kernel"); + } + VLOG(3) << "run in MergeIdsOpKernel"; + + const auto *ids_var = ctx.InputVar("Ids"); + PADDLE_ENFORCE(ids_var->IsType(), + "only support to merge Ids of LoDTensor"); + + const auto &ids_tensor = ids_var->Get(); + const auto &ids_dims = ids_tensor.dims(); + const int64_t *ids = ids_tensor.data(); + + auto x_tensors = ctx.MultiInput("X"); + + auto *out = ctx.Output("Out"); + + int batch_size = 0; + int embedding_size = 0; + for (auto &input : x_tensors) { + if (framework::product(input->dims()) != 0) { + if (embedding_size == 0) { + embedding_size = input->dims()[1]; + } + PADDLE_ENFORCE_EQ(embedding_size, input->dims()[1], + "embedding size of all input should be the same"); + batch_size += input->dims()[0]; + } + } + PADDLE_ENFORCE_EQ( + batch_size, ids_dims[0], + "the batch size of ids and merged embedding value should be the same"); + + const size_t shard_num = x_tensors.size(); + + if (shard_num == 1) { + VLOG(3) << "only one shard, we can copy the data directly"; + TensorCopy(*x_tensors[0], place, out); + } else { + std::vector in_indexs(shard_num, 0); + auto *out_data = out->mutable_data( + framework::make_ddim({batch_size, embedding_size}), place); + // copy data from ins[shard_num] to out. + for (int i = 0; i < ids_dims[0]; ++i) { + int64_t id = ids[i]; + size_t shard_id = static_cast(id) % shard_num; + int index = in_indexs[shard_id]; + memcpy(out_data + embedding_size * i, + x_tensors[shard_id]->data() + index * embedding_size, + sizeof(T) * embedding_size); + in_indexs[shard_id] += 1; + } + + for (size_t i = 0; i < shard_num; ++i) { + PADDLE_ENFORCE_EQ(in_indexs[i], x_tensors[i]->dims()[0], + "after merge, all data in x_tensor should be used"); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc index cdbc975c02214721ceae3a338741101ef32d7ee9..aa19c62c83648814e86b1e7062424be3693e4b98 100644 --- a/paddle/fluid/operators/norm_op.cc +++ b/paddle/fluid/operators/norm_op.cc @@ -16,40 +16,34 @@ limitations under the License. */ namespace paddle { namespace operators { -template class NormOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput( - "X", - "(Tensor) The input tensor of norm operator. " - "The format of input tensor is NCHW. Where N is batch size, C is the " - "number of channels, H and W is the height and width of feature."); - AddInput("Scale", - "(Tensor) The input tensor of norm operator. " - "The format of input tensor is C * 1."); - AddAttr("epsilon", - "(float, default 1e-10) Constant " - "for numerical stability.") + AddInput("X", "(Tensor) A tensor of rank >= axis."); + AddAttr("axis", + "The axis on which to apply normalization. If axis < 0, " + "the dimension to normalization is rank(X) + axis. -1 is " + "the last dimension."); + AddAttr("epsilon", + "(float, default 1e-10) The epsilon value is used " + "to avoid division by zero.") .SetDefault(1.0e-10f); - AddOutput("Out", - "(Tensor) The output tensor of norm operator." - "N * M." - "M = C * H * W"); + AddOutput("Norm", + "(Tensor) A tensor saved the `sqrt(sum(x) + epsion)` will " + "be used in backward kernel.") + .AsIntermediate(); + AddOutput("Out", "(Tensor) A tensor of the same shape as X."); AddComment(R"DOC( - "Input shape: $(N, C, H, W)$ - Scale shape: $(C, 1)$ - Output shape: $(N, C, H, W)$ - Where - forward - $$ - [\frac {x_{1}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{2}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{3}}{\sqrt{\sum{x_{i}^{2}}}} \cdot \cdot \cdot \frac {x_{n}}{\sqrt{\sum{x_{i}^{2}}}}] - $$ - backward - $$ - \frac{\frac{\mathrm{d}L }{\mathrm{d}y_{1}} - \frac {x_{1}\sum {\frac{\mathrm{d} L}{\mathrm{d} y_{j}}}x_{j}}{\sum x_{j}^{2}} }{\sqrt{\sum{x_{j}^{2}}}} - $$ - )DOC"); + +Given a tensor, apply 2-normalization along the provided axis. + +$$ +y = \frac{x}{ \sqrt{\sum {x^2} + epsion }} +$$ + +where, $\sum {x^2}$ is calculated along the `axis` dimension. + +)DOC"); } }; @@ -58,15 +52,15 @@ class NormOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of NormOp" - "should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Scale"), - "Input(Scale) of NormOp" - "should not be null."); + "Input(X) of NormOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of NormOp should not be null."); - auto in_x_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", in_x_dims); + auto xdim = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", xdim); + int axis = ctx->Attrs().Get("axis"); + if (axis < 0) axis = xdim.size() + axis; + xdim[axis] = 1; + ctx->SetOutputDim("Norm", xdim); } }; @@ -84,12 +78,12 @@ class NormOpGrad : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker, +using CPU = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(norm_grad, ops::NormOpGrad); -REGISTER_OP_CPU_KERNEL( - norm, ops::NormKernel, - ops::NormKernel); -REGISTER_OP_CPU_KERNEL( - norm_grad, ops::NormGradKernel, - ops::NormGradKernel); +REGISTER_OP_CPU_KERNEL(norm, ops::NormKernel, + ops::NormKernel); +REGISTER_OP_CPU_KERNEL(norm_grad, ops::NormGradKernel, + ops::NormGradKernel); diff --git a/paddle/fluid/operators/norm_op.cu b/paddle/fluid/operators/norm_op.cu index d1d9be50742b54a3b6f068fd43ec4b16696183bf..1d0021d33ff9ee65c3366183466b94266e6c2999 100644 --- a/paddle/fluid/operators/norm_op.cu +++ b/paddle/fluid/operators/norm_op.cu @@ -16,9 +16,9 @@ limitations under the License. */ #include "paddle/fluid/operators/norm_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - norm, ops::NormKernel, - ops::NormKernel); -REGISTER_OP_CUDA_KERNEL( - norm_grad, ops::NormGradKernel, - ops::NormGradKernel); +using CUDA = paddle::platform::CUDADeviceContext; + +REGISTER_OP_CUDA_KERNEL(norm, ops::NormKernel, + ops::NormKernel); +REGISTER_OP_CUDA_KERNEL(norm_grad, ops::NormGradKernel, + ops::NormGradKernel); diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h index 0ad29e8a0385c46a07842930378ed7a040564437..3167bdc8ac718b23435690577e4163826d14a332 100644 --- a/paddle/fluid/operators/norm_op.h +++ b/paddle/fluid/operators/norm_op.h @@ -19,156 +19,110 @@ limitations under the License. */ namespace paddle { namespace operators { -template +inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n, + int* post) { + *pre = 1; + *post = 1; + *n = dim[axis]; + for (int i = 0; i < axis; ++i) { + (*pre) *= dim[i]; + } + for (int i = axis + 1; i < dim.size(); ++i) { + (*post) *= dim[i]; + } +} + +template class NormKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { - const framework::Tensor* in_x = context.Input("X"); - const framework::Tensor* scale = context.Input("Scale"); - auto* out = context.Output("Out"); - auto epsilon = static_cast(context.Attr("epsilon")); - out->mutable_data(context.GetPlace()); - int batch_size = in_x->dims()[0]; - int channels = in_x->dims()[1]; - int height = in_x->dims()[2]; - int width = in_x->dims()[3]; - int fea_len = height * width; - auto* place = - context.template device_context().eigen_device(); - auto x = - framework::EigenMatrix::From( - *in_x, framework::make_ddim({batch_size, fea_len * channels})); - // get square - framework::Tensor x_square; - x_square.mutable_data(in_x->dims(), context.GetPlace()); - auto x_square_eigen = - framework::EigenMatrix::From( - x_square, framework::make_ddim({batch_size, fea_len * channels})); - x_square_eigen.device(*place) = x.square(); - auto scale_eigen = - framework::EigenVector::Flatten( - *scale); - for (int n = 0; n < batch_size; ++n) { - framework::Tensor in_x_batch = in_x->Slice(n, n + 1); - auto in_x_batch_eigen = - framework::EigenMatrix::From( - in_x_batch, framework::make_ddim({channels, fea_len})); - framework::Tensor x_square_batch = x_square.Slice(n, n + 1); - auto x_square_batch_eigen = - framework::EigenMatrix::From( - x_square_batch, framework::make_ddim({channels, fea_len})); - framework::Tensor out_batch = out->Slice(n, n + 1); - auto out_batch_eigen = - framework::EigenMatrix::From( - out_batch, framework::make_ddim({channels, fea_len})); - framework::Tensor tmp_tensor; - tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), - context.GetPlace()); - auto tmp = framework::EigenVector::Flatten(tmp_tensor); - // get colsum and sqrt , inverse - auto dim = Eigen::array({{0}}); - tmp.device(*place) = x_square_batch_eigen.sum(dim); - tmp.device(*place) = (tmp + epsilon).sqrt().inverse(); - Eigen::array broadcast_dim_col; - broadcast_dim_col[1] = 1; - broadcast_dim_col[0] = channels; - out_batch_eigen.device(*place) = - in_x_batch_eigen * (tmp.broadcast(broadcast_dim_col)); - Eigen::array broadcast_dim_row; - broadcast_dim_row[1] = fea_len; - broadcast_dim_row[0] = 1; - out_batch_eigen.device(*place) = - out_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row)); - } + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_x = ctx.Input("X"); + auto* out_y = ctx.Output("Out"); + auto* out_norm = ctx.Output("Norm"); + out_y->mutable_data(ctx.GetPlace()); + out_norm->mutable_data(ctx.GetPlace()); + + auto xdim = in_x->dims(); + auto ndim = out_norm->dims(); + T eps = static_cast(ctx.Attr("epsilon")); + int axis = ctx.Attr("axis"); + if (axis < 0) axis = xdim.size() + axis; + int pre, n, post; + GetDims(xdim, axis, &pre, &n, &post); + + auto* place = ctx.template device_context().eigen_device(); + + Eigen::DSizes shape(pre, n, post); + Eigen::DSizes norm_shape(pre, post); + + auto x_e = framework::EigenVector::Flatten(*in_x); + auto y_e = framework::EigenVector::Flatten(*out_y); + auto norm_e = framework::EigenVector::Flatten(*out_norm); + auto x = x_e.reshape(shape); + auto y = y_e.reshape(shape); + auto norm = norm_e.reshape(norm_shape); + + Eigen::DSizes rdim(1); + // y = x / sqrt((sum(x * x) + epsilon)) + // norm = sqrt(sum(x * x) + epsilon) + auto sum = x.pow(2).sum(rdim) + eps; + norm.device(*place) = sum.sqrt(); + // y = x / norm + Eigen::DSizes rshape(pre, 1, post); + Eigen::DSizes bcast(1, n, 1); + y.device(*place) = x / norm.reshape(rshape).broadcast(bcast); } }; template class NormGradKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { - const framework::Tensor* in_x = context.Input("X"); - const framework::Tensor* scale = context.Input("Scale"); - const framework::Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - auto epsilon = static_cast(context.Attr("epsilon")); - framework::Tensor* in_x_grad = - context.Output(framework::GradVarName("X")); - in_x_grad->mutable_data(context.GetPlace()); - int batch_size = in_x->dims()[0]; - int channels = in_x->dims()[1]; - int height = in_x->dims()[2]; - int width = in_x->dims()[3]; - int fea_len = height * width; - auto* place = - context.template device_context().eigen_device(); - - auto scale_eigen = - framework::EigenVector::Flatten( - *scale); - auto x = - framework::EigenMatrix::From( - *in_x, framework::make_ddim({batch_size, fea_len * channels})); - // get square - framework::Tensor x_square; - x_square.mutable_data(in_x->dims(), context.GetPlace()); - auto x_square_eigen = - framework::EigenMatrix::From( - x_square, framework::make_ddim({batch_size, fea_len * channels})); - x_square_eigen.device(*place) = x.square(); - - for (int n = 0; n < batch_size; ++n) { - framework::Tensor in_x_batch = in_x->Slice(n, n + 1); - auto in_x_batch_eigen = - framework::EigenMatrix::From( - in_x_batch, framework::make_ddim({channels, fea_len})); - framework::Tensor in_g_batch = in_x_grad->Slice(n, n + 1); - auto in_g_batch_eigen = - framework::EigenMatrix::From( - in_g_batch, framework::make_ddim({channels, fea_len})); - framework::Tensor x_square_batch = x_square.Slice(n, n + 1); - auto x_square_batch_eigen = - framework::EigenMatrix::From( - x_square_batch, framework::make_ddim({channels, fea_len})); - framework::Tensor outg_batch = out_grad->Slice(n, n + 1); - auto outg_batch_eigen = - framework::EigenMatrix::From( - outg_batch, framework::make_ddim({channels, fea_len})); - - framework::Tensor tmp_tensor; - tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), - context.GetPlace()); - auto tmp_eigen = - framework::EigenVector::Flatten(tmp_tensor); - auto dim = Eigen::array({{0}}); - tmp_eigen.device(*place) = (in_x_batch_eigen * outg_batch_eigen).sum(dim); - framework::Tensor norm_tmp_tensor; - norm_tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), - context.GetPlace()); - auto norm_tmp_eigen = - framework::EigenVector::Flatten(norm_tmp_tensor); - norm_tmp_eigen.device(*place) = - (x_square_batch_eigen.sum(dim) + epsilon).sqrt(); - Eigen::array broadcast_dim_col; - broadcast_dim_col[1] = 1; - broadcast_dim_col[0] = channels; - in_g_batch_eigen.device(*place) = - in_x_batch_eigen * tmp_eigen.broadcast(broadcast_dim_col); - in_g_batch_eigen.device(*place) = - in_g_batch_eigen / - (norm_tmp_eigen * norm_tmp_eigen).broadcast(broadcast_dim_col); - in_g_batch_eigen.device(*place) = outg_batch_eigen - in_g_batch_eigen; - // outg_batch_eigen + (in_g_batch_eigen * -1); - in_g_batch_eigen.device(*place) = - in_g_batch_eigen / norm_tmp_eigen.broadcast(broadcast_dim_col); - Eigen::array broadcast_dim_row; - broadcast_dim_row[1] = fea_len; - broadcast_dim_row[0] = 1; - in_g_batch_eigen.device(*place) = - in_g_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row)); - } + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_x = ctx.Input("X"); + auto* in_norm = ctx.Input("Norm"); + auto* in_dy = ctx.Input(framework::GradVarName("Out")); + auto* out_dx = ctx.Output(framework::GradVarName("X")); + out_dx->mutable_data(ctx.GetPlace()); + + auto xdim = in_x->dims(); + int axis = ctx.Attr("axis"); + if (axis < 0) axis = xdim.size() + axis; + int pre, n, post; + GetDims(xdim, axis, &pre, &n, &post); + + auto* place = ctx.template device_context().eigen_device(); + + auto x_e = framework::EigenVector::Flatten(*in_x); + auto dy_e = framework::EigenVector::Flatten(*in_dy); + auto norm_e = framework::EigenVector::Flatten(*in_norm); + auto dx_e = framework::EigenVector::Flatten(*out_dx); + + Eigen::DSizes shape(pre, n, post); + Eigen::DSizes norm_shape(pre, post); + auto x = x_e.reshape(shape); + auto dy = dy_e.reshape(shape); + auto norm = norm_e.reshape(norm_shape); + auto dx = dx_e.reshape(shape); + + framework::Tensor rsum; + rsum.mutable_data({pre, post}, ctx.GetPlace()); + auto sum = framework::EigenTensor::From(rsum); + + Eigen::DSizes rdim(1); + Eigen::DSizes bcast(1, n, 1); + Eigen::DSizes rshape(pre, 1, post); + + // dx = ( dy/sqrt(sum(x*x)) ) * [1 - x*sum(x) / (sum(x*x) + e)] + // = [dy - dy * x * sum(x) / (sum(x*x) + e)] / sqrt(sum(x*x)) + // = [dy - x * sum(x*dy) / (sum(x*x) + e)] / sqrt(sum(x*x)) + // 1. sum = sum(x*dy) + sum.device(*place) = (x * dy).sum(rdim); + // 2. dx = x * sum + dx.device(*place) = sum.reshape(rshape).broadcast(bcast) * x; + // 3. dx / (sum(x*x) + e) + // where, norm.pow(2) = sum(x*x) + e, which is calculated in forward. + dx.device(*place) = dx / norm.pow(2).broadcast(bcast); + // 4. [dy - dx] / sqrt(sum(x*x)) + dx.device(*place) = (dy - dx) / norm.broadcast(bcast); } }; } // namespace operators diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc index d60a99994edc926456706ff6a3ba998a3e5e7dd5..be55bc43b14f1e6211f71b4080d1676838ad508c 100644 --- a/paddle/fluid/operators/pool_cudnn_op.cu.cc +++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc @@ -135,7 +135,11 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel { PoolingMode pooling_mode; if (pooling_type == "max") { - pooling_mode = PoolingMode::kMaximum; + if (FLAGS_cudnn_deterministic) { + pooling_mode = PoolingMode::kMaximumDeterministic; + } else { + pooling_mode = PoolingMode::kMaximum; + } } else { pooling_mode = PoolingMode::kAverage; } diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc index 60e936298defe7c6ce8a33bdc7de05b52eb950e7..5341187d1ce9400ac34750ab691608e76158ae0d 100644 --- a/paddle/fluid/operators/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/pool_mkldnn_op.cc @@ -18,16 +18,24 @@ limitations under the License. */ namespace paddle { namespace operators { -using mkldnn::memory; // Note: paddle has also "memory" namespace -using mkldnn::pooling_forward; +using framework::DataLayout; +using mkldnn::memory; using mkldnn::pooling_backward; +using mkldnn::pooling_forward; +using mkldnn::primitive; +using mkldnn::reorder; +using mkldnn::stream; +using platform::to_void_cast; // Generate keys for storing/retriving primitives for this operator // TODO(jczaja): Make hashing function more optimial -static std::string gethash(memory::dims& input_dims, std::string& pooling_type, - std::vector& ksize, std::vector& strides, - std::vector& paddings, std::string suffix) { - auto dims2str = [](memory::dims& operand_dims) { +static std::string gethash(const memory::dims& input_dims, + const std::string& pooling_type, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string& suffix) { + auto dims2str = [](const memory::dims& operand_dims) { std::string dstr = ""; for (size_t i = 0; i < operand_dims.size(); ++i) { dstr += std::to_string(operand_dims[i]) + "-"; @@ -52,8 +60,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { const Tensor* input = ctx.Input("X"); Tensor* output = ctx.Output("Out"); - // Get an unique name from "argument" name of "Out" variable - // This name will be used as key when saving info into device context + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); std::string pooling_type = ctx.Attr("pooling_type"); std::vector ksize = ctx.Attr>("ksize"); @@ -79,6 +88,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector src_tz = paddle::framework::vectorize2int(input->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + auto input_format = input->format(); + memory::format output_format{memory::format::format_undef}; + const std::string key = gethash(src_tz, pooling_type, ksize, strides, paddings, ctx.op().Output("Out")); const std::string key_pool_p = key + "@pool_p"; @@ -91,16 +103,17 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { auto pool_p = std::static_pointer_cast(dev_ctx.GetBlob(key_pool_p)); if (pool_p == nullptr) { - // TODO(pzelazko-intel): support more formats + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), input_format); - auto src_md = - platform::MKLDNNMemDesc(src_tz, platform::MKLDNNGetDataType(), - mkldnn::memory::format::nchw); - auto dst_md = - platform::MKLDNNMemDesc(dst_tz, platform::MKLDNNGetDataType(), - mkldnn::memory::format::nchw); + /* create memory descriptor for pooling without specified format + * ('any') which lets a primitive (pooling in this case) choose + * the memory format preferred for best performance + */ + auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32, + mkldnn::memory::format::any); - std::shared_ptr pool_pd = + std::shared_ptr pool_pd = CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize, pooling_type, mkldnn_engine); @@ -113,20 +126,22 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { // save pool_workspace_memory to be referred in backward path dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); - auto pool_src_memory_p = std::make_shared( - memory::primitive_desc{src_md, mkldnn_engine}, - static_cast(const_cast(input_data))); - dev_ctx.SetBlob(key_pool_src_mem_p, pool_src_memory_p); + auto src_memory = std::make_shared(pool_pd->src_primitive_desc(), + to_void_cast(input_data)); + auto dst_memory = + std::make_shared(pool_pd->dst_primitive_desc(), output_data); - auto pool_dst_memory_p = std::make_shared( - memory::primitive_desc{dst_md, mkldnn_engine}, - static_cast(output_data)); - dev_ctx.SetBlob(key_pool_dst_mem_p, pool_dst_memory_p); + dev_ctx.SetBlob(key_pool_src_mem_p, src_memory); + dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory); + + pool_p = std::make_shared(*pool_pd, *(src_memory.get()), + *(dst_memory.get()), + *workspace_memory); - pool_p = std::make_shared( - *pool_pd, *(pool_src_memory_p.get()), *(pool_dst_memory_p.get()), - *workspace_memory); dev_ctx.SetBlob(key_pool_p, pool_p); + + output_format = + (memory::format)dst_memory->get_primitive_desc().desc().data.format; } else { // Primitives already exist auto pool_src_memory_p = @@ -137,14 +152,20 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { std::static_pointer_cast(dev_ctx.GetBlob(key_pool_dst_mem_p)); PADDLE_ENFORCE(pool_dst_memory_p != nullptr, "Fail to find pooling dst mem_p in device context"); - pool_src_memory_p->set_data_handle( - reinterpret_cast(const_cast(input_data))); + pool_src_memory_p->set_data_handle(to_void_cast(input_data)); pool_dst_memory_p->set_data_handle(output_data); + + output_format = (memory::format)pool_dst_memory_p->get_primitive_desc() + .desc() + .data.format; } // push primitive to stream and wait until it's executed std::vector pipeline{*(pool_p.get())}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + stream(stream::kind::eager).submit(pipeline).wait(); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(output_format); } private: @@ -191,6 +212,13 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { const Tensor* out_grad = ctx.Input(framework::GradVarName("Out")); Tensor* in_x_grad = ctx.Output(framework::GradVarName("X")); + PADDLE_ENFORCE(in_x->layout() == DataLayout::kMKLDNN && + in_x->format() != memory::format::format_undef, + "Wrong layout/format set for Input X tensor"); + PADDLE_ENFORCE(out_grad->layout() == DataLayout::kMKLDNN && + out_grad->format() != memory::format::format_undef, + "Wrong layout/format set for Input output_grad tensor"); + std::string pooling_type = ctx.Attr("pooling_type"); std::vector ksize = ctx.Attr>("ksize"); std::vector strides = ctx.Attr>("strides"); @@ -209,6 +237,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { const T* out_grad_data = out_grad->data(); T* in_x_grad_data = in_x_grad->mutable_data(ctx.GetPlace()); + memory::format in_x_grad_format{memory::format::format_undef}; std::vector diff_src_tz = paddle::framework::vectorize2int(in_x_grad->dims()); @@ -222,39 +251,48 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { const std::string key_pool_bwd_p = key + "@pool_bwd_p"; const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p"; const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p"; + const std::string key_pool_src_mem_p = key + "@pool_src_mem_p"; + const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p"; const std::string key_pool_pd = key + "@pool_pd"; const std::string key_pool_workspace_memory = key + "@pool_workspace_memory"; + auto user_diff_dst_memory = + memory({{{diff_dst_tz}, memory::data_type::f32, out_grad->format()}, + mkldnn_engine}, + to_void_cast(out_grad_data)); + + std::shared_ptr diff_src_memory; + std::shared_ptr diff_dst_memory; + auto dst_memory = + std::static_pointer_cast(dev_ctx.GetBlob(key_pool_dst_mem_p)); + PADDLE_ENFORCE(dst_memory != nullptr, + "Fail to find dst_memory in device context"); + + primitive reorder_diff_dst; + bool is_diff_dst_reordered = false; auto pool_bwd_p = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_bwd_p)); if (pool_bwd_p == nullptr) { - auto diff_src_md = - platform::MKLDNNMemDesc(diff_src_tz, platform::MKLDNNGetDataType(), - mkldnn::memory::format::nchw); - auto diff_dst_md = - platform::MKLDNNMemDesc(diff_dst_tz, platform::MKLDNNGetDataType(), - mkldnn::memory::format::nchw); + // Retrieve src_memory/dst_memory saved in forward pass + auto src_memory = + std::static_pointer_cast(dev_ctx.GetBlob(key_pool_src_mem_p)); + PADDLE_ENFORCE(src_memory != nullptr, + "Fail to find src_memory in device context"); // Retrieve pool_pd/pool_workspace_memory from device context auto pool_pd = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_pd)); PADDLE_ENFORCE(pool_pd != nullptr, "Fail to find pool_pd in device context"); - - auto workspace_memory = std::static_pointer_cast( + auto workspace_memory = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_workspace_memory)); PADDLE_ENFORCE(workspace_memory != nullptr, "Fail to find workspace_memory in device context"); - auto pool_diff_src_memory_p = std::make_shared(memory( - {diff_src_md, mkldnn_engine}, static_cast(in_x_grad_data))); - dev_ctx.SetBlob(key_pool_diff_src_mem_p, pool_diff_src_memory_p); - - auto pool_diff_dst_memory_p = std::make_shared( - memory({diff_dst_md, mkldnn_engine}, - static_cast(const_cast(out_grad_data)))); - dev_ctx.SetBlob(key_pool_diff_dst_mem_p, pool_diff_dst_memory_p); + // create memory descriptors for pooling + auto diff_src_md = src_memory.get()->get_primitive_desc().desc(); + auto diff_dst_md = dst_memory.get()->get_primitive_desc().desc(); auto pool_bwd_desc = mkldnn::pooling_backward::desc( pooling_type == "max" ? mkldnn::algorithm::pooling_max @@ -264,35 +302,74 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc( pool_bwd_desc, mkldnn_engine, *pool_pd); + // reorder between user_diff_dst and pool diff_dst if needed + diff_dst_memory = std::make_shared(user_diff_dst_memory); + if (memory::primitive_desc(dst_memory->get_primitive_desc()) != + user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory = + std::make_shared(dst_memory.get()->get_primitive_desc()); + reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory); + is_diff_dst_reordered = true; + } + + diff_src_memory = std::make_shared( + pool_bwd_pd.diff_src_primitive_desc(), in_x_grad_data); + + dev_ctx.SetBlob(key_pool_diff_src_mem_p, diff_src_memory); + dev_ctx.SetBlob(key_pool_diff_dst_mem_p, diff_dst_memory); + pool_bwd_p = std::make_shared( - pool_bwd_pd, *(pool_diff_dst_memory_p.get()), *workspace_memory, - *(pool_diff_src_memory_p)); + pool_bwd_pd, *(diff_dst_memory.get()), *workspace_memory, + *(diff_src_memory)); dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p); + } else { // Primitives already exist - auto pool_diff_src_memory_p = std::static_pointer_cast( + diff_src_memory = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_diff_src_mem_p)); - PADDLE_ENFORCE(pool_diff_src_memory_p != nullptr, + PADDLE_ENFORCE(diff_src_memory != nullptr, "Fail to find pooling src mem_p in device context"); - auto pool_diff_dst_memory_p = std::static_pointer_cast( + diff_dst_memory = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_diff_dst_mem_p)); - PADDLE_ENFORCE(pool_diff_dst_memory_p != nullptr, + PADDLE_ENFORCE(diff_dst_memory != nullptr, "Fail to find pooling dst mem_p in device context"); - pool_diff_src_memory_p->set_data_handle( - reinterpret_cast(in_x_grad_data)); - pool_diff_dst_memory_p->set_data_handle(const_cast(out_grad_data)); + + diff_src_memory->set_data_handle(reinterpret_cast(in_x_grad_data)); + diff_dst_memory->set_data_handle(const_cast(out_grad_data)); + + // reorder between user_diff_dst and pool diff_dst if needed + if (memory::primitive_desc(dst_memory->get_primitive_desc()) != + user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory = + std::make_shared(dst_memory.get()->get_primitive_desc()); + reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory); + is_diff_dst_reordered = true; + } } + in_x_grad_format = (memory::format)diff_src_memory->get_primitive_desc() + .desc() + .data.format; + // push primitive to stream and wait until it's executed - std::vector pipeline{*(pool_bwd_p.get())}; + std::vector pipeline; + if (is_diff_dst_reordered) { + pipeline.push_back(reorder_diff_dst); + } + pipeline.push_back(*(pool_bwd_p.get())); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + + in_x_grad->set_layout(DataLayout::kMKLDNN); + in_x_grad->set_format(in_x_grad_format); } // Compute() }; } // namespace operators } // namespace paddle +namespace ops = paddle::operators; + REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace, - paddle::operators::PoolMKLDNNOpKernel); + ops::PoolMKLDNNOpKernel); REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace, - paddle::operators::PoolMKLDNNGradOpKernel); + ops::PoolMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index f4fb2b132fe8d59cb50f5a1f7359240ac50445fe..6707cdded4020fe3e2b01ba399dfc279a9da677d 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -83,6 +83,9 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const { framework::OpKernelType PoolOp::GetExpectedKernelType( const framework::ExecutionContext &ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; @@ -92,11 +95,10 @@ framework::OpKernelType PoolOp::GetExpectedKernelType( if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif - std::string data_format = ctx.Attr("data_format"); - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), layout_, library_); @@ -112,6 +114,9 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { framework::OpKernelType PoolOpGrad::GetExpectedKernelType( const framework::ExecutionContext &ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; @@ -121,6 +126,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif @@ -129,8 +135,6 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN, "float16 can only be used when CUDNN is used"); } - std::string data_format = ctx.Attr("data_format"); - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, library_); } @@ -147,7 +151,8 @@ void Pool2dOpMaker::Make() { "The format of output tensor is also NCHW, " "where N is batch size, C is the number of channels, " "H is the height of the feature, " - "and W is the width of the feature."); + "and W is the width of the feature.") + .Reuse("X"); AddAttr("pooling_type", "(string), pooling type, can be \"max\" for max-pooling " @@ -240,7 +245,8 @@ void Pool3dOpMaker::Make() { "The format of output tensor is also NCDHW, " "where N is batch size, C is " "the number of channels, and D, H and W is the depth, height and " - "width of the feature, respectively."); + "width of the feature, respectively.") + .Reuse("X"); AddAttr("pooling_type", "(string) Pooling type, can be \"max\" for max-pooling " diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc index e0a9b24ac8978418a1a4ece62286e022bec8b834..f71ba84b318c1f8b0604310f3db8a0826124e207 100644 --- a/paddle/fluid/operators/prefetch_op.cc +++ b/paddle/fluid/operators/prefetch_op.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/send_recv_util.h" namespace paddle { @@ -41,19 +41,19 @@ class PrefetchOp : public framework::OperatorBase { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); - auto rpc_client = detail::RPCClient::GetInstance(); + detail::RPCClient* rpc_client = + detail::RPCClient::GetInstance(); for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get " << outs[i] << " back"; - rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i], - outs[i]); + rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, ins[i], outs[i]); } else { VLOG(3) << "don't send no-initialied variable: " << ins[i]; } } - PADDLE_ENFORCE(rpc_client->Wait()); + rpc_client->Wait(); } }; diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc index b14b559e31dd422f8ebe4002988a9746dfdf28a2..528a6e4a1b68fe611d104f21bafe970762611a03 100644 --- a/paddle/fluid/operators/random_crop_op.cc +++ b/paddle/fluid/operators/random_crop_op.cc @@ -20,7 +20,6 @@ class RandomCropOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( @@ -36,11 +35,11 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Seed", "The random seed."); AddOutput("Out", "The cropped instance batch."); AddOutput("SeedOut", "The random seed after random cropping.") - .AsDispensable(); + .AsIntermediate(); AddAttr>("shape", "The shape of a cropped instance."); AddComment(R"DOC( - This operator takes a batch of instance, and do random cropping on each instance. - It means that cropping positions differs on each instance, which is determined + This operator takes a batch of instance, and do random cropping on each instance. + It means that cropping positions differs on each instance, which is determined by an uniform random generator. All cropped instances have the same shape, which is determined by the operator's attribute 'shape'. )DOC"); diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc index 4cc7cbc6e89b0712faf9ad9c51480bce00da15f5..ecbae3894d551186f53625a6cc9cfdb36adc8d2d 100644 --- a/paddle/fluid/operators/reader/create_batch_reader_op.cc +++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc @@ -20,7 +20,7 @@ namespace reader { class BatchReader : public framework::DecoratedReader { public: - BatchReader(ReaderBase* reader, int batch_size) + BatchReader(const std::shared_ptr& reader, int batch_size) : DecoratedReader(reader), batch_size_(batch_size) { buffer_.reserve(batch_size_); } diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc index 331224a59899b4a7d517ca4f7141fb5b8f4f5168..0a02fcdeaa5a6de97d59ddce4f58ad945aa2572a 100644 --- a/paddle/fluid/operators/reader/create_custom_reader_op.cc +++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc @@ -22,7 +22,8 @@ namespace reader { class CustomReader : public framework::DecoratedReader { public: - CustomReader(ReaderBase* reader, const framework::BlockDesc& sub_block, + CustomReader(const std::shared_ptr& reader, + const framework::BlockDesc& sub_block, const std::vector& source_var_names, const std::vector& sink_var_names) : DecoratedReader(reader), diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index bc830a2b72e657f79f4c94e24428d38ff2b7c42e..5f35b9b3eac1d9aab8662833c6e39d12f11a0087 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -34,7 +34,8 @@ static constexpr size_t kChannelSize = 1; // kCacheSize - 2 class DoubleBufferReader : public framework::DecoratedReader { public: explicit DoubleBufferReader( - ReaderBase* reader, platform::Place target_place = platform::CPUPlace()) + const std::shared_ptr& reader, + platform::Place target_place = platform::CPUPlace()) : DecoratedReader(reader), place_(target_place) { cpu_tensor_cache_.resize(kCacheSize); gpu_tensor_cache_.resize(kCacheSize); diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc index 249b0b7c6dbc8b8104bce95562e6e9b2a28c77f8..19b54110b9aeece33b8d6c73612ae0e12dbfafbd 100644 --- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc +++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc @@ -21,7 +21,7 @@ namespace reader { class MultiPassReader : public framework::DecoratedReader { public: - MultiPassReader(ReaderBase* reader, int pass_num) + MultiPassReader(const std::shared_ptr& reader, int pass_num) : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {} void ReadNext(std::vector* out) override { diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc index fd233be945932eee9f9a3c0c578a43d5b7cc83aa..57e8e21214b7c99e52550fe51a67c9b5201cb46f 100644 --- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc +++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc @@ -23,7 +23,8 @@ namespace reader { class ShuffleReader : public framework::DecoratedReader { public: - ShuffleReader(ReaderBase* reader, size_t buffer_size, size_t seed = 0) + ShuffleReader(const std::shared_ptr& reader, size_t buffer_size, + size_t seed = 0) : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) { VLOG(10) << "Create shuffle reader of " << reader_; if (seed_ == 0) { diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc index 1db70f3e9699dba604569c36dc35025dfe2c94fe..3798015146f4ffb085aa82e23ca3f1fb3c5cf5a4 100644 --- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc +++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc @@ -21,7 +21,8 @@ namespace reader { class ThreadedReader : public framework::DecoratedReader { public: - explicit ThreadedReader(ReaderBase* reader) : DecoratedReader(reader) {} + explicit ThreadedReader(const std::shared_ptr& reader) + : DecoratedReader(reader) {} void ReadNext(std::vector* out) override { std::lock_guard lock(mutex_); diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc index 8c0dac65dd691954b112bfa61622d399b2b9c3e5..31e5d81e55ed9703eb3a9ef2595fa2a280f1a734 100644 --- a/paddle/fluid/operators/reader/open_files_op.cc +++ b/paddle/fluid/operators/reader/open_files_op.cc @@ -26,7 +26,11 @@ class MultiFileReader : public framework::ReaderBase { MultiFileReader(const std::vector& file_names, const std::vector& dims, size_t thread_num, size_t buffer_size) - : file_names_(file_names), dims_(dims), buffer_size_(buffer_size) { + : buffer_size_(buffer_size) { + readers_.reserve(file_names.size()); + for (const std::string& f_name : file_names) { + readers_.emplace_back(CreateReaderByFileName(f_name, dims)); + } prefetchers_.resize(thread_num); StartNewScheduler(); } @@ -40,14 +44,13 @@ class MultiFileReader : public framework::ReaderBase { void StartNewScheduler(); void EndScheduler(); void ScheduleThreadFunc(); - void PrefetchThreadFunc(std::string file_name, size_t thread_idx); + void PrefetchThreadFunc(size_t reader_idx, size_t thread_idx); - std::vector file_names_; - std::vector dims_; + std::vector> readers_; std::thread scheduler_; std::vector prefetchers_; size_t buffer_size_; - reader::BlockingQueue* waiting_file_idx_; + reader::BlockingQueue* waiting_reader_idx_; reader::BlockingQueue* available_thread_idx_; reader::BlockingQueue>* buffer_; }; @@ -65,15 +68,15 @@ void MultiFileReader::ReInit() { void MultiFileReader::StartNewScheduler() { size_t thread_num = prefetchers_.size(); - waiting_file_idx_ = new reader::BlockingQueue(file_names_.size()); + waiting_reader_idx_ = new reader::BlockingQueue(readers_.size()); available_thread_idx_ = new reader::BlockingQueue(thread_num); buffer_ = new reader::BlockingQueue>( buffer_size_); - for (size_t i = 0; i < file_names_.size(); ++i) { - waiting_file_idx_->Send(i); + for (size_t i = 0; i < readers_.size(); ++i) { + waiting_reader_idx_->Send(i); } - waiting_file_idx_->Close(); + waiting_reader_idx_->Close(); for (size_t i = 0; i < thread_num; ++i) { available_thread_idx_->Send(i); } @@ -84,13 +87,13 @@ void MultiFileReader::StartNewScheduler() { void MultiFileReader::EndScheduler() { available_thread_idx_->Close(); buffer_->Close(); - waiting_file_idx_->Close(); + waiting_reader_idx_->Close(); if (scheduler_.joinable()) { scheduler_.join(); } delete buffer_; delete available_thread_idx_; - delete waiting_file_idx_; + delete waiting_reader_idx_; } void MultiFileReader::ScheduleThreadFunc() { @@ -102,12 +105,11 @@ void MultiFileReader::ScheduleThreadFunc() { if (prefetcher.joinable()) { prefetcher.join(); } - size_t file_idx; - if (waiting_file_idx_->Receive(&file_idx)) { + size_t reader_idx; + if (waiting_reader_idx_->Receive(&reader_idx)) { // Still have files to read. Start a new prefetch thread. - std::string file_name = file_names_[file_idx]; - prefetcher = std::thread([this, file_name, thread_idx] { - PrefetchThreadFunc(file_name, thread_idx); + prefetcher = std::thread([this, reader_idx, thread_idx] { + PrefetchThreadFunc(reader_idx, thread_idx); }); } else { // No more file to read. @@ -129,23 +131,22 @@ void MultiFileReader::ScheduleThreadFunc() { VLOG(5) << "MultiFileReader schedule thread terminates."; } -void MultiFileReader::PrefetchThreadFunc(std::string file_name, - size_t thread_idx) { - VLOG(5) << "The prefetch thread of file '" << file_name << "' starts."; - std::unique_ptr reader = - CreateReaderByFileName(file_name, dims_); +void MultiFileReader::PrefetchThreadFunc(size_t reader_idx, size_t thread_idx) { + VLOG(5) << "The prefetch thread of file idx '" << reader_idx << "' starts."; + std::unique_ptr& reader = readers_[reader_idx]; while (true) { std::vector ins; reader->ReadNext(&ins); if (ins.empty()) { + reader->ReInit(); break; } try { buffer_->Send(std::move(ins)); } catch (paddle::platform::EnforceNotMet e) { VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch " - "thread of file '" - << file_name << "' will terminate."; + "thread of file idx '" + << reader_idx << "' will terminate."; break; } } @@ -154,7 +155,8 @@ void MultiFileReader::PrefetchThreadFunc(std::string file_name, VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. " "Fail to send thread_idx."; } - VLOG(5) << "The prefetch thread of file '" << file_name << "' terminates."; + VLOG(5) << "The prefetch thread of file idx '" << reader_idx + << "' terminates."; } class OpenFilesOp : public framework::OperatorBase { diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index d8ddb7b448910b5e0e6e71742eb2fdc6a225c919..15dfb5469bf51330b98d6699fb3ce708222212ed 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -19,8 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" - -#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -44,14 +43,15 @@ class RecvOp : public framework::OperatorBase { // For profiling platform::RecordEvent record_event(Type(), &ctx); - auto rpc_client = detail::RPCClient::GetInstance(); + detail::RPCClient* rpc_client = + detail::RPCClient::GetInstance(); for (size_t i = 0; i < outs.size(); i++) { VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; - rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]); + rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]); } if (sync_mode) { - PADDLE_ENFORCE(rpc_client->Wait()); + rpc_client->Wait(); } } }; @@ -77,9 +77,15 @@ This operator can get variables from server side. } }; +class RecvOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override {} +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker); +REGISTER_OPERATOR(recv, ops::RecvOp, paddle::framework::EmptyGradOpMaker, + ops::RecvOpMaker, ops::RecvOpShapeInference); diff --git a/paddle/fluid/operators/reduce_max_op.cc b/paddle/fluid/operators/reduce_max_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..95d3768e1fdf6947659c7b3a1c9d57fad741472a --- /dev/null +++ b/paddle/fluid/operators/reduce_max_op.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_REDUCE_OP(reduce_max); +REGISTER_OP_CPU_KERNEL( + reduce_max, ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL( + reduce_max_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_max_op.cu b/paddle/fluid/operators/reduce_max_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..0d86b3127e42f7ee14ba57b1c762e8128a0f2d54 --- /dev/null +++ b/paddle/fluid/operators/reduce_max_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_max, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_max_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.cc b/paddle/fluid/operators/reduce_mean_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..fc258c2496340b47d24dc89f16f7419dbb4b0d95 --- /dev/null +++ b/paddle/fluid/operators/reduce_mean_op.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_mean_op.h" + +REGISTER_REDUCE_OP(reduce_mean); +REGISTER_OP_CPU_KERNEL(reduce_mean, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL(reduce_mean_grad, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..960cb3235be7f4cc98b97d3b088ceaeb3d4a4209 --- /dev/null +++ b/paddle/fluid/operators/reduce_mean_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_mean_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_mean, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_mean_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.h b/paddle/fluid/operators/reduce_mean_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1359679c4767d2032bf3e3a90849ad2a2ef3e829 --- /dev/null +++ b/paddle/fluid/operators/reduce_mean_op.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +struct MeanFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->mean(dim); + } +}; + +struct MeanGradFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, + const Dim& dim, int size) { + dx->device(place) = dy->broadcast(dim) / dx->constant(size); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_min_max_op.h b/paddle/fluid/operators/reduce_min_max_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ec59f3e71c1c702655a3feed10935b2f5a29d8a8 --- /dev/null +++ b/paddle/fluid/operators/reduce_min_max_op.h @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +struct MaxFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->maximum(dim); + } +}; + +struct MinFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->minimum(dim); + } +}; + +struct MaxOrMinGradFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, + const Dim& dim, int size) { + auto equals = (*x) == y->broadcast(dim); + auto ones = dx->constant(1); + auto zeros = dx->constant(0); + // If there are multiple minimum or maximum elements, the subgradient of + // each is the set [0, 1], and we pass gradient to all of them here. + dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_min_op.cc b/paddle/fluid/operators/reduce_min_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..330a86d2e4237a10d8cf6fd40025540edf08d897 --- /dev/null +++ b/paddle/fluid/operators/reduce_min_op.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_REDUCE_OP(reduce_min); +REGISTER_OP_CPU_KERNEL( + reduce_min, ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL( + reduce_min_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_min_op.cu b/paddle/fluid/operators/reduce_min_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..da466f805eff4709dc23471baef03e94052ee6c1 --- /dev/null +++ b/paddle/fluid/operators/reduce_min_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_min, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_min_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc deleted file mode 100644 index e293fd5e410b2a34b3c71ea674607ba9d7654535..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_op.cc +++ /dev/null @@ -1,186 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/reduce_op.h" - -#include -#include -#include - -namespace paddle { -namespace operators { - -using framework::Tensor; - -class ReduceOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of ReduceOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of ReduceOp should not be null."); - auto x_dims = ctx->GetInputDim("X"); - auto x_rank = x_dims.size(); - PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); - auto dims = ctx->Attrs().Get>("dim"); - for (size_t i = 0; i < dims.size(); ++i) { - if (dims[i] < 0) dims[i] = x_rank + dims[i]; - PADDLE_ENFORCE_LT( - dims[i], x_rank, - "The dim should be in the range [-rank(input), rank(input))."); - } - sort(dims.begin(), dims.end()); - bool reduce_all = ctx->Attrs().Get("reduce_all"); - bool keep_dim = ctx->Attrs().Get("keep_dim"); - if (reduce_all) { - if (keep_dim) - ctx->SetOutputDim( - "Out", framework::make_ddim(std::vector(x_rank, 1))); - else - ctx->SetOutputDim("Out", {1}); - } else { - auto dims_vector = vectorize(x_dims); - if (keep_dim) { - for (size_t i = 0; i < dims.size(); ++i) { - dims_vector[dims[i]] = 1; - } - } else { - const int kDelFlag = -2; - for (size_t i = 0; i < dims.size(); ++i) { - dims_vector[dims[i]] = kDelFlag; - } - dims_vector.erase( - remove(dims_vector.begin(), dims_vector.end(), kDelFlag), - dims_vector.end()); - } - auto out_dims = framework::make_ddim(dims_vector); - ctx->SetOutputDim("Out", out_dims); - if (dims[0] != 0) { - // Only pass LoD when not reducing on the first dim. - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - } -}; - -class ReduceGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null."); - auto x_dims = ctx->GetInputDim("X"); - auto x_rank = x_dims.size(); - PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); - auto dims = ctx->Attrs().Get>("dim"); - for (size_t i = 0; i < dims.size(); ++i) { - if (dims[i] < 0) dims[i] = x_rank + dims[i]; - PADDLE_ENFORCE_LT( - dims[i], x_rank, - "The dim should be in the range [-rank(input), rank(input))."); - } - sort(dims.begin(), dims.end()); - auto x_grad_name = framework::GradVarName("X"); - if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); - ctx->ShareLoD("X", /*->*/ x_grad_name); - } - } -}; - -class ReduceOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() final { - AddInput("X", - "(Tensor) The input tensor. Tensors with rank at most 6 are " - "supported."); - AddOutput("Out", "(Tensor) The result tensor."); - AddAttr>( - "dim", - "(list, default {0}) The dimensions to reduce. " - "Must be in the range [-rank(input), rank(input)). " - "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. " - "Note that reducing on the first dim will make the LoD info lost.") - .SetDefault({0}); - AddAttr("keep_dim", - "(bool, default false) " - "If true, retain the reduced dimension with length 1.") - .SetDefault(false); - AddAttr("reduce_all", - "(bool, default false) " - "If true, output a scalar reduced along all dimensions.") - .SetDefault(false); - AddComment(string::Sprintf(R"DOC( -%s Operator. - -This operator computes the %s of input tensor along the given dimension. -The result tensor has 1 fewer dimension than the input unless keep_dim is true. -If reduce_all is true, just reduce along all dimensions and output a scalar. - -)DOC", - GetOpType(), GetName())); - } - - protected: - virtual std::string GetName() const = 0; - virtual std::string GetOpType() const = 0; -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -#define REGISTER_REDUCE_OP(op_name) \ - class __##op_name##Maker__ : public ops::ReduceOpMaker { \ - protected: \ - virtual std::string GetName() const { return #op_name; } \ - virtual std::string GetOpType() const { return "Reduce " #op_name; } \ - }; \ - REGISTER_OPERATOR(reduce_##op_name, ops::ReduceOp, __##op_name##Maker__, \ - paddle::framework::DefaultGradOpDescMaker); \ - REGISTER_OPERATOR(reduce_##op_name##_grad, ops::ReduceGradOp) - -REGISTER_REDUCE_OP(sum); -REGISTER_REDUCE_OP(mean); -REGISTER_REDUCE_OP(max); -REGISTER_REDUCE_OP(min); -REGISTER_REDUCE_OP(prod); - -#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \ - REGISTER_OP_CPU_KERNEL(reduce_type, \ - ops::ReduceKernel, \ - ops::ReduceKernel, \ - ops::ReduceKernel, \ - ops::ReduceKernel); \ - REGISTER_OP_CPU_KERNEL( \ - reduce_type##_grad, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel); - -FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL); diff --git a/paddle/fluid/operators/reduce_op.cu b/paddle/fluid/operators/reduce_op.cu deleted file mode 100644 index ae29587f55847315b1d84f1344677e753fe01a9b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_op.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#define EIGEN_USE_GPU -#include "paddle/fluid/operators/reduce_op.h" - -namespace ops = paddle::operators; - -#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor) \ - REGISTER_OP_CUDA_KERNEL( \ - reduce_type, ops::ReduceKernel, \ - ops::ReduceKernel, \ - ops::ReduceKernel, \ - ops::ReduceKernel); \ - REGISTER_OP_CUDA_KERNEL( \ - reduce_type##_grad, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel); - -FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL); diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h index cd19cc1460a6b4d4201f21f6f27f988c1547b88a..72b6cf1773d5bcc42e40e72111179d454d2bb4a9 100644 --- a/paddle/fluid/operators/reduce_op.h +++ b/paddle/fluid/operators/reduce_op.h @@ -14,105 +14,20 @@ limitations under the License. */ #pragma once +#include +#include #include -#include "glog/logging.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" + +#include "paddle/fluid/operators/reduce_op_function.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using DDim = framework::DDim; -template -using EigenTensor = framework::EigenTensor; -template -using EigenScalar = framework::EigenScalar; -template -using EigenVector = framework::EigenVector; - -struct SumFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->sum(dim); - } -}; - -struct SumGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - dx->device(place) = dy->broadcast(dim); - } -}; - -struct MeanFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->mean(dim); - } -}; - -struct MeanGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - dx->device(place) = dy->broadcast(dim) / dx->constant(size); - } -}; - -struct MaxFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->maximum(dim); - } -}; - -struct MinFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->minimum(dim); - } -}; - -struct MaxOrMinGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - auto equals = (*x) == y->broadcast(dim); - auto ones = dx->constant(1); - auto zeros = dx->constant(0); - // If there are multiple minimum or maximum elements, the subgradient of - // each is the set [0, 1], and we pass gradient to all of them here. - dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros); - } -}; - -struct ProdFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->prod(dim); - } -}; - -struct ProdGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse(); - } -}; - -#define HANDLE_DIM(NDIM, RDIM) \ - if (ndim == NDIM && rdim == RDIM) { \ - ReduceCompute(context); \ +#define HANDLE_DIM(NDIM, RDIM) \ + if (ndim == NDIM && rdim == RDIM) { \ + ReduceFunctor( \ + context.template device_context(), *input, output, \ + dims, keep_dim); \ } template @@ -120,11 +35,15 @@ class ReduceKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + output->mutable_data(context.GetPlace()); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + if (reduce_all) { // Flatten and reduce 1-D tensor - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - output->mutable_data(context.GetPlace()); auto x = EigenVector::Flatten(*input); auto out = EigenScalar::From(*output); auto& place = @@ -133,17 +52,18 @@ class ReduceKernel : public framework::OpKernel { Functor functor; functor(place, &x, &out, reduce_dim); } else { - int ndim = context.Input("X")->dims().size(); - int rdim = context.Attr>("dim").size(); - HANDLE_DIM(6, 5); - HANDLE_DIM(6, 4); - HANDLE_DIM(6, 3); - HANDLE_DIM(6, 2); - HANDLE_DIM(6, 1); - HANDLE_DIM(5, 4); - HANDLE_DIM(5, 3); - HANDLE_DIM(5, 2); - HANDLE_DIM(5, 1); + int ndim = input->dims().size(); + int rdim = dims.size(); + // comments for accelerating compiling temporarily. + // HANDLE_DIM(6, 5); + // HANDLE_DIM(6, 4); + // HANDLE_DIM(6, 3); + // HANDLE_DIM(6, 2); + // HANDLE_DIM(6, 1); + // HANDLE_DIM(5, 4); + // HANDLE_DIM(5, 3); + // HANDLE_DIM(5, 2); + // HANDLE_DIM(5, 1); HANDLE_DIM(4, 3); HANDLE_DIM(4, 2); HANDLE_DIM(4, 1); @@ -153,48 +73,6 @@ class ReduceKernel : public framework::OpKernel { HANDLE_DIM(1, 1); } } - - private: - template - void ReduceCompute(const framework::ExecutionContext& context) const { - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - output->mutable_data(context.GetPlace()); - - auto x = EigenTensor::From(*input); - auto x_rank = static_cast(x.dimensions().size()); - auto dims = context.Attr>("dim"); - auto reduce_dim = Eigen::array(); - for (size_t i = 0; i < dims.size(); ++i) { - if (dims[i] < 0) dims[i] = x_rank + dims[i]; - reduce_dim[i] = dims[i]; - } - // construct the squeezed output tensor - bool keep_dim = context.Attr("keep_dim"); - DDim out_dims = output->dims(); - if (keep_dim && x_rank > 1) { - const int kDelFlag = -2; - auto dims_vector = vectorize(out_dims); - for (size_t i = 0; i < dims.size(); ++i) { - dims_vector[dims[i]] = kDelFlag; - } - dims_vector.erase( - remove(dims_vector.begin(), dims_vector.end(), kDelFlag), - dims_vector.end()); - out_dims = framework::make_ddim(dims_vector); - } - auto& place = - *context.template device_context().eigen_device(); - Functor functor; - - if (D == 1) { - auto out = EigenScalar::From(*output); - functor(place, &x, &out, reduce_dim); - } else { - auto out = EigenTensor::From(*output, out_dims); - functor(place, &x, &out, reduce_dim); - } - } }; template @@ -202,12 +80,15 @@ class ReduceGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { bool reduce_all = context.Attr("reduce_all"); + auto dims = context.Attr>("dim"); + + auto* input0 = context.Input("X"); + auto* input1 = context.Input("Out"); + auto* input2 = context.Input(framework::GradVarName("Out")); + auto* output = context.Output(framework::GradVarName("X")); + output->mutable_data(context.GetPlace()); + if (reduce_all) { - auto* input0 = context.Input("X"); - auto* input1 = context.Input("Out"); - auto* input2 = context.Input(framework::GradVarName("Out")); - auto* output = context.Output(framework::GradVarName("X")); - output->mutable_data(context.GetPlace()); auto x = EigenVector::Flatten(*input0); auto x_reduce = EigenVector::From(*input1); auto x_reduce_grad = EigenVector::From(*input2); @@ -220,74 +101,172 @@ class ReduceGradKernel : public framework::OpKernel { functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, broadcast_dim[0]); } else { - int rank = context.Input("X")->dims().size(); + int rank = input0->dims().size(); switch (rank) { case 1: - ReduceGradCompute<1>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 2: - ReduceGradCompute<2>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 3: - ReduceGradCompute<3>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 4: - ReduceGradCompute<4>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 5: - ReduceGradCompute<5>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 6: - ReduceGradCompute<6>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; } } } +}; - private: - template - void ReduceGradCompute(const framework::ExecutionContext& context) const { - auto* input0 = context.Input("X"); - auto* input1 = context.Input("Out"); - auto* input2 = context.Input(framework::GradVarName("Out")); - auto* output = context.Output(framework::GradVarName("X")); +class ReduceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; - output->mutable_data(context.GetPlace()); - auto x = EigenTensor::From(*input0); - auto x_grad = EigenTensor::From(*output); - auto x_rank = static_cast(x.dimensions().size()); - auto dims = context.Attr>("dim"); - auto x_dims = input0->dims(); - auto reduced_dims_v = vectorize(x_dims); - Eigen::array broadcast_dim; - for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ReduceOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ReduceOp should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto x_rank = x_dims.size(); + PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); + auto dims = ctx->Attrs().Get>("dim"); + for (size_t i = 0; i < dims.size(); ++i) { + if (dims[i] < 0) dims[i] = x_rank + dims[i]; + PADDLE_ENFORCE_LT( + dims[i], x_rank, + "The dim should be in the range [-rank(input), rank(input))."); + } + sort(dims.begin(), dims.end()); + bool reduce_all = ctx->Attrs().Get("reduce_all"); + bool keep_dim = ctx->Attrs().Get("keep_dim"); + if (reduce_all) { + if (keep_dim) + ctx->SetOutputDim( + "Out", framework::make_ddim(std::vector(x_rank, 1))); + else + ctx->SetOutputDim("Out", {1}); + } else { + auto dims_vector = vectorize(x_dims); + if (keep_dim) { + for (size_t i = 0; i < dims.size(); ++i) { + dims_vector[dims[i]] = 1; + } + } else { + const int kDelFlag = -2; + for (size_t i = 0; i < dims.size(); ++i) { + dims_vector[dims[i]] = kDelFlag; + } + dims_vector.erase( + remove(dims_vector.begin(), dims_vector.end(), kDelFlag), + dims_vector.end()); + } + auto out_dims = framework::make_ddim(dims_vector); + ctx->SetOutputDim("Out", out_dims); + if (dims[0] != 0) { + // Only pass LoD when not reducing on the first dim. + ctx->ShareLoD("X", /*->*/ "Out"); + } + } + } +}; + +class ReduceGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; - int broad_cats_times = 1; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto x_rank = x_dims.size(); + PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); + auto dims = ctx->Attrs().Get>("dim"); for (size_t i = 0; i < dims.size(); ++i) { if (dims[i] < 0) dims[i] = x_rank + dims[i]; - reduced_dims_v[dims[i]] = 1; - broadcast_dim[dims[i]] = x_dims[dims[i]]; - broad_cats_times *= x_dims[dims[i]]; + PADDLE_ENFORCE_LT( + dims[i], x_rank, + "The dim should be in the range [-rank(input), rank(input))."); + } + sort(dims.begin(), dims.end()); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + ctx->ShareLoD("X", /*->*/ x_grad_name); } - auto reduced_dims = framework::make_ddim(reduced_dims_v); - auto x_reduce = EigenTensor::From(*input1, reduced_dims); - auto x_reduce_grad = EigenTensor::From(*input2, reduced_dims); + } +}; + +class ReduceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() final { + AddInput("X", + "(Tensor) The input tensor. Tensors with rank at most 6 are " + "supported."); + AddOutput("Out", "(Tensor) The result tensor."); + AddAttr>( + "dim", + "(list, default {0}) The dimensions to reduce. " + "Must be in the range [-rank(input), rank(input)). " + "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. " + "Note that reducing on the first dim will make the LoD info lost.") + .SetDefault({0}); + AddAttr("keep_dim", + "(bool, default false) " + "If true, retain the reduced dimension with length 1.") + .SetDefault(false); + AddAttr("reduce_all", + "(bool, default false) " + "If true, output a scalar reduced along all dimensions.") + .SetDefault(false); + AddComment(string::Sprintf(R"DOC( +%s Operator. - auto& place = - *context.template device_context().eigen_device(); +This operator computes the %s of input tensor along the given dimension. +The result tensor has 1 fewer dimension than the input unless keep_dim is true. +If reduce_all is true, just reduce along all dimensions and output a scalar. - Functor functor; - functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, - broad_cats_times); +)DOC", + GetOpType(), GetName())); } + + protected: + virtual std::string GetName() const = 0; + virtual std::string GetOpType() const = 0; }; } // namespace operators } // namespace paddle -#define FOR_EACH_KERNEL_FUNCTOR(__macro) \ - __macro(reduce_sum, SumFunctor, SumGradFunctor); \ - __macro(reduce_mean, MeanFunctor, MeanGradFunctor); \ - __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \ - __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); \ - __macro(reduce_prod, ProdFunctor, ProdGradFunctor); +namespace ops = paddle::operators; + +#define REGISTER_REDUCE_OP(op_name) \ + class __##op_name##Maker__ : public ops::ReduceOpMaker { \ + protected: \ + virtual std::string GetName() const { return #op_name; } \ + virtual std::string GetOpType() const { return "Reduce " #op_name; } \ + }; \ + REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__, \ + paddle::framework::DefaultGradOpDescMaker); \ + REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp) diff --git a/paddle/fluid/operators/reduce_op_function.h b/paddle/fluid/operators/reduce_op_function.h new file mode 100644 index 0000000000000000000000000000000000000000..3da27bc8ac8d448471b9ff3779ac6aca59fac523 --- /dev/null +++ b/paddle/fluid/operators/reduce_op_function.h @@ -0,0 +1,109 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; +template +using EigenTensor = framework::EigenTensor; +template +using EigenScalar = framework::EigenScalar; +template +using EigenVector = framework::EigenVector; + +template +void ReduceFunctor(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* output, const std::vector& dims, + bool keep_dim) { + auto x = EigenTensor::From(input); + auto x_rank = static_cast(x.dimensions().size()); + auto reduce_dim = Eigen::array(); + std::vector dims_ref = dims; + for (size_t i = 0; i < dims_ref.size(); ++i) { + if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i]; + reduce_dim[i] = dims_ref[i]; + } + // construct the squeezed output tensor + DDim out_dims = output->dims(); + if (keep_dim && x_rank > 1) { + const int kDelFlag = -2; + auto dims_vector = framework::vectorize(out_dims); + for (size_t i = 0; i < dims_ref.size(); ++i) { + dims_vector[dims_ref[i]] = kDelFlag; + } + dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag), + dims_vector.end()); + out_dims = framework::make_ddim(dims_vector); + } + auto& place = *context.eigen_device(); + Functor functor; + + if (D == 1) { + auto out = EigenScalar::From(*output); + functor(place, &x, &out, reduce_dim); + } else { + auto out = EigenTensor::From(*output, out_dims); + functor(place, &x, &out, reduce_dim); + } +} + +template +void ReduceGradFunctor(const DeviceContext& context, + const framework::Tensor& input0, + const framework::Tensor& input1, + const framework::Tensor& input2, + framework::Tensor* output, + const std::vector& dims) { + auto x = EigenTensor::From(input0); + auto x_grad = EigenTensor::From(*output); + auto x_rank = static_cast(x.dimensions().size()); + auto x_dims = input0.dims(); + auto reduced_dims_v = framework::vectorize(x_dims); + std::vector dims_ref = dims; + Eigen::array broadcast_dim; + for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; + + int broad_cats_times = 1; + for (size_t i = 0; i < dims_ref.size(); ++i) { + if (dims_ref[i] < 0) { + dims_ref[i] = x_rank + dims_ref[i]; + } + reduced_dims_v[dims_ref[i]] = 1; + broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]]; + broad_cats_times *= x_dims[dims_ref[i]]; + } + auto reduced_dims = framework::make_ddim(reduced_dims_v); + auto x_reduce = EigenTensor::From(input1, reduced_dims); + auto x_reduce_grad = EigenTensor::From(input2, reduced_dims); + + auto& place = *context.eigen_device(); + + Functor functor; + functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, + broad_cats_times); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_prod_op.cc b/paddle/fluid/operators/reduce_prod_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..713728b99757a6f3bb128f665d5576ac64eef8ec --- /dev/null +++ b/paddle/fluid/operators/reduce_prod_op.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_prod_op.h" + +REGISTER_REDUCE_OP(reduce_prod); +REGISTER_OP_CPU_KERNEL(reduce_prod, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL(reduce_prod_grad, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_prod_op.cu b/paddle/fluid/operators/reduce_prod_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..d62e677d92cffecf629d1684026b0c7bcfec29e3 --- /dev/null +++ b/paddle/fluid/operators/reduce_prod_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_prod_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_prod, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_prod_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_prod_op.h b/paddle/fluid/operators/reduce_prod_op.h new file mode 100644 index 0000000000000000000000000000000000000000..97748113e092719aceed9d806ca6242077111532 --- /dev/null +++ b/paddle/fluid/operators/reduce_prod_op.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +struct ProdFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->prod(dim); + } +}; + +struct ProdGradFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, + const Dim& dim, int size) { + dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_sum_op.cc b/paddle/fluid/operators/reduce_sum_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c5b5398787b44e658b0f8390162df0e6c3006651 --- /dev/null +++ b/paddle/fluid/operators/reduce_sum_op.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_sum_op.h" + +REGISTER_REDUCE_OP(reduce_sum); +REGISTER_OP_CPU_KERNEL( + reduce_sum, ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL(reduce_sum_grad, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..f2e16955a50dc6a7feda9fbaf968c929ef3d8a4f --- /dev/null +++ b/paddle/fluid/operators/reduce_sum_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_sum_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_sum, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_sum_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_sum_op.h b/paddle/fluid/operators/reduce_sum_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e67d7e1da5f0244d2dee346873692a80cbad2fc4 --- /dev/null +++ b/paddle/fluid/operators/reduce_sum_op.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +struct SumFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->sum(dim); + } +}; + +struct SumGradFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, + const Dim& dim, int size) { + dx->device(place) = dy->broadcast(dim); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reverse_op.cc b/paddle/fluid/operators/reverse_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a20f7d231fa9ea313581ac0629a87fa5f4a88ce5 --- /dev/null +++ b/paddle/fluid/operators/reverse_op.cc @@ -0,0 +1,107 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reverse_op.h" +#include + +namespace paddle { +namespace operators { + +class ReverseOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); + const auto& x_dims = ctx->GetInputDim("X"); + const auto& axis = ctx->Attrs().Get>("axis"); + PADDLE_ENFORCE(!axis.empty(), "'axis' can not be empty."); + for (int a : axis) { + PADDLE_ENFORCE_LT(a, x_dims.size(), + "The axis must be less than input tensor's rank."); + } + ctx->SetOutputDim("Out", x_dims); + } +}; + +class ReverseOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The LoDTensor to be flipped."); + AddOutput("Out", "The LoDTensor after flipping."); + AddAttr>( + "axis", "The axises that along which order of elements is reversed."); + AddComment(R"DOC( + Reverse Operator. + + Reverse the order of elements in the input LoDTensor along given axises. + + Case 1: + Given + X = [[1, 2, 3, 4, 5] + [6, 7, 8, 9, 10] + [11, 12, 13, 14, 15]], + and + axis = [0], + we get: + Out = [[11, 12, 13, 14, 15] + [6, 7, 8, 9, 10] + [1, 2, 3, 4, 5]]. + + Case 2: + Given + X = [[[1, 2, 3, 4] + [5, 6, 7, 8]] + [[9, 10, 11, 12] + [13, 14, 15, 16]]], + and + axis = [0, 2], + we get: + Out = [[[12, 11, 10, 9] + [16, 15, 14, 13]] + [[4, 3, 2, 1] + [8, 7, 6, 5]]], + )DOC"); + } +}; + +class ReverseGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto* grad_op = new framework::OpDesc(); + grad_op->SetType("reverse"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttr("axis", GetAttr("axis")); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(reverse, ops::ReverseOp, ops::ReverseOpMaker, + ops::ReverseGradMaker); +REGISTER_OPERATOR(reverse_grad, ops::ReverseOp); +REGISTER_OP_CPU_KERNEL( + reverse, ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel) diff --git a/paddle/fluid/operators/reverse_op.cu b/paddle/fluid/operators/reverse_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..635c41529b38f2dd287b00ed2e5659e11f619e78 --- /dev/null +++ b/paddle/fluid/operators/reverse_op.cu @@ -0,0 +1,24 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reverse_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + reverse, ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel) diff --git a/paddle/fluid/operators/reverse_op.h b/paddle/fluid/operators/reverse_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9063cd59bba5c6307b55a500455908a5fd278390 --- /dev/null +++ b/paddle/fluid/operators/reverse_op.h @@ -0,0 +1,87 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +template +struct ReverseFunctor { + void operator()(const DeviceContext& context, const framework::LoDTensor& in, + framework::LoDTensor* out, const std::vector& axis) { + Eigen::array reverse_axis; + for (int i = 0; i < Rank; ++i) { + reverse_axis[i] = false; + } + for (int a : axis) { + reverse_axis[a] = true; + } + + auto in_eigen = framework::EigenTensor::From(in); + auto out_eigen = framework::EigenTensor::From(*out); + auto* dev = context.eigen_device(); + + out_eigen.device(*dev) = in_eigen.reverse(reverse_axis); + } +}; + +template +class ReverseKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + const auto& axis = context.Attr>("axis"); + int rank = x->dims().size(); + auto& dev_ctx = context.template device_context(); + + switch (rank) { + case 1: + ReverseFunctor functor1; + functor1(dev_ctx, *x, out, axis); + break; + case 2: + ReverseFunctor functor2; + functor2(dev_ctx, *x, out, axis); + break; + case 3: + ReverseFunctor functor3; + functor3(dev_ctx, *x, out, axis); + break; + case 4: + ReverseFunctor functor4; + functor4(dev_ctx, *x, out, axis); + break; + case 5: + ReverseFunctor functor5; + functor5(dev_ctx, *x, out, axis); + break; + case 6: + ReverseFunctor functor6; + functor6(dev_ctx, *x, out, axis); + break; + default: + PADDLE_THROW( + "Reserve operator doesn't supports tensors whose ranks are greater " + "than 6."); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc index bcd8e81609a37cc544f5a5cc4188400c1632a668..c6c975a23ce846464388c72af5d8902144ceb16a 100644 --- a/paddle/fluid/operators/send_barrier_op.cc +++ b/paddle/fluid/operators/send_barrier_op.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/macros.h" -#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -44,18 +44,19 @@ class SendBarrierOp : public framework::OperatorBase { // For profiling platform::RecordEvent record_event(Type(), &ctx); - auto rpc_client = detail::RPCClient::GetInstance(); + detail::RPCClient* rpc_client = + detail::RPCClient::GetInstance(); VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode; // need to wait before sending send_barrier message - PADDLE_ENFORCE(rpc_client->Wait()); + rpc_client->Wait(); if (sync_mode) { for (auto& ep : eps) { VLOG(3) << "send barrier, ep: " << ep; rpc_client->AsyncSendBatchBarrier(ep); } - PADDLE_ENFORCE(rpc_client->Wait()); + rpc_client->Wait(); } } }; diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index a5150f242ca3b0befafa2443f0bc466e2aea85e4..84ec36625314572d16e5c537884b6efec420cc60 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -16,10 +16,9 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/send_recv_util.h" #include "paddle/fluid/platform/profiler.h" @@ -36,12 +35,9 @@ class SendOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& place) const override { auto ins = Inputs("X"); - auto outs = Outputs("Out"); - std::vector epmap = Attr>("epmap"); - std::vector endpoints = - Attr>("endpoints"); - bool sync_mode = Attr("sync_mode"); + std::vector epmap = Attr>("epmap"); + int sync_send = Attr("sync_mode"); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); @@ -49,38 +45,21 @@ class SendOp : public framework::OperatorBase { // For profiling platform::RecordEvent record_event(Type(), &ctx); - auto rpc_client = detail::RPCClient::GetInstance(); + detail::RPCClient* rpc_client = + detail::RPCClient::GetInstance(); for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; - rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]); + // TODO(Yancey1989): we need to use an IO threadpool which has + // a larger number of threads than the computing threadpool. + rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]); } else { VLOG(3) << "don't send no-initialied variable: " << ins[i]; } } - PADDLE_ENFORCE(rpc_client->Wait()); - - if (sync_mode) { - for (auto& ep : endpoints) { - VLOG(3) << "batch barrier, ep: " << ep; - rpc_client->AsyncSendBatchBarrier(ep); - } - PADDLE_ENFORCE(rpc_client->Wait()); - } - - if (outs.size() > 0) { - for (size_t i = 0; i < outs.size(); i++) { - VLOG(2) << "getting " << outs[i] << " from " << epmap[i]; - rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]); - } - PADDLE_ENFORCE(rpc_client->Wait()); - // tell pservers that current trainer have called fetch - for (auto& ep : endpoints) { - VLOG(2) << "send fetch barrier, ep: " << ep; - rpc_client->AsyncSendFetchBarrier(ep); - } - PADDLE_ENFORCE(rpc_client->Wait()); + if (sync_send) { + rpc_client->Wait(); } } }; @@ -88,26 +67,22 @@ class SendOp : public framework::OperatorBase { class SendOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() { - AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable(); - AddOutput("Out", "(Tensor) Output tensor to be received from server") + AddInput("X", "(Tensor, SelectedRows) Input variables to be sent") .AsDuplicable(); AddComment(R"DOC( Send operator -This operator will send tensor to recv_op at the parameter server. +This operator will send variables to listen_and_serve op at the parameter server. )DOC"); - // TODO(typhoonzero): remove this attr generate de-duplicated vector from - // epmap when initializing. - AddAttr>("endpoints", - "(string vector, default 127.0.0.1:6164)" - "Server endpoints to send variables to.") - .SetDefault({}); + AddAttr("sync_mode", + "(int, default 0)" + "sync send or async send.") + .SetDefault(0); AddAttr>("epmap", "(string vector, default 127.0.0.1:6164)" "Server endpoints in the order of input " "variables for mapping") - .SetDefault({}); - AddAttr("sync_mode", "work in sync_mode or not").SetDefault(true); + .SetDefault({"127.0.0.1:6164"}); } }; diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc deleted file mode 100644 index fe839dab6924618c8a4c39868d9bf86056a0be40..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/send_vars_op.cc +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include // NOLINT -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/send_recv_util.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace operators { - -class SendVarsOp : public framework::OperatorBase { - public: - SendVarsOp(const std::string& type, const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - void RunImpl(const framework::Scope& scope, - const platform::Place& place) const override { - auto ins = Inputs("X"); - - std::vector epmap = Attr>("epmap"); - int sync_send = Attr("sync_send"); - - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - // For profiling - platform::RecordEvent record_event(Type(), &ctx); - - auto rpc_client = detail::RPCClient::GetInstance(); - - for (size_t i = 0; i < ins.size(); i++) { - if (NeedSend(scope, ins[i])) { - VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; - // TODO(Yancey1989): we need to use an IO threadpool which has - // a larger number of threads than the computing threadpool. - rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]); - } else { - VLOG(3) << "don't send no-initialied variable: " << ins[i]; - } - } - if (sync_send) { - rpc_client->Wait(); - } - } -}; - -class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() { - AddInput("X", "(Tensor, SelectedRows) Input variables to be sent") - .AsDuplicable(); - AddComment(R"DOC( -Send operator - -This operator will send variables to listen_and_serve op at the parameter server. -)DOC"); - AddAttr("sync_send", - "(int, default 0)" - "sync send or async send.") - .SetDefault(0); - AddAttr>("epmap", - "(string vector, default 127.0.0.1:6164)" - "Server endpoints in the order of input " - "variables for mapping") - .SetDefault({"127.0.0.1:6164"}); - } -}; - -class SendVarsOpShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* ctx) const override {} -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR(send_vars, ops::SendVarsOp, - paddle::framework::EmptyGradOpMaker, ops::SendVarsOpMaker, - ops::SendVarsOpShapeInference); diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc index 7a2bdeac09d61603f437ff10d58d0542bb3c3689..fef230e42d07a5ed73b7a7a6ab682694675bb9d2 100644 --- a/paddle/fluid/operators/sgd_op.cc +++ b/paddle/fluid/operators/sgd_op.cc @@ -74,7 +74,8 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Grad", "(Tensor or SelectedRows) Input gradient"); AddOutput("ParamOut", "(Tensor or SelectedRows, same with Param) " - "Output parameter, should share the same memory with Param"); + "Output parameter, should share the same memory with Param") + .Reuse("Param"); AddComment(R"DOC( SGD operator diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h index f9e0596191d0b86686e0fa36265806111c774b38..2685ce217ee0f0d3e89f3751e96218dcd19bead4 100644 --- a/paddle/fluid/operators/sgd_op.h +++ b/paddle/fluid/operators/sgd_op.h @@ -114,7 +114,7 @@ class SGDOpKernel : public framework::OpKernel { int64_t id_index = param.Index(grad.rows()[i]); PADDLE_ENFORCE_GE(id_index, static_cast(0), "id should be in the table"); - for (size_t j = 0; j < grad_row_width; j++) { + for (int64_t j = 0; j < grad_row_width; j++) { out_data[id_index * grad_row_width + j] -= lr[0] * grad_data[i * grad_row_width + j]; } diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..61bb445e8b4c6a71e9b1a6a0bcf02a31ab271d0a --- /dev/null +++ b/paddle/fluid/operators/slice_op.cc @@ -0,0 +1,130 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/slice_op.h" +#include +#include + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +class SliceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input (Input) of slice op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output (Out) of slice op should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE(in_dims.size() < 7, + "The rank of input should be less than 7."); + framework::DDim out_dims(in_dims); + auto axes = ctx->Attrs().Get>("axes"); + auto starts = ctx->Attrs().Get>("starts"); + auto ends = ctx->Attrs().Get>("ends"); + + PADDLE_ENFORCE_EQ(starts.size(), ends.size()); + PADDLE_ENFORCE_EQ(starts.size(), axes.size()); + int dim_value, start, end; + for (size_t i = 0; i < axes.size(); ++i) { + dim_value = out_dims[axes[i]]; + start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i]; + end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i]; + start = std::max(start, 0); + end = std::max(end, 0); + start = std::min(start, dim_value); + end = std::min(end, dim_value); + start = std::min(start, end); + out_dims[axes[i]] = end - start; + } + ctx->SetOutputDim("Out", out_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.GetPlace()); + } +}; + +class SliceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", "Tensor of data to extract slices from."); + AddOutput("Out", "Sliced data tensor."); + + AddAttr>( + "axes", + "(list) Axes that `starts` and `ends` apply to. It's optional." + "If not present, will be treated as [0, 1, ..., len(`starts`) - 1]."); + AddAttr>( + "starts", + "(list) Starting indices of corresponding axis in `axes`"); + AddAttr>( + "ends", + "(list) Starting indices of corresponding axis in `axes`."); + + AddComment(R"DOC( +Slice Operator. + +Produces a slice of the input tensor along multiple axes. Similar to numpy: +https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html +Slice uses `axes`, `starts` and `ends` attributes to specify the start and +end dimension for each axis in the list of axes, it uses this information +to slice the input data tensor. If a negative value is passed for any of +the start or end indices, it represents number of elements before the end +of that dimension. If the value passed to start or end is larger than +the n (the number of elements in this dimension), it represents n. +For slicing to the end of a dimension with unknown size, it is recommended +to pass in INT_MAX. If axes are omitted, they are set to [0, ..., ndim-1]. + + Example 1: + Given: + data = [ [1, 2, 3, 4], [5, 6, 7, 8], ] + axes = [0, 1] + starts = [1, 0] + ends = [2, 3] + Then: + result = [ [5, 6, 7], ] + + Example 2: + Given: + data = [ [1, 2, 3, 4], [5, 6, 7, 8], ] + starts = [0, 1] + ends = [-1, 1000] + Then: + result = [ [2, 3, 4], ] +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(slice, ops::SliceOp, ops::SliceOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL( + slice, ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel); diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..8c1767c70b19d1386af9610ef3405eb487a39878 --- /dev/null +++ b/paddle/fluid/operators/slice_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/slice_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + slice, ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel); diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ba231aee176564b91a642912ce0b32bcdef8cfc1 --- /dev/null +++ b/paddle/fluid/operators/slice_op.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class SliceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + int rank = ctx.Input("Input")->dims().size(); + switch (rank) { + case 1: + SliceCompute<1>(ctx); + break; + case 2: + SliceCompute<2>(ctx); + break; + case 3: + SliceCompute<3>(ctx); + break; + case 4: + SliceCompute<4>(ctx); + break; + case 5: + SliceCompute<5>(ctx); + break; + case 6: + SliceCompute<6>(ctx); + break; + } + } + + private: + template + void SliceCompute(const framework::ExecutionContext& context) const { + auto& place = + *context.template device_context().eigen_device(); + auto in = context.Input("Input"); + auto out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + auto out_dims = out->dims(); + auto in_dims = in->dims(); + auto axes = context.Attr>("axes"); + auto starts = context.Attr>("starts"); + + auto offsets = Eigen::array(); + auto extents = Eigen::array(); + for (size_t i = 0; i < D; ++i) { + offsets[i] = 0; + extents[i] = out_dims[i]; + } + int start; + for (size_t i = 0; i < axes.size(); ++i) { + start = starts[i]; + if (start < 0) { + start = (start + in_dims[axes[i]]); + } + start = std::max(start, 0); + offsets[axes[i]] = start; + } + auto in_t = + framework::EigenTensor::From( + *in); + auto out_t = + framework::EigenTensor::From( + *out); + out_t.device(place) = in_t.slice(offsets, extents); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index cc256aa627bdda0609f496cab93a2dec7d95f348..847b3cbd1bd416ae1326211c98ba9d145c103298 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -49,6 +49,9 @@ class SoftmaxOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { // choose cudnn kernel if the runtime supported. framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; @@ -58,6 +61,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif @@ -68,9 +72,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { "float16 can only be used on GPU place"); } - std::string data_format = ctx.Attr("data_format"); - return framework::OpKernelType(input_data_type, ctx.GetPlace(), - framework::StringToDataLayout(data_format), + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, library_); } }; @@ -81,7 +83,8 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "The input tensor of softmax. " "2-D with shape [batch_size, input_feature_dimensions]."); - AddOutput("Out", "The normalized values with the same shape as X."); + AddOutput("Out", "The normalized values with the same shape as X.") + .Reuse("X"); AddAttr( "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn") @@ -142,6 +145,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { // choose cudnn kernel if the runtime supported. framework::LibraryType library_{framework::LibraryType::kPlain}; + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index bcc5e22d4a77349e7cde9a43b83f23d4c867d994..863baba9ea7663d0b21875e0b423dc4a6ce2d59a 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -115,7 +115,7 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "(vector) The input tensors of sum operator.") .AsDuplicable(); - AddOutput("Out", "(Tensor) The output tensor of sum operator."); + AddOutput("Out", "(Tensor) The output tensor of sum operator.").Reuse("X"); AddComment(R"DOC( Sum operator. diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc index 855157e7c4c5c4a43091d28d3a5414e6e386b727..4b1208c4376b48e25866fc510f3a6d2ea06e7610 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt_engine_op.cc @@ -17,23 +17,93 @@ #include "paddle/fluid/operators/tensorrt_engine_op.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/utils/singleton.h" namespace paddle { namespace operators { +using inference::Singleton; +using inference::tensorrt::TRT_EngineManager; + +using FluidDT = framework::proto::VarType_Type; +using TRT_DT = nvinfer1::DataType; + +namespace { + +TRT_DT FluidDataType2TRT(FluidDT type) { + switch (type) { + case FluidDT::VarType_Type_FP32: + return TRT_DT::kFLOAT; + case FluidDT::VarType_Type_INT32: + return TRT_DT::kINT32; + default: + return TRT_DT::kINT32; + } + PADDLE_THROW("unkown type"); + return TRT_DT::kINT32; +} + +nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { + PADDLE_ENFORCE_GT(shape.size(), 1UL, + "TensorRT' tensor input requires at least 2 dimensions"); + PADDLE_ENFORCE_LE(shape.size(), 4UL, + "TensorRT' tensor input requires at most 4 dimensions"); + + switch (shape.size()) { + case 2: + return nvinfer1::Dims2(shape[0], shape[1]); + case 3: + return nvinfer1::Dims3(shape[0], shape[1], shape[2]); + case 4: + return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]); + default: + return nvinfer1::Dims(); + } + return nvinfer1::Dims(); +} + +} // namespace + template void paddle::operators::TensorRTEngineKernel::Prepare( const framework::ExecutionContext &context) const { + VLOG(4) << "Prepare engine"; // Get the ProgramDesc and pass to convert. - const auto &block = context.Attr("subgraph"); + framework::proto::BlockDesc block_desc; + block_desc.ParseFromString(context.Attr("subgraph")); max_batch_ = context.Attr("max_batch"); auto max_workspace = context.Attr("max_workspace"); - engine_.reset(new inference::tensorrt::TensorRTEngine( - max_batch_, max_workspace, nullptr)); + engine_ = Singleton::Global().Create( + max_batch_, max_workspace, &stream_); + engine_->InitNetwork(); + + framework::BlockDesc block(nullptr /*programdesc*/, &block_desc); + // Add inputs + VLOG(4) << "declare inputs"; + for (auto &input : context.Inputs("Xs")) { + VLOG(4) << "declare input " << input; + auto *var = block.FindVar(input); + PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, + "TensorRT engine only takes LoDTensor as input"); + auto shape = var->GetShape(); + engine_->DeclareInput( + input, FluidDataType2TRT( + var->Proto()->type().lod_tensor().tensor().data_type()), + Vec2TRT_Dims(var->GetShape())); + } + // TODO(Superjomn) parameters should be passed after analysised from outside. inference::Singleton::Global().ConvertBlock( - block, {}, context.scope(), engine_.get()); + block_desc, {}, context.scope(), engine_); + + // Add outputs + VLOG(4) << "declare outputs"; + for (auto &output : context.Outputs("Ys")) { + VLOG(4) << "declare output " << output; + engine_->DeclareOutput(output); + } + engine_->FreezeNetwork(); } @@ -42,7 +112,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("Xs", "A list of inputs.").AsDuplicable(); AddOutput("Ys", "A list of outputs").AsDuplicable(); - AddAttr("subgraph", "the subgraph"); + AddAttr("subgraph", "the subgraph."); + AddAttr("max_batch", "the maximum batch size."); + AddAttr("max_workspace", "the maximum batch size."); AddComment("TensorRT engine operator."); } }; diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index fe273d386c529be3df05a955f492e2c39d4d8812..4b089601ff76eedd87bb3a52a38c4d22d4a94bf6 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -32,9 +32,12 @@ class TensorRTEngineOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { + auto input0 = ctx.Inputs("Xs").front(); framework::OpKernelType kt = framework::OpKernelType( - framework::ToDataType( - ctx.Input("pre_ids")->type()), + framework::ToDataType(ctx.scope() + .FindVar(input0) + ->GetMutable() + ->type()), platform::CPUPlace()); return kt; } @@ -50,17 +53,16 @@ class TensorRTEngineKernel : public framework::OpKernel { auto input_names = context.op().Inputs("Xs"); PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs"); // Try to determine a batch_size - auto* tensor0 = context.Input(input_names.front()); - PADDLE_ENFORCE_NOT_NULL(tensor0); - int batch_size = tensor0->dims()[0]; + auto& tensor0 = inference::analysis::GetFromScope( + context.scope(), input_names.front()); + int batch_size = tensor0.dims()[0]; PADDLE_ENFORCE_LE(batch_size, max_batch_); // Convert input tensor from fluid to engine. for (const auto& x : context.Inputs("Xs")) { // convert input and copy to TRT engine's buffer - auto* v = context.scope().FindVar(x); - PADDLE_ENFORCE_NOT_NULL(v, "no variable called %s", x); - auto& t = v->Get(); + auto& t = inference::analysis::GetFromScope( + context.scope(), x); if (platform::is_cpu_place(t.place())) { engine_->SetInputFromCPU(x, static_cast(t.data()), t.memory_size()); @@ -86,13 +88,18 @@ class TensorRTEngineKernel : public framework::OpKernel { fluid_t->Resize(framework::make_ddim(ddim)); auto size = inference::analysis::AccuDims(dims.d, dims.nbDims); if (platform::is_cpu_place(fluid_t->place())) { + // TODO(Superjomn) change this float to dtype size. engine_->GetOutputInCPU( - y, fluid_t->mutable_data(platform::CPUPlace()), size); + y, fluid_t->mutable_data(platform::CPUPlace()), + size * sizeof(float)); } else { engine_->GetOutputInGPU( - y, fluid_t->mutable_data(platform::CUDAPlace()), size); + y, fluid_t->mutable_data(platform::CUDAPlace()), + size * sizeof(float)); } } + + cudaStreamSynchronize(stream_); } protected: @@ -100,7 +107,8 @@ class TensorRTEngineKernel : public framework::OpKernel { void Prepare(const framework::ExecutionContext& context) const; private: - mutable std::unique_ptr engine_; + mutable cudaStream_t stream_; + mutable inference::tensorrt::TensorRTEngine* engine_{nullptr}; mutable int max_batch_{0}; }; diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6f383de259b270038c32296b59007f6c7d895f12 --- /dev/null +++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc @@ -0,0 +1,152 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +USE_CPU_ONLY_OP(tensorrt_engine); + +namespace paddle { +namespace operators { + +namespace { +void CreateCPUTensor(framework::Scope* scope, const std::string& name, + const std::vector& shape) { + auto* var = scope->Var(name); + auto* tensor = var->GetMutable(); + auto dims = framework::make_ddim(shape); + tensor->Resize(dims); + platform::CPUPlace place; + platform::CPUDeviceContext ctx(place); + inference::tensorrt::RandomizeTensor(tensor, place, ctx); +} + +void AddTensorToBlockDesc(framework::proto::BlockDesc* block, + const std::string& name, + const std::vector& shape) { + using framework::proto::VarType; + auto* var = block->add_vars(); + framework::VarDesc desc(name); + desc.SetType(VarType::LOD_TENSOR); + desc.SetDataType(VarType::FP32); + desc.SetShape(shape); + *var = *desc.Proto(); +} + +template +void SetAttr(framework::proto::OpDesc* op, const std::string& name, + const T& data); + +template <> +void SetAttr(framework::proto::OpDesc* op, const std::string& name, + const std::string& data) { + auto* attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::STRING); + attr->set_s(data); +} +template <> +void SetAttr(framework::proto::OpDesc* op, const std::string& name, + const int& data) { + auto* attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::INT); + attr->set_i(data); +} +template <> +void SetAttr(framework::proto::OpDesc* op, const std::string& name, + const int64_t& data) { + auto* attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::LONG); + attr->set_l(data); +} + +} // namespace + +TEST(TensorRTEngineOp, manual) { + framework::ProgramDesc program; + auto* block_ = program.Proto()->add_blocks(); + block_->set_idx(0); + block_->set_parent_idx(-1); + + LOG(INFO) << "create block desc"; + framework::BlockDesc block_desc(&program, block_); + LOG(INFO) << "create mul op"; + auto* mul = block_desc.AppendOp(); + mul->SetType("mul"); + mul->SetInput("X", std::vector({"x"})); // 2 x 4 + mul->SetInput("Y", std::vector({"y"})); // 4 x 6 + mul->SetOutput("Out", std::vector({"z"})); // 2 x 6 + + LOG(INFO) << "create fc op"; + auto* fc = block_desc.AppendOp(); + fc->SetType("mul"); + fc->SetInput("X", std::vector({"z"})); + fc->SetInput("Y", std::vector({"y0"})); // 6 x 8 + fc->SetOutput("Out", std::vector({"z0"})); // 2 x 8 + + // Set inputs' variable shape in BlockDesc + AddTensorToBlockDesc(block_, "x", std::vector({2, 4})); + AddTensorToBlockDesc(block_, "y", std::vector({4, 6})); + AddTensorToBlockDesc(block_, "y0", std::vector({6, 8})); + AddTensorToBlockDesc(block_, "z", std::vector({2, 6})); + + // It is wired, need to copy manually. + *block_->add_ops() = *mul->Proto(); + *block_->add_ops() = *fc->Proto(); + + ASSERT_EQ(block_->ops_size(), 2); + + LOG(INFO) << "create tensorrt desc"; + framework::OpDesc engine_op_desc(nullptr); + engine_op_desc.SetType("tensorrt_engine"); + engine_op_desc.SetInput("Xs", std::vector({"x", "y", "y0"})); + engine_op_desc.SetOutput("Ys", std::vector({"z0"})); + SetAttr(engine_op_desc.Proto(), "subgraph", + block_->SerializeAsString()); + SetAttr(engine_op_desc.Proto(), "max_batch", 30); + SetAttr(engine_op_desc.Proto(), "max_workspace", 1 << 10); + + LOG(INFO) << "create engine op"; + auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); + + framework::Scope scope; + platform::CPUPlace place; + platform::CPUDeviceContext ctx(place); + // Prepare variables. + CreateCPUTensor(&scope, "x", std::vector({2, 4})); + CreateCPUTensor(&scope, "y", std::vector({4, 6})); + CreateCPUTensor(&scope, "z", std::vector({2, 6})); + + CreateCPUTensor(&scope, "y0", std::vector({6, 8})); + CreateCPUTensor(&scope, "z0", std::vector({2, 8})); + + // Execute them. + LOG(INFO) << "engine_op run"; + engine_op->Run(scope, place); +} + +} // namespace operators +} // namespace paddle + +USE_TRT_CONVERTER(mul) +USE_TRT_CONVERTER(fc) diff --git a/paddle/fluid/operators/test_send_nccl_id.cc b/paddle/fluid/operators/test_send_nccl_id.cc index a845ba2eb038fa6a8e70dfbac06c31c19dbb9e3e..5015b1005569ba70b147ebb795243e24ab81ea5c 100644 --- a/paddle/fluid/operators/test_send_nccl_id.cc +++ b/paddle/fluid/operators/test_send_nccl_id.cc @@ -20,8 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/detail/grpc_server.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/operators/math/math_function.h" @@ -29,6 +28,10 @@ limitations under the License. */ #include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/string/printf.h" +#ifdef PADDLE_WITH_GRPC +#include "paddle/fluid/operators/send_recv_util.h" +#endif + USE_NO_KERNEL_OP(listen_and_serv); namespace f = paddle::framework; @@ -37,7 +40,7 @@ namespace m = paddle::operators::math; namespace detail = paddle::operators::detail; namespace string = paddle::string; -std::unique_ptr g_rpc_service; +std::unique_ptr g_rpc_service; std::unique_ptr g_req_handler; void StartServer() { @@ -58,10 +61,9 @@ void StartServer() { g_req_handler->SetRPCServer(g_rpc_service.get()); std::thread server_thread( - std::bind(&detail::AsyncGRPCServer::StartServer, g_rpc_service.get())); + std::bind(&detail::RPCServer::StartServer, g_rpc_service.get())); g_rpc_service->SetCond(detail::kRequestSend); - std::cout << "before WaitFanInOfSend" << std::endl; g_rpc_service->WaitBarrier(detail::kRequestSend); LOG(INFO) << "got nccl id and stop server..."; @@ -69,9 +71,9 @@ void StartServer() { server_thread.join(); } -TEST(SendNcclId, GrpcServer) { +TEST(SendNcclId, RPCServer) { g_req_handler.reset(new detail::RequestSendHandler(true)); - g_rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", 1)); + g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); std::thread server_thread(StartServer); g_rpc_service->WaitServerReady(); @@ -88,12 +90,14 @@ TEST(SendNcclId, GrpcServer) { int port = g_rpc_service->GetSelectedPort(); std::string ep = string::Sprintf("127.0.0.1:%d", port); - detail::RPCClient client; + + detail::RPCClient* client = detail::RPCClient::GetInstance(); + LOG(INFO) << "connect to server" << ep; - client.AsyncSendVariable(ep, dev_ctx, scope, NCCL_ID_VARNAME); - client.Wait(); - client.AsyncSendBatchBarrier(ep); - client.Wait(); + client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME); + client->Wait(); + client->AsyncSendBatchBarrier(ep); + client->Wait(); server_thread.join(); g_rpc_service.reset(nullptr); diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc index c17d1afc309c65035063348d4934ea1783b018ed..4a8ac441cfaf642fde58ee30865a22e83c065498 100644 --- a/paddle/fluid/operators/top_k_op.cc +++ b/paddle/fluid/operators/top_k_op.cc @@ -50,7 +50,7 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", "(Tensor) The input of Topk op"); - AddOutput("Out", "(Tensor) The output tensor of Topk op"); + AddOutput("Out", "(Tensor) The output tensor of Topk op").Reuse("X"); AddOutput("Indices", "(Tensor) The indices of Topk elements of input"); AddComment(R"DOC( Top K operator diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h index 123d3598f4f4753f70889e415aff0f41b7d212f7..2ce9b31bb81de867ff4ed6ee14afddecd95317b9 100644 --- a/paddle/fluid/platform/assert.h +++ b/paddle/fluid/platform/assert.h @@ -17,7 +17,7 @@ limitations under the License. */ #define STRINGIFY(x) #x #define TOSTRING(x) STRINGIFY(x) -#if defined(__APPLE__) && defined(__CUDA_ARCH__) && !defined(NDEBUG) +#if defined(__CUDA_ARCH__) #include #define PADDLE_ASSERT(e) \ do { \ @@ -38,6 +38,9 @@ limitations under the License. */ } while (0) #else #include -#define PADDLE_ASSERT(e) assert(e) +// For cuda, the assertions can affect performance and it is therefore +// recommended to disable them in production code +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion +#define PADDLE_ASSERT(e) assert((e)) #define PADDLE_ASSERT_MSG(e, m) assert((e) && (m)) #endif diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index 4fc9aae8e36e9b43d65fab0b92ec3a2549057128..40dc7c9a0b6a40f2419ace3ce7e0e5e82bc95c1a 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -21,12 +21,17 @@ limitations under the License. */ #include #endif +#include #include "gflags/gflags.h" DEFINE_double(fraction_of_cpu_memory_to_use, 1, "Default use 100% of CPU memory for PaddlePaddle," "reserve the rest for page tables, etc"); +DEFINE_uint64( + initial_cpu_memory_in_mb, 500, + "Default initial 500MB of CPU memory for PaddlePaddle, in MD unit."); + DEFINE_double( fraction_of_cuda_pinned_memory_to_use, 0.5, "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," @@ -54,7 +59,10 @@ inline size_t CpuTotalPhysicalMemory() { size_t CpuMaxAllocSize() { // For distributed systems, it requires configuring and limiting // the fraction of memory to use. - return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory(); + return std::min( + static_cast(FLAGS_fraction_of_cpu_memory_to_use * + CpuTotalPhysicalMemory()), + static_cast(FLAGS_initial_cpu_memory_in_mb * 1 << 20)); } size_t CpuMinChunkSize() { diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index c0d399d078f73743836fc2a0c1d4b1e6b31ecd83..6ea4f8b7cba18ce7f803dbd9b15a7ae70c3055f2 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -22,6 +22,8 @@ limitations under the License. */ #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/macros.h" +DECLARE_bool(cudnn_deterministic); + namespace paddle { namespace platform { @@ -76,8 +78,44 @@ enum class DataLayout { // Not use enum class PoolingMode { kMaximum, kAverage, + kMaximumDeterministic, }; +#if CUDNN_VERSION < 6000 +#pragma message "CUDNN version under 6.0 is supported at best effort." +#pragma message "We strongly encourage you to move to 6.0 and above." +#pragma message "This message is intended to annoy you enough to update." +#pragma message \ + "please see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/" + +inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) { + switch (mode) { + case PoolingMode::kMaximumDeterministic: + return CUDNN_POOLING_MAX; + case PoolingMode::kAverage: + return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + case PoolingMode::kMaximum: + return CUDNN_POOLING_MAX; + default: + PADDLE_THROW("Unexpected pooling mode."); + } +} +#else + +inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) { + switch (mode) { + case PoolingMode::kMaximumDeterministic: + return CUDNN_POOLING_MAX_DETERMINISTIC; + case PoolingMode::kAverage: + return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + case PoolingMode::kMaximum: + return CUDNN_POOLING_MAX; + default: + PADDLE_THROW("Unexpected pooling mode."); + } +} +#endif // CUDNN_VERSION < 6000 + template class CudnnDataType; @@ -293,9 +331,7 @@ class ScopedPoolingDescriptor { PADDLE_ENFORCE_EQ(kernel.size(), pads.size()); PADDLE_ENFORCE_EQ(kernel.size(), strides.size()); PADDLE_ENFORCE(dynload::cudnnSetPoolingNdDescriptor( - desc_, (mode == PoolingMode::kMaximum - ? CUDNN_POOLING_MAX - : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING), + desc_, (GetPoolingMode(mode)), CUDNN_PROPAGATE_NAN, // Always propagate nans. kernel.size(), kernel.data(), pads.data(), strides.data())); return desc_; diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 1f733d71bdfb777d4a2f316a5fefc3c874879862..6c50ab2685c56bafe146c67fe2ef081ee4c55628 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -175,7 +175,6 @@ CUDADeviceContext::~CUDADeviceContext() { Place CUDADeviceContext::GetPlace() const { return place_; } void CUDADeviceContext::Wait() const { - std::lock_guard guard(mutex_); PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaGetLastError()); } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index a9c1984616bc731e0557f2cb89282423aa9c3bac..292ffef1aef12732812b8c5b0020cad73b1d06fc 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -11,6 +11,7 @@ limitations under the License. */ #pragma once #include +#include // NOLINT #include #include #include @@ -100,7 +101,7 @@ class CUDADeviceContext : public DeviceContext { template void RecordEvent(cudaEvent_t ev, Callback callback) { - std::lock_guard guard(mutex_); + std::lock_guard guard(mtx_); callback(); PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); } @@ -110,8 +111,6 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr eigen_device_; std::unique_ptr eigen_stream_; - - mutable std::recursive_mutex mutex_; cudaStream_t stream_; cudnnHandle_t cudnn_handle_; cublasHandle_t cublas_handle_; @@ -119,6 +118,8 @@ class CUDADeviceContext : public DeviceContext { int compute_capability; int multi_process; int max_threads_per_mp; + + std::mutex mtx_; }; template <> diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 1a9be044e024e4b1dda5ef7d515c65f3a7513710..d9e2afadaf8ec439d158e57c94d3e6e684bce116 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -322,7 +322,6 @@ class DeviceTracerImpl : public DeviceTracer { DisableActivity(); dynload::cuptiUnsubscribe(subscriber_); CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_)); - PADDLE_ENFORCE(dynload::cuptiFinalize()); enabled_ = false; } diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index 81acaff87d3c2025cf0d6185a1590b018bfbd83c..25bcda7eedc1ef42f75fb8fd1439f0c8f55015c3 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -45,7 +45,7 @@ extern void *cublas_dso_handle; std::call_once(cublas_dso_flag, []() { \ cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \ }); \ - void *p_##__name = dlsym(cublas_dso_handle, #__name); \ + static void *p_##__name = dlsym(cublas_dso_handle, #__name); \ return reinterpret_cast(p_##__name)(args...); \ } \ }; \ diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 34d83e395694f55eafca74d63ebf363169ab30e8..77e46fa768b62c277d7b4027de7173e39a5672b4 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -39,7 +39,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \ }); \ EnforceCUDNNLoaded(#__name); \ - void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ + static void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ return reinterpret_cast(p_##__name)(args...); \ } \ }; \ diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h index e64de7c20fc9d145e51cfc4528e321b3c4ec86c8..e8f4a82ef132be9e4ec3fb76f11766046a2ff638 100644 --- a/paddle/fluid/platform/dynload/cupti.h +++ b/paddle/fluid/platform/dynload/cupti.h @@ -45,7 +45,7 @@ extern void *cupti_dso_handle; std::call_once(cupti_dso_flag, []() { \ cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \ }); \ - void *p_##__name = dlsym(cupti_dso_handle, #__name); \ + static void *p_##__name = dlsym(cupti_dso_handle, #__name); \ return reinterpret_cast(p_##__name)(args...); \ } \ }; \ @@ -72,7 +72,6 @@ extern void *cupti_dso_handle; __macro(cuptiGetResultString); \ __macro(cuptiActivityGetNumDroppedRecords); \ __macro(cuptiActivityFlushAll); \ - __macro(cuptiFinalize); \ __macro(cuptiSubscribe); \ __macro(cuptiUnsubscribe); \ __macro(cuptiEnableCallback); \ diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h index 46ad4379d5f9572d415ef1d747077217ae29391e..5b9e0820e0b319fe7a636a57a0029caf038b4db3 100644 --- a/paddle/fluid/platform/dynload/curand.h +++ b/paddle/fluid/platform/dynload/curand.h @@ -34,7 +34,7 @@ extern void *curand_dso_handle; std::call_once(curand_dso_flag, []() { \ curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \ }); \ - void *p_##__name = dlsym(curand_dso_handle, #__name); \ + static void *p_##__name = dlsym(curand_dso_handle, #__name); \ return reinterpret_cast(p_##__name)(args...); \ } \ }; \ diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h index 37902ae20c5d9d64486232bbd468375c4a50a615..575516f81870fc9f7b92919ffc20a201cb5cbce8 100644 --- a/paddle/fluid/platform/dynload/nccl.h +++ b/paddle/fluid/platform/dynload/nccl.h @@ -37,7 +37,7 @@ extern void* nccl_dso_handle; std::call_once(nccl_dso_flag, []() { \ nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \ }); \ - void* p_##__name = dlsym(nccl_dso_handle, #__name); \ + static void* p_##__name = dlsym(nccl_dso_handle, #__name); \ return reinterpret_cast(p_##__name)(args...); \ } \ }; \ diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h index f584a49da0fefe0b064b5fb55b01ec132225ce5e..5d67658b94af75680a100e13eed7b6b052162e00 100644 --- a/paddle/fluid/platform/dynload/tensorrt.h +++ b/paddle/fluid/platform/dynload/tensorrt.h @@ -40,7 +40,7 @@ extern void* tensorrt_dso_handle; paddle::platform::dynload::GetTensorRtDsoHandle(); \ PADDLE_ENFORCE(tensorrt_dso_handle, "load tensorrt so failed"); \ }); \ - void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \ + static void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \ PADDLE_ENFORCE(p_##__name, "load %s failed", #__name); \ return reinterpret_cast(p_##__name)(args...); \ } \ diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h index 7c70649d21c547beb824576d4a8ecf6219a9bddf..d157c1fda789b98f06ad069d2a9c4f421ff82dcd 100644 --- a/paddle/fluid/platform/dynload/warpctc.h +++ b/paddle/fluid/platform/dynload/warpctc.h @@ -40,7 +40,7 @@ extern void* warpctc_dso_handle; std::call_once(warpctc_dso_flag, []() { \ warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \ }); \ - void* p_##_name = dlsym(warpctc_dso_handle, #__name); \ + static void* p_##_name = dlsym(warpctc_dso_handle, #__name); \ return reinterpret_cast(p_##_name)(args...); \ } \ }; \ diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index f1187620d81ff3bc1deef2106edb54d6199fa927..de711b7d23ef01d57a62087c552ea090f01f0386 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace platform { @@ -86,5 +87,17 @@ inline mkldnn::memory::data_type MKLDNNGetDataType() { return mkldnn::memory::f32; } +inline void Reorder(const mkldnn::memory& src, const mkldnn::memory& dst) { + auto reorder_prim = mkldnn::reorder(src, dst); + std::vector pipeline; + pipeline.push_back(reorder_prim); + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); +} + +inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) { + return static_cast( + memory.get_primitive_desc().desc().data.format); +} + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 6f8e3f22db54d166cf97cfdd3d009058207a7ca5..cc46c88fd1f9a5d1bacad26beed6fd0af6405310 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) { } } +// NOTE(minqiyang): according to the ncclGroupEnd documentations: +// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html, +// ncclGroupEnd will wait for all communicators to be initialized, which will +// cause blocking problem when a runtime_error was thrown, so try only guard +// NCCL actions when use it. class NCCLGroupGuard { public: static std::mutex &NCCLMutex() { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3af8941be69fe507bc105e26b608ec768e4b5998..bd5c613f8cf794df5dfeb7517ed4350f9b3b6099 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -413,6 +413,9 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Executor") .def(py::init()) +#ifdef PADDLE_WITH_DISTRIBUTE + .def("complete", &Executor::Complete) +#endif .def("run", (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) & Executor::Run); @@ -509,16 +512,24 @@ All parameter, weight, gradient are variables in Paddle. self.num_threads_ = num_threads; }) .def_property( - "use_event", - [](const ExecutionStrategy &self) { return self.use_event_; }, - [](ExecutionStrategy &self, bool use_event) { - self.use_event_ = use_event; + "use_cuda", + [](const ExecutionStrategy &self) { return self.use_cuda_; }, + [](ExecutionStrategy &self, bool use_cuda) { + self.use_cuda_ = use_cuda; }) .def_property( "allow_op_delay", [](const ExecutionStrategy &self) { return self.allow_op_delay_; }, [](ExecutionStrategy &self, bool allow_op_delay) { self.allow_op_delay_ = allow_op_delay; + }) + .def_property( + "num_iteration_per_drop_scope", + [](const ExecutionStrategy &self) { + return self.num_iteration_per_drop_scope_; + }, + [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) { + self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope; }); py::class_ build_strategy(pe, "BuildStrategy"); @@ -545,6 +556,12 @@ All parameter, weight, gradient are variables in Paddle. [](BuildStrategy &self, BuildStrategy::GradientScaleStrategy strategy) { self.gradient_scale_ = strategy; + }) + .def_property( + "debug_graphviz_path", + [](const BuildStrategy &self) { return self.debug_graphviz_path_; }, + [](BuildStrategy &self, const std::string &path) { + self.debug_graphviz_path_ = path; }); pe.def(py::init &, diff --git a/paddle/fluid/recordio/chunk.cc b/paddle/fluid/recordio/chunk.cc index 82d9aa601cf450b8f90573d6c582bb12ced7a48a..6c65d9160c059ac143ee258b2bdaed5915a1dca1 100644 --- a/paddle/fluid/recordio/chunk.cc +++ b/paddle/fluid/recordio/chunk.cc @@ -119,40 +119,56 @@ bool Chunk::Write(std::ostream& os, Compressor ct) const { } bool Chunk::Parse(std::istream& sin) { - Header hdr; - bool ok = hdr.Parse(sin); + ChunkParser parser(sin); + if (!parser.Init()) { + return false; + } + Clear(); + while (parser.HasNext()) { + Add(parser.Next()); + } + return true; +} + +ChunkParser::ChunkParser(std::istream& sin) : in_(sin) {} +bool ChunkParser::Init() { + pos_ = 0; + bool ok = header_.Parse(in_); if (!ok) { return ok; } - auto beg_pos = sin.tellg(); - uint32_t crc = Crc32Stream(sin, hdr.CompressSize()); - PADDLE_ENFORCE_EQ(hdr.Checksum(), crc); - Clear(); - sin.seekg(beg_pos, sin.beg); - std::unique_ptr compressed_stream; - switch (hdr.CompressType()) { + auto beg_pos = in_.tellg(); + uint32_t crc = Crc32Stream(in_, header_.CompressSize()); + PADDLE_ENFORCE_EQ(header_.Checksum(), crc); + in_.seekg(beg_pos, in_.beg); + + switch (header_.CompressType()) { case Compressor::kNoCompress: break; case Compressor::kSnappy: - compressed_stream.reset(new snappy::iSnappyStream(sin)); + compressed_stream_.reset(new snappy::iSnappyStream(in_)); break; default: PADDLE_THROW("Not implemented"); } + return true; +} - std::istream& stream = compressed_stream ? *compressed_stream : sin; +bool ChunkParser::HasNext() const { return pos_ < header_.NumRecords(); } - for (uint32_t i = 0; i < hdr.NumRecords(); ++i) { - uint32_t rec_len; - stream.read(reinterpret_cast(&rec_len), sizeof(uint32_t)); - std::string buf; - buf.resize(rec_len); - stream.read(&buf[0], rec_len); - PADDLE_ENFORCE_EQ(rec_len, stream.gcount()); - Add(buf); +std::string ChunkParser::Next() { + if (!HasNext()) { + return ""; } - return true; + ++pos_; + std::istream& stream = compressed_stream_ ? *compressed_stream_ : in_; + uint32_t rec_len; + stream.read(reinterpret_cast(&rec_len), sizeof(uint32_t)); + std::string buf; + buf.resize(rec_len); + stream.read(&buf[0], rec_len); + PADDLE_ENFORCE_EQ(rec_len, stream.gcount()); + return buf; } - } // namespace recordio } // namespace paddle diff --git a/paddle/fluid/recordio/chunk.h b/paddle/fluid/recordio/chunk.h index 71a1556a33bfa5c937d6a799d2818cd5a5ef2094..cfb954a591679c2d2c4f42ecd99ca0c8bd1084cf 100644 --- a/paddle/fluid/recordio/chunk.h +++ b/paddle/fluid/recordio/chunk.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include @@ -53,9 +54,20 @@ class Chunk { DISABLE_COPY_AND_ASSIGN(Chunk); }; -size_t CompressData(const char* in, size_t in_length, Compressor ct, char* out); +class ChunkParser { + public: + explicit ChunkParser(std::istream& sin); + + bool Init(); + std::string Next(); + bool HasNext() const; -void DeflateData(const char* in, size_t in_length, Compressor ct, char* out); + private: + Header header_; + uint32_t pos_{0}; + std::istream& in_; + std::unique_ptr compressed_stream_; +}; } // namespace recordio } // namespace paddle diff --git a/paddle/fluid/recordio/scanner.cc b/paddle/fluid/recordio/scanner.cc index 88b4d4001bc1b6dc935a9aabc2db5edfb55a60e4..06a13e6c5b6ea76456e231e3f7b1eb33492b16ea 100644 --- a/paddle/fluid/recordio/scanner.cc +++ b/paddle/fluid/recordio/scanner.cc @@ -22,35 +22,33 @@ namespace paddle { namespace recordio { Scanner::Scanner(std::unique_ptr &&stream) - : stream_(std::move(stream)) { + : stream_(std::move(stream)), parser_(*stream_) { Reset(); } -Scanner::Scanner(const std::string &filename) { - stream_.reset(new std::ifstream(filename)); +Scanner::Scanner(const std::string &filename) + : stream_(new std::ifstream(filename)), parser_(*stream_) { Reset(); } void Scanner::Reset() { stream_->clear(); stream_->seekg(0, std::ios::beg); - ParseNextChunk(); + parser_.Init(); } std::string Scanner::Next() { - PADDLE_ENFORCE(!eof_, "StopIteration"); - auto rec = cur_chunk_.Record(offset_++); - if (offset_ == cur_chunk_.NumRecords()) { - ParseNextChunk(); + if (stream_->eof()) { + return ""; } - return rec; -} -void Scanner::ParseNextChunk() { - eof_ = !cur_chunk_.Parse(*stream_); - offset_ = 0; + auto res = parser_.Next(); + if (!parser_.HasNext() && HasNext()) { + parser_.Init(); + } + return res; } -bool Scanner::HasNext() const { return !eof_; } +bool Scanner::HasNext() const { return !stream_->eof(); } } // namespace recordio } // namespace paddle diff --git a/paddle/fluid/recordio/scanner.h b/paddle/fluid/recordio/scanner.h index 34f1b0c78d6b5af6072a993579e1866d38c6d009..0d885dd87a2f819ba1d9f76259196f6cfff0b2a0 100644 --- a/paddle/fluid/recordio/scanner.h +++ b/paddle/fluid/recordio/scanner.h @@ -37,11 +37,7 @@ class Scanner { private: std::unique_ptr stream_; - Chunk cur_chunk_; - size_t offset_; - bool eof_; - - void ParseNextChunk(); + ChunkParser parser_; }; } // namespace recordio } // namespace paddle diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h index f3d8b1a39e849d5f5a9e79cf33252b60170ced81..854e4baa3987f61353038c7b26acf43943c89636 100644 --- a/paddle/math/MathFunctions.h +++ b/paddle/math/MathFunctions.h @@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef MATHFUNCTIONS_H_ -#define MATHFUNCTIONS_H_ +#pragma once #ifdef PADDLE_WITH_MKLML #include @@ -21,7 +20,7 @@ limitations under the License. */ #include #endif -#if defined(PADDLE_USE_VECLIB) +#ifdef PADDLE_USE_VECLIB extern "C" { #include #include @@ -30,8 +29,10 @@ extern "C" { #ifdef PADDLE_USE_OPENBLAS #include +#ifdef LAPACK_FOUND #include #endif +#endif #ifndef LAPACK_FOUND extern "C" { @@ -126,5 +127,3 @@ template void vTanh(const int n, const T* a, T* r); } // namespace paddle - -#endif // MATHFUNCTIONS_H_ diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 8eeea1805d8610f6f27f422337f3526688b73de3..e8b305326702cf04b752bb2eb413f848daa5ec7b 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -132,7 +132,8 @@ EOF -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ - -DWITH_CONTRIB=${WITH_CONTRIB:-ON} + -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ + -DWITH_ANAKIN=ON } function abort(){ @@ -145,19 +146,17 @@ function check_style() { trap 'abort' 0 set -e - # install glide - curl https://glide.sh/get | bash - eval "$(GIMME_GO_VERSION=1.8.3 gimme)" + if [ -x "$(command -v gimme)" ]; then + eval "$(GIMME_GO_VERSION=1.8.3 gimme)" + fi # set up go environment for running gometalinter mkdir -p $GOPATH/src/github.com/PaddlePaddle/ ln -sf ${PADDLE_ROOT} $GOPATH/src/github.com/PaddlePaddle/Paddle - cd $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd - + mkdir -p ./build/go + cp go/glide.* build/go + cd build/go; glide install; cd - - go get github.com/alecthomas/gometalinter - gometalinter --install - - cd ${PADDLE_ROOT} export PATH=/usr/bin:$PATH pre-commit install clang-format --version @@ -183,6 +182,7 @@ function build() { ============================================ EOF make clean + make -j `nproc` make install -j `nproc` } @@ -449,7 +449,7 @@ EOF # run paddle version to install python packages first RUN apt-get update &&\ ${NCCL_DEPS}\ - apt-get install -y wget python-pip dmidecode python-tk && easy_install -U pip && \ + apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \ pip install /*.whl; apt-get install -f -y && \ apt-get clean -y && \ rm -f /*.whl && \ diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index 586ec48477f085a14d2f15b265a95d596705694f..507479c8622c8d33722e08bba018ad1ba5452e15 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -30,7 +30,7 @@ int main(int argc, char** argv) { new_argv.push_back( strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory")); #else - new_argv.push_back(strdup("--tryfromenv=use_pinned_memory")); + new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,use_mkldnn")); #endif int new_argc = static_cast(new_argv.size()); char** new_argv_address = new_argv.data(); diff --git a/python/paddle/batch.py b/python/paddle/batch.py index d48c54fcbb66487617b1946bc69724870c8f879c..3c6a53db3c2287e8ef5931a06ca5dad455665ee0 100644 --- a/python/paddle/batch.py +++ b/python/paddle/batch.py @@ -15,7 +15,7 @@ __all__ = ['batch'] -def batch(reader, batch_size, drop_last=False): +def batch(reader, batch_size, drop_last=True): """ Create a batched reader. diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py index f082e33be3357fbe405ab1a1ef5e0e601108a363..527044b415533cc640e3cfc5837c08ab0f8b74b1 100644 --- a/python/paddle/dataset/flowers.py +++ b/python/paddle/dataset/flowers.py @@ -119,7 +119,8 @@ def reader_creator(data_file, yield sample, int(label) - 1 if use_xmap: - return xmap_readers(mapper, reader, cpu_count(), buffered_size) + cpu_num = int(os.environ.get('CPU_NUM', cpu_count())) + return xmap_readers(mapper, reader, cpu_num, buffered_size) else: return map_readers(mapper, reader) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 859605d005328c030980a49a349742772de1cb6d..bd985ad733aa8eece2f8374d033f452a0175a011 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -26,6 +26,7 @@ from trainer import BeginEpochEvent from trainer import EndEpochEvent from trainer import BeginStepEvent from trainer import EndStepEvent +from trainer import CheckpointConfig import inferencer from inferencer import Inferencer @@ -44,8 +45,8 @@ import transpiler from param_attr import ParamAttr, WeightNormParamAttr from data_feeder import DataFeeder from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace -from transpiler import DistributeTranspiler, SimpleDistributeTranspiler, \ - InferenceTranspiler, memory_optimize, release_memory +from transpiler import DistributeTranspiler, InferenceTranspiler, \ + memory_optimize, release_memory from concurrency import (Go, make_channel, channel_send, channel_recv, channel_close, Select) from lod_tensor import create_lod_tensor, create_random_int_lodtensor @@ -116,11 +117,11 @@ def __bootstrap__(): read_env_flags = [ 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', - 'eager_delete_scope' + 'eager_delete_scope', 'use_mkldnn' ] if core.is_compiled_with_cuda(): read_env_flags += [ - 'fraction_of_gpu_memory_to_use', 'cudnn_algo_use_autotune' + 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic' ] core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index 7940dabcfb03cc9eb46f678365685a6e99bcceec..e2013137b14f73bb0fcfb57b4bdc35fcc043bdc0 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -15,6 +15,7 @@ from __future__ import print_function import core import numpy +import os import six.moves as six import multiprocessing @@ -150,7 +151,9 @@ class DataFeeder(object): elif isinstance(self.place, core.CUDAPlace): return core.get_cuda_device_count() else: - return multiprocessing.cpu_count() + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + return cpu_num def decorate_reader(self, reader, diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 93aa5f908ec929a33089a62caa2186ba9e57fffe..33d8f709412b25d29c6618272500dd7b953d6645 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -170,6 +170,8 @@ def get_program_cache_key(feed, fetch_list): return var.desc.name() elif isinstance(var, str): return var + elif isinstance(var, basestring): + return str(var) else: raise TypeError(str(var) + " should be Variable or str") diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 33b5caa0eab0ec192eb4a3b63cf82a672c58d2cb..f6438c82ac207d0e38d8be5e9d6252b28e72826e 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -72,6 +72,8 @@ def convert_np_dtype_to_dtype_(np_dtype): return core.VarDesc.VarType.INT64 elif dtype == np.bool: return core.VarDesc.VarType.BOOL + elif dtype == np.uint16: + return core.VarDesc.VarType.INT16 elif dtype == np.uint8: return core.VarDesc.VarType.UINT8 else: @@ -361,6 +363,13 @@ class OpProtoHolder(object): raise ValueError("Operator \"%s\" has not been registered." % type) return self.op_proto_map[type] + @staticmethod + def generated_op_attr_names(): + return { + core.op_proto_and_checker_maker.kOpRoleAttrName(), + core.op_proto_and_checker_maker.kOpRoleVarAttrName() + } + class Operator(object): """ @@ -368,6 +377,13 @@ class Operator(object): Block. Users can use the build in instructions to describe their neural network. """ + OP_WITHOUT_KERNEL_SET = { + 'feed', 'fetch', 'save', 'load', 'recurrent', 'go', + 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', + 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', + 'ncclInit', 'channel_create', 'channel_close', 'channel_send', + 'channel_recv', 'select', 'gen_nccl_id' + } def __init__(self, block, @@ -504,17 +520,13 @@ class Operator(object): else: self.desc.set_attr(attr_name, self.attrs[attr_name]) self.desc.check_attrs() - no_kernel_op_set = { - 'feed', 'fetch', 'save', 'load', 'recurrent', 'go', - 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', - 'recv', 'listen_and_serv', 'parallel_do', 'save_combine', - 'load_combine', 'ncclInit', 'channel_create', 'channel_close', - 'channel_send', 'channel_recv', 'select', 'gen_nccl_id' - } - if type not in no_kernel_op_set: + if self.has_kernel(type): self.desc.infer_var_type(self.block.desc) self.desc.infer_shape(self.block.desc) + def has_kernel(self, op_type): + return op_type not in self.OP_WITHOUT_KERNEL_SET + def to_string(self, throw_on_error): """ To debug string. @@ -742,7 +754,9 @@ class Block(object): def var(self, name): if not isinstance(name, basestring): - raise TypeError() + raise TypeError( + "var require string as parameter, but get %s instead." % + (type(name))) v = self.vars.get(name, None) if v is None: raise ValueError("var %s not in this block" % name) diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py index 9f242cf29a56573349f192307a68e135a409a4be..6baac00905713594acd59bb3819038576fab0674 100644 --- a/python/paddle/fluid/inferencer.py +++ b/python/paddle/fluid/inferencer.py @@ -56,6 +56,8 @@ class Inferencer(object): else: self.exe = executor.Executor(self.place) + self.inference_program = self.inference_program.clone(for_test=True) + def infer(self, inputs, return_numpy=True): """ :param inputs: a map of {"input_name": input_var} that will be feed into the inference program diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 8e58e5eb794e1bb507ab05394a1f7b57a1d2ed42..6323c9899e0080b436a52f852c647466b8f94bc1 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -24,7 +24,8 @@ __all__ = [ 'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params', 'load_persistables', 'save_inference_model', 'load_inference_model', 'get_inference_program', 'save_checkpoint', 'load_checkpoint', - 'clean_checkpoint' + 'clean_checkpoint', 'load_persist_vars_without_grad', + 'save_persist_vars_without_grad', 'get_latest_checkpoint_serial' ] @@ -457,95 +458,161 @@ def get_parameter_value_by_name(name, executor, program=None): SUCCESS_MARK_FILENAME = "_SUCCESS" CHECKPOINT_PREFIX = "checkpoint" +MODEL_DIR = "__model__" +TRAINER_PREFIX = "trainer" CHECKPOINT_SEPARATOR = "_" def save_checkpoint(executor, - checkpoint_dir=None, - max_num_checkpoints=3, - save_interval_secs=600, - main_program=None): + checkpoint_dir, + trainer_id, + trainer_args=None, + main_program=None, + max_num_checkpoints=3): """ Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory, the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy to keep numbers of checkpoint directory, the numbers of checkpoint directory are max_num_checkpoints at most, The interval between two saved checkpoints must greater than save_interval_secs. - :param executor - :param checkpoint_dir - :param max_num_checkpoints - :param save_interval_secs - :param main_program + :param executor executor for save the value + :param checkpoint_dir the checkpoint directory + :param trainer_id currect trainer id, if id is equal to 0, the trainer is chief + :param main_program will save all variables in program + :param max_num_checkpoints will keep numbers of checkpoint serials not bigger than max_num_checkpoints """ if checkpoint_dir is None: - checkpoint_dir = os.getcwd() + raise ValueError("'checkpoint_dir' should not be None") + + if trainer_args: + assert isinstance(trainer_args, dict) if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir) - serial = _get_lastest_checkpoint_dir(checkpoint_dir) - if serial >= 0 and not _interval_secs_exceed( - _get_serial_dir(serial, checkpoint_dir), save_interval_secs): - return + serial = get_latest_checkpoint_serial(checkpoint_dir) + 1 + cur_dir = _get_serial_dir(checkpoint_dir, serial) - serial += 1 - cur_dir = _get_serial_dir(serial, checkpoint_dir) + save_trainer_args(cur_dir, trainer_id, trainer_args) - save_vars( - executor, - dirname=cur_dir, - main_program=main_program, - vars=None, - predicate=_is_checkpoint_var, - filename=None) - _write_success(cur_dir) - _lru_delete(checkpoint_dir, max_num_checkpoints) + if trainer_id == 0: + save_persist_vars_without_grad(executor, cur_dir, main_program) + + _scroll_delete(checkpoint_dir, max_num_checkpoints) -def load_checkpoint(executor, checkpoint_dir=None, main_program=None): +def load_checkpoint(executor, checkpoint_dir, serial, main_program): """ Load checkpoint from a directory by executor, it will find the most recent saved checkpoint file and load it auto. - :param executor - :param checkpoint_dir - :param main_program + :param executor executor for load the value + :param checkpoint_dir the checkpoint directory + :param serial the serial folder in checkpoint directory will be load + :param main_program will load all variables in program """ if checkpoint_dir is None: - checkpoint_dir = os.getcwd() + raise ValueError("'checkpoint_dir' should not be None") - serial = _get_lastest_checkpoint_dir(checkpoint_dir) + if serial is None or serial < 0: + raise ValueError("'serial' should not be None or <0 ") - if serial < 0: - return + if main_program is None: + raise ValueError('main_program should not be None.') - cur_dir = _get_serial_dir(serial, checkpoint_dir) - - load_vars( - executor, - dirname=cur_dir, - main_program=main_program, - predicate=_is_checkpoint_var, - filename=None) + cur_dir = _get_serial_dir(checkpoint_dir, serial) + load_persist_vars_without_grad(executor, cur_dir, main_program, True) def clean_checkpoint(checkpoint_dir, delete_dir=False): """ clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before. delete_dir only works when the directory is empty, otherwise, OSError is raised. + + :param checkpoint_dir + :param delete_dir """ + if checkpoint_dir is None: - checkpoint_dir = os.getcwd() - _lru_delete(checkpoint_dir, max_num_checkpoints=0) + raise ValueError("'checkpoint_dir' should not be None") + _scroll_delete(checkpoint_dir, max_num_checkpoints=0) if delete_dir and not os.listdir(checkpoint_dir): os.rmdir(checkpoint_dir) -def _get_serial_dir(serial, checkpoint_dir): - serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial) - return os.path.join(checkpoint_dir, serial_folder) +def load_persist_vars_without_grad(executor, + dirname, + program, + has_model_dir=False): + """ + load_persist_vars_without_grad will load variables from a directory by an executor, + the variable named end with "@GRAD" will not be loaded. + + :param executor executor for load the value + :param dirname the checkpoint directory + :param program will load all variables in program + :param has_model_dir if has_model_dir is True, will load variables from sub directory named __model__ + """ + + if has_model_dir: + dirname = _get_model_dir(dirname) + + load_vars( + executor, + dirname=dirname, + main_program=program, + predicate=_is_checkpoint_var, + filename=None) + + +def save_persist_vars_without_grad(executor, dirname, program): + """ + save_persist_vars_without_grad will save variables to a directory by an executor, + the variable named end with "@GRAD" will not be saved. + + :param executor executor for load the value + :param dirname the checkpoint directory + :param program will load all variables in program + """ + cur_dir = _get_model_dir(dirname) + save_vars( + executor, + dirname=cur_dir, + main_program=program, + vars=None, + predicate=_is_checkpoint_var, + filename=None) + _write_success(cur_dir) + + +def save_trainer_args(dirname, trainer_id, trainer_args): + assert isinstance(trainer_args, dict) + + cur_dir = _get_trainer_dir(dirname, trainer_id) + + for name, value in trainer_args.iteritems(): + args_file = os.path.join(cur_dir, name) + with open(args_file, 'w') as f: + f.write(str(value)) + _write_success(cur_dir) + + +def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args): + assert isinstance(trainer_args, list) + + cur_dir = _get_serial_dir(checkpoint_dir, serial) + cur_dir = _get_trainer_dir(cur_dir, trainer_id) + + ret_values = [] + + for arg in trainer_args: + cur_file = os.path.join(cur_dir, arg) + with open(cur_file, 'r') as f: + contents = f.read() + ret_values.append(contents.strip()) + return ret_values def _is_checkpoint_var(var): @@ -559,36 +626,74 @@ def _is_checkpoint_var(var): var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ var.desc.type() == core.VarDesc.VarType.RAW: return False + # @GRAD are named for gradient variables, checkpoint will not save it. + if "@GRAD" in var.name: + return False + # .trainer_ are named for distribute train variables, checkpoint will not save it. + if ".trainer_" in var.name: + return False - if var.name.endswith("@GRAD"): + # .block is named for distribute train variables, checkpoint will not save it. + if ".block" in var.name: return False return var.persistable -def _interval_secs_exceed(dirname, save_interval_secs): - dir_time = os.path.getmtime(dirname) - if save_interval_secs > (time.time() - dir_time): - return False - return True +def _get_dir_serial(dirname): + _, serial = dirname.split(CHECKPOINT_SEPARATOR) + + try: + serial_num = int(serial) + except ValueError: + serial_num = -1 + return serial_num + + +def _get_serial_dir(dirname, serial): + serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial) + serial_dir = os.path.join(dirname, serial_folder) + + if not os.path.isdir(serial_dir): + os.makedirs(serial_dir) + + return serial_dir + +def _get_model_dir(dirname): + model_dir = os.path.join(dirname, MODEL_DIR) -def _lru_delete(dirname, max_num_checkpoints=3): + if not os.path.isdir(model_dir): + os.makedirs(model_dir) + + return model_dir + + +def _get_trainer_dir(dirname, trainer_id): + trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id) + trainer_dir = os.path.join(dirname, trainer_folder) + + if not os.path.isdir(trainer_dir): + os.makedirs(trainer_dir) + + return trainer_dir + + +def _scroll_delete(dirname, max_num_checkpoints=3): dirs = os.listdir(dirname) - serials = [] + serial_map = {} for serial in dirs: - try: - serials.append(int(serial)) - except ValueError: - continue + serial_num = _get_dir_serial(serial) + serial_map[serial_num] = serial - if len(serials) <= max_num_checkpoints: + if len(serial_map.keys()) <= max_num_checkpoints: return + serials = serial_map.keys() serials.sort(reverse=True) serials = serials[max_num_checkpoints:] for serial in serials: - cur_dir = os.path.join(dirname, str(serial)) + cur_dir = _get_serial_dir(dirname, serial) shutil.rmtree(cur_dir) @@ -604,33 +709,30 @@ def _write_success(dirname): f.write(now) -def _get_lastest_checkpoint_dir(checkpoint_dir): +def get_latest_checkpoint_serial(checkpoint_dir): """ get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory :param checkpoint_dir """ - if not checkpoint_dir.strip(): + if not checkpoint_dir: return -1 def has_success(checkpoint_dir, cur_dir): """ is _SUCCESS in this dir """ - _, serial = cur_dir.split(CHECKPOINT_SEPARATOR) - - try: - int(serial) - except ValueError: - return -1 - if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)): + serial = _get_dir_serial(cur_dir) + if serial == -1 or not os.path.isdir( + os.path.join(checkpoint_dir, cur_dir)): return -1 success_path = os.path.join( - _get_serial_dir(serial, checkpoint_dir), SUCCESS_MARK_FILENAME) + _get_serial_dir(checkpoint_dir, serial), MODEL_DIR, + SUCCESS_MARK_FILENAME) if os.path.isfile(success_path): - return int(serial) + return serial if not os.path.isdir(checkpoint_dir): return -1 diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 8758ac9f94ab91b5be5fc70917c64db38997d1c1..9de88e2c3205ace74beff43df7ae8956897d965a 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -19,11 +19,12 @@ from ..unique_name import generate as unique_name from control_flow import BlockGuard from ..layer_helper import LayerHelper from ..executor import global_scope +from layer_function_generator import generate_layer_fn, templatedoc __all__ = [ 'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer', - 'random_data_generator', 'Preprocessor' + 'random_data_generator', 'Preprocessor', 'load' ] @@ -434,7 +435,7 @@ def open_files(filenames, shapes, lod_levels, dtypes, - thread_num, + thread_num=1, buffer_size=None, pass_num=1, for_parallel=True): @@ -662,3 +663,29 @@ class Preprocessor(object): "sink_var_names": self.sink_var_names }) return monkey_patch_reader_methods(self.reader) + + +@templatedoc() +def load(out, file_path, load_as_fp16=None): + """ + ${comment} + + >>> import paddle.fluid as fluid + >>> tmp_tensor = fluid.layers.create_tensor(dtype='float32') + >>> fluid.layers.load(tmp_tensor, "./tmp_tensor.bin") + + Args: + out(${out_type}): ${out_comment}. + + file_path(${file_path_type}): ${file_path_comment}. + + load_as_fp16(${load_as_fp16_type}): ${load_as_fp16_comment}. + + Returns: + None + """ + helper = LayerHelper("load", **locals()) + attrs = {"file_path": file_path} + if load_as_fp16 is not None: + attrs['load_as_fp16'] = load_as_fp16 + helper.append_op(type="load", inputs={}, output={"Out": out}, args=attrs) diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index 295d1b7190ec39bcc6efdf72aebede14a99807aa..cb60a3aec9a5a69f1eed281eb017384a621c66a8 100644 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -15,16 +15,13 @@ import re import cStringIO import functools import warnings +import string from ..proto import framework_pb2 from ..framework import OpProtoHolder, Variable from ..layer_helper import LayerHelper -__all__ = [ - 'deprecated', - 'generate_layer_fn', - 'autodoc', -] +__all__ = ['deprecated', 'generate_layer_fn', 'autodoc', 'templatedoc'] def _convert_(name): @@ -43,6 +40,10 @@ def _convert_(name): return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() +def _type_to_str_(tp): + return framework_pb2.AttrType.Name(tp) + + def _generate_doc_string_(op_proto): """ Generate docstring by OpProto @@ -54,9 +55,6 @@ def _generate_doc_string_(op_proto): str: the document string """ - def _type_to_str_(tp): - return framework_pb2.AttrType.Name(tp) - if not isinstance(op_proto, framework_pb2.OpProto): raise TypeError("OpProto should be `framework_pb2.OpProto`") @@ -75,7 +73,11 @@ def _generate_doc_string_(op_proto): buf.write(str(each_input.dispensable)) buf.write('\n') + skip_attrs = OpProtoHolder.generated_op_attr_names() + for each_attr in op_proto.attrs: + if each_attr.name in skip_attrs: + continue buf.write(' ') buf.write(each_attr.name) buf.write(' (') @@ -220,3 +222,67 @@ def autodoc(comment=""): return func return __impl__ + + +_inline_math_single_dollar = re.compile(r"\$([^\$]+)\$") + + +def templatedoc(op_type=None): + """ + Decorator of layer function. It will use the docstring from the layer + function as the template. The template arguments are: + + * ${comment}: The operator comment written in CPP. + * ${{name}_comment}: The comment of ${name} written with AddAttr, AddOutput, + and AddInput. The ${name} is Python snake style. i.e., xxx_xxx. + * ${{name}_type}: The type of ${name}. + + Returns: + Decorated function. + """ + + def trim_ending_dot(msg): + return msg.rstrip('.') + + def escape_inline_math(msg): + return _inline_math_single_dollar.sub(repl=r':math:`\1`', string=msg) + + def __impl__(func): + if op_type is None: + op_type_name = func.__name__ + else: + op_type_name = op_type + op_proto = OpProtoHolder.instance().get_op_proto(op_type_name) + tmpl = string.Template(func.__doc__) + + comment_lines = op_proto.comment.split("\n") + comment = "" + for line in comment_lines: + line = line.strip() + if len(line) != 0: + comment += escape_inline_math(line) + comment += " " + elif len(comment) != 0: + comment += "\n \n " + + args = {"comment": trim_ending_dot(comment)} + for each_input in op_proto.inputs: + input_name = _convert_(each_input.name) + args["{0}_comment".format(input_name)] = trim_ending_dot( + each_input.comment) + args["{0}_type".format(input_name)] = "Variable" + for each_attr in op_proto.attrs: + input_name = _convert_(each_attr.name) + args["{0}_comment".format(input_name)] = trim_ending_dot( + each_attr.comment) + args["{0}_type".format(input_name)] = _type_to_str_(each_attr.type) + + for each_opt in op_proto.outputs: + output_name = _convert_(each_opt.name) + args["{0}_comment".format(output_name)] = trim_ending_dot( + each_opt.comment) + args["{0}_type".format(output_name)] = "Variable" + func.__doc__ = tmpl.substitute(args) + return func + + return __impl__ diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index d13c54daa5a985e2e1bf9357630fe29d24a17bb4..716cc7824eff0c56cc55a055310fa8b1913ac5e6 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +When training a model, it's often useful to decay the +learning rate during training process, this is called +learning_rate_decay. There are many strategies to do +this, this module will provide some classical method. +User can also implement their own learning_rate_decay +strategy according to this module. +""" import control_flow import nn @@ -22,14 +30,6 @@ __all__ = [ 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', 'polynomial_decay', 'piecewise_decay', 'noam_decay' ] -""" -When training a model, it's often useful to decay the -learning rate during training process, this is called -learning_rate_decay. There are many strategies to do -this, this module will provide some classical method. -User can also implement their own learning_rate_decay -strategy according to this module. -""" def _decay_step_counter(begin=0): @@ -41,18 +41,20 @@ def _decay_step_counter(begin=0): def noam_decay(d_model, warmup_steps): - """Apply decay to learning rate. - ```python - lr_value = np.power(d_model, -0.5) * np.min([ - np.power(current_steps, -0.5), - np.power(warmup_steps, -1.5) * current_steps - ]) - ``` + """ + Noam decay method. The numpy implementation of noam decay as follows. + + >>> import numpy as np + >>> lr_value = np.power(d_model, -0.5) * np.min([ + >>> np.power(current_steps, -0.5), + >>> np.power(warmup_steps, -1.5) * current_steps]) + + Please reference `attention is all you need + `_. Args: d_model(Variable): The dimensionality of input and output of model. - Reference: attention is all you need - https://arxiv.org/pdf/1706.03762.pdf + warmup_steps(Variable): A super parameter. Returns: diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric.py index cab2eb55510542bdd4dd7eca7667601697759181..a1c64ce2771526cbd0baa944f97d01e7878b3ac1 100644 --- a/python/paddle/fluid/layers/metric.py +++ b/python/paddle/fluid/layers/metric.py @@ -64,10 +64,6 @@ def auc(input, label, curve='ROC', num_thresholds=200): topk_indices = helper.create_tmp_variable(dtype="int64") topk_out, topk_indices = nn.topk(input, k=k) auc_out = helper.create_tmp_variable(dtype="float32") - if correct is None: - correct = helper.create_tmp_variable(dtype="int64") - if total is None: - total = helper.create_tmp_variable(dtype="int64") helper.append_op( type="accuracy", inputs={ diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 60f8cbbfa714e8500606fdf68b7a23e1ffb9d37a..98f169e8f0881fbba6aecb45b43a52c8fd51132d 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -71,7 +71,10 @@ __all__ = [ 'cumsum', 'scatter', 'sum', + 'slice', + 'polygon_box_transform', 'shape', + 'maxout', ] + __activations__ for _OP in set(__all__): diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index be34cc81a5d5ca0e781e5984b6c3eeaa4e25eb90..62b01d595a812ee8fc094e40b6dfb5c3f56cd012 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -18,6 +18,7 @@ from ..framework import convert_np_dtype_to_dtype_ from ..framework import Variable from ..initializer import Constant, force_init_on_cpu from ..core import VarDesc +from layer_function_generator import templatedoc import numpy __all__ = [ @@ -30,6 +31,8 @@ __all__ = [ 'assign', 'fill_constant_batch_size_like', 'fill_constant', + 'argmin', + 'argmax', 'ones', 'zeros', ] @@ -266,6 +269,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None): return out +@templatedoc() def fill_constant_batch_size_like(input, shape, dtype, @@ -273,30 +277,28 @@ def fill_constant_batch_size_like(input, input_dim_idx=0, output_dim_idx=0): """ - **fill_constant_batch_size_like** - - This function creates a tensor of specified *shape*, *dtype* and batch size, - and initializes this with a constant supplied in *value*. The batch size is - obtained from the `input` tensor. + ${comment} It also sets *stop_gradient* to True. + >>> data = fluid.layers.fill_constant_batch_size_like( + >>> input=like, shape=[1], value=0, dtype='int64') + Args: - input(Variable): Tensor whose dimensions will be used to get batch size - shape(tuple|list|None): Shape of output tensor - dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor - value(float): Constant value to initialize the output tensor - input_dim_idx(int): Index of input's batch size dimension - output_dim_idx(int): Index of output's batch size dimension + input(${input_type}): ${input_comment}. - Returns: - Variable: The tensor variable storing the output + shape(${shape_type}): ${shape_comment}. - Examples: - .. code-block:: python + dtype(${dtype_type}): ${dtype_comment}. + + value(${value_type}): ${value_comment}. - data = fluid.layers.fill_constant_batch_size_like( - input=like, shape=[1], value=0, dtype='int64') + input_dim_idx(${input_dim_idx_type}): ${input_dim_idx_comment}. + + output_dim_idx(${output_dim_idx_type}): ${output_dim_idx_comment}. + + Returns: + ${out_comment}. """ helper = LayerHelper("fill_constant_batch_size_like", **locals()) out = helper.create_tmp_variable(dtype=dtype) @@ -315,6 +317,68 @@ def fill_constant_batch_size_like(input, return out +def argmin(x, axis=0): + """ + **argmin** + + This function computes the indices of the min elements + of the input tensor's element along the provided axis. + + Args: + x(Variable): The input to compute the indices of + the min elements. + axis(int): Axis to compute indices along. + + Returns: + Variable: The tensor variable storing the output + + Examples: + .. code-block:: python + + out = fluid.layers.argmin(x=in, axis=0) + out = fluid.layers.argmin(x=in, axis=-1) + """ + helper = LayerHelper("arg_min", **locals()) + out = helper.create_tmp_variable(VarDesc.VarType.INT64) + helper.append_op( + type='arg_min', + inputs={'X': x}, + outputs={'Out': [out]}, + attrs={'axis': axis}) + return out + + +def argmax(x, axis=0): + """ + **argmax** + + This function computes the indices of the max elements + of the input tensor's element along the provided axis. + + Args: + x(Variable): The input to compute the indices of + the max elements. + axis(int): Axis to compute indices along. + + Returns: + Variable: The tensor variable storing the output + + Examples: + .. code-block:: python + + out = fluid.layers.argmax(x=in, axis=0) + out = fluid.layers.argmax(x=in, axis=-1) + """ + helper = LayerHelper("arg_max", **locals()) + out = helper.create_tmp_variable(VarDesc.VarType.INT64) + helper.append_op( + type='arg_max', + inputs={'X': x}, + outputs={'Out': [out]}, + attrs={'axis': axis}) + return out + + def ones(shape, dtype, force_cpu=False): """ **ones** @@ -363,6 +427,40 @@ def zeros(shape, dtype, force_cpu=False): return fill_constant(value=0.0, **locals()) +def reverse(x, axis): + """ + **reverse** + + This function reverse the input 'x' along given axises. + + Args: + x(Vairbale): the input to be reversed. + axis(int|tuple|list): Axis that along which order of elements + is reversed. If it is a tuple or a list, reversing + will be apply on each axis in the tuple or list. + + Returns: + Variable: The reversed tensor. + + Examples: + .. code-block:: python + + out = fluid.layers.reverse(x=in, axis=0) + # or: + out = fluid.layers.reverse(x=in, axis=[0,1]) + """ + if isinstance(axis, int): + axis = [axis] + helper = LayerHelper("reverse", **locals()) + out = helper.create_tmp_variable(dtype=x.dtype) + helper.append_op( + type='reverse', + inputs={'Input': x}, + outputs={'Out': [out]}, + attrs={'axis': axis}) + return out + + def save(x, file_path, overwrite=True): """ Saves a variable as a file. @@ -403,22 +501,6 @@ def save_combine(x, file_path, overwrite=True): "overwrite": overwrite}) -def load(out, file_path): - """ - Loads a variable from a given file. - - Args: - out(variable): The variable to be read from the disk file. - file_path(str): The path of the disk file. - """ - helper = LayerHelper("load", **locals()) - helper.append_op( - type="load", - inputs={}, - output={"Out": out}, - args={"file_path": file_path}) - - def load_combine(out, file_path): """ Loads a list of vairables from a single file. diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 3117dfe00c7a3df1035c439dc31b81e67781d0cc..0fdc9a035292b3390cece6c5821a60b1b281e54d 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -18,6 +18,7 @@ import framework import executor import warnings import sys +import os __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy'] @@ -101,7 +102,9 @@ class ParallelExecutor(object): p.set_place(self._act_places[-1]) self._places.append(p) else: - for i in xrange(multiprocessing.cpu_count()): + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + for i in xrange(cpu_num): p = core.Place() self._act_places.append(core.CPUPlace()) p.set_place(self._act_places[-1]) @@ -110,19 +113,17 @@ class ParallelExecutor(object): if exec_strategy is None: exec_strategy = ExecutionStrategy() - if use_cuda: - exec_strategy.use_event = True - else: - exec_strategy.use_event = False + exec_strategy.use_cuda = use_cuda if exec_strategy.num_threads == 0: if use_cuda: # Experiments on se-resnext shows that too many threads hurt # performance. Worth tunning for other models in the future. - exec_strategy.num_threads = len(self._places) * 2 + exec_strategy.num_threads = len(self._places) * 4 else: - exec_strategy.num_threads = min( - len(self._places) * 2, multiprocessing.cpu_count()) + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + exec_strategy.num_threads = cpu_num if build_strategy is None: build_strategy = BuildStrategy() diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py index 5accaacd5361165d30b92c71ae4fd62e23e44e07..8d48e9abef0fb9861284c6302b30efb0e3994989 100644 --- a/python/paddle/fluid/recordio_writer.py +++ b/python/paddle/fluid/recordio_writer.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import core import contextlib - -__all__ = ['convert_reader_to_recordio_file'] +__all__ = [ + 'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files' +] @contextlib.contextmanager @@ -46,3 +48,36 @@ def convert_reader_to_recordio_file( writer.complete_append_tensor() counter += 1 return counter + + +def convert_reader_to_recordio_files( + filename, + batch_per_file, + reader_creator, + feeder, + compressor=core.RecordIOWriter.Compressor.Snappy, + max_num_records=1000, + feed_order=None): + if feed_order is None: + feed_order = feeder.feed_names + f_name, f_ext = os.path.splitext(filename) + assert (f_ext == ".recordio") + + lines = [] + f_idx = 0 + counter = 0 + for idx, batch in enumerate(reader_creator()): + lines.append(batch) + if idx >= batch_per_file and idx % batch_per_file == 0: + filename = "%s-%05d%s" % (f_name, f_idx, f_ext) + with create_recordio_writer(filename, compressor, + max_num_records) as writer: + for l in lines: + res = feeder.feed(l) + for each in feed_order: + writer.append_tensor(res[each]) + writer.complete_append_tensor() + counter += 1 + lines = [] + f_idx += 1 + return counter diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py index de3906fc6a005181b0ab04a846eb2e7ce14004c2..ad28c9eff560507e5b326451159be3949353f58f 100644 --- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py +++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py @@ -38,7 +38,7 @@ def inference_program(): return y_predict -def linear(): +def train_program(): y = fluid.layers.data(name='y', shape=[1], dtype='float32') y_predict = inference_program() @@ -48,13 +48,15 @@ def linear(): return avg_loss +def optimizer_func(): + return fluid.optimizer.SGD(learning_rate=0.001) + + def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() trainer = fluid.Trainer( - train_func=train_program, - place=place, - optimizer=fluid.optimizer.SGD(learning_rate=0.001)) + train_func=train_program, place=place, optimizer_func=optimizer_func) def event_handler(event): if isinstance(event, fluid.EndStepEvent): @@ -102,7 +104,7 @@ def main(use_cuda): # Directory for saving the trained model params_dirname = "fit_a_line.inference.model" - train(use_cuda, linear, params_dirname) + train(use_cuda, train_program, params_dirname) infer(use_cuda, inference_program, params_dirname) diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py index 63dc1b6ce30974ede22a3f7772b76bf207bbae39..8e222d26907e8fe697b596a67e62cc9df84afe0e 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py @@ -85,6 +85,10 @@ def train_network(): return [avg_cost, accuracy] +def optimizer_func(): + return fluid.optimizer.Adam(learning_rate=0.001) + + def train(use_cuda, train_program, params_dirname): BATCH_SIZE = 128 EPOCH_NUM = 1 @@ -92,10 +96,11 @@ def train(use_cuda, train_program, params_dirname): train_reader = paddle.batch( paddle.reader.shuffle( cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10), - batch_size=BATCH_SIZE) + batch_size=BATCH_SIZE, + drop_last=False) test_reader = paddle.batch( - paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE) + paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False) def event_handler(event): if isinstance(event, fluid.EndStepEvent): @@ -111,9 +116,7 @@ def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() trainer = fluid.Trainer( - train_func=train_program, - optimizer=fluid.optimizer.Adam(learning_rate=0.001), - place=place) + train_func=train_program, optimizer_func=optimizer_func, place=place) trainer.train( reader=train_reader, diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py index 0bf8f265a1c1b11364ecfa11061af183ce20d51e..dbc7bc06c93157f271c79e85b6925468e861e57f 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py @@ -64,15 +64,20 @@ def train_network(): return [avg_cost, accuracy] +def optimizer_func(): + return fluid.optimizer.Adam(learning_rate=0.001) + + def train(use_cuda, train_program, params_dirname): BATCH_SIZE = 128 train_reader = paddle.batch( paddle.reader.shuffle( cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10), - batch_size=BATCH_SIZE) + batch_size=BATCH_SIZE, + drop_last=False) test_reader = paddle.batch( - paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE) + paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False) def event_handler(event): if isinstance(event, fluid.EndStepEvent): @@ -88,9 +93,7 @@ def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() trainer = fluid.Trainer( - train_func=train_program, - place=place, - optimizer=fluid.optimizer.Adam(learning_rate=0.001)) + train_func=train_program, place=place, optimizer_func=optimizer_func) trainer.train( reader=train_reader, diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py index 8cce398ff33695dc15ae6fb01a887194596af001..0ccb3a39e02ea0c24bdfe01c5eba73b92da88a04 100755 --- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py +++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py @@ -141,12 +141,16 @@ def train_program(): return [avg_cost] +def optimize_func(): + return fluid.optimizer.SGD(learning_rate=fluid.layers.exponential_decay( + learning_rate=0.01, decay_steps=100000, decay_rate=0.5, staircase=True)) + + def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - optimizer = fluid.optimizer.SGD(learning_rate=0.01) trainer = fluid.Trainer( - train_func=train_program, place=place, optimizer=optimizer) + train_func=train_program, place=place, optimizer_func=optimize_func) feed_order = [ 'word_data', 'ctx_n2_data', 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data', @@ -245,7 +249,7 @@ def infer(use_cuda, inference_program, params_dirname): }, return_numpy=False) - print("infer results: ", np.array(results[0])) + print("infer results: ", np.array(results[0]).shape) def main(use_cuda): diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py index d4b723d3e6b619709ab3dc76a32ae87f1cdec274..c4b37df3a09f93fe965ae28ce783f06f5018020d 100644 --- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py +++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py @@ -158,6 +158,13 @@ def train_program(is_sparse): return avg_cost +def optimizer_func(): + return fluid.optimizer.Adagrad( + learning_rate=1e-4, + regularization=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=0.1)) + + def train(use_cuda, is_sparse, is_local=True): EPOCH_NUM = 1 @@ -182,11 +189,8 @@ def train(use_cuda, is_sparse, is_local=True): trainer = fluid.Trainer( train_func=partial(train_program, is_sparse), - optimizer=fluid.optimizer.Adagrad( - learning_rate=1e-4, - regularization=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=0.1)), - place=place) + place=place, + optimizer_func=optimizer_func) trainer.train( reader=train_reader, diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py index 03439cbd37671b4727879bf3d0793f016f55247a..9a09db25dc0e2c71772aa06e6d0cf993321612e4 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py @@ -57,14 +57,17 @@ def train_program(): return [avg_cost, acc] +def optimizer_func(): + return fluid.optimizer.Adam(learning_rate=0.001) + + def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - optimizer = fluid.optimizer.Adam(learning_rate=0.001) trainer = fluid.Trainer( train_func=train_program, place=place, - optimizer=optimizer, + optimizer_func=optimizer_func, parallel=True) def event_handler(event): diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py index 89bbd21bea7d64a8dd6fc32829b6addb680da62e..b2b544e791d7ea35ff7d2c9a2dce7ce7f5680f38 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py @@ -44,12 +44,15 @@ def train_program(): return [avg_cost, acc] +def optimizer_func(): + return fluid.optimizer.Adam(learning_rate=0.001) + + def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - optimizer = fluid.optimizer.Adam(learning_rate=0.001) trainer = fluid.Trainer( - train_func=train_program, place=place, optimizer=optimizer) + train_func=train_program, place=place, optimizer_func=optimizer_func) def event_handler(event): if isinstance(event, fluid.EndEpochEvent): diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py index dfc7325acf23176c05fe42761b9997b98d23372a..090c11ce1e79201f0d65d3540527791ab2191d4a 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py +++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py @@ -155,12 +155,15 @@ def train_program(): return [avg_cost, scale_infer] +def optimizer_func(): + return fluid.optimizer.SGD(learning_rate=0.2) + + def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - optimizer = fluid.optimizer.SGD(learning_rate=0.2) trainer = fluid.Trainer( - train_func=train_program, place=place, optimizer=optimizer) + train_func=train_program, place=place, optimizer_func=optimizer_func) feed_order = [ 'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id', diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py index 11e9fd1bec1450f6753dbe38c7014090d6e136b6..9b61f7a00ce5e2a08c2105fb7f50e6868ef25df3 100644 --- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py +++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py @@ -64,15 +64,18 @@ def train_program(word_dict): return [avg_cost, accuracy] +def optimizer_func(): + return fluid.optimizer.Adagrad(learning_rate=0.002) + + def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - optimizer = fluid.optimizer.Adagrad(learning_rate=0.002) word_dict = paddle.dataset.imdb.word_dict() trainer = fluid.Trainer( train_func=partial(train_program, word_dict), place=place, - optimizer=optimizer) + optimizer_func=optimizer_func) def event_handler(event): if isinstance(event, fluid.EndEpochEvent): diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py index 90757d54f99715163518ce5a094e6ba3a67efed3..aa7c567b4d66ba07c26d54436fb305011cfeccf2 100644 --- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py +++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py @@ -79,15 +79,18 @@ def train_program(word_dict): return [avg_cost, accuracy] +def optimizer_func(): + return fluid.optimizer.Adagrad(learning_rate=0.002) + + def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - optimizer = fluid.optimizer.Adagrad(learning_rate=0.002) word_dict = paddle.dataset.imdb.word_dict() trainer = fluid.Trainer( train_func=partial(train_program, word_dict), place=place, - optimizer=optimizer) + optimizer_func=optimizer_func) def event_handler(event): if isinstance(event, fluid.EndEpochEvent): diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py index 52b7d4a83779d01936afb3d9d1e4864b05d55b5a..8c74be0f08855c20f5aa3ecd75622a51e94a0304 100644 --- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py +++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py @@ -71,20 +71,25 @@ def train_program(word_dict): return [avg_cost, accuracy] +def optimizer_func(): + return fluid.optimizer.Adagrad(learning_rate=0.002) + + def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - optimizer = fluid.optimizer.Adagrad(learning_rate=0.002) word_dict = paddle.dataset.imdb.word_dict() trainer = fluid.Trainer( train_func=partial(train_program, word_dict), place=place, - optimizer=optimizer) + optimizer_func=optimizer_func) def event_handler(event): if isinstance(event, fluid.EndEpochEvent): test_reader = paddle.batch( - paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE) + paddle.dataset.imdb.test(word_dict), + batch_size=BATCH_SIZE, + drop_last=False) avg_cost, acc = trainer.test( reader=test_reader, feed_order=['words', 'label']) @@ -110,7 +115,8 @@ def train(use_cuda, train_program, params_dirname): train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.imdb.train(word_dict), buf_size=25000), - batch_size=BATCH_SIZE) + batch_size=BATCH_SIZE, + drop_last=False) trainer.train( num_epochs=1, diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py index eeb8e67087334ea96aab9cdb6272e34e2eb99939..ba44f72d9b03c3a44560a8a30cba2253256314ef 100644 --- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py +++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py @@ -80,6 +80,10 @@ def train_program(is_sparse): return avg_cost +def optimizer_func(): + return fluid.optimizer.SGD(learning_rate=0.001) + + def train(use_cuda, train_program, params_dirname): train_reader = paddle.batch( paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE) @@ -104,9 +108,7 @@ def train(use_cuda, train_program, params_dirname): sys.exit("got NaN loss, training failed.") trainer = fluid.Trainer( - train_func=train_program, - optimizer=fluid.optimizer.SGD(learning_rate=0.001), - place=place) + train_func=train_program, optimizer_func=optimizer_func, place=place) trainer.train( reader=train_reader, diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py index bc8a1aafc82d62501cecfa71be0cc3851c75eae2..99d51ae0076178aca50e36c2c187257a8ba1cbf2 100644 --- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py @@ -76,8 +76,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, emb_layers.append(mark_embedding) hidden_0_layers = [ - fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') - for emb in emb_layers + fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers ] hidden_0 = fluid.layers.sums(input=hidden_0_layers) @@ -94,8 +93,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, for i in range(1, depth): mix_hidden = fluid.layers.sums(input=[ - fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'), - fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh') + fluid.layers.fc(input=input_tmp[0], size=hidden_dim), + fluid.layers.fc(input=input_tmp[1], size=hidden_dim) ]) lstm = fluid.layers.dynamic_lstm( diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py index 8818cf96fa8f08036f9e23aae786f67b5614b2b9..be347cd5315668dde0454d7959dbf9bcfa465b5f 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py @@ -56,7 +56,7 @@ BATCH_SIZE = 200 # fix the order of training data train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE) + paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE, drop_last=False) # train_reader = paddle.batch( # paddle.reader.shuffle( diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py index a1ca6d981fafb401985d03e9f2d63d1cb41b21b5..fa696acdfa9058af14f0bd34ce1a2980db5aeafc 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py @@ -80,21 +80,6 @@ def encoder_decoder(): return rnn() -def to_lodtensor(data, place): - seq_lens = [len(seq) for seq in data] - cur_len = 0 - lod = [cur_len] - for l in seq_lens: - cur_len += l - lod.append(cur_len) - flattened_data = np.concatenate(data, axis=0).astype("int64") - flattened_data = flattened_data.reshape([len(flattened_data), 1]) - res = core.LoDTensor() - res.set(flattened_data, place) - res.set_lod([lod]) - return res - - def main(): rnn_out = encoder_decoder() label = layers.data( @@ -122,18 +107,21 @@ def main(): exe.run(framework.default_startup_program()) + feed_order = [ + 'src_word_id', 'target_language_word', 'target_language_next_word' + ] + + feed_list = [ + fluid.default_main_program().global_block().var(var_name) + for var_name in feed_order + ] + feeder = fluid.DataFeeder(feed_list, place) + batch_id = 0 for pass_id in xrange(10): for data in train_data(): - word_data = to_lodtensor(map(lambda x: x[0], data), place) - trg_word = to_lodtensor(map(lambda x: x[1], data), place) - trg_word_next = to_lodtensor(map(lambda x: x[2], data), place) outs = exe.run(fluid.default_main_program(), - feed={ - 'src_word_id': word_data, - 'target_language_word': trg_word, - 'target_language_next_word': trg_word_next - }, + feed=feeder.feed(data), fetch_list=[avg_cost]) avg_cost_val = np.array(outs[0]) print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + diff --git a/python/paddle/fluid/tests/test_concurrency.py b/python/paddle/fluid/tests/no_test_concurrency.py similarity index 100% rename from python/paddle/fluid/tests/test_concurrency.py rename to python/paddle/fluid/tests/no_test_concurrency.py diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index fead95ffdab25c7ea96b7ef223efc0abf7eea3e3..21182393bd68db4a379fc3ecf83fc85d27ca9490 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -43,10 +43,12 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_dist_train) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) +# TODO(wuyi): this test hungs on CI, will add it back later +list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL) py_test_modules(test_dist_train MODULES test_dist_train SERIAL) -# tests that need to be done in fixed timeout -set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) +py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) +py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) diff --git a/python/paddle/fluid/tests/unittests/benchmark.py b/python/paddle/fluid/tests/unittests/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..e891ee932f1440001eb25b222f1f4613e97dfcb1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/benchmark.py @@ -0,0 +1,113 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import unittest +import time +import itertools + +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from op_test import OpTest + + +class BenchmarkSuite(OpTest): + def timeit_function(self, callback, iters, *args, **kwargs): + assert iters != 0, "Iters should >= 1" + start = time.time() + for i in range(iters): + callback(*args, **kwargs) + elapse = time.time() - start + return elapse / iters + + def _assert_cpu_gpu_same(self, cpu_outs, gpu_outs, fetch_list, atol): + for item_cpu_out, item_gpu_out, variable in zip(cpu_outs, gpu_outs, + fetch_list): + # the cpu version is baseline, expect gpu version keep same with cpu version. + expect = item_cpu_out + expect_t = np.array(item_cpu_out) + actual = item_gpu_out + actual_t = np.array(item_gpu_out) + var_name = variable if isinstance(variable, + basestring) else variable.name + self.assertTrue( + np.allclose( + actual_t, expect_t, atol=atol), + "Output (" + var_name + ") has diff" + str(actual_t) + "\n" + + str(expect_t)) + self.assertListEqual(actual.lod(), + expect.lod(), + "Output (" + var_name + ") has different lod") + + def _get_input_names(self): + inputs = [] + for name, value in self.inputs.iteritems(): + if isinstance(value, list): + inputs.extend([sub_name for sub_name, _ in value]) + inputs.append(name) + return inputs + + def _get_output_names(self): + outputs = [] + for var_name, var in self.outputs.iteritems(): + if isinstance(var, list): + for sub_var_name, sub_var in var: + outputs.append(sub_var_name) + else: + outputs.append(var_name) + if len(outputs) == 0: + for out_name, out_dup in Operator.get_op_outputs(self.op_type): + outputs.append(str(out_name)) + return outputs + + def check_output_stability(self, atol=1e-8): + places = self._get_places() + if len(places) < 2: + return + cpu_outs, fetch_list = self._calc_output(places[0]) + gpu_outs, _ = self._calc_output(places[1]) + self._assert_cpu_gpu_same(cpu_outs, gpu_outs, fetch_list, atol) + + def timeit_output_with_place(self, place, iters): + return self.timeit_function(self.calc_output, iters, place) + + def timeit_output(self, iters=100): + places = self._get_places() + elapses = [] + for place in places: + elapses.append(self.timeit_output_with_place(place, iters)) + for place, elapse in zip(places, elapses): + print("One pass of ({2}_op) at {0} cost {1}".format( + str(place), elapse, self.op_type)) + + def timeit_grad_with_place(self, place, iters=100): + inputs_to_check = self._get_input_names() + output_names = self._get_output_names() + return self.timeit_function( + self._get_gradient, + iters, + inputs_to_check, + place, + output_names, + no_grad_set=None) + + def timeit_grad(self, iters=100): + places = self._get_places() + elapses = [] + for place in places: + elapses.append(self.timeit_grad_with_place(place, iters)) + for place, elapse in zip(places, elapses): + print("One pass of ({2}_grad_op) at {0} cost {1}".format( + str(place), elapse, self.op_type)) diff --git a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py new file mode 100644 index 0000000000000000000000000000000000000000..91a5f1bca4441d80489a02eb9283928e38321826 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py @@ -0,0 +1,82 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle.fluid as fluid +from benchmark import BenchmarkSuite +from op_test import OpTest + +# This is a demo op test case for operator benchmarking and high resolution number stability alignment. + + +class TestSumOp(BenchmarkSuite): + def setUp(self): + self.op_type = "sum" + self.customize_testcase() + self.customize_fetch_list() + + def customize_fetch_list(self): + """ + customize fetch list, configure the wanted variables. + >>> self.fetch_list = ["Out"] + """ + self.fetch_list = ["Out"] + # pass + + def customize_testcase(self): + # a test case + x0 = np.random.random((300, 400)).astype('float32') + x1 = np.random.random((300, 400)).astype('float32') + x2 = np.random.random((300, 400)).astype('float32') + + # NOTE: if the output is empty, then it will autofilled by benchmarkSuite. + # only the output dtype is used, the shape, lod and data is computed from input. + self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]} + self.outputs = {"Out": x0 + x1 + x2} + + def test_check_output(self): + """ + compare the output with customized output. In this case, + you should set the correct output by hands. + >>> self.outputs = {"Out": x0 + x1 + x2} + """ + self.check_output(atol=1e-8) + + def test_output_stability(self): + # compare the cpu gpu output in high resolution. + self.check_output_stability() + + def test_timeit_output(self): + """ + perf the op, time cost will be averged in iters. + output example + >>> One pass of (sum_op) at CPUPlace cost 0.000461330413818 + >>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596 + """ + self.timeit_output(iters=100) + + def test_timeit_grad(self): + """ + perf the op gradient, time cost will be averged in iters. + output example + >>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536 + >>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653 + """ + self.timeit_grad(iters=100) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index b611470fa1ff326df960c349b71006f52d586d8e..307caae4b0cf4869c1abb755215aa97795d47e15 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -15,13 +15,17 @@ import unittest import numpy as np import random +import time import itertools -import paddle.fluid.core as core import collections + +import paddle.fluid as fluid +import paddle.fluid.core as core from paddle.fluid.backward import append_backward from paddle.fluid.op import Operator from paddle.fluid.executor import Executor -from paddle.fluid.framework import Program, OpProtoHolder +from paddle.fluid.framework import Program, OpProtoHolder, Variable +from testsuite import create_op, set_input, append_input_output, append_loss_ops def randomize_probability(batch_size, class_num, dtype='float32'): @@ -33,73 +37,6 @@ def randomize_probability(batch_size, class_num, dtype='float32'): return prob -def create_op(scope, op_type, inputs, outputs, attrs): - kwargs = dict() - - op_maker = core.op_proto_and_checker_maker - op_role_attr_name = op_maker.kOpRoleAttrName() - - if op_role_attr_name not in attrs: - attrs[op_role_attr_name] = int(op_maker.OpRole.Forward) - - def __create_var__(name, var_name): - scope.var(var_name).get_tensor() - kwargs[name].append(var_name) - - for in_name, in_dup in Operator.get_op_inputs(op_type): - if in_name in inputs: - kwargs[in_name] = [] - if in_dup: - sub_in = inputs[in_name] - for item in sub_in: - sub_in_name, _ = item[0], item[1] - __create_var__(in_name, sub_in_name) - else: - __create_var__(in_name, in_name) - - for out_name, out_dup in Operator.get_op_outputs(op_type): - if out_name in outputs: - kwargs[out_name] = [] - if out_dup: - sub_out = outputs[out_name] - for item in sub_out: - sub_out_name, _ = item[0], item[1] - __create_var__(out_name, sub_out_name) - else: - __create_var__(out_name, out_name) - - for attr_name in Operator.get_op_attr_names(op_type): - if attr_name in attrs: - kwargs[attr_name] = attrs[attr_name] - - return Operator(op_type, **kwargs) - - -def set_input(scope, op, inputs, place): - def __set_input__(var_name, var): - if isinstance(var, tuple) or isinstance(var, np.ndarray): - tensor = scope.find_var(var_name).get_tensor() - if isinstance(var, tuple): - tensor.set_lod(var[1]) - var = var[0] - tensor.set_dims(var.shape) - tensor.set(var, place) - elif isinstance(var, float): - scope.find_var(var_name).set_float(var) - elif isinstance(var, int): - scope.find_var(var_name).set_int(var) - - for in_name, in_dup in Operator.get_op_inputs(op.type()): - if in_name in inputs: - if in_dup: - sub_in = inputs[in_name] - for item in sub_in: - sub_in_name, sub_in_val = item[0], item[1] - __set_input__(sub_in_name, sub_in_val) - else: - __set_input__(in_name, inputs[in_name]) - - def get_numeric_gradient(place, scope, op, @@ -173,54 +110,15 @@ def get_numeric_gradient(place, return gradient_flat.reshape(tensor_to_check.get_dims()) -def append_input_output(block, op_proto, np_list, is_input): - '''Insert VarDesc and generate Python variable instance''' - proto_list = op_proto.inputs if is_input else op_proto.outputs - - def create_var(block, name, np_list, var_proto): - if name not in np_list: - assert var_proto.intermediate, "{} not found".format(name) - shape = None - lod_level = None - else: - np_value = np_list[name] - if isinstance(np_value, tuple): - shape = list(np_value[0].shape) - lod_level = len(np_value[1]) - else: - shape = list(np_value.shape) - lod_level = 0 - return block.create_var( - dtype="float32", shape=shape, lod_level=lod_level, name=name) - - var_dict = {} - for var_proto in proto_list: - var_name = str(var_proto.name) - if is_input: - if (var_name not in np_list) and var_proto.dispensable: - continue - assert (var_name in np_list) or (var_proto.dispensable), \ - "Missing {} as input".format(var_name) - if var_proto.duplicable: - assert isinstance(np_list[var_name], list), \ - "Duplicable {} should be set as list".format(var_name) - var_list = [] - for (name, np_value) in np_list[var_name]: - var_list.append( - create_var(block, name, {name: np_value}, var_proto)) - var_dict[var_name] = var_list - else: - var_dict[var_name] = create_var(block, var_name, np_list, var_proto) - - return var_dict - - class OpTest(unittest.TestCase): @classmethod def setUpClass(cls): '''Fix random seeds to remove randomness from tests''' cls._np_rand_state = np.random.get_state() cls._py_rand_state = random.getstate() + cls.call_once = False + cls.dtype = "float32" + cls.outputs = {} np.random.seed(123) random.seed(124) @@ -231,6 +129,31 @@ class OpTest(unittest.TestCase): np.random.set_state(cls._np_rand_state) random.setstate(cls._py_rand_state) + def try_call_once(self, data_type): + if not self.call_once: + self.call_once = True + self.dtype = data_type + + def infer_dtype_from_inputs_outputs(self, inputs, outputs): + def infer_dtype(numpy_dict): + assert isinstance( + numpy_dict, + dict), "self.inputs, self.outputs must be numpy_dict" + for var_name, var_value in numpy_dict.iteritems(): + if isinstance(var_value, (np.ndarray, np.generic)): + self.try_call_once(var_value.dtype) + elif isinstance(var_value, (list, tuple)): + # the case of self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]} + if len(var_value) > 1 and isinstance(var_value[1], ( + np.ndarray, np.generic)): + instance = var_value[1] + self.try_call_once(instance[1].dtype) + else: + self.try_call_once("float32") + + infer_dtype(inputs) + infer_dtype(outputs) + def feed_var(self, input_vars, place): feed_map = {} for var_name in input_vars: @@ -254,18 +177,14 @@ class OpTest(unittest.TestCase): return feed_map - def calc_output(self, place): - outs, _ = self._calc_output(place) - return outs - - def _calc_output(self, place): + def _append_ops(self, block): op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) - - program = Program() - block = program.global_block() - - inputs = append_input_output(block, op_proto, self.inputs, True) - outputs = append_input_output(block, op_proto, self.outputs, False) + "infer datatype from inputs and outputs for this test case" + self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs) + inputs = append_input_output(block, op_proto, self.inputs, True, + self.dtype) + outputs = append_input_output(block, op_proto, self.outputs, False, + self.dtype) op = block.append_op( type=self.op_type, inputs=inputs, @@ -275,22 +194,68 @@ class OpTest(unittest.TestCase): op.desc.infer_var_type(block.desc) op.desc.infer_shape(block.desc) - fetch_list = [] - for var_name, var in outputs.iteritems(): - if var_name in self.outputs: + def _get_io_vars(self, block, numpy_inputs): + inputs = {} + for name, value in numpy_inputs.iteritems(): + if isinstance(value, list): + var_list = [ + block.var(sub_name) for sub_name, sub_value in value + ] + inputs[name] = var_list + else: + inputs[name] = block.var(name) + return inputs + + def _get_inputs(self, block): + return self._get_io_vars(block, self.inputs) + + def _get_outputs(self, block): + return self._get_io_vars(block, self.outputs) + + def calc_output(self, place): + outs, _ = self._calc_output(place) + return outs + + def _calc_output(self, place, parallel=False): + + program = Program() + block = program.global_block() + self._append_ops(block) + + inputs = self._get_inputs(block) + outputs = self._get_outputs(block) + feed_map = self.feed_var(inputs, place) + + if parallel: + use_cuda = False + if isinstance(place, fluid.CUDAPlace(0)): + use_cuda = True + executor = fluid.ParallelExecutor( + use_cuda=use_cuda, loss_name=loss.name, main_program=program) + else: + executor = Executor(place) + + fetch_list = getattr(self, "fetch_list", []) + # if the fetch_list is customized by user, we use it directly. + # if not, fill the fetch_list by the user configured outputs in test. + if len(fetch_list) == 0: + for var_name, var in outputs.iteritems(): if isinstance(var, list): for v in var: fetch_list.append(v) else: fetch_list.append(var) - - feed_map = self.feed_var(inputs, place) - - exe = Executor(place) - outs = exe.run(program, - feed=feed_map, - fetch_list=fetch_list, - return_numpy=False) + # if the fetch_list still empty, fill the fetch_list by the operator output. + if len(fetch_list) == 0: + for out_name, out_dup in Operator.get_op_outputs(self.op_type): + fetch_list.append(str(out_name)) + # fetch_list = map(block.var, fetch_list) + if not isinstance(fetch_list[0], Variable): + fetch_list = map(block.var, fetch_list) + outs = executor.run(program, + feed=feed_map, + fetch_list=fetch_list, + return_numpy=False) return outs, fetch_list def check_output_with_place(self, place, atol): @@ -346,17 +311,19 @@ class OpTest(unittest.TestCase): "Output (" + out_name + ") has different lod at " + str(place)) - def check_output(self, atol=1e-5): - places = [core.CPUPlace()] + def _get_places(self): + places = [fluid.CPUPlace()] if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): places.append(core.CUDAPlace(0)) + return places + + def check_output(self, atol=1e-5): + places = self._get_places() for place in places: self.check_output_with_place(place, atol) def check_output_customized(self, checker): - places = [core.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): - places.append(core.CUDAPlace(0)) + places = self._get_places() for place in places: outs = self.calc_output(place) outs = [np.array(out) for out in outs] @@ -389,9 +356,7 @@ class OpTest(unittest.TestCase): in_place=False, max_relative_error=0.005, user_defined_grads=None): - places = [core.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): - places.append(core.CUDAPlace(0)) + places = self._get_places() for place in places: self.check_grad_with_place(place, inputs_to_check, output_names, no_grad_set, numeric_grad_delta, @@ -438,35 +403,6 @@ class OpTest(unittest.TestCase): max_relative_error, "Gradient Check On %s" % str(place)) - @staticmethod - def _create_var_descs_(block, var_dict): - # FIXME: Try unify with `append_input_output` - for param_name in var_dict: - var = var_dict[param_name] - if not isinstance(var, list) and not isinstance(var, tuple): - var = [(param_name, var, None)] - if not isinstance(var[0], list) and not isinstance(var[0], tuple): - var = [(param_name, var[0], var[1])] - - for i, item in enumerate(var): - if not isinstance(item[0], basestring): - item = [[param_name] + list(item)] - if len(item) == 2: - if isinstance(item[1], tuple): - var[i] = [item[0], item[1][0], item[1][1]] - else: - # only set var name and value, set lod to None - var[i] = list(item) + [None] - var_descs = [(block.create_var( - name=name, shape=each.shape, dtype=each.dtype), each, lod) - for name, each, lod in var] - - yield param_name, var_descs - - @staticmethod - def _merge_list(iterable): - return reduce(lambda a, b: list(a) + list(b), iterable, []) - @staticmethod def _numpy_to_lod_tensor(np_value, lod, place): tensor = core.LoDTensor() @@ -497,83 +433,31 @@ class OpTest(unittest.TestCase): input.dtype = np.uint16 return input - def _get_gradient(self, input_to_check, place, output_names, no_grad_set): + def _get_gradient(self, + input_to_check, + place, + output_names, + no_grad_set, + parallel=False): prog = Program() block = prog.global_block() - inputs_with_np = { - key: value - for (key, value) in OpTest._create_var_descs_( - block, getattr(self, 'inputs', {})) - } - outputs_with_np = { - key: val - for (key, val) in OpTest._create_var_descs_( - block, getattr(self, 'outputs', {})) - } - inputs = { - k: [item[0] for item in inputs_with_np[k]] - for k in inputs_with_np - } - outputs = { - k: [item[0] for item in outputs_with_np[k]] - for k in outputs_with_np - } - - op = block.append_op( - type=self.op_type, - inputs=inputs, - outputs=outputs, - attrs=getattr(self, 'attrs', {})) - - # infer variable type and infer shape in compile-time - op.desc.infer_var_type(block.desc) - op.desc.infer_shape(block.desc) - - mean_inputs = map(block.var, output_names) - - if len(mean_inputs) == 1: - loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1]) - op = block.append_op( - inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean') - op.desc.infer_var_type(block.desc) - op.desc.infer_shape(block.desc) - else: - avg_sum = [] - for cur_loss in mean_inputs: - cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1]) - op = block.append_op( - inputs={"X": [cur_loss]}, - outputs={"Out": [cur_avg_loss]}, - type="mean") - op.desc.infer_var_type(block.desc) - op.desc.infer_shape(block.desc) - avg_sum.append(cur_avg_loss) - - loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1]) - op_sum = block.append_op( - inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum') - op_sum.desc.infer_var_type(block.desc) - op_sum.desc.infer_shape(block.desc) - - loss = block.create_var(dtype=loss_sum.dtype, shape=[1]) - op_loss = block.append_op( - inputs={"X": loss_sum}, - outputs={"Out": loss}, - type='scale', - attrs={'scale': 1.0 / float(len(avg_sum))}) - op_loss.desc.infer_var_type(block.desc) - op_loss.desc.infer_shape(block.desc) - + self._append_ops(block) + loss = append_loss_ops(block, output_names) param_grad_list = append_backward( loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set) - feed_dict = { - item[0].name: OpTest._numpy_to_lod_tensor(item[1], item[2], place) - for p_name in inputs_with_np for item in inputs_with_np[p_name] - } + inputs = self._get_inputs(block) + feed_dict = self.feed_var(inputs, place) fetch_list = [g for p, g in param_grad_list] - executor = Executor(place) + if parallel: + use_cuda = False + if isinstance(place, fluid.CUDAPlace(0)): + use_cuda = True + executor = fluid.ParallelExecutor( + use_cuda=use_cuda, loss_name=loss.name, main_program=program) + else: + executor = Executor(place) return map(np.array, executor.run(prog, feed_dict, fetch_list, return_numpy=False)) diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index c9c3c648717814c28c39a401487925824e885946..829c5a1a5fd099543e9e98b9587d4f316a91b587 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import multiprocessing +import os import unittest import paddle.fluid as fluid import time @@ -23,6 +25,7 @@ __all__ = ['TestParallelExecutorBase'] class TestParallelExecutorBase(unittest.TestCase): def check_network_convergence(self, method, + use_cuda=True, memory_opt=True, iter=50, batch_size=None, @@ -53,7 +56,7 @@ class TestParallelExecutorBase(unittest.TestCase): adam.minimize(loss) if memory_opt: fluid.memory_optimize(main) - place = fluid.CUDAPlace(0) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() startup_exe = fluid.Executor(place) startup_exe.run(startup) exec_strategy = fluid.ExecutionStrategy() @@ -64,7 +67,7 @@ class TestParallelExecutorBase(unittest.TestCase): if use_parallel_executor: exe = fluid.ParallelExecutor( - True, + use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) @@ -72,7 +75,9 @@ class TestParallelExecutorBase(unittest.TestCase): exe = fluid.Executor(place=place) if batch_size is not None: - batch_size *= fluid.core.get_cuda_device_count() + batch_size *= fluid.core.get_cuda_device_count( + ) if use_cuda else int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) begin = time.time() first_loss, = run_executor( exe=exe, feed=feed_dict, fetch_list=[loss.name]) diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py new file mode 100644 index 0000000000000000000000000000000000000000..e04412f809cdd75d07d28a60f0c2f19041a684f6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py @@ -0,0 +1,82 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class BaseTestCase(OpTest): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 0 + + def setUp(self): + self.initTestCase() + self.x = (1000 * np.random.random(self.dims)).astype(self.dtype) + self.inputs = {'X': self.x} + self.attrs = {'axis': self.axis} + if self.op_type == "arg_min": + self.outputs = {'Out': np.argmin(self.x, axis=self.axis)} + else: + self.outputs = {'Out': np.argmax(self.x, axis=self.axis)} + + def test_check_output(self): + self.check_output() + + +class TestCase0(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 0 + + +class TestCase1(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4) + self.dtype = 'float64' + self.axis = 1 + + +class TestCase2(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4) + self.dtype = 'int64' + self.axis = 0 + + +class TestCase3(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, ) + self.dtype = 'int64' + self.axis = 0 + + +class TestCase4(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (1, ) + self.dtype = 'int32' + self.axis = 0 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py index 56f5af91d8e58086c12fde6948229675569aa271..b4c48d85f2c564d877c0a29e64dd2944d2b26ea3 100644 --- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py +++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py @@ -19,7 +19,8 @@ import math from op_test import OpTest -def box_coder(target_box, prior_box, prior_box_var, output_box, code_type): +def box_coder(target_box, prior_box, prior_box_var, output_box, code_type, + box_normalized): prior_box_x = ( (prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0]) prior_box_y = ( @@ -30,6 +31,9 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type): (prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0]) prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0], prior_box_var.shape[1]) + if not box_normalized: + prior_box_height = prior_box_height + 1 + prior_box_width = prior_box_width + 1 if (code_type == "EncodeCenterSize"): target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape( @@ -40,6 +44,9 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type): target_box.shape[0], 1) target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape( target_box.shape[0], 1) + if not box_normalized: + target_box_height = target_box_height + 1 + target_box_width = target_box_width + 1 output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \ prior_box_var[:,:,0] @@ -64,9 +71,13 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type): output_box[:, :, 1] = target_box_y - target_box_height / 2 output_box[:, :, 2] = target_box_x + target_box_width / 2 output_box[:, :, 3] = target_box_y + target_box_height / 2 + if not box_normalized: + output_box[:, :, 2] = output_box[:, :, 2] - 1 + output_box[:, :, 3] = output_box[:, :, 3] - 1 -def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type): +def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type, + box_normalized): n = target_box.shape[0] m = prior_box.shape[0] output_box = np.zeros((n, m, 4), dtype=np.float32) @@ -74,11 +85,11 @@ def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type): if (code_type == "EncodeCenterSize"): box_coder(target_box[lod[i]:lod[i + 1], :], prior_box, prior_box_var, output_box[lod[i]:lod[i + 1], :, :], - code_type) + code_type, box_normalized) elif (code_type == "DecodeCenterSize"): box_coder(target_box[lod[i]:lod[i + 1], :, :], prior_box, prior_box_var, output_box[lod[i]:lod[i + 1], :, :], - code_type) + code_type, box_normalized) return output_box @@ -93,15 +104,45 @@ class TestBoxCoderOp(OpTest): prior_box_var = np.random.random((10, 4)).astype('float32') target_box = np.random.random((5, 10, 4)).astype('float32') code_type = "DecodeCenterSize" + box_normalized = False output_box = batch_box_coder(prior_box, prior_box_var, target_box, - lod[0], code_type) + lod[0], code_type, box_normalized) self.inputs = { 'PriorBox': prior_box, 'PriorBoxVar': prior_box_var, 'TargetBox': target_box, } - self.attrs = {'code_type': 'decode_center_size'} + self.attrs = { + 'code_type': 'decode_center_size', + 'box_normalized': False + } + self.outputs = {'OutputBox': output_box} + + +class TestBoxCoderOpWithoutBoxVar(OpTest): + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "box_coder" + lod = [[0, 1, 2, 3, 4, 5]] + prior_box = np.random.random((10, 4)).astype('float32') + prior_box_var = np.ones((10, 4)).astype('float32') + target_box = np.random.random((5, 10, 4)).astype('float32') + code_type = "DecodeCenterSize" + box_normalized = False + output_box = batch_box_coder(prior_box, prior_box_var, target_box, + lod[0], code_type, box_normalized) + + self.inputs = { + 'PriorBox': prior_box, + 'TargetBox': target_box, + } + self.attrs = { + 'code_type': 'decode_center_size', + 'box_normalized': False + } self.outputs = {'OutputBox': output_box} @@ -116,15 +157,16 @@ class TestBoxCoderOpWithLoD(OpTest): prior_box_var = np.random.random((10, 4)).astype('float32') target_box = np.random.random((20, 4)).astype('float32') code_type = "EncodeCenterSize" + box_normalized = True output_box = batch_box_coder(prior_box, prior_box_var, target_box, - lod[0], code_type) + lod[0], code_type, box_normalized) self.inputs = { 'PriorBox': prior_box, 'PriorBoxVar': prior_box_var, 'TargetBox': (target_box, lod), } - self.attrs = {'code_type': 'encode_center_size'} + self.attrs = {'code_type': 'encode_center_size', 'box_normalized': True} self.outputs = {'OutputBox': output_box} diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint.py b/python/paddle/fluid/tests/unittests/test_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..e22400a045ced16c46b0bf005155f621f249d263 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_checkpoint.py @@ -0,0 +1,75 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import unittest +import os +import tempfile + + +class TestCheckpoint(unittest.TestCase): + def setUp(self): + self.dirname = tempfile.mktemp() + self.max_num_checkpoints = 3 + self.epoch_interval = 1 + self.step_interval = 1 + self.trainer_id = 0 + self.chief = self.trainer_id == 0 + self.place = fluid.CPUPlace() + self.epoch_id = 100 + self.step_id = 20 + + def test_checkpoint(self): + self.save_checkpoint() + serial = fluid.io.get_latest_checkpoint_serial(self.dirname) + self.assertTrue(serial >= 0) + trainer_args = ["epoch_id", "step_id"] + epoch_id, step_id = fluid.io.load_trainer_args( + self.dirname, serial, self.trainer_id, trainer_args) + self.assertEqual(self.step_id, int(step_id)) + self.assertEqual(self.epoch_id, int(epoch_id)) + + program = fluid.Program() + with fluid.program_guard(program): + exe = fluid.Executor(self.place) + fluid.io.load_checkpoint(exe, self.dirname, serial, program) + + fluid.io.clean_checkpoint(self.dirname, delete_dir=True) + self.assertFalse(os.path.isdir(self.dirname)) + + def save_checkpoint(self): + config = fluid.CheckpointConfig(self.dirname, self.max_num_checkpoints, + self.epoch_interval, self.step_interval) + + trainer_args = {} + trainer_args["epoch_id"] = self.epoch_id + trainer_args["step_id"] = self.step_id + + program = fluid.Program() + with fluid.program_guard(program): + program.global_block().create_var( + name="scale_0", + psersistable=True, + dtype="float32", + shape=[32, 32]) + + exe = fluid.Executor(self.place) + for i in xrange(10): + fluid.io.save_checkpoint(exe, config.checkpoint_dir, + self.trainer_id, trainer_args, program, + config.max_num_checkpoints) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_crop_op.py b/python/paddle/fluid/tests/unittests/test_crop_op.py index 20cc3a643f1adfc04faad15e1b7baad3e22d9d29..4016089c01644f0389855ab114360f90c50a1bbe 100644 --- a/python/paddle/fluid/tests/unittests/test_crop_op.py +++ b/python/paddle/fluid/tests/unittests/test_crop_op.py @@ -42,9 +42,9 @@ class TestCropOp(OpTest): def setUp(self): self.op_type = "crop" self.crop_by_input = False + self.offset_by_input = False self.attrs = {} self.initTestCase() - self.attrs['offsets'] = self.offsets if self.crop_by_input: self.inputs = { 'X': np.random.random(self.x_shape).astype("float32"), @@ -55,6 +55,10 @@ class TestCropOp(OpTest): self.inputs = { 'X': np.random.random(self.x_shape).astype("float32"), } + if self.offset_by_input: + self.inputs['Offsets'] = np.array(self.offsets).astype('int32') + else: + self.attrs['offsets'] = self.offsets self.outputs = { 'Out': crop(self.inputs['X'], self.offsets, self.crop_shape) } @@ -101,5 +105,22 @@ class TestCase4(TestCropOp): self.crop_by_input = True +class TestCase5(TestCropOp): + def initTestCase(self): + self.x_shape = (3, 4, 5) + self.crop_shape = [2, 2, 3] + self.offsets = [1, 0, 2] + self.offset_by_input = True + + +class TestCase6(TestCropOp): + def initTestCase(self): + self.x_shape = (10, 9, 14) + self.crop_shape = [3, 3, 5] + self.offsets = [3, 5, 4] + self.crop_by_input = True + self.offset_by_input = True + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index fa49bd41a5876847d046682dce5c3d3868a18500..b4379ad447e01683325dfcbb6a5b322f0b8eac3d 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -13,39 +13,16 @@ # limitations under the License. import unittest - import paddle.fluid as fluid -import paddle.fluid.core as core -import paddle.fluid.layers as layers from paddle.fluid.transpiler.distribute_transpiler import delete_ops -import numpy + +from transpiler_test import TranspilerTest -class TestDistTranspiler(unittest.TestCase): +class TestDistTranspiler(TranspilerTest): def setUp(self): - self.trainer_id = 0 - self.trainers = 2 - self.pservers = 2 - self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175" self.current_pserver_ep = "127.0.0.1:6174" - def net_conf(self): - x = fluid.layers.data(name='x', shape=[1000], dtype='float32') - - y_predict = fluid.layers.fc(input=x, - size=1000, - act=None, - param_attr=fluid.ParamAttr(name='fc_w')) - - y = fluid.layers.data(name='y', shape=[1], dtype='float32') - - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1) - - optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) - return optimize_ops, params_grads - def test_transpiler(self): trainer = self.get_trainer() pserver, startup = self.get_pserver(self.current_pserver_ep) @@ -70,14 +47,6 @@ class TestDistTranspiler(unittest.TestCase): fc_w_var = startup.global_block().var("fc_w.block1") self.assertEqual(fc_w_var.shape, (500, 1000)) - def get_main_program(self): - main = fluid.Program() - - with fluid.program_guard(main): - self.net_conf() - - return main - def get_expect_trainer_ops(self): trainer = fluid.Program() @@ -86,31 +55,12 @@ class TestDistTranspiler(unittest.TestCase): delete_ops(trainer.global_block(), optimize_ops) ops = [op.type for op in trainer.global_block().ops] + [ - "split_byref", "send_vars", "send_barrier", "recv", "recv", + "split_byref", "send", "send_barrier", "recv", "recv", "fetch_barrier", "concat" ] - ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars") + ops.insert(ops.index("elementwise_add_grad") + 1, "send") return ops - def get_trainer(self): - return self._transpiler_instance().get_trainer_program() - - def get_pserver(self, ep): - t = self._transpiler_instance() - pserver = t.get_pserver_program(ep) - startup = t.get_startup_program(ep, pserver) - return pserver, startup - - def _transpiler_instance(self): - main = self.get_main_program() - t = fluid.DistributeTranspiler() - t.transpile( - self.trainer_id, - program=main, - pservers=self.pserver_eps, - trainers=self.trainers) - return t - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py index 22329390754d8d010dced0d1aca35617140cd097..95af51f1b2f8cd9492baa9cb14fe31ffa586f2fc 100644 --- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py +++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py @@ -30,9 +30,6 @@ class Memory(object): assert val.dtype == self.ex.dtype self.cur = val - def ex(self): - return self.ex - def next(self): self.ex = self.cur self.cur = None diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py index 1f52bd90d0d49bda6c180019e90ebd923c91439c..96d47906a0606bba4b1d2207f7da85b058e42a2b 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py @@ -252,5 +252,25 @@ class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp): self.axis = 1 +class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(3, 20, 20).astype(self.dtype) + self.y = np.random.rand(3, 1, 1).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = -1 + + +class TestFP16ElementwiseAddOp_channelwise_add(TestFP16ElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(3, 10, 20).astype(self.dtype) + self.y = np.random.rand(3, 1, 1).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = -1 + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index ca08fd7fc8e452b36a984ee5db3832e65e97ef78..f8cf6f4e2d25c0c03a3a73dca8e6bc1990b3b78b 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -379,6 +379,28 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(output) print(str(program)) + def test_polygon_box_transform(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[8, 4, 4], dtype="float32") + output = layers.polygon_box_transform(input=x) + self.assertIsNotNone(output) + print(str(program)) + + def test_l2_normalize(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[8, 7, 10], dtype="float32") + output = layers.l2_normalize(x, axis=1) + + def test_maxout(self): + program = Program() + with program_guard(program): + data = layers.data(name='x', shape=[8, 6, 6], dtype="float32") + output = layers.maxout(x=data, groups=2) + self.assertIsNotNone(output) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py index cf89f9d0ebf6200933e539ef7fa8cbdc8f6db058..d1d709551c77908db88be6fda7ac74d4e922138e 100644 --- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py +++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py @@ -23,7 +23,7 @@ from multiprocessing import Process from op_test import OpTest -def run_pserver(use_cuda, sync_mode, ip, port, trainer_count, trainer_id): +def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id): x = fluid.layers.data(name='x', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) y = fluid.layers.data(name='y', shape=[1], dtype='float32') @@ -39,15 +39,8 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainer_count, trainer_id): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) - port = os.getenv("PADDLE_INIT_PORT", port) - pserver_ips = os.getenv("PADDLE_INIT_PSERVERS", ip) # ip,ip... - eplist = [] - for ip in pserver_ips.split(","): - eplist.append(':'.join([ip, port])) - pserver_endpoints = ",".join(eplist) # ip:port,ip:port... - trainers = int(os.getenv("TRAINERS", trainer_count)) - current_endpoint = os.getenv("POD_IP", ip) + ":" + port - trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID", trainer_id)) + pserver_endpoints = ip + ":" + port + current_endpoint = ip + ":" + port t = fluid.DistributeTranspiler() t.transpile( trainer_id, @@ -62,47 +55,52 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainer_count, trainer_id): class TestListenAndServOp(OpTest): def setUp(self): - self.sleep_time = 5 + self.ps_timeout = 5 self.ip = "127.0.0.1" self.port = "6173" - self.trainer_count = 1 + self.trainers = 1 self.trainer_id = 1 - def _raise_signal(self, parent_pid, raised_signal): - time.sleep(self.sleep_time) - ps_command = subprocess.Popen( - "ps -o pid --ppid %d --noheaders" % parent_pid, - shell=True, - stdout=subprocess.PIPE) - ps_output = ps_command.stdout.read() - retcode = ps_command.wait() - assert retcode == 0, "ps command returned %d" % retcode - - for pid_str in ps_output.split("\n")[:-1]: - try: - os.kill(int(pid_str), raised_signal) - except Exception: - continue - def _start_pserver(self, use_cuda, sync_mode): p = Process( target=run_pserver, - args=(use_cuda, sync_mode, self.ip, self.port, self.trainer_count, + args=(use_cuda, sync_mode, self.ip, self.port, self.trainers, self.trainer_id)) p.start() + return p.pid + + def _wait_ps_ready(self, pid): + start_left_time = self.ps_timeout + sleep_time = 0.5 + while True: + assert start_left_time >= 0, "wait ps ready failed" + time.sleep(sleep_time) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + start_left_time -= sleep_time + + def test_rpc_interfaces(self): + # TODO(Yancey1989): need to make sure the rpc interface correctly. + pass def test_handle_signal_in_serv_op(self): # run pserver on CPU in sync mode - self._start_pserver(False, True) + pid = self._start_pserver(False, True) + self._wait_ps_ready(pid) - # raise SIGINT to pserver - self._raise_signal(os.getpid(), signal.SIGINT) + # raise SIGTERM to pserver + os.kill(pid, signal.SIGTERM) # run pserver on CPU in async mode - self._start_pserver(False, False) + pid = self._start_pserver(False, False) + self._wait_ps_ready(pid) # raise SIGTERM to pserver - self._raise_signal(os.getpid(), signal.SIGTERM) + os.kill(pid, signal.SIGTERM) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py index f8ff5a3361af66612f08b2aa4eaffa363f04c594..e726f99d49877a1bc464090092ec80b97ab15d0c 100644 --- a/python/paddle/fluid/tests/unittests/test_lstm_op.py +++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py @@ -194,107 +194,104 @@ class TestLstmOp(OpTest): ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4) -class TestLstmOpHasInitial(TestLstmOp): - def set_argument(self): - self.lod = [[0, 2, 5, 7]] - self.D = 16 - - self.act_gate = 'sigmoid' - self.act_cell = 'tanh' - self.act_cand = 'tanh' - - self.has_initial_state = True - self.is_reverse = True - self.use_peepholes = True - - def test_check_grad(self): - # TODO(qingqing) remove folowing lines after the check_grad is refined. - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'], - max_relative_error=5e-4) - - def test_check_grad_ingore_bias(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Weight'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('Bias')) - - def test_check_grad_ingore_weight(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Bias'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('Weight')) - - def test_check_grad_ingore_input(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Weight', 'Bias'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('Input')) - - def test_check_grad_ingore_h0(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('H0')) - - def test_check_grad_ingore_c0(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('C0')) - - -class TestLstmOpRerverse(TestLstmOp): - def set_argument(self): - self.lod = [[0, 2, 5, 7]] - self.D = 16 - - self.act_gate = 'sigmoid' - self.act_cell = 'tanh' - self.act_cand = 'tanh' - - self.has_initial_state = False - self.is_reverse = True - self.use_peepholes = True - - -class TestLstmOpNotUsePeepholes(TestLstmOp): - def set_argument(self): - self.lod = [[0, 2, 5, 7]] - self.D = 16 - - self.act_gate = 'sigmoid' - self.act_cell = 'tanh' - self.act_cand = 'tanh' - - self.has_initial_state = False - self.is_reverse = True - self.use_peepholes = False - +# class TestLstmOpHasInitial(TestLstmOp): +# def set_argument(self): +# self.lod = [[0, 2, 5, 7]] +# self.D = 16 + +# self.act_gate = 'sigmoid' +# self.act_cell = 'tanh' +# self.act_cand = 'tanh' + +# self.has_initial_state = True +# self.is_reverse = True +# self.use_peepholes = True + +# def test_check_grad(self): +# # TODO(qingqing) remove folowing lines after the check_grad is refined. +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'], +# max_relative_error=5e-4) + +# def test_check_grad_ingore_bias(self): +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Weight'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('Bias')) + +# def test_check_grad_ingore_weight(self): +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Bias'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('Weight')) + +# def test_check_grad_ingore_input(self): +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Weight', 'Bias'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('Input')) + +# def test_check_grad_ingore_h0(self): +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('H0')) + +# def test_check_grad_ingore_c0(self): +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('C0')) + +# class TestLstmOpRerverse(TestLstmOp): +# def set_argument(self): +# self.lod = [[0, 2, 5, 7]] +# self.D = 16 + +# self.act_gate = 'sigmoid' +# self.act_cell = 'tanh' +# self.act_cand = 'tanh' + +# self.has_initial_state = False +# self.is_reverse = True +# self.use_peepholes = True + +# class TestLstmOpNotUsePeepholes(TestLstmOp): +# def set_argument(self): +# self.lod = [[0, 2, 5, 7]] +# self.D = 16 + +# self.act_gate = 'sigmoid' +# self.act_cell = 'tanh' +# self.act_cand = 'tanh' + +# self.has_initial_state = False +# self.is_reverse = True +# self.use_peepholes = False if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_mean_iou.py b/python/paddle/fluid/tests/unittests/test_mean_iou.py new file mode 100644 index 0000000000000000000000000000000000000000..64d42b693bf11f3cb0153243909db4c0612bf4e7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py @@ -0,0 +1,114 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +import unittest +import numpy as np +from op_test import OpTest + + +def compute_mean_iou(predictions, labels, num_classes, in_wrongs, in_corrects, + in_mean_ious): + assert predictions.shape == labels.shape + predictions = predictions.flatten() + labels = labels.flatten() + + out_wrong = np.zeros([num_classes]).astype("int32") + for _, wrong in in_wrongs: + out_wrong += wrong + out_correct = np.zeros([num_classes]).astype("int32") + for _, correct in in_corrects: + out_correct += correct + + for pred, label in zip(predictions, labels): + if pred == label: + out_correct[pred] += 1 + else: + out_wrong[pred] += 1 + out_wrong[label] += 1 + + denominator = out_wrong + out_correct + valid_count = (denominator != 0).sum() + denominator = np.where(denominator > 0, denominator, + np.ones(denominator.shape)) + mean_iou = (out_correct / denominator).sum() / valid_count + + for _, in_mean_iou in in_mean_ious: + mean_iou += in_mean_iou + return mean_iou, out_wrong, out_correct + + +class TestMeanIOUOp(OpTest): + def setUp(self): + self.config() + self.op_type = "mean_iou" + predictions = np.random.randint(0, self.num_classes, + self.image_size).astype("int32") + labels = np.random.randint(0, self.num_classes, + self.image_size).astype("int32") + + in_wrongs = [] + for i in range(self.in_wrong_num): + in_wrongs.append(("in_wrong_%d" % i, np.random.randint( + 0, 10, [self.num_classes]).astype("int32"))) + + in_corrects = [] + for i in range(self.in_correct_num): + in_corrects.append(("in_correct_%d" % i, np.random.randint( + 0, 10, [self.num_classes]).astype("int32"))) + + in_mean_ious = [] + for i in range(self.in_mean_iou_num): + in_mean_ious.append(("in_mean_iou_%d" % i, np.random.uniform( + 0, 1, [1]).astype("float32"))) + + self.inputs = { + 'Predictions': predictions, + 'Labels': labels, + 'InWrongs': in_wrongs, + 'InCorrects': in_corrects, + 'InMeanIou': in_mean_ious + } + self.attrs = {'num_classes': long(self.num_classes)} + mean_iou, out_wrong, out_correct = compute_mean_iou( + predictions, labels, self.num_classes, in_wrongs, in_corrects, + in_mean_ious) + self.outputs = { + 'OutMeanIou': mean_iou, + 'OutWrong': out_wrong, + 'OutCorrect': out_correct + } + + def config(self): + self.num_classes = 10 + self.image_size = [128, 128] + self.in_wrong_num = 0 + self.in_correct_num = 0 + self.in_mean_iou_num = 0 + + def test_check_output(self): + self.check_output() + + +class TestCase1(TestMeanIOUOp): + def config(self): + self.num_classes = 5 + self.image_size = [100, 128] + self.in_wrong_num = 2 + self.in_correct_num = 2 + self.in_mean_iou_num = 2 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py new file mode 100644 index 0000000000000000000000000000000000000000..f209bdf30faffc0b2c7932b7b10f384d6d61a831 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py @@ -0,0 +1,38 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class TestMergeIdsOp(OpTest): + def setUp(self): + self.op_type = "merge_ids" + ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64') + x0 = np.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.4]]).astype('float32') + x1 = np.array([]).astype('float32') + x2 = np.array([[0.4, 0.5], [0.4, 0.5], [0.5, 0.6], + [0.5, 0.6]]).astype('float32') + out = np.array([[0.1, 0.2], [0.4, 0.5], [0.4, 0.5], [0.2, 0.3], + [0.5, 0.6], [0.5, 0.6], [0.3, 0.4]]).astype('float32') + self.inputs = {'Ids': ids, "X": [('x0', x0), ('x1', x1), ('x2', x2)]} + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py index 862b7f8cb93620da4dd4673028776cfe565eeb0b..bbc782c1bce302df68ab30013f3a7667e51ed479 100644 --- a/python/paddle/fluid/tests/unittests/test_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_mul_op.py @@ -22,8 +22,8 @@ class TestMulOp(OpTest): def setUp(self): self.op_type = "mul" self.inputs = { - 'X': np.random.random((32, 84)).astype("float32"), - 'Y': np.random.random((84, 100)).astype("float32") + 'X': np.random.random((2, 5)).astype("float32"), + 'Y': np.random.random((5, 3)).astype("float32") } self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} @@ -46,13 +46,16 @@ class TestMulOp2(OpTest): def setUp(self): self.op_type = "mul" self.inputs = { - 'X': np.random.random((15, 4, 12, 10)).astype("float32"), - 'Y': np.random.random((4, 30, 8, 2, 9)).astype("float32") + 'X': np.random.random((3, 4, 4, 3)).astype("float32"), + 'Y': np.random.random((2, 6, 1, 2, 3)).astype("float32") } - self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2} - result = np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10), - self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9)) - result = result.reshape(15, 4, 8, 2, 9) + self.attrs = { + 'x_num_col_dims': 2, + 'y_num_col_dims': 2, + } + result = np.dot(self.inputs['X'].reshape(3 * 4, 4 * 3), + self.inputs['Y'].reshape(2 * 6, 1 * 2 * 3)) + result = result.reshape(3, 4, 1, 2, 3) self.outputs = {'Out': result} def test_check_output(self): @@ -73,9 +76,9 @@ class TestMulOp2(OpTest): class TestFP16MulOp1(OpTest): def setUp(self): self.op_type = "mul" - x = np.random.random((32, 84)).astype("float16") - y = np.random.random((84, 100)).astype("float16") - self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)} + x = np.random.random((3, 5)).astype("float16") + y = np.random.random((5, 4)).astype("float16") + self.inputs = {'X': x.view(np.float16), 'Y': y.view(np.float16)} self.outputs = {'Out': np.dot(x, y)} def test_check_output(self): @@ -88,13 +91,15 @@ class TestFP16MulOp1(OpTest): class TestFP16MulOp2(OpTest): def setUp(self): self.op_type = "mul" - x = np.random.random((15, 4, 12, 10)).astype("float16") - y = np.random.random((4, 30, 8, 2, 9)).astype("float16") - self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)} - self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2} - result = np.dot( - x.reshape(15 * 4, 12 * 10), y.reshape(4 * 30, 8 * 2 * 9)) - result = result.reshape(15, 4, 8, 2, 9) + x = np.random.random((3, 4, 4, 3)).astype("float16") + y = np.random.random((2, 6, 1, 2, 3)).astype("float16") + self.inputs = {'X': x.view(np.float16), 'Y': y.view(np.float16)} + self.attrs = { + 'x_num_col_dims': 2, + 'y_num_col_dims': 2, + } + result = np.dot(x.reshape(3 * 4, 4 * 3), y.reshape(2 * 6, 1 * 2 * 3)) + result = result.reshape(3, 4, 1, 2, 3) self.outputs = {'Out': result} def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_norm_op.py b/python/paddle/fluid/tests/unittests/test_norm_op.py index 6feda175fb537db894ac7f19e22297f6062a4d61..108a665f37f5cd652ec83f784a56ca52e6b49fe8 100644 --- a/python/paddle/fluid/tests/unittests/test_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_norm_op.py @@ -17,44 +17,23 @@ import numpy as np from op_test import OpTest -def norm(input, scale, epsilon): - s0, s1, s2, s3 = input.shape - x_square = input * input - for i in xrange(s0): - input_batch = input[i:i + 1, :, :, :] - input_batch = input_batch.reshape(s1, s2 * s3) - x_square_batch = x_square[i:i + 1, :, :, :] - x_square_batch = x_square_batch.reshape(s1, s2 * s3) - square_colsum = x_square_batch.sum(axis=0) + epsilon - tmp = pow(square_colsum, 0.5) - tmp = np.reciprocal(tmp) - tmp_tile = np.tile(tmp, s1) - tmp_tile = tmp_tile.reshape(s1, s2 * s3) - scale_tile = np.tile(scale, (1, s2 * s3)) - scale_tile = scale_tile.reshape(s1, s2 * s3) - out_batch = input_batch * tmp_tile * scale_tile - out_batch = out_batch.reshape(1, s1, s2, s3) - if i == 0: - out = out_batch - else: - out = np.concatenate((out, out_batch), 0) - out.reshape(s0, s1, s2, s3) - return out +def l2_norm(x, axis, epsilon): + x2 = x**2 + s = np.sum(x2, axis=axis, keepdims=True) + r = np.sqrt(s + epsilon) + y = x / np.broadcast_to(r, x.shape) + return y, r class TestNormOp(OpTest): def setUp(self): self.op_type = "norm" self.init_test_case() - input = np.random.random(self.shape).astype("float32") - scale = np.array([10, 10, 10]) - self.inputs = { - 'X': input.astype('float32'), - 'Scale': scale.astype('float32') - } - self.attrs = {'epsilon': self.epsilon} - output = norm(input, scale, self.epsilon) - self.outputs = {'Out': output.astype('float32')} + x = np.random.random(self.shape).astype("float64") + y, norm = l2_norm(x, self.axis, self.epsilon) + self.inputs = {'X': x} + self.attrs = {'epsilon': self.epsilon, 'axis': self.axis} + self.outputs = {'Out': y, 'Norm': norm} def test_check_output(self): self.check_output() @@ -63,8 +42,23 @@ class TestNormOp(OpTest): self.check_grad(['X'], 'Out') def init_test_case(self): - self.shape = [2, 3, 2, 2] - self.epsilon = 1e-6 + self.shape = [2, 3, 4, 4] + self.axis = 1 + self.epsilon = 1e-8 + + +class TestNormOp2(TestNormOp): + def init_test_case(self): + self.shape = [5, 3, 9, 7] + self.axis = 0 + self.epsilon = 1e-8 + + +class TestNormOp3(TestNormOp): + def init_test_case(self): + self.shape = [5, 3, 2, 7] + self.axis = -1 + self.epsilon = 1e-8 if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py index ef34893943d8f6bf91b1eb14378e463c178de84d..198c68866d399023c51c2a43b588aa8ec49c3c9a 100644 --- a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py +++ b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py @@ -70,8 +70,9 @@ class TestNormalization(unittest.TestCase): def l2_normalize(self, data, axis, epsilon): """ Compute the groundtruth. """ - output = data * np.reciprocal( - np.sum(np.square(data), axis=axis, keepdims=True)) + output = data / np.broadcast_to( + np.sqrt(np.sum(np.square(data), axis=axis, keepdims=True)), + data.shape) return output def test_l2_normalize(self): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py index 66e138b03f3b170aca4fb2207438eb9af1783c33..163975555ec2cea5c169cc1da3c4324d91ba3616 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -17,6 +17,7 @@ import paddle.fluid as fluid import unittest import paddle import numpy as np +import os word_dict, verb_dict, label_dict = conll05.get_dict() word_dict_len = len(word_dict) @@ -101,7 +102,11 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, class TestCRFModel(unittest.TestCase): - def check_network_convergence(self, is_sparse, build_strategy=None): + def check_network_convergence(self, + is_sparse, + build_strategy=None, + use_cuda=True): + os.environ['CPU_NUM'] = str(4) main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): @@ -145,12 +150,12 @@ class TestCRFModel(unittest.TestCase): paddle.dataset.conll05.test(), buf_size=8192), batch_size=16) - place = fluid.CUDAPlace(0) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) pe = fluid.ParallelExecutor( - use_cuda=True, + use_cuda=use_cuda, loss_name=avg_cost.name, build_strategy=build_strategy) @@ -172,25 +177,33 @@ class TestCRFModel(unittest.TestCase): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy) + is_sparse=True, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( + is_sparse=True, build_strategy=build_strategy, use_cuda=False) def test_update_dense_parameter_all_reduce(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy) + is_sparse=False, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( + is_sparse=False, build_strategy=build_strategy, use_cuda=False) def test_update_sparse_parameter_reduce(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy) + is_sparse=True, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( + is_sparse=True, build_strategy=build_strategy, use_cuda=False) def test_update_dense_parameter_reduce(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy) + is_sparse=False, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( + is_sparse=False, build_strategy=build_strategy, use_cuda=False) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py index 24f8d28c0304a77a99213374b25d0db728eca265..79702475cca86ca22107d4b1824fda277dd83157 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py @@ -18,6 +18,7 @@ import paddle.fluid as fluid import unittest import numpy as np import paddle +import os def Lenet(data, class_dim): @@ -35,7 +36,7 @@ def Lenet(data, class_dim): class TestFetchOp(unittest.TestCase): - def parallel_exe(self, train_inputs, seed): + def parallel_exe(self, train_inputs, seed, use_cuda): main = fluid.Program() startup = fluid.Program() startup.random_seed = seed @@ -59,13 +60,13 @@ class TestFetchOp(unittest.TestCase): # conv2d_1.b_0@GRAD. Those variables should not be pruned. # fluid.memory_optimize(main) - place = fluid.CUDAPlace(0) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) pe = fluid.ParallelExecutor( - use_cuda=True, loss_name=loss.name, main_program=main) + use_cuda=use_cuda, loss_name=loss.name, main_program=main) fetch_list = [] all_vars = main.global_block().vars @@ -88,14 +89,16 @@ class TestFetchOp(unittest.TestCase): for i in range(iters): train_inputs.append(tst_reader_iter.next()) - self.parallel_exe(train_inputs, seed=1) + os.environ['CPU_NUM'] = str(4) + self.parallel_exe(train_inputs, seed=1, use_cuda=True) + self.parallel_exe(train_inputs, seed=1, use_cuda=False) class TestFeedParallel(unittest.TestCase): - def test_main(self): + def parallel_exe(self, use_cuda, seed): main = fluid.Program() startup = fluid.Program() - startup.random_seed = 1 + startup.random_seed = seed with fluid.scope_guard(fluid.core.Scope()): with fluid.program_guard(main, startup): data = fluid.layers.data( @@ -111,15 +114,18 @@ class TestFeedParallel(unittest.TestCase): regularization=fluid.regularizer.L2Decay(1e-4)) opt.minimize(loss) - place = fluid.CUDAPlace(0) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) reader = feeder.decorate_reader( paddle.batch( flowers.train(), batch_size=16), multi_devices=True) + exe = fluid.Executor(place) exe.run(startup) + pe = fluid.ParallelExecutor( - use_cuda=True, loss_name=loss.name, main_program=main) + use_cuda=use_cuda, loss_name=loss.name, main_program=main) for batch_id, data in enumerate(reader()): loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0]) @@ -127,6 +133,11 @@ class TestFeedParallel(unittest.TestCase): if batch_id == 2: break + def test_feed_op(self): + os.environ['CPU_NUM'] = str(4) + self.parallel_exe(use_cuda=True, seed=1) + self.parallel_exe(use_cuda=False, seed=1) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 015703c3e25f4e11e64ab6a7de99da12bee608f6..a801d99aa1ced35eb7f081fde63ad541f0eb2589 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -18,6 +18,7 @@ import numpy as np import paddle import paddle.dataset.mnist as mnist import unittest +import os MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio" @@ -85,6 +86,7 @@ def fc_with_batchnorm(use_feed): class TestMNIST(TestParallelExecutorBase): @classmethod def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) # Convert mnist to recordio file with fluid.program_guard(fluid.Program(), fluid.Program()): reader = paddle.batch(mnist.train(), batch_size=4) @@ -99,9 +101,12 @@ class TestMNIST(TestParallelExecutorBase): fluid.recordio_writer.convert_reader_to_recordio_file( MNIST_RECORDIO_FILE, reader, feeder) - def check_simple_fc_convergence(self, balance_parameter_opt_between_cards): - self.check_network_convergence(simple_fc_net) - self.check_network_convergence(simple_fc_net, allow_op_delay=True) + def check_simple_fc_convergence(self, + balance_parameter_opt_between_cards, + use_cuda=True): + self.check_network_convergence(simple_fc_net, use_cuda=use_cuda) + self.check_network_convergence( + simple_fc_net, use_cuda=use_cuda, allow_op_delay=True) img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') @@ -109,17 +114,21 @@ class TestMNIST(TestParallelExecutorBase): simple_fc_net, feed_dict={"image": img, "label": label}, + use_cuda=use_cuda, balance_parameter_opt_between_cards=balance_parameter_opt_between_cards ) def test_simple_fc(self): - self.check_simple_fc_convergence(False) + self.check_simple_fc_convergence(False, use_cuda=True) + self.check_simple_fc_convergence(False, use_cuda=False) def test_simple_fc_with_new_strategy(self): - self.check_simple_fc_convergence(True) + self.check_simple_fc_convergence(True, use_cuda=True) + self.check_simple_fc_convergence(True, use_cuda=False) def check_simple_fc_parallel_accuracy(self, - balance_parameter_opt_between_cards): + balance_parameter_opt_between_cards, + use_cuda=True): img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') single_first_loss, single_last_loss = self.check_network_convergence( @@ -127,12 +136,14 @@ class TestMNIST(TestParallelExecutorBase): seed=1000, feed_dict={"image": img, "label": label}, + use_cuda=use_cuda, use_parallel_executor=False) parallel_first_loss, parallel_last_loss = self.check_network_convergence( method=simple_fc_net, seed=1000, feed_dict={"image": img, "label": label}, + use_cuda=use_cuda, use_parallel_executor=True, balance_parameter_opt_between_cards=balance_parameter_opt_between_cards ) @@ -143,28 +154,33 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(False) + self.check_simple_fc_parallel_accuracy(False, use_cuda=True) + self.check_simple_fc_parallel_accuracy(False, use_cuda=False) def test_simple_fc_parallel_accuracy_with_new_strategy(self): - self.check_simple_fc_parallel_accuracy(True) + self.check_simple_fc_parallel_accuracy(True, use_cuda=True) + self.check_simple_fc_parallel_accuracy(True, use_cuda=False) - def check_batchnorm_fc_convergence(self, - balance_parameter_opt_between_cards): - self.check_network_convergence(fc_with_batchnorm) + def check_batchnorm_fc_convergence( + self, balance_parameter_opt_between_cards, use_cuda): + self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda) img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') self.check_network_convergence( fc_with_batchnorm, feed_dict={"image": img, "label": label}, + use_cuda=use_cuda, balance_parameter_opt_between_cards=balance_parameter_opt_between_cards ) def test_batchnorm_fc(self): - self.check_batchnorm_fc_convergence(False) + self.check_batchnorm_fc_convergence(False, use_cuda=True) + self.check_batchnorm_fc_convergence(False, use_cuda=False) def test_batchnorm_fc_with_new_strategy(self): - self.check_batchnorm_fc_convergence(True) + self.check_batchnorm_fc_convergence(True, use_cuda=True) + self.check_batchnorm_fc_convergence(True, use_cuda=False) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index a3fa140cbb7994a36d2cbee26d598165f1f771d2..066299e6c6f7f6c159cb0886e86d3404b027b698 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -15,6 +15,7 @@ import paddle.fluid as fluid from parallel_executor_test_base import TestParallelExecutorBase import unittest +import os def squeeze_excitation(input, num_channels, reduction_ratio): @@ -130,22 +131,30 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False): class TestResnet(TestParallelExecutorBase): - def check_resnet_convergence(self, balance_parameter_opt_between_cards): + def check_resnet_convergence(self, + balance_parameter_opt_between_cards, + use_cuda=True, + iter=20): + os.environ['CPU_NUM'] = str(4) + import functools batch_size = 2 self.check_network_convergence( functools.partial( SE_ResNeXt50Small, batch_size=batch_size), - iter=20, + iter=iter, batch_size=batch_size, + use_cuda=use_cuda, balance_parameter_opt_between_cards=balance_parameter_opt_between_cards ) def test_resnet(self): - self.check_resnet_convergence(False) + self.check_resnet_convergence(False, use_cuda=True) + self.check_resnet_convergence(False, use_cuda=False, iter=5) def test_resnet_with_new_strategy(self): - self.check_resnet_convergence(True) + self.check_resnet_convergence(True, use_cuda=True) + self.check_resnet_convergence(True, use_cuda=False, iter=5) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py index 93a5f767867d68110cf7b8f441cc740ecd843cf9..31ba8c1d6096c9c89e0695c8eca8e16a5e303a61 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py @@ -15,6 +15,7 @@ import paddle.fluid as fluid import numpy as np import unittest +import os def simple_fc_net(): @@ -35,7 +36,8 @@ def simple_fc_net(): class ParallelExecutorTestingDuringTraining(unittest.TestCase): - def check_network_convergence(self, build_strategy=None): + def check_network_convergence(self, use_cuda, build_strategy=None): + os.environ['CPU_NUM'] = str(4) main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): @@ -49,19 +51,19 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): image = np.random.normal(size=(batch_size, 784)).astype('float32') label = np.random.randint(0, 10, (batch_size, 1), dtype="int64") - place = fluid.CUDAPlace(0) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) feed_dict = {'image': image, 'label': label} train_exe = fluid.ParallelExecutor( - use_cuda=True, + use_cuda=use_cuda, loss_name=loss.name, main_program=main, build_strategy=build_strategy) test_exe = fluid.ParallelExecutor( - use_cuda=True, + use_cuda=use_cuda, main_program=test_program, share_vars_from=train_exe, build_strategy=build_strategy) @@ -81,12 +83,18 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): def test_parallel_testing(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce - self.check_network_convergence(build_strategy) + self.check_network_convergence( + use_cuda=True, build_strategy=build_strategy) + self.check_network_convergence( + use_cuda=False, build_strategy=build_strategy) def test_parallel_testing_with_new_strategy(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce - self.check_network_convergence(build_strategy) + self.check_network_convergence( + use_cuda=True, build_strategy=build_strategy) + self.check_network_convergence( + use_cuda=False, build_strategy=build_strategy) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index c81df66d987f3d3856af0e19fc935df7de2edacc..b6215fddb11bb6b3a76b5a6395e7254d21971c13 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -19,6 +19,7 @@ from parallel_executor_test_base import TestParallelExecutorBase import unittest import paddle import paddle.dataset.wmt16 as wmt16 +import os WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio" @@ -149,6 +150,7 @@ def transformer(use_feed): class TestTransformer(TestParallelExecutorBase): @classmethod def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) reader = paddle.batch( wmt16.train(ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size), @@ -167,7 +169,8 @@ class TestTransformer(TestParallelExecutorBase): @unittest.skip("transformer is buggy in multi gpu") def test_main(self): - self.check_network_convergence(transformer) + self.check_network_convergence(transformer, use_cuda=True) + self.check_network_convergence(transformer, use_cuda=False) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_reverse_op.py b/python/paddle/fluid/tests/unittests/test_reverse_op.py new file mode 100644 index 0000000000000000000000000000000000000000..f845575a02869f08299d76b5600074598ca27f6c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py @@ -0,0 +1,67 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class TestReverseOp(OpTest): + def initTestCase(self): + self.x = np.random.random((3, 4)).astype('float32') + self.axis = [0] + + def setUp(self): + self.initTestCase() + self.op_type = "reverse" + self.inputs = {"X": self.x} + self.attrs = {'axis': self.axis} + out = self.x + for a in self.axis: + out = np.flip(out, axis=a) + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestCase0(TestReverseOp): + def initTestCase(self): + self.x = np.random.random((3, 4)).astype('float32') + self.axis = [1] + + +class TestCase1(TestReverseOp): + def initTestCase(self): + self.x = np.random.random((3, 4)).astype('float32') + self.axis = [0, 1] + + +class TestCase2(TestReverseOp): + def initTestCase(self): + self.x = np.random.random((3, 4, 5)).astype('float32') + self.axis = [0, 2] + + +class TestCase3(TestReverseOp): + def initTestCase(self): + self.x = np.random.random((3, 4, 5)).astype('float32') + self.axis = [1, 2] + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py new file mode 100644 index 0000000000000000000000000000000000000000..f4aa7426bc315be501348a64e2f15caed6dc8810 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py @@ -0,0 +1,80 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +import paddle.fluid as fluid +from paddle.fluid.transpiler.distribute_transpiler import delete_ops + +from transpiler_test import TranspilerTest + + +class TestSimpleDistTranspiler(TranspilerTest): + def setUp(self): + self.current_pserver_ep = "127.0.0.1:6175" + + def test_simple_transpiler(self): + np.random.seed(1) + + trainer = self.get_trainer() + pserver, startup = self.get_pserver(self.current_pserver_ep) + self.assertEqual([op.type for op in trainer.global_block().ops], + self.get_expect_trainer_ops()) + + self.assertEqual(len(pserver.blocks), 2) + # block0: listen_and_serv + self.assertEqual([op.type for op in pserver.blocks[0].ops], + ["listen_and_serv"]) + # block1: optimize pass + self.assertEqual([op.type for op in pserver.blocks[1].ops], + ["sum", "scale", "sgd"]) + + # confirm startup program + self.assertEqual([op.type for op in startup.global_block().ops], + ["fill_constant", "uniform_random", "uniform_random"]) + + # the variable #fc_w will NOT be splited + fc_w_var = startup.global_block().var("fc_w@GRAD") + self.assertEqual(fc_w_var.shape, (1000, 1000)) + + fc_w_var = startup.global_block().var("fc_w@GRAD.trainer_0") + self.assertEqual(fc_w_var.shape, (1000, 1000)) + + def get_expect_trainer_ops(self): + trainer = fluid.Program() + + with fluid.program_guard(trainer): + optimize_ops, params_grads = self.net_conf() + + delete_ops(trainer.global_block(), optimize_ops) + ops = [op.type for op in trainer.global_block().ops] + [ + "send", "send_barrier", "recv", "recv", "fetch_barrier" + ] + ops.insert(ops.index("elementwise_add_grad") + 1, "send") + return ops + + def _transpiler_instance(self): + main = self.get_main_program() + t = fluid.DistributeTranspiler() + t.transpile( + self.trainer_id, + program=main, + pservers=self.pserver_eps, + trainers=self.trainers, + slice_var_up=False) + return t + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py new file mode 100644 index 0000000000000000000000000000000000000000..1a48bce3bb7c74551a365fd471f6869b128babac --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_slice_op.py @@ -0,0 +1,62 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class TestSliceOp(OpTest): + def setUp(self): + self.op_type = "slice" + self.config() + self.inputs = {'Input': self.input} + self.outputs = {'Out': self.out} + self.attrs = { + 'axes': self.axes, + 'starts': self.starts, + 'ends': self.ends + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [1, 0, 2] + self.ends = [3, 3, 4] + self.axes = [0, 1, 2] + self.out = self.input[1:3, 0:3, 2:4, :] + + def test_check_output(self): + self.check_output() + + +class TestCase1(TestSliceOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 2] + self.out = self.input[-3:3, 0:100, 2:-1, :] + + +class TestCase2(TestSliceOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 3] + self.out = self.input[-3:3, 0:100, :, 2:-1] + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_split_var.py b/python/paddle/fluid/tests/unittests/test_slice_var.py similarity index 85% rename from python/paddle/fluid/tests/unittests/test_split_var.py rename to python/paddle/fluid/tests/unittests/test_slice_var.py index 157def9b56e44092a86023035d1ab444c938aa07..82305b23a1a1e2cee8cef6b291d848581fe5b509 100644 --- a/python/paddle/fluid/tests/unittests/test_split_var.py +++ b/python/paddle/fluid/tests/unittests/test_slice_var.py @@ -14,14 +14,14 @@ import math import unittest -from paddle.fluid.transpiler.distribute_transpiler import split_variable +from paddle.fluid.transpiler.distribute_transpiler import slice_variable import paddle.fluid as fluid import paddle.fluid.core as core import random -class TestSplitVar(unittest.TestCase): - def check_split_output(self, shapes, expected_sizes, min_size): +class TestSliceVar(unittest.TestCase): + def check_slice_output(self, shapes, expected_sizes, min_size): var_list = [] program = fluid.Program() for shape in shapes: @@ -31,7 +31,7 @@ class TestSplitVar(unittest.TestCase): # dtype=core.VarDesc.VarType.LOD_TENSOR, shape=shape) var_list.append(var) - blocks = split_variable(var_list, 10, min_size) + blocks = slice_variable(var_list, 10, min_size) all_sizes = [] for s in expected_sizes: for s2 in s: @@ -49,7 +49,7 @@ class TestSplitVar(unittest.TestCase): [1150, 1150, 1150, 1150, 1150, 1150, 1100] ] - self.check_split_output(shapes, expected_sizes, 1024) + self.check_slice_output(shapes, expected_sizes, 1024) def test_check_output_8k(self): shapes = [[3, 5], [1024], [28, 784], [8, 1020], [800, 10], @@ -57,7 +57,7 @@ class TestSplitVar(unittest.TestCase): expected_sizes = [[15], [1024], [10976, 10976], [8160], [8000], [35937, 35937, 35937, 35937, 35937, 35937]] - self.check_split_output(shapes, expected_sizes, 8192) + self.check_slice_output(shapes, expected_sizes, 8192) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py new file mode 100644 index 0000000000000000000000000000000000000000..1dc94a80c9d3999d34fdf0edbf82ffe297bd95d7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/testsuite.py @@ -0,0 +1,182 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +import paddle.fluid.core as core +from paddle.fluid.op import Operator + + +def as_lodtensor(np_array, lod, place): + tensor = core.LoDTensor() + tensor.set(np_value, place) + if lod is not None: + tensor.set_lod(lod) + return tensor + + +def create_op(scope, op_type, inputs, outputs, attrs): + kwargs = dict() + + op_maker = core.op_proto_and_checker_maker + op_role_attr_name = op_maker.kOpRoleAttrName() + + if op_role_attr_name not in attrs: + attrs[op_role_attr_name] = int(op_maker.OpRole.Forward) + + def __create_var__(name, var_name): + scope.var(var_name).get_tensor() + kwargs[name].append(var_name) + + for in_name, in_dup in Operator.get_op_inputs(op_type): + if in_name in inputs: + kwargs[in_name] = [] + if in_dup: + sub_in = inputs[in_name] + for item in sub_in: + sub_in_name, _ = item[0], item[1] + __create_var__(in_name, sub_in_name) + else: + __create_var__(in_name, in_name) + + for out_name, out_dup in Operator.get_op_outputs(op_type): + if out_name in outputs: + kwargs[out_name] = [] + if out_dup: + sub_out = outputs[out_name] + for item in sub_out: + sub_out_name, _ = item[0], item[1] + __create_var__(out_name, sub_out_name) + else: + __create_var__(out_name, out_name) + + for attr_name in Operator.get_op_attr_names(op_type): + if attr_name in attrs: + kwargs[attr_name] = attrs[attr_name] + + return Operator(op_type, **kwargs) + + +def set_input(scope, op, inputs, place): + def __set_input__(var_name, var): + if isinstance(var, tuple) or isinstance(var, np.ndarray): + tensor = scope.find_var(var_name).get_tensor() + if isinstance(var, tuple): + tensor.set_lod(var[1]) + var = var[0] + tensor.set_dims(var.shape) + tensor.set(var, place) + elif isinstance(var, float): + scope.find_var(var_name).set_float(var) + elif isinstance(var, int): + scope.find_var(var_name).set_int(var) + + for in_name, in_dup in Operator.get_op_inputs(op.type()): + if in_name in inputs: + if in_dup: + sub_in = inputs[in_name] + for item in sub_in: + sub_in_name, sub_in_val = item[0], item[1] + __set_input__(sub_in_name, sub_in_val) + else: + __set_input__(in_name, inputs[in_name]) + + +def append_input_output(block, op_proto, np_list, is_input, dtype): + '''Insert VarDesc and generate Python variable instance''' + proto_list = op_proto.inputs if is_input else op_proto.outputs + + def create_var(block, name, np_list, var_proto): + dtype = None + shape = None + lod_level = None + if name not in np_list: + assert var_proto.intermediate, "{} not found".format(name) + else: + np_value = np_list[name] + if isinstance(np_value, tuple): + dtype = np_value[0].dtype + # output shape, lod should be infered from input. + if is_input: + shape = list(np_value[0].shape) + lod_level = len(np_value[1]) + else: + dtype = np_value.dtype + if is_input: + shape = list(np_value.shape) + lod_level = 0 + return block.create_var( + dtype=dtype, shape=shape, lod_level=lod_level, name=name) + + var_dict = {} + for var_proto in proto_list: + var_name = str(var_proto.name) + if is_input: + if (var_name not in np_list) and var_proto.dispensable: + continue + assert (var_name in np_list) or (var_proto.dispensable), \ + "Missing {} as input".format(var_name) + if var_proto.duplicable: + assert isinstance(np_list[var_name], list), \ + "Duplicable {} should be set as list".format(var_name) + var_list = [] + for (name, np_value) in np_list[var_name]: + var_list.append( + create_var(block, name, {name: np_value}, var_proto)) + var_dict[var_name] = var_list + else: + var_dict[var_name] = create_var(block, var_name, np_list, var_proto) + + return var_dict + + +def append_loss_ops(block, output_names): + mean_inputs = map(block.var, output_names) + # for item in mean_inputs: + # print(item) + # print("Item", item.dtype) + + if len(mean_inputs) == 1: + loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1]) + op = block.append_op( + inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean') + op.desc.infer_var_type(block.desc) + op.desc.infer_shape(block.desc) + else: + avg_sum = [] + for cur_loss in mean_inputs: + cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1]) + op = block.append_op( + inputs={"X": [cur_loss]}, + outputs={"Out": [cur_avg_loss]}, + type="mean") + op.desc.infer_var_type(block.desc) + op.desc.infer_shape(block.desc) + avg_sum.append(cur_avg_loss) + + loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1]) + op_sum = block.append_op( + inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum') + op_sum.desc.infer_var_type(block.desc) + op_sum.desc.infer_shape(block.desc) + + loss = block.create_var(dtype=loss_sum.dtype, shape=[1]) + op_loss = block.append_op( + inputs={"X": loss_sum}, + outputs={"Out": loss}, + type='scale', + attrs={'scale': 1.0 / float(len(avg_sum))}) + op_loss.desc.infer_var_type(block.desc) + op_loss.desc.infer_shape(block.desc) + return loss diff --git a/python/paddle/fluid/tests/unittests/transpiler_test.py b/python/paddle/fluid/tests/unittests/transpiler_test.py new file mode 100644 index 0000000000000000000000000000000000000000..d84c5d9c41c705cf6d14cc0b5a8c692b0d646337 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/transpiler_test.py @@ -0,0 +1,73 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.layers as layers + + +class TranspilerTest(unittest.TestCase): + @classmethod + def setUpClass(self): + self.trainer_id = 0 + self.trainers = 2 + self.pservers = 2 + self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175" + + def net_conf(self): + x = fluid.layers.data(name='x', shape=[1000], dtype='float32') + + y_predict = fluid.layers.fc(input=x, + size=1000, + act=None, + param_attr=fluid.ParamAttr(name='fc_w')) + + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1) + + optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) + return optimize_ops, params_grads + + def get_main_program(self): + main = fluid.Program() + + with fluid.program_guard(main): + self.net_conf() + + return main + + def get_trainer(self): + return self._transpiler_instance().get_trainer_program() + + def get_pserver(self, ep): + t = self._transpiler_instance() + pserver = t.get_pserver_program(ep) + startup = t.get_startup_program(ep, pserver) + return pserver, startup + + def _transpiler_instance(self): + main = self.get_main_program() + t = fluid.DistributeTranspiler() + t.transpile( + self.trainer_id, + program=main, + pservers=self.pserver_eps, + trainers=self.trainers) + return t diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 7da123dd92ed9d111d68cd70efb8ce1493452609..efc28d899304b01a3085891f3ae9396d57c589a1 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -27,11 +27,8 @@ import parallel_executor from transpiler import distribute_transpiler __all__ = [ - 'Trainer', - 'BeginEpochEvent', - 'EndEpochEvent', - 'BeginStepEvent', - 'EndStepEvent', + 'Trainer', 'BeginEpochEvent', 'EndEpochEvent', 'BeginStepEvent', + 'EndStepEvent', 'CheckpointConfig' ] @@ -59,6 +56,35 @@ class EndStepEvent(object): self.metrics = metrics +class CheckpointConfig(object): + def __init__(self, + checkpoint_dir=None, + max_num_checkpoints=3, + epoch_interval=1, + step_interval=10): + if checkpoint_dir is None: + self.checkpoint_dir = os.getcwd() + else: + self.checkpoint_dir = checkpoint_dir + + self.max_num_checkpoints = max_num_checkpoints + + if epoch_interval < 1: + self.epoch_interval = 1 + else: + self.epoch_interval = epoch_interval + + if step_interval < 1: + self.step_interval = 10 + else: + self.step_interval = step_interval + + self.epoch_id = 0 + self.step_id = 0 + self.load_serial = None + self.is_pserver = False + + def check_and_get_place(place): """ Check the type of place or get the default place @@ -90,23 +116,32 @@ class Trainer(object): Args: train_func(callable): A function which will return loss. The loss must be a scalar. - optimizer(optimizer.Optimizer): The optimizer should be an instance of Optimizer + optimizer_func(callable): A function that returns an Optimizer object. place: The device place of this trainer. """ def __init__(self, train_func, - optimizer, + optimizer_func, param_path=None, place=None, - parallel=False): + parallel=False, + checkpoint_config=None): self.__stop = False self.parallel = parallel # 1. we need to generate a framework.Program by calling # program_func. Reference: fluid.program_guard in # test_word2vec.py - if not isinstance(optimizer, opt_module.Optimizer): - raise TypeError("The optimizer should be an instance of Optimizer") + + # config for checkpoint + # only chief worker will save variables + self.trainer_id = 0 + self.checkpoint_cfg = checkpoint_config + if self.checkpoint_cfg: + assert isinstance(self.checkpoint_cfg, CheckpointConfig) + serial = io.get_latest_checkpoint_serial( + self.checkpoint_cfg.checkpoint_dir) + self.checkpoint_cfg.load_serial = serial if serial >= 0 else None self.scope = core.Scope() @@ -117,12 +152,15 @@ class Trainer(object): program_func_outs = train_func() self.train_func_outputs = program_func_outs if isinstance( program_func_outs, list) else [program_func_outs] - self.test_program = self.train_program.clone() + self.test_program = self.train_program.clone(for_test=True) + + # The first element of program_func_outs is loss. + loss = self.train_func_outputs[0] + + optimizer = optimizer_func() if not isinstance(optimizer, opt_module.Optimizer): raise TypeError( "The optimizer should be an instance of Optimizer") - # The fisrt element of program_func_outs is loss. - loss = self.train_func_outputs[0] optimize_ops, params_grads = optimizer.minimize(loss) self.place = check_and_get_place(place) @@ -136,9 +174,25 @@ class Trainer(object): exe = executor.Executor(place) exe.run(self.startup_program) - if param_path: + if self.checkpoint_cfg and self.checkpoint_cfg.load_serial: + with self._prog_and_scope_guard(): + exe = executor.Executor(place) + io.load_checkpoint(exe, self.checkpoint_cfg.checkpoint_dir, + self.checkpoint_cfg.load_serial, + self.startup_program) + + if not self.checkpoint_cfg.is_pserver: + epoch_id, step_id = io.load_trainer_args( + self.checkpoint_cfg.checkpoint_dir, + self.checkpoint_cfg.load_serial, self.trainer_id, + self._get_checkpoint_load_args()) + self.checkpoint_cfg.epoch_id = int(epoch_id) + self.checkpoint_cfg.step_id = int(step_id) + + if param_path and os.path.isdir(param_path): # load params from param_path into scope - io.load_persistables(exe, dirname=param_path) + io.load_persist_vars_without_grad( + exe, dirname=param_path, program=self.startup_program) def _transpile_nccl2_dist(self): # PADDLE_TRAINER_IPS @@ -193,14 +247,18 @@ class Trainer(object): current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port # the unique trainer id, starting from 0, needed by trainer # only - trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + # the role, should be either PSERVER or TRAINER training_role = os.getenv("PADDLE_TRAINING_ROLE") with self._prog_and_scope_guard(): t = distribute_transpiler.DistributeTranspiler() t.transpile( - trainer_id, pservers=pserver_endpoints, trainers=trainers) + self.trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": + if self.checkpoint_cfg: + self.is_pserver = True + self.train_program = t.get_pserver_program(current_endpoint) self.startup_program = t.get_startup_program(current_endpoint, self.train_program) @@ -293,11 +351,26 @@ class Trainer(object): self._train_by_any_executor(event_handler, exe, num_epochs, reader) def _train_by_any_executor(self, event_handler, exe, num_epochs, reader): - for epoch_id in range(num_epochs): + if self.checkpoint_cfg: + epochs = [ + epoch_id for epoch_id in range(num_epochs) + if epoch_id >= self.checkpoint_cfg.epoch_id + ] + else: + epochs = [epoch_id for epoch_id in range(num_epochs)] + + for epoch_id in epochs: event_handler(BeginEpochEvent(epoch_id)) for step_id, data in enumerate(reader()): if self.__stop: + if self.checkpoint_cfg: + self._clean_checkpoint() return + + if self.checkpoint_cfg and self.checkpoint_cfg.load_serial \ + and self.checkpoint_cfg.step_id >= step_id and self.checkpoint_cfg.epoch_id == epoch_id: + continue + begin_event = BeginStepEvent(epoch_id, step_id) event_handler(begin_event) if begin_event.fetch_metrics: @@ -308,8 +381,13 @@ class Trainer(object): ]) else: metrics = exe.run(feed=data, fetch_list=[]) + + if self.checkpoint_cfg: + self._save_checkpoint(epoch_id, step_id) event_handler(EndStepEvent(epoch_id, step_id, metrics)) event_handler(EndEpochEvent(epoch_id)) + if self.checkpoint_cfg: + self._clean_checkpoint() def _test_by_executor(self, reader, feed_order, fetch_list): with executor.scope_guard(self.scope): @@ -348,6 +426,38 @@ class Trainer(object): loss_name=self.train_func_outputs[0].name) return self._get_parallel_executor() + def _clean_checkpoint(self): + assert self.checkpoint_cfg + io.clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir) + + def _get_checkpoint_load_args(self): + """ + epoch_id and step_id are runtime arguments, they are not variables, will load them independently. + """ + return ["epoch_id", "step_id"] + + def _get_checkpoint_save_args(self, epoch_id, step_id): + """ + epoch_id and step_id are runtime arguments, they are not variables, will save them independently. + """ + trainer_args = {} + trainer_args["epoch_id"] = epoch_id + trainer_args["step_id"] = step_id + return trainer_args + + def _save_checkpoint(self, epoch_id, step_id): + assert self.checkpoint_cfg + + if epoch_id % self.checkpoint_cfg.epoch_interval == 0 and step_id % self.checkpoint_cfg.step_interval == 0: + exe = executor.Executor(self.place) + io.save_checkpoint( + executor=exe, + checkpoint_dir=self.checkpoint_cfg.checkpoint_dir, + trainer_id=self.trainer_id, + trainer_args=self._get_checkpoint_save_args(epoch_id, step_id), + main_program=self.train_program, + max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints) + def build_feed_var_list(program, feed_order): if not isinstance(program, framework.Program): diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py index 045ca537b2e84c02298d6375a7ef5bdbb5517380..cf18090f71f34be5105498f5846dbcdf15ab2e3f 100644 --- a/python/paddle/fluid/transpiler/__init__.py +++ b/python/paddle/fluid/transpiler/__init__.py @@ -15,10 +15,9 @@ from distribute_transpiler import DistributeTranspiler from inference_transpiler import InferenceTranspiler from memory_optimization_transpiler import memory_optimize, release_memory -from distribute_transpiler_simple import SimpleDistributeTranspiler from ps_dispatcher import HashName, RoundRobin __all__ = [ - "DistributeTranspiler", "InferenceTranspiler", "SimpleDistributeTranspiler", - "memory_optimize", "release_memory", "HashName", "RoundRobin" + "DistributeTranspiler", "InferenceTranspiler", "memory_optimize", + "release_memory", "HashName", "RoundRobin" ] diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 06b0a1375ce6568cca864cd8a2dd69ee46b223a7..9c604170b8b53c9cbcf39b4978ae60ccad84648c 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -24,9 +24,9 @@ Steps to transpile trainer: 1. split variable to multiple blocks, aligned by product(dim[1:]) (width). 2. rename splited grad variables to add trainer_id suffix ".trainer_%d". 3. modify trainer program add split_op to each grad variable. -4. append send_op to send splited variables to server and fetch - params(splited blocks or origin param) from server. -5. append concat_op to merge splited blocks to update local weights. +4. append send_op to send splited variables to server and +5. add recv_op to fetch params(splited blocks or origin param) from server. +6. append concat_op to merge splited blocks to update local weights. Steps to transpile pserver: 1. create new program for parameter server. @@ -39,6 +39,7 @@ Steps to transpile pserver: from __future__ import print_function import math +import numpy as np from ps_dispatcher import RoundRobin, HashName, PSDispatcher from .. import core, framework @@ -70,7 +71,7 @@ def same_or_split_var(p_name, var_name): return p_name == var_name or p_name.startswith(var_name + ".block") -def split_variable(var_list, service_count, min_block_size=8192): +def slice_variable(var_list, slice_count, min_block_size=8192): """ We may need to split dense tensor to one or more blocks and put them equally onto parameter server. One block is a sub-tensor @@ -78,25 +79,25 @@ def split_variable(var_list, service_count, min_block_size=8192): We need to have a minimal block size so that the calculations in the parameter server side can gain better performance. By default - minimum block size 8K elements (maybe 16bit or 32bit or 64bit). + minimum block size 8K elements (maybe 16bit or 32bit or 64bit). Args: var_list (list): List of variables. - service_count (int): Numel of pserver services. A pserver may have two - or more listening ports. + slice_count (int): Numel of count that variables will be sliced, which + could be the pserver services' count. min_block_size (int): Minimum splitted block size. Returns: - blocks (list[(varname, block_id, current_block_size)]): A list + blocks (list[(varname, block_id, current_block_size)]): A list of VarBlocks. Each VarBlock specifies a shard of the var. """ blocks = [] for var in var_list: - split_count = service_count + split_count = slice_count var_numel = reduce(lambda x, y: x * y, var.shape) max_pserver_count = int(math.floor(var_numel / float(min_block_size))) if max_pserver_count == 0: max_pserver_count = 1 - if max_pserver_count < service_count: + if max_pserver_count < slice_count: split_count = max_pserver_count block_size = int(math.ceil(var_numel / float(split_count))) @@ -176,8 +177,9 @@ class DistributeTranspiler: dtype=table_grad_var.dtype) for index in range(len(self.pserver_endpoints)) ] + return param_list, grad_list - def _init_splited_vars(self, split_method): + def _init_splited_vars(self, slice_var_up): # update these mappings for further transpile: # 1. param_var_mapping: param var name -> [splited params vars] # 2. grad_var_mapping: grad var name -> [splited grads vars] @@ -186,19 +188,34 @@ class DistributeTranspiler: param_list = [] grad_list = [] + param_grad_set = set() for p, g in self.params_grads: # skip parameter marked not trainable if type(p) == Parameter and p.trainable == False: continue - param_list.append(p) - grad_list.append(g) - - self._update_dist_lookup_table_vars(param_list, grad_list, - self.params_grads) - - grad_blocks = split_variable(grad_list, len(self.pserver_endpoints)) - param_blocks = split_variable(param_list, len(self.pserver_endpoints)) + if p.name not in param_grad_set: + param_list.append(p) + param_grad_set.add(p.name) + if g.name not in param_grad_set: + grad_list.append(g) + param_grad_set.add(g.name) + + param_list, grad_list = self._update_dist_lookup_table_vars( + param_list, grad_list, self.params_grads) + + if slice_var_up: + # when we slice var up into blocks, we will slice the var according to + # pserver services' count. A pserver may have two or more listening ports. + grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints)) + param_blocks = slice_variable(param_list, + len(self.pserver_endpoints)) + else: + # when we do NOT slice var up into blocks, we will always slice params + # grads into one block. + grad_blocks = slice_variable(grad_list, 1) + param_blocks = slice_variable(param_list, 1) assert (len(grad_blocks) == len(param_blocks)) + # origin_varname -> [splited_var] self.param_var_mapping = self._create_vars_from_blocklist( self.origin_program, param_blocks) @@ -229,6 +246,7 @@ class DistributeTranspiler: program=None, pservers="127.0.0.1:6174", trainers=1, + slice_var_up=True, split_method=RoundRobin, sync_mode=True): """ @@ -262,13 +280,27 @@ class DistributeTranspiler: self.has_distributed_lookup_table = self._has_distributed_lookup_table() # split and create vars, then put splited vars in dicts for later use. - self._init_splited_vars(split_method) + self._init_splited_vars(slice_var_up) # step 3.1: insert send op to send gradient vars to parameter servers ps_dispatcher.reset() send_vars = [] - for orig_varname, splited_vars in self.grad_var_mapping.items(): + + # in general cases, the number of pservers is times of 2, and this + # will lead to uneven distribution among weights and bias: + # fc_w@GRAD_trainer_0, fc_w@GRAD_trainer_1 --> pserver1 + # fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2 + # shuffle the map will avoid the uneven distribution above + grad_var_mapping_items = self.grad_var_mapping.items() + if not slice_var_up: + np.random.shuffle(grad_var_mapping_items) + + for orig_varname, splited_vars in grad_var_mapping_items: eplist = ps_dispatcher.dispatch(splited_vars) + + if not slice_var_up: + assert (len(splited_vars) == 1) + if len(splited_vars) == 1: orig_varname = splited_vars[0].name index = find_op_by_output_arg(program.global_block(), @@ -285,7 +317,7 @@ class DistributeTranspiler: program.global_block().insert_op( index=index + 1, - type="send_vars", + type="send", inputs={"X": splited_vars}, outputs={}, attrs={ @@ -316,6 +348,7 @@ class DistributeTranspiler: for i, ep in enumerate(eplist): self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i]) self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i]) + # step4: Concat the parameters splits together after recv. for varname, splited_var in self.param_var_mapping.iteritems(): eps = [] @@ -482,35 +515,38 @@ class DistributeTranspiler: grad_to_block_id, None) # process distributed lookup_table - prefetch_block = None + prefetch_var_name_to_block_id = [] if self.has_distributed_lookup_table: pserver_index = self.pserver_endpoints.index(endpoint) table_opt_block = self._create_table_optimize_block( pserver_index, pserver_program, pre_block_idx, grad_to_block_id) - prefetch_block = self._create_prefetch_block( + prefetch_var_name_to_block_id = self._create_prefetch_block( pserver_index, pserver_program, table_opt_block) # NOTE: if has_distributed_lookup_table is False, then prefetch_block will # not be executed, so it's safe to use optimize_block to hold the place if self.has_distributed_lookup_table: - assert prefetch_block is not None + assert len(prefetch_var_name_to_block_id) > 0 else: - assert prefetch_block is None - prefetch_block = pserver_program.global_block() + assert len(prefetch_var_name_to_block_id) == 0 + + attrs = { + "OptimizeBlock": pserver_program.block(1), + "endpoint": endpoint, + "Fanin": self.trainer_num, + "sync_mode": self.sync_mode, + "grad_to_block_id": grad_to_block_id + } + if len(prefetch_var_name_to_block_id) > 0: + attrs['prefetch_var_name_to_block_id'] \ + = prefetch_var_name_to_block_id # step5 append the listen_and_serv op pserver_program.global_block().append_op( type="listen_and_serv", inputs={'X': recv_inputs}, outputs={}, - attrs={ - "OptimizeBlock": pserver_program.block(1), - "endpoint": endpoint, - "Fanin": self.trainer_num, - "PrefetchBlock": prefetch_block, - "sync_mode": self.sync_mode, - "grad_to_block_id": grad_to_block_id - }) + attrs=attrs) pserver_program.sync_with_cpp() return pserver_program @@ -575,8 +611,15 @@ class DistributeTranspiler: def _replace_lookup_table_op_with_prefetch(self, program, pserver_endpoints): # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op - self.prefetch_input_vars = None - self.prefetch_output_vars = None + # self.all_prefetch_input_vars = + # [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1] + # [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]] + self.all_prefetch_input_vars = [] + + # self.all_prefetch_input_vars = + # [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1] + # [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]] + self.all_prefetch_output_vars = [] continue_search_lookup_table_op = True while continue_search_lookup_table_op: @@ -586,26 +629,27 @@ class DistributeTranspiler: if op.type == LOOKUP_TABLE_TYPE: continue_search_lookup_table_op = True - op_index = list(all_ops).index(op) + lookup_table_op_index = list(all_ops).index(op) ids_name = op.input("Ids") out_name = op.output("Out") - if self.prefetch_input_vars is None: - ids_var = program.global_block().vars[ids_name[0]] - self.prefetch_input_vars = self.create_splited_vars( - source_var=ids_var, - block=program.global_block(), - tag="_prefetch_in_") - if self.prefetch_output_vars is None: - out_var = program.global_block().vars[out_name[0]] - self.prefetch_output_vars = self.create_splited_vars( - source_var=out_var, - block=program.global_block(), - tag="_prefetch_out_") + ids_var = program.global_block().vars[ids_name[0]] + prefetch_input_vars = self.create_splited_vars( + source_var=ids_var, + block=program.global_block(), + tag="_prefetch_in_") + self.all_prefetch_input_vars.append(prefetch_input_vars) + + out_var = program.global_block().vars[out_name[0]] + prefetch_output_vars = self.create_splited_vars( + source_var=out_var, + block=program.global_block(), + tag="_prefetch_out_") + self.all_prefetch_output_vars.append(prefetch_output_vars) # insert split_ids_op program.global_block().insert_op( - index=op_index, + index=lookup_table_op_index, type="split_ids", inputs={ 'Ids': [ @@ -613,14 +657,14 @@ class DistributeTranspiler: for varname in ids_name ] }, - outputs={"Out": self.prefetch_input_vars}) + outputs={"Out": prefetch_input_vars}) # insert prefetch_op program.global_block().insert_op( - index=op_index + 1, + index=lookup_table_op_index + 1, type="prefetch", - inputs={'X': self.prefetch_input_vars}, - outputs={"Out": self.prefetch_output_vars}, + inputs={'X': prefetch_input_vars}, + outputs={"Out": prefetch_output_vars}, attrs={ "epmap": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE @@ -628,16 +672,21 @@ class DistributeTranspiler: # insert concat_op program.global_block().insert_op( - index=op_index + 2, - type="concat", - inputs={'X': self.prefetch_output_vars}, + index=lookup_table_op_index + 2, + type="merge_ids", + inputs={ + 'Ids': [ + program.global_block().vars[varname] + for varname in ids_name + ], + 'X': prefetch_output_vars + }, outputs={ "Out": [ program.global_block().vars[varname] for varname in out_name ] - }, - attrs={"axis": 0}) + }) # delete lookup_table_op delete_ops(program.global_block(), [op]) @@ -645,7 +694,7 @@ class DistributeTranspiler: break def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints): - # 2. add split_ids_op and send_vars_op to send gradient to pservers + # 2. add split_ids_op and send_op to send gradient to pservers # there should only be one table_name all_ops = program.global_block().ops table_grad_name = grad_var_name(self.table_name) @@ -662,11 +711,11 @@ class DistributeTranspiler: outputs={"Out": self.trainer_side_table_grad_list}) program.global_block().insert_op( index=op_index + 2, - type="send_vars", + type="send", inputs={'X': self.trainer_side_table_grad_list}, outputs={}, attrs={ - "sync_send": True, + "sync_mode": True, "epmap": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE }) @@ -676,30 +725,34 @@ class DistributeTranspiler: optimize_block): # STEP: create prefetch block table_var = pserver_program.global_block().vars[self.table_name] - prefetch_block = pserver_program.create_block(optimize_block.idx) - trainer_ids = self.prefetch_input_vars[pserver_index] - pserver_ids = pserver_program.global_block().create_var( - name=trainer_ids.name, - type=trainer_ids.type, - shape=trainer_ids.shape, - dtype=trainer_ids.dtype) - trainer_out = self.prefetch_output_vars[pserver_index] - pserver_out = pserver_program.global_block().create_var( - name=trainer_out.name, - type=trainer_out.type, - shape=trainer_out.shape, - dtype=trainer_out.dtype) - prefetch_block.append_op( - type="lookup_sparse_table", - inputs={'Ids': pserver_ids, - "W": table_var}, - outputs={"Out": pserver_out}, - attrs={ - "is_sparse": True, # has no effect on lookup_table op - "is_distributed": True, - "padding_idx": -1 - }) - return prefetch_block + prefetch_var_name_to_block_id = [] + for index in range(len(self.all_prefetch_input_vars)): + prefetch_block = pserver_program.create_block(optimize_block.idx) + trainer_ids = self.all_prefetch_input_vars[index][pserver_index] + pserver_ids = pserver_program.global_block().create_var( + name=trainer_ids.name, + type=trainer_ids.type, + shape=trainer_ids.shape, + dtype=trainer_ids.dtype) + trainer_out = self.all_prefetch_output_vars[index][pserver_index] + pserver_out = pserver_program.global_block().create_var( + name=trainer_out.name, + type=trainer_out.type, + shape=trainer_out.shape, + dtype=trainer_out.dtype) + prefetch_block.append_op( + type="lookup_sparse_table", + inputs={'Ids': pserver_ids, + "W": table_var}, + outputs={"Out": pserver_out}, + attrs={ + "is_sparse": True, # has no effect on lookup_table op + "is_distributed": True, + "padding_idx": -1 + }) + prefetch_var_name_to_block_id.append(trainer_ids.name + ":" + str( + prefetch_block.idx)) + return prefetch_var_name_to_block_id def _create_table_optimize_block(self, pserver_index, pserver_program, pre_block_idx, grad_to_block_id): @@ -788,8 +841,8 @@ class DistributeTranspiler: program (ProgramDesc): ProgramDesc which gradients blong. block_list (list[(varname, block_id, block_size)]): List of gradient blocks. add_trainer_suffix (Bool): Add trainer suffix to new variable's name if set True. - Returns: - var_mapping (dict(varname->[new_varname_variable])):A dict mapping + Returns: + var_mapping (dict(varname->[new_varname_variable])):A dict mapping from original var name to each var split. """ @@ -802,6 +855,9 @@ class DistributeTranspiler: if not block_map.has_key(varname): block_map[varname] = [] block_map[varname].append((long(offset), long(size))) + # Do not remove this important debug message: + print("block map: %s" % block_map) + for varname, splited in block_map.iteritems(): orig_var = program.global_block().var(varname) if len(splited) == 1: diff --git a/python/paddle/fluid/transpiler/distribute_transpiler_simple.py b/python/paddle/fluid/transpiler/distribute_transpiler_simple.py deleted file mode 100644 index ea8c27cdca885dbbf90349b35df9691951264061..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/transpiler/distribute_transpiler_simple.py +++ /dev/null @@ -1,254 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from ..framework import Program, default_main_program, Parameter, Variable -from ..layer_helper import LayerHelper - - -def hash_name_to_server(params_grads, pserver_endpoints): - """ - :param param_grads: - :return: a map of pserver endpoint -> - params -> [param list] - grads -> [grad list] - """ - - def _hash_param(param_name, total): - return hash(param_name) % total - - param_grad_map = dict() - for param, grad in params_grads: - if param.trainable is True and grad is not None: - server_id = _hash_param(param.name, len(pserver_endpoints)) - server_for_param = pserver_endpoints[server_id] - if not param_grad_map.has_key(server_for_param): - param_grad_map[server_for_param] = {"params": [], "grads": []} - param_grad_map[server_for_param]["params"].append(param) - param_grad_map[server_for_param]["grads"].append(grad) - - return param_grad_map - - -def round_robin(params_grads, pserver_endpoints): - assert (len(params_grads) > len(pserver_endpoints)) - - param_grad_map = dict() - pserver_idx = 0 - for param, grad in params_grads: - if param.trainable is True: - server_for_param = pserver_endpoints[pserver_idx] - if not param_grad_map.has_key(server_for_param): - param_grad_map[server_for_param] = {"params": [], "grads": []} - - param_grad_map[server_for_param]["params"].append(param) - param_grad_map[server_for_param]["grads"].append(grad) - - pserver_idx += 1 - if pserver_idx >= len(pserver_endpoints): - pserver_idx = 0 - return param_grad_map - - -class SimpleDistributeTranspiler: - def transpile(self, - optimize_ops, - params_grads, - program=None, - pservers="127.0.0.1:6174", - trainers=1, - split_method=round_robin): - """ - Transpile the program to a distributed data-parallelism programs. - - The main_program will be transform to use a remote parameter server - to do parameter optimization. And the optimization graph will be put - in to a parameter server program. - - Use different methods to split trainable varialbles to different - parameter servers. - - Example to run: - - exe = fluid.Executor(place) - t = fluid.DistributeTranspiler() - t.transpile(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1) - - pserver_endpoint = os.getenv("PSERVER") - if pserver_endpoint: - pserver_prog = t.get_pserver_program(pserver_endpoint, optimize_ops) - exe.run(fluid.default_startup_program()) - exe.run(pserver_prog) - else: - feeder = fluid.DataFeeder(feed_list=[images, label], place=place) - exe.run(fluid.default_startup_program()) - - for pass_id in range(PASS_NUM): - ... - - :param optimize_ops: op list of optimization, should be the - return value of Optimizer.minimize - :type optimize_ops: list - :param program: program to optimize, default default_main_program - :param pservers: parameter server endpoints like "m1:6174,m2:6174" - :type pservers: string - - :return: return a list of programs - """ - if program is None: - program = default_main_program() - self.program = program - self.trainers = trainers - self.optimize_ops = optimize_ops - self._optimize_distributed( - optimize_ops, - program, - params_grads, - pservers=pservers, - trainers=trainers, - split_method=split_method) - - def _clone_param(self, block, v): - assert isinstance(v, Parameter) - new_p = Parameter( - block=block, - shape=v.shape, - dtype=v.dtype, - type=v.type, - lod_level=v.lod_level, - stop_gradient=v.stop_gradient, - trainable=v.trainable, - optimize_attr=v.optimize_attr, - regularizer=v.regularizer, - name=v.name) - block.vars[new_p.name] = new_p - - def _clone_var(self, block, var): - assert isinstance(var, Variable) - return block.create_var( - name=var.name, - shape=var.shape, - dtype=var.dtype, - type=var.type, - lod_level=var.lod_level, - persistable=var.persistable) - - def _optimize_distributed(self, optimize_ops, program, params_and_grads, - **kwargs): - if kwargs.has_key("split_method"): - split_method = kwargs["split_method"] - else: - split_method = round_robin - - assert (callable(split_method)) - pserver_endpoints = kwargs["pservers"].split(",") - self.param_grad_map = split_method(params_and_grads, pserver_endpoints) - - send_op_ordered_inputs = [] - send_op_ordered_outputs = [] - epmap = [] - for ep, v in self.param_grad_map.iteritems(): - send_op_ordered_inputs.extend(v["grads"]) - send_op_ordered_outputs.extend(v["params"]) - for i in v["grads"]: - epmap.append(ep) - send_op = program.global_block().append_op( - type="send", - inputs={"X": send_op_ordered_inputs - }, # inputs is a list of tensors to be send - outputs={"Out": send_op_ordered_outputs}, - attrs={"endpoints": pserver_endpoints, - "epmap": epmap}) - - def get_trainer_program(self): - # remove optimize ops and add a send op to main_program - self.program.global_block().delete_ops(self.optimize_ops) - return self.program - - def _create_var_for_trainers(self, block, var, trainers): - var_list = [] - for i in xrange(trainers): - var_each = block.create_var( - name="%s.trainer_%d" % (var.name, i), - psersistable=var.persistable, - dtype=var.dtype, - shape=var.shape) - var_list.append(var_each) - return var_list - - def get_pserver_program(self, endpoint, optimize_ops): - pserver_program = Program() - for v in self.param_grad_map[endpoint]["params"]: - self._clone_param(pserver_program.global_block(), v) - - optimize_sub_program = Program() - grad_var_names = [ - var.name for var in self.param_grad_map[endpoint]["grads"] - ] - for opt_op in optimize_ops: - for _, var in opt_op.inputs.iteritems(): - # NOTE: append operators to merge gradients from multiple - # trainers. If trainers == 1, this is not needed. - if self.trainers > 1 and var.name in grad_var_names: - vars2merge = self._create_var_for_trainers( - optimize_sub_program.global_block(), var, self.trainers) - merged_var = optimize_sub_program.global_block().create_var( - name=var.name, - persistable=var.persistable, - dtype=var.dtype, - shape=var.shape) - optimize_sub_program.global_block().append_op( - type="sum", - inputs={"X": vars2merge}, - outputs={"Out": merged_var}) - optimize_sub_program.global_block().append_op( - type="scale", - inputs={"X": merged_var}, - outputs={"Out": merged_var}, - attrs={"scale": 1.0 / float(self.trainers)}) - else: - optimize_sub_program.global_block().create_var( - name=var.name, - persistable=var.persistable, - dtype=var.dtype, - shape=var.shape) - - if opt_op.inputs.has_key("Grad"): - if opt_op.inputs["Grad"].name in grad_var_names: - optimize_sub_program.global_block().append_op( - type=opt_op.type, - inputs=opt_op.inputs, - outputs=opt_op.outputs, - attrs=opt_op.attrs) - else: - optimize_sub_program.global_block().append_op( - type=opt_op.type, - inputs=opt_op.inputs, - outputs=opt_op.outputs, - attrs=opt_op.attrs) - pserver_program.global_block().append_op( - type="recv", - inputs={"RX": - self.param_grad_map[endpoint]["grads"]}, # grads to recv - outputs={}, - attrs={ - "OptimizeBlock": optimize_sub_program.global_block(), - "endpoint": endpoint, - "ParamList": - [p.name for p in self.param_grad_map[endpoint]["params"]], - "GradList": - [p.name for p in self.param_grad_map[endpoint]["grads"]], - "Trainers": self.trainers - }) - pserver_program.sync_with_cpp() - return pserver_program diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py index e6f87ce61b1d16d4f98f111626776aa52c2ec35b..4e3beaf639bad9fed2862a5477095b66ef4b9aee 100644 --- a/python/paddle/trainer_config_helpers/attrs.py +++ b/python/paddle/trainer_config_helpers/attrs.py @@ -240,14 +240,15 @@ class ExtraLayerAttribute(object): :type error_clipping_threshold: float :param drop_rate: Dropout rate. Dropout will create a mask on layer output. The dropout rate is the zero rate of this mask. The - details of what dropout is please refer to `here - `_. + details of what dropout is please refer to `JMLRdropout + `_. :type drop_rate: float :param device: device ID of layer. device=-1, use CPU. device>=0, use GPU. - The details allocation in parallel_nn please refer to `here - `_. + The details allocation in parallel_nn please refer to `use_case + `_. :type device: int """ diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index ebc31b23e0f5504b4bebccabe996b054c7fbce3b..e6a03759ef431086390e217eabcdff47e610346c 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -2556,7 +2556,7 @@ def img_conv_layer(input, the output will be obtained by concatenating the two results. The details of grouped convolution, please refer to: - `ImageNet Classification with Deep Convolutional Neural Networks + `ImageNet Classification With Deep Convolutional Neural Networks `_ The example usage is: @@ -5678,8 +5678,8 @@ def warp_ctc_layer(input, `_ library, which is used in `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin `_, to compute Connectionist Temporal - Classification (CTC) loss. Besides, another `warp-ctc - `_ repository, which is forked from + Classification (CTC) loss. Besides, another `warp-ctc repository + `_ , which is forked from the official one, is maintained to enable more compiling options. During the building process, PaddlePaddle will clone the source codes, build and install it to :code:`third_party/install/warpctc` directory. diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py index 7bdddeaabec733ef26b3f766c6437f5c53d65044..357a4e9b000ea81afe291ff39dde2bed5c67e619 100644 --- a/python/paddle/v2/dataset/flowers.py +++ b/python/paddle/v2/dataset/flowers.py @@ -119,7 +119,8 @@ def reader_creator(data_file, yield sample, int(label) - 1 if use_xmap: - return xmap_readers(mapper, reader, cpu_count(), buffered_size) + cpu_num = int(os.environ.get('CPU_NUM', cpu_count())) + return xmap_readers(mapper, reader, cpu_num, buffered_size) else: return map_readers(mapper, reader) diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py index d48c54fcbb66487617b1946bc69724870c8f879c..3c6a53db3c2287e8ef5931a06ca5dad455665ee0 100644 --- a/python/paddle/v2/minibatch.py +++ b/python/paddle/v2/minibatch.py @@ -15,7 +15,7 @@ __all__ = ['batch'] -def batch(reader, batch_size, drop_last=False): +def batch(reader, batch_size, drop_last=True): """ Create a batched reader. diff --git a/tools/codestyle/.gitignore b/tools/codestyle/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..0d20b6487c61e7d1bde93acf4a14b7a89083a16d --- /dev/null +++ b/tools/codestyle/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/tools/codestyle/docstring_checker.py b/tools/codestyle/docstring_checker.py index 48100e5bf989520043b5ca372b02883faea8a9fd..54a690462699651d3e14f9b24383df01a9740336 100644 --- a/tools/codestyle/docstring_checker.py +++ b/tools/codestyle/docstring_checker.py @@ -126,9 +126,10 @@ class DocstringChecker(BaseChecker): 'W9002': ('Doc string does not end with "." period', symbol + "-end-with", 'Used when a doc string does not end with a period'), - 'W9003': ('All args with their types must be mentioned in doc string', - symbol + "-with-all-args", - 'Used when not all arguments are in the doc string '), + 'W9003': + ('All args with their types must be mentioned in doc string %s', + symbol + "-with-all-args", + 'Used when not all arguments are in the doc string '), 'W9005': ('Missing docstring or docstring is too short', symbol + "-missing", 'Add docstring longer >=10'), 'W9006': ('Docstring indent error, use 4 space for indent', @@ -178,6 +179,8 @@ class DocstringChecker(BaseChecker): self.indent_style(node) def missing_doc_string(self, node): + if node.name.startswith("__") or node.name.startswith("_"): + return True if node.tolineno - node.fromlineno <= 10: return True @@ -199,12 +202,16 @@ class DocstringChecker(BaseChecker): doc = node.doc lines = doc.splitlines() + line_num = 0 for l in lines: + if line_num == 0: + continue cur_indent = len(l) - len(l.lstrip()) if cur_indent % indent != 0: self.add_message('W9006', node=node, line=node.fromlineno) return False + line_num += 1 return True @@ -320,15 +327,19 @@ class DocstringChecker(BaseChecker): return True parsed_args = doc.args + args_not_documented = set(args) - set(parsed_args) if len(args) > 0 and len(parsed_args) <= 0: - print "debug:parsed args: ", parsed_args - self.add_message('W9003', node=node, line=node.fromlineno) + self.add_message( + 'W9003', + node=node, + line=node.fromlineno, + args=list(args_not_documented)) return False for t in args: if t not in parsed_args: - print t, " with (type) not in ", parsed_args - self.add_message('W9003', node=node, line=node.fromlineno) + self.add_message( + 'W9003', node=node, line=node.fromlineno, args=[t, ]) return False return True diff --git a/tools/codestyle/pylint_pre_commit.hook b/tools/codestyle/pylint_pre_commit.hook index e7c92ba671e0eb778b2ab5447bea7c4b14fe761b..150a3f5666bd39d30b7e6518e58a14fb5fe2f14b 100755 --- a/tools/codestyle/pylint_pre_commit.hook +++ b/tools/codestyle/pylint_pre_commit.hook @@ -7,13 +7,13 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" export PYTHONPATH=$DIR:$PYTHONPATH # The trick to remove deleted files: https://stackoverflow.com/a/2413151 -for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do +for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do pylint --disable=all --load-plugins=docstring_checker \ --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file; TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?); done -#exit $TOTAL_ERRORS +exit $TOTAL_ERRORS #For now, just warning: -exit 0 +#exit 0