diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile index 8298fcf95a5074bce9533e04d54dab79a1460286..b9eaca5ee6b487bb37bb954b3c606c3096d37aeb 100644 --- a/benchmark/fluid/Dockerfile +++ b/benchmark/fluid/Dockerfile @@ -19,4 +19,4 @@ ADD *.whl / RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s ENV LD_LIBRARY_PATH=/usr/local/lib -ADD fluid_benchmark.py dataset.py models/ /workspace/ +ADD fluid_benchmark.py recordio_converter.py models/ /workspace/ diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md index 33d2228ca5f65d104360e22bc281fad2d3dd9d0e..f40f3c129741f9b6e3654399a9110b065fec7d6c 100644 --- a/benchmark/fluid/README.md +++ b/benchmark/fluid/README.md @@ -44,6 +44,16 @@ Currently supported `--model` argument include: PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2 ``` +## Prepare the RecordIO file to Achieve Better Performance + +Run the following command will generate RecordIO files like "mnist.recordio" under the path +and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size +at any time using `fluid.batch`. + +```bash +python -c 'from recordio_converter import *; prepare_mnist("data", 1)' +``` + ## Run Distributed Benchmark on Kubernetes Cluster You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 49f26255f315c3c368f42b367dfc6487ffa0deb5..62a05234c45ee4fe1dc21f5a74efc269227154db 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -38,10 +38,12 @@ def parse_args(): default='resnet', help='The model to run benchmark with.') parser.add_argument( - '--batch_size', type=int, default=32, help='The minibatch size.') + '--batch_size', + type=int, + default=32, + help='The batch size on each gpu.') parser.add_argument( '--learning_rate', type=float, default=0.001, help='The learning rate.') - # TODO(wuyi): add "--use_fake_data" option back. parser.add_argument( '--skip_batch_num', type=int, @@ -49,7 +51,10 @@ def parse_args(): help='The first num of minibatch num to skip, for better performance test' ) parser.add_argument( - '--iterations', type=int, default=80, help='The number of minibatches.') + '--iterations', + type=int, + default=80, + help='The number of minibatches, set to -1 to run all batches.') parser.add_argument( '--pass_num', type=int, default=100, help='The number of passes.') parser.add_argument( @@ -69,6 +74,7 @@ def parse_args(): type=int, default=1, help='If gpus > 1, will use ParallelExecutor to run, else use Executor.') + # this option is available only for vgg and resnet. parser.add_argument( '--cpus', type=int, @@ -78,7 +84,7 @@ def parse_args(): '--data_set', type=str, default='flowers', - choices=['cifar10', 'flowers'], + choices=['cifar10', 'flowers', 'imagenet'], help='Optional dataset for benchmark.') parser.add_argument( '--infer_only', action='store_true', help='If set, run forward only.') @@ -108,6 +114,16 @@ def parse_args(): default='local', choices=['local', 'pserver', 'nccl2'], help='Choose parameter update method, can be local, pserver, nccl2.') + parser.add_argument( + '--use_reader_op', + action='store_true', + help='Whether to use reader op, and must specify the data path if set this to true.' + ) + parser.add_argument( + '--data_path', + type=str, + default="", + help='Directory that contains all the training recordio files.') args = parser.parse_args() return args @@ -210,26 +226,50 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) exe = fluid.Executor(place) exe.run(startup_prog) - feed_var_list = [ - var for var in train_prog.global_block().vars.itervalues() - if var.is_data - ] - feeder = fluid.DataFeeder(feed_var_list, place) + + if not args.use_reader_op: + feed_var_list = [ + var for var in train_prog.global_block().vars.itervalues() + if var.is_data + ] + feeder = fluid.DataFeeder(feed_var_list, place) iters, num_samples, start_time = 0, 0, time.time() for pass_id in range(args.pass_num): train_losses = [] - for batch_id, data in enumerate(train_reader()): + if not args.use_reader_op: + reader_generator = train_reader() + batch_id = 0 + data = None + while True: + if not args.use_reader_op: + data = next(reader_generator, None) + if data == None: + break + if iters == args.iterations: + break if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 - if iters == args.iterations: - break - loss = exe.run(train_prog, - feed=feeder.feed(data), - fetch_list=[avg_loss]) + + if args.use_reader_op: + try: + loss = exe.run(train_prog, fetch_list=[avg_loss]) + except fluid.core.EnforceNotMet as ex: + break + else: + loss = exe.run(train_prog, + feed=feeder.feed(data), + fetch_list=[avg_loss]) iters += 1 - num_samples += len(data) + batch_id += 1 + # FIXME(wuyi): For use_reader_op, if the current + # pass is not the last, the last batch of this pass + # is also equal to args.batch_size. + if args.use_reader_op: + num_samples += args.batch_size * args.gpus + else: + num_samples += len(data) train_losses.append(loss) print("Pass: %d, Iter: %d, Loss: %f\n" % (pass_id, iters, np.mean(train_losses))) @@ -250,10 +290,14 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, args, train_prog, startup_prog, nccl_id_var, num_trainers, trainer_id): - feed_var_list = [ - var for var in train_prog.global_block().vars.itervalues() - if var.is_data - ] + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + if not args.use_reader_op: + feed_var_list = [ + var for var in train_prog.global_block().vars.itervalues() + if var.is_data + ] + feeder = fluid.DataFeeder(feed_var_list, place) + # generate fake: if args.use_fake_data: for var in feed_var_list: @@ -270,7 +314,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, "value": 1.0, "dtype": var.dtype}) - place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) if nccl_id_var and trainer_id == 0: #FIXME(wuyi): wait other trainer to start listening time.sleep(30) @@ -287,12 +330,21 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, num_trainers=num_trainers, trainer_id=trainer_id) - feeder = fluid.DataFeeder(feed_var_list, place) for pass_id in range(args.pass_num): num_samples = 0 iters = 0 start_time = time.time() - for batch_id, data in enumerate(train_reader()): + if not args.use_reader_op: + reader_generator = train_reader() + batch_id = 0 + data = None + while True: + if not args.use_reader_op: + data = next(reader_generator, None) + if data == None: + break + if iters == args.iterations: + break if args.profile and pass_id == 0 and batch_id == 5: profiler.start_profiler("All") elif args.profile and pass_id == 0 and batch_id == 10: @@ -301,19 +353,25 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 - if iters == args.iterations: - break - if args.use_fake_data: - loss, = exe.run([avg_loss.name]) + if args.use_fake_data or args.use_reader_op: + try: + loss, = exe.run([avg_loss.name]) + except fluid.core.EnforceNotMet as ex: + break else: loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) if args.update_method == "pserver": exe.bcast_params() - num_samples += len(data) + if args.use_reader_op: + num_samples += args.batch_size * args.gpus + else: + num_samples += len(data) iters += 1 if batch_id % 1 == 0: print("Pass %d, batch %d, loss %s" % (pass_id, batch_id, np.array(loss))) + batch_id += 1 + print_train_time(start_time, time.time(), num_samples) if not args.no_test and batch_acc: test_acc = test(startup_exe, infer_prog, test_reader, feeder, diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py index 635b3373dd27b21f83afae10b1d24833b81d57eb..69541adf6b7e53fcc1ac9d3c82b5a60ca0a72879 100644 --- a/benchmark/fluid/models/machine_translation.py +++ b/benchmark/fluid/models/machine_translation.py @@ -197,6 +197,8 @@ def lodtensor_to_ndarray(lod_tensor): def get_model(args): + if args.use_reader_op: + raise Exception("machine_translation do not support reader op for now.") embedding_dim = 512 encoder_size = 512 decoder_size = 512 @@ -221,7 +223,7 @@ def get_model(args): train_batch_generator = paddle.batch( paddle.reader.shuffle( paddle.dataset.wmt14.train(dict_size), buf_size=1000), - batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus) test_batch_generator = paddle.batch( paddle.reader.shuffle( diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py index 28a38a931cf6cfcd5dd858b363b3d29b70368315..8e740dc6896b7eeeb82170aa13d32987c4df5c48 100644 --- a/benchmark/fluid/models/mnist.py +++ b/benchmark/fluid/models/mnist.py @@ -20,6 +20,7 @@ import numpy as np import argparse import time import cProfile +import os import paddle import paddle.fluid as fluid @@ -65,9 +66,24 @@ def cnn_model(data): def get_model(args): - # Input data - images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) - label = fluid.layers.data(name='label', shape=[1], dtype='int64') + if args.use_reader_op: + filelist = [ + os.path.join(args.data_path, f) for f in os.listdir(args.data_path) + ] + data_file = fluid.layers.open_files( + filenames=filelist, + shapes=[[-1, 1, 28, 28], (-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int64"], + thread_num=args.gpus, + pass_num=args.pass_num) + data_file = fluid.layers.double_buffer( + fluid.layers.batch( + data_file, batch_size=args.batch_size)) + images, label = fluid.layers.read_file(data_file) + else: + images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') if args.device == 'CPU' and args.cpus > 1: places = fluid.layers.get_places(args.cpus) @@ -103,7 +119,7 @@ def get_model(args): # Reader train_reader = paddle.batch( - paddle.dataset.mnist.train(), batch_size=args.batch_size) + paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus) test_reader = paddle.batch( paddle.dataset.mnist.test(), batch_size=args.batch_size) return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index f951f73a35dc4dc6f796178ebbc3e2886b2d7d8c..2ee2b5be09bfcc2e7fcec7eb2f80e28e4e75ab3d 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -19,6 +19,7 @@ from __future__ import print_function import functools import numpy as np import time +import os import cProfile, pstats, StringIO @@ -26,6 +27,7 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.profiler as profiler +from recordio_converter import imagenet_train, imagenet_test def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): @@ -122,16 +124,48 @@ def get_model(args): else: dshape = [32, 32, 3] model = resnet_cifar10 - else: + train_reader = paddle.dataset.cifar.train10() + test_reader = paddle.dataset.cifar.test10() + elif args.data_set == "flowers": class_dim = 102 if args.data_format == 'NCHW': dshape = [3, 224, 224] else: dshape = [224, 224, 3] model = resnet_imagenet - - input = fluid.layers.data(name='data', shape=dshape, dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') + train_reader = paddle.dataset.flowers.train() + test_reader = paddle.dataset.flowers.test() + elif args.data_set == "imagenet": + class_dim = 1000 + if args.data_format == 'NCHW': + dshape = [3, 224, 224] + else: + dshape = [224, 224, 3] + model = resnet_imagenet + if not args.data_path: + raise Exception( + "Must specify --data_path when training with imagenet") + train_reader = imagenet_train(args.data_path) + test_reader = imagenet_test(args.data_path) + + if args.use_reader_op: + filelist = [ + os.path.join(args.data_path, f) for f in os.listdir(args.data_path) + ] + data_file = fluid.layers.open_files( + filenames=filelist, + shapes=[[-1] + dshape, (-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int64"], + thread_num=args.gpus, + pass_num=args.pass_num) + data_file = fluid.layers.double_buffer( + fluid.layers.batch( + data_file, batch_size=args.batch_size)) + input, label = fluid.layers.read_file(data_file) + else: + input = fluid.layers.data(name='data', shape=dshape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') if args.device == 'CPU' and args.cpus > 1: places = fluid.layers.get_places(args.cpus) @@ -162,15 +196,10 @@ def get_model(args): optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) - train_reader = paddle.batch( + batched_train_reader = paddle.batch( paddle.reader.shuffle( - paddle.dataset.cifar.train10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), - buf_size=5120), - batch_size=args.batch_size) - test_reader = paddle.batch( - paddle.dataset.cifar.test10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), - batch_size=args.batch_size) - - return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc + train_reader, buf_size=5120), + batch_size=args.batch_size * args.gpus) + batched_test_reader = paddle.batch(train_reader, batch_size=args.batch_size) + + return avg_cost, inference_program, optimizer, batched_train_reader, batched_test_reader, batch_acc diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py index 1b680d76a8ba1ead7c8c50065e1817c45b951b27..e1c4857f1a365f6480929ea57296a9801f5ea945 100644 --- a/benchmark/fluid/models/stacked_dynamic_lstm.py +++ b/benchmark/fluid/models/stacked_dynamic_lstm.py @@ -44,6 +44,9 @@ def crop_sentence(reader, crop_size): def get_model(args): + if args.use_reader_op: + raise Exception( + "stacked_dynamic_lstm do not support reader op for now.") lstm_size = 512 emb_dim = 512 crop_size = 1500 @@ -114,7 +117,7 @@ def get_model(args): train_reader = batch( paddle.reader.shuffle( crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000), - batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus) test_reader = batch( paddle.reader.shuffle( crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000), diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py index 53856c5f7acd3a4e1476ec57154a880bb6f984c9..6092cdeb884b3a9b60a3bcf20b022f2b0685e6aa 100644 --- a/benchmark/fluid/models/vgg.py +++ b/benchmark/fluid/models/vgg.py @@ -22,6 +22,7 @@ import paddle.fluid as fluid import paddle.fluid.core as core import argparse import functools +import os def vgg16_bn_drop(input): @@ -65,9 +66,24 @@ def get_model(args): else: data_shape = [224, 224, 3] - # Input data - images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') + if args.use_reader_op: + filelist = [ + os.path.join(args.data_path, f) for f in os.listdir(args.data_path) + ] + data_file = fluid.layers.open_files( + filenames=filelist, + shapes=[[-1] + data_shape, (-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int64"], + thread_num=args.gpus, + pass_num=args.pass_num) + data_file = fluid.layers.double_buffer( + fluid.layers.batch( + data_file, batch_size=args.batch_size)) + images, label = fluid.layers.read_file(data_file) + else: + images = fluid.layers.data(name='data', shape=dshape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') # Train program net = vgg16_bn_drop(images) @@ -95,7 +111,7 @@ def get_model(args): paddle.dataset.cifar.train10() if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), buf_size=5120), - batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus) test_reader = paddle.batch( paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), diff --git a/benchmark/fluid/recordio_converter.py b/benchmark/fluid/recordio_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..f2dc39109bf1beaf147b046560c92fbd2416d8e6 --- /dev/null +++ b/benchmark/fluid/recordio_converter.py @@ -0,0 +1,164 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.dataset import mnist, cifar, flowers, image + + +def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data, + shape_label): + num_batches = 0 + with fluid.program_guard(fluid.Program(), fluid.Program()): + reader = paddle.batch(py_reader(), batch_size=batch_size) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=shape_data), + fluid.layers.data( + name='label', shape=shape_label, dtype='int64'), + ], + place=fluid.CPUPlace()) + num_batches = fluid.recordio_writer.convert_reader_to_recordio_file( + outfilepath, reader, feeder) + return num_batches + + +def prepare_mnist(outpath, batch_size): + outfilepath = os.path.join(outpath, "mnist.recordio") + convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1]) + + +def prepare_cifar10(outpath, batch_size): + outfilepath = os.path.join(outpath, "cifar.recordio") + convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1]) + + +def prepare_flowers(outpath, batch_size): + outfilepath = os.path.join(outpath, "flowers.recordio") + convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224], + [1]) + + +def default_mapper(sample): + img, label = sample + img = image.simple_transform( + img, 256, 224, True, mean=[103.94, 116.78, 123.68]) + return img.flatten().astype('float32'), label + + +def imagenet_train(data_dir): + contents = os.listdir(data_dir) + if set(contents) != set( + ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]): + raise Exception("Imagenet data contents error!") + img2label = dict() + imgfilelist = [] + with open(os.path.join(data_dir, "train.txt")) as fn: + while 1: + l = fn.readline() + if not l: + break + img, lbl = l[:-1].split(" ") + img2label[img] = int(lbl) + imgfilelist.append(img) + # shuffle all, this is slow + random.shuffle(imgfilelist) + + def train_reader(): + for idx, imgfile in enumerate(imgfilelist): + data = image.load_image( + os.path.join(data_dir, "train", imgfile.lower())) + label = [img2label[imgfile], ] + yield [data, label] + + return paddle.reader.map_readers(default_mapper, train_reader) + + +def imagenet_test(data_dir): + contents = os.listdir(data_dir) + if set(contents) != set( + ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]): + raise Exception("Imagenet data contents error!") + img2label = dict() + imgfilelist = [] + with open(os.path.join(data_dir, "val.txt")) as fn: + while 1: + l = fn.readline() + if not l: + break + img, lbl = l[:-1].split(" ") + img2label[img] = int(lbl) + imgfilelist.append(img) + + def test_reader(): + for idx, imgfile in enumerate(imgfilelist): + base_path = os.path.join(data_dir, "val", imgfile.split(".")[0]) + image_path = ".".join([base_path, "jpeg"]) + data = image.load_image(image_path) + label = [img2label[imgfile], ] + yield [data, label] + + return paddle.reader.map_readers(default_mapper, test_reader) + + +# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged +def convert_reader_to_recordio_files( + filename, + batch_per_file, + reader_creator, + feeder, + compressor=core.RecordIOWriter.Compressor.Snappy, + max_num_records=1000, + feed_order=None): + if feed_order is None: + feed_order = feeder.feed_names + f_name, f_ext = os.path.splitext(filename) + assert (f_ext == ".recordio") + + lines = [] + f_idx = 0 + counter = 0 + for idx, batch in enumerate(reader_creator()): + lines.append(batch) + if idx >= batch_per_file and idx % batch_per_file == 0: + filename = "%s-%05d%s" % (f_name, f_idx, f_ext) + with fluid.recordio_writer.create_recordio_writer( + filename, compressor, max_num_records) as writer: + for l in lines: + res = feeder.feed(l) + for each in feed_order: + writer.append_tensor(res[each]) + writer.complete_append_tensor() + counter += 1 + lines = [] + f_idx += 1 + print("written file: ", filename) + return counter + + +def prepare_imagenet(inpath, outpath, batch_size): + r = paddle.batch(imagenet_train(inpath), batch_size=batch_size) + feeder = fluid.DataFeeder( + feed_list=[ + fluid.layers.data( + name="image", shape=[3, 224, 224]), fluid.layers.data( + name="label", shape=[1], dtype='int64') + ], + place=fluid.CPUPlace()) + outpath = os.path.join(outpath, "imagenet.recordio") + convert_reader_to_recordio_files(outpath, 10000, r, feeder) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 682614742cf1bd3130c638020a2545e16226d4d6..4158d0528a1aea52c2a3f0880fe1000183a9df53 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -92,6 +92,9 @@ if(WITH_GPU) if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile") endif() + if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4) + message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile") + endif() include_directories(${TENSORRT_INCLUDE_DIR}) endif() elseif(WITH_AMD_GPU) diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt index 1e3bb7bf16f969255dba6f6ec7a6a70bbb1e07ee..f279020e9334323ebdf3125a8833044cd9eccae5 100644 --- a/paddle/contrib/inference/CMakeLists.txt +++ b/paddle/contrib/inference/CMakeLists.txt @@ -24,31 +24,37 @@ set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library") set(inference_deps paddle_inference_api paddle_fluid_api) # if anakin is set enable anakin api implementation -if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY) +if(ANAKIN_INCLUDE AND ANAKIN_LIBRARY) set(ANAKIN_FOUND ON) else() set(ANAKIN_FOUND OFF) endif() +function(fetch_include_recursively root_dir) + if (IS_DIRECTORY ${root_dir}) + include_directories(${root_dir}) + endif() + + file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*) + foreach(sub ${ALL_SUB}) + if (IS_DIRECTORY ${root_dir}/${sub}) + fetch_include_recursively(${root_dir}/${sub}) + endif() + endforeach() +endfunction() + if (ANAKIN_FOUND) # Anakin's code style doesn't follow google c style. - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=comment - -Wno-error=reorder - -Wno-error=format - -Wno-error=switch - -Wno-error=return-type - -Wno-error=non-virtual-dtor - -Wno-error=cpp") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp") message(STATUS "Anakin for inference is enabled") message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}") - include_directories("${ANAKIN_INCLUDE}") - # Anakin's source path is a mass, need to set sub-directories trivially. - include_directories("${ANAKIN_INCLUDE}/saber") - link_directories("${ANAKIN_LIBRARY}") + fetch_include_recursively(${ANAKIN_INCLUDE}) + + link_directories(${ANAKIN_LIBRARY}) - nv_library(inference_anakin_api SRCS paddle_inference_api_anakin_engine.cc) - target_link_libraries(inference_anakin_api anakin) + nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc) + target_link_libraries(inference_anakin_api anakin anakin_saber_common) list(APPEND inference_deps inference_anakin_api) endif() @@ -73,7 +79,7 @@ function(inference_api_test TARGET_NAME) endfunction(inference_api_test) cc_library(paddle_inference_api - SRCS paddle_inference_api.cc paddle_inference_api_impl.cc + SRCS paddle_inference_api.cc paddle_inference_api_impl.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) cc_test(test_paddle_inference_api @@ -84,8 +90,8 @@ inference_api_test(test_paddle_inference_api_impl ARGS test_word2vec test_image_classification) if (ANAKIN_FOUND) - nv_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc - DEPS ${inference_deps} protobuf) + cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc + DEPS ${inference_deps}) endif() if(WITH_TESTING) diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc index 9b4843f714f11484860056711fd223edc8a5d037..192a6414260ce06048b8c765402d89882cabc51b 100644 --- a/paddle/contrib/inference/demo/simple_on_word2vec.cc +++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include #include #include +#include #include "paddle/contrib/inference/paddle_inference_api.h" - namespace paddle { namespace demo { @@ -61,13 +61,67 @@ void Main(bool use_gpu) { for (size_t i = 0; i < std::min(5UL, num_elements); i++) { LOG(INFO) << static_cast(outputs.front().data.data)[i]; } + // TODO(Superjomn): this is should be free automatically + free(outputs[0].data.data); + } +} + +void MainThreads(int num_threads, bool use_gpu) { + // Multi-threads only support on CPU + // 0. Create PaddlePredictor with a config. + NativeConfig config; + config.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config.use_gpu = use_gpu; + config.fraction_of_gpu_memory = 0.15; + config.device = 0; + auto main_predictor = + CreatePaddlePredictor(config); + + std::vector threads; + for (int tid = 0; tid < num_threads; ++tid) { + threads.emplace_back([&, tid]() { + // 1. clone a predictor which shares the same parameters + auto predictor = main_predictor->Clone(); + constexpr int num_batches = 3; + for (int batch_id = 0; batch_id < num_batches; ++batch_id) { + // 2. Dummy Input Data + int64_t data[4] = {1, 2, 3, 4}; + PaddleBuf buf{.data = data, .length = sizeof(data)}; + PaddleTensor tensor{.name = "", + .shape = std::vector({4, 1}), + .data = buf, + .dtype = PaddleDType::INT64}; + std::vector inputs(4, tensor); + std::vector outputs; + // 3. Run + CHECK(predictor->Run(inputs, &outputs)); + + // 4. Get output. + ASSERT_EQ(outputs.size(), 1UL); + LOG(INFO) << "TID: " << tid << ", " + << "output buffer size: " << outputs.front().data.length; + const size_t num_elements = outputs.front().data.length / sizeof(float); + // The outputs' buffers are in CPU memory. + for (size_t i = 0; i < std::min(5UL, num_elements); i++) { + LOG(INFO) << static_cast(outputs.front().data.data)[i]; + } + free(outputs[0].data.data); + } + }); + } + for (int i = 0; i < num_threads; ++i) { + threads[i].join(); } } TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); } +TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); } +TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); } #ifdef PADDLE_WITH_CUDA TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); } +TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); } +TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); } #endif } // namespace demo diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h index c4588cf04030b9627dbe9b40c1bb04d1e782ebba..77e2d77b6b7fe3eeed865c8de0818d059cfa6c6e 100644 --- a/paddle/contrib/inference/paddle_inference_api.h +++ b/paddle/contrib/inference/paddle_inference_api.h @@ -113,5 +113,4 @@ struct AnakinConfig : public PaddlePredictor::Config { // Similarly, each engine kind should map to a unique predictor implementation. template std::unique_ptr CreatePaddlePredictor(const ConfigT& config); - } // namespace paddle diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc index 865d7ac10db55ce9565f4b1a35defa2a3d1d40ef..ea7781f691da81befd5d11c226c35e1da79baaaa 100644 --- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc +++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc @@ -24,8 +24,16 @@ PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor( } bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) { - // TODO(Superjomn) Tell anakin to support return code. - engine_.Build(config.model_file, config.max_batch_size); + if (!(graph_.load(config.model_file))) { + return false; + } + graph_.ResetBatchSize("input_0", config.max_batch_size); + // optimization for graph + if (!(graph_.Optimize())) { + return false; + } + // construct executer + executor_.init(graph_); return true; } @@ -38,24 +46,30 @@ bool PaddleInferenceAnakinPredictor::Run( << "'s type is not float"; return false; } - engine_.SetInputFromCPU( - input.name, static_cast(input.data.data), input.data.length); + auto d_tensor_in_p = executor_.get_in(input.name); + float *d_data_p = d_tensor_in_p->mutable_data(); + if (cudaMemcpy(d_data_p, + static_cast(input.data.data), + d_tensor_in_p->valid_size() * sizeof(float), + cudaMemcpyHostToDevice) != 0) { + LOG(ERROR) << "copy data from CPU to GPU error"; + return false; + } } - // TODO(Superjomn) Tell anakin to support return code. - engine_.Execute(); + executor_.prediction(); if (output_data->empty()) { LOG(ERROR) << "At least one output should be set with tensors' names."; return false; } for (auto &output : *output_data) { - auto *tensor = engine_.GetOutputInGPU(output.name); + auto *tensor = executor_.get_out(output.name); output.shape = tensor->shape(); // Copy data from GPU -> CPU if (cudaMemcpy(output.data.data, - tensor->data(), - tensor->size(), + tensor->mutable_data(), + tensor->valid_size() * sizeof(float), cudaMemcpyDeviceToHost) != 0) { LOG(ERROR) << "copy data from GPU to CPU error"; return false; @@ -64,9 +78,26 @@ bool PaddleInferenceAnakinPredictor::Run( return true; } -// TODO(Superjomn) To implement latter. +anakin::Net + &PaddleInferenceAnakinPredictor::get_executer() { + return executor_; +} + +// the cloned new Predictor of anakin share the same net weights from original +// Predictor std::unique_ptr PaddleInferenceAnakinPredictor::Clone() { - return nullptr; + VLOG(3) << "Anakin Predictor::clone"; + std::unique_ptr cls(new PaddleInferenceAnakinPredictor()); + // construct executer from other graph + auto anakin_predictor_p = + dynamic_cast(cls.get()); + if (!anakin_predictor_p) { + LOG(ERROR) << "fail to call Init"; + return nullptr; + } + anakin_predictor_p->get_executer().init(graph_); + + return std::move(cls); } // A factory to help create difference predictor. @@ -74,6 +105,7 @@ template <> std::unique_ptr CreatePaddlePredictor( const AnakinConfig &config) { + VLOG(3) << "Anakin Predictor create."; std::unique_ptr x( new PaddleInferenceAnakinPredictor(config)); return x; diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h index fe9f562e9d1d40c30585bcb68fa51e445bedb4aa..181784cbdf91fe2f50e20f4d447448a42a18d301 100644 --- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h +++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h @@ -20,32 +20,42 @@ limitations under the License. */ #pragma once // NOTE This header file do not have namespace. -// TODO(Superjomn) Tell Anakin to provide better APIs. -#include +//#include #include "paddle/contrib/inference/paddle_inference_api.h" +#include "framework/core/net/net.h" +#include "saber/saber_types.h" + namespace paddle { class PaddleInferenceAnakinPredictor : public PaddlePredictor { public: + PaddleInferenceAnakinPredictor() {} + PaddleInferenceAnakinPredictor(const AnakinConfig& config); // NOTE Unlike the native engine, the buffers of anakin engine's output_data // should be allocated first. - // TODO(Superjomn) should unify all the behaviors of output_data accross all - // the engines. bool Run(const std::vector& inputs, std::vector* output_data) override; std::unique_ptr Clone() override; + anakin::Net& + get_executer(); + + ~PaddleInferenceAnakinPredictor() override{}; + private: bool Init(const AnakinConfig& config); - anakin::AnakinEngine - engine_; + graph_; + anakin::Net + executor_; + AnakinConfig config_; }; } // namespace paddle diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc index 43324bc67cba16c36d9dbcb58ccde1c57293085e..47b9c6fa285b623d2b08f45917cb3474dbc2ab83 100644 --- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc +++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc @@ -12,16 +12,54 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/contrib/inference/paddle_inference_api.h" +#include #include +#include "gflags/gflags.h" +#include "paddle/contrib/inference/paddle_inference_api.h" + namespace paddle { -TEST(inference, anakin) { +AnakinConfig GetConfig() { AnakinConfig config; + config.model_file = "./mobilenet_v2.anakin.bin"; + config.device = 0; + config.max_batch_size = 1; + return config; +} - auto engine = +TEST(inference, anakin) { + AnakinConfig config = GetConfig(); + auto predictor = CreatePaddlePredictor(config); + + float data[1 * 3 * 224 * 224] = {1.0f}; + + PaddleBuf buf{.data = data, .length = sizeof(data)}; + PaddleTensor tensor{.name = "input_0", + .shape = std::vector({1, 3, 224, 224}), + .data = buf, + .dtype = PaddleDType::FLOAT32}; + + // For simplicity, we set all the slots with the same data. + std::vector paddle_tensor_feeds(1, tensor); + + float data_out[1000]; + + PaddleBuf buf_out{.data = data_out, .length = sizeof(data)}; + PaddleTensor tensor_out{.name = "prob_out", + .shape = std::vector({1000, 1}), + .data = buf_out, + .dtype = PaddleDType::FLOAT32}; + + std::vector outputs(1, tensor_out); + + ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); + + float* data_o = static_cast(outputs[0].data.data); + for (size_t j = 0; j < 1000; ++j) { + LOG(INFO) << "output[" << j << "]: " << data_o[j]; + } } } // namespace paddle diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc index 1f960677163988be6f4c502738861bf86588f406..4b6cb7b051d1ad2c63e895017c7faf1245c22612 100644 --- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc +++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc @@ -15,6 +15,8 @@ limitations under the License. */ #include #include +#include + #include "gflags/gflags.h" #include "paddle/contrib/inference/paddle_inference_api_impl.h" #include "paddle/fluid/inference/tests/test_helper.h" @@ -45,14 +47,19 @@ NativeConfig GetConfig() { config.model_dir = FLAGS_dirname + "word2vec.inference.model"; LOG(INFO) << "dirname " << config.model_dir; config.fraction_of_gpu_memory = 0.15; +#ifdef PADDLE_WITH_CUDA config.use_gpu = true; +#else + config.use_gpu = false; +#endif config.device = 0; return config; } -TEST(paddle_inference_api_impl, word2vec) { +void MainWord2Vec(bool use_gpu) { NativeConfig config = GetConfig(); auto predictor = CreatePaddlePredictor(config); + config.use_gpu = use_gpu; framework::LoDTensor first_word, second_word, third_word, fourth_word; framework::LoD lod{{0, 1}}; @@ -100,11 +107,12 @@ TEST(paddle_inference_api_impl, word2vec) { free(outputs[0].data.data); } -TEST(paddle_inference_api_impl, image_classification) { +void MainImageClassification(bool use_gpu) { int batch_size = 2; bool use_mkldnn = false; bool repeat = false; NativeConfig config = GetConfig(); + config.use_gpu = use_gpu; config.model_dir = FLAGS_dirname + "image_classification_resnet.inference.model"; @@ -149,4 +157,143 @@ TEST(paddle_inference_api_impl, image_classification) { free(data); } +void MainThreadsWord2Vec(bool use_gpu) { + NativeConfig config = GetConfig(); + config.use_gpu = use_gpu; + auto main_predictor = CreatePaddlePredictor(config); + + // prepare inputs data and reference results + constexpr int num_jobs = 3; + std::vector> jobs(num_jobs); + std::vector> paddle_tensor_feeds(num_jobs); + std::vector refs(num_jobs); + for (size_t i = 0; i < jobs.size(); ++i) { + // each job has 4 words + jobs[i].resize(4); + for (size_t j = 0; j < 4; ++j) { + framework::LoD lod{{0, 1}}; + int64_t dict_size = 2073; // The size of dictionary + SetupLoDTensor(&jobs[i][j], lod, static_cast(0), dict_size - 1); + paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i][j])); + } + + // get reference result of each job + std::vector ref_feeds; + std::vector ref_fetches(1, &refs[i]); + for (auto& word : jobs[i]) { + ref_feeds.push_back(&word); + } + TestInference(config.model_dir, ref_feeds, ref_fetches); + } + + // create threads and each thread run 1 job + std::vector threads; + for (int tid = 0; tid < num_jobs; ++tid) { + threads.emplace_back([&, tid]() { + auto predictor = main_predictor->Clone(); + auto& local_inputs = paddle_tensor_feeds[tid]; + std::vector local_outputs; + ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs)); + + // check outputs range + ASSERT_EQ(local_outputs.size(), 1UL); + const size_t len = local_outputs[0].data.length; + float* data = static_cast(local_outputs[0].data.data); + for (size_t j = 0; j < len / sizeof(float); ++j) { + ASSERT_LT(data[j], 1.0); + ASSERT_GT(data[j], -1.0); + } + + // check outputs correctness + float* ref_data = refs[tid].data(); + EXPECT_EQ(refs[tid].numel(), static_cast(len / sizeof(float))); + for (int i = 0; i < refs[tid].numel(); ++i) { + EXPECT_NEAR(ref_data[i], data[i], 1e-3); + } + free(data); + }); + } + for (int i = 0; i < num_jobs; ++i) { + threads[i].join(); + } +} + +void MainThreadsImageClassification(bool use_gpu) { + constexpr int num_jobs = 4; // each job run 1 batch + constexpr int batch_size = 1; + NativeConfig config = GetConfig(); + config.use_gpu = use_gpu; + config.model_dir = + FLAGS_dirname + "image_classification_resnet.inference.model"; + + auto main_predictor = CreatePaddlePredictor(config); + std::vector jobs(num_jobs); + std::vector> paddle_tensor_feeds(num_jobs); + std::vector refs(num_jobs); + for (size_t i = 0; i < jobs.size(); ++i) { + // prepare inputs + std::vector> feed_target_shapes = + GetFeedTargetShapes(config.model_dir, /*is_combined*/ false); + feed_target_shapes[0][0] = batch_size; + framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]); + SetupTensor(&jobs[i], input_dims, 0.f, 1.f); + paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i])); + + // get reference result of each job + std::vector ref_feeds(1, &jobs[i]); + std::vector ref_fetches(1, &refs[i]); + TestInference(config.model_dir, ref_feeds, ref_fetches); + } + + // create threads and each thread run 1 job + std::vector threads; + for (int tid = 0; tid < num_jobs; ++tid) { + threads.emplace_back([&, tid]() { + auto predictor = main_predictor->Clone(); + auto& local_inputs = paddle_tensor_feeds[tid]; + std::vector local_outputs; + ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs)); + + // check outputs correctness + ASSERT_EQ(local_outputs.size(), 1UL); + const size_t len = local_outputs[0].data.length; + float* data = static_cast(local_outputs[0].data.data); + float* ref_data = refs[tid].data(); + EXPECT_EQ(refs[tid].numel(), len / sizeof(float)); + for (int i = 0; i < refs[tid].numel(); ++i) { + EXPECT_NEAR(ref_data[i], data[i], 1e-3); + } + free(data); + }); + } + for (int i = 0; i < num_jobs; ++i) { + threads[i].join(); + } +} + +TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); } +TEST(inference_api_native, word2vec_cpu_threads) { + MainThreadsWord2Vec(false /*use_gpu*/); +} +TEST(inference_api_native, image_classification_cpu) { + MainThreadsImageClassification(false /*use_gpu*/); +} +TEST(inference_api_native, image_classification_cpu_threads) { + MainThreadsImageClassification(false /*use_gpu*/); +} + +#ifdef PADDLE_WITH_CUDA +TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); } +TEST(inference_api_native, word2vec_gpu_threads) { + MainThreadsWord2Vec(true /*use_gpu*/); +} +TEST(inference_api_native, image_classification_gpu) { + MainThreadsImageClassification(true /*use_gpu*/); +} +TEST(inference_api_native, image_classification_gpu_threads) { + MainThreadsImageClassification(true /*use_gpu*/); +} + +#endif + } // namespace paddle diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h index 9c5e2cf7ccdcea2822da42210ff1fdb915a9a4ec..b611bb77b4e1ec05b8bd029ac37cefba346c6eb0 100644 --- a/paddle/fluid/framework/data_layout.h +++ b/paddle/fluid/framework/data_layout.h @@ -27,6 +27,7 @@ enum class DataLayout { kNHWC = 0, kNCHW = 1, kAnyLayout = 2, + kMKLDNN = 3, // all layouts supported by MKLDNN internally }; inline DataLayout StringToDataLayout(const std::string& str) { @@ -41,6 +42,8 @@ inline DataLayout StringToDataLayout(const std::string& str) { return DataLayout::kNCHW; } else if (s == "ANYLAYOUT") { return DataLayout::kAnyLayout; + } else if (s == "MKLDNNLAYOUT") { + return DataLayout::kMKLDNN; } else { PADDLE_THROW("Unknown storage order string: %s", s); } @@ -54,8 +57,10 @@ inline std::string DataLayoutToString(const DataLayout& data_layout) { return "NCHW"; case DataLayout::kAnyLayout: return "ANY_LAYOUT"; + case DataLayout::kMKLDNN: + return "MKLDNNLAYOUT"; default: - PADDLE_THROW("unknown DataLayou %d", data_layout); + PADDLE_THROW("unknown DataLayout %d", data_layout); } } diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 60ec60a427ba9046ce690eb75c27cd322fdd726d..5b8dfc57ba020cea259041f55a66472ea26b4eec 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -16,6 +16,9 @@ #include #include "paddle/fluid/operators/math/math_function.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif namespace paddle { namespace framework { @@ -88,5 +91,85 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var, out->set_layout(expected_kernel_type.data_layout_); } +#ifdef PADDLE_WITH_MKLDNN +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::reorder; + +void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) { + switch (type) { + case mkldnn::memory::data_type::f32: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::s8: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::u8: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::s16: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::s32: + return platform::to_void_cast(tensor.data()); + default: + PADDLE_THROW("wrong mkldnn type provided"); + } +} +#endif + +void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, + const Tensor& in, Tensor* out) { + auto in_layout = kernel_type_for_var.data_layout_; + auto out_layout = expected_kernel_type.data_layout_; + + PADDLE_ENFORCE( + in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN, + "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to " + "non-MKLDNN"); + +#ifdef PADDLE_WITH_MKLDNN + PADDLE_ENFORCE(in.format() != memory::format::format_undef && + in.format() != memory::format::any, + "Input tensor should have specified memory format"); + + // Set default as NCHW in case not specified + out_layout = + out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout; + + auto& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = dynamic_cast( + pool.Get(expected_kernel_type.place_)); + auto& cpu_engine = dev_ctx->GetEngine(); + + std::vector in_tz = paddle::framework::vectorize2int(in.dims()); + std::vector out_tz = in_tz; + + memory::data_type in_type = ToMKLDNNDataType(in.type()); + PADDLE_ENFORCE(in_type != memory::data_type::data_undef, + "Input tensor type is not supported: ", in.type().name()); + memory::data_type out_type = in_type; + + memory::format in_format = + in_tz.size() == 2 ? memory::format::nc : in.format(); + memory::format out_format = + out_tz.size() == 2 ? memory::format::nc : ToMKLDNNFormat(out_layout); + + void* in_data = GetDataFromTensor(in, in_type); + + // output tensor has the same dims as input. Reorder don't change dims + out->Resize(in.dims()); + + auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); + + auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data); + auto out_memory = + memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data); + + platform::Reorder(in_memory, out_memory); + + out->set_layout(out_layout); + // reset format since the out tensor will be feed to non-MKLDNN OPkernel + out->set_format(memory::format::format_undef); +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h index 06b638663dd334837a3bcb7737e507fcbc871c7a..2ba84ce57fd8aa3d9aa651bdaa2930e459c74e88 100644 --- a/paddle/fluid/framework/data_layout_transform.h +++ b/paddle/fluid/framework/data_layout_transform.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/tensor.h" @@ -22,6 +23,50 @@ namespace paddle { namespace framework { +#ifdef PADDLE_WITH_MKLDNN +using MKLDNNFormat = mkldnn::memory::format; +using MKLDNNDataType = mkldnn::memory::data_type; + +inline MKLDNNFormat ToMKLDNNFormat(const DataLayout& layout) { + switch (layout) { + case DataLayout::kNHWC: + return MKLDNNFormat::nhwc; + case DataLayout::kNCHW: + return MKLDNNFormat::nchw; + default: + PADDLE_THROW("Fail to convert layout %s to MKLDNN format", + DataLayoutToString(layout)); + } +} + +inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) { + switch (format) { + case MKLDNNFormat::nhwc: + return DataLayout::kNHWC; + case MKLDNNFormat::nchw: + return DataLayout::kNCHW; + default: + PADDLE_THROW("Fail to convert MKLDNN format to paddle layout"); + } +} + +inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) { + static const std::map dict{ + {std::type_index(typeid(float)), MKLDNNDataType::f32}, // NOLINT + {std::type_index(typeid(char)), MKLDNNDataType::s8}, // NOLINT + {std::type_index(typeid(unsigned char)), MKLDNNDataType::u8}, + {std::type_index(typeid(int16_t)), MKLDNNDataType::s16}, + {std::type_index(typeid(int32_t)), MKLDNNDataType::s32}}; + auto iter = dict.find(type); + if (iter != dict.end()) return iter->second; + return MKLDNNDataType::data_undef; +} +#endif + +void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, + const Tensor& in, Tensor* out); + std::vector GetAxis(const DataLayout& from, const DataLayout& to); void TransDataLayout(const OpKernelType& kernel_type_for_var, diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index 9c277a27da5af34fc9fb18ca073e369c05ecdf22..b8fcc92697ca1bf1d971f8fef020f31d405605a9 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -33,11 +33,34 @@ void DataTransform(const OpKernelType& expected_kernel_type, Tensor in; in.ShareDataWith(input_tensor); Tensor out; + DataLayout lin = kernel_type_for_var.data_layout_; + DataLayout lout = expected_kernel_type.data_layout_; // do layout transform - if (NeedTransformLayout(expected_kernel_type.data_layout_, - kernel_type_for_var.data_layout_)) { - TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out); + if (NeedTransformLayout(lout, lin)) { + if (lin == DataLayout::kMKLDNN || lout == DataLayout::kMKLDNN) { + PADDLE_ENFORCE( + !(lin == DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN), + "No layout transform needed between two MKLDNN OPKernels"); + + if (lin != DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN) { +#ifdef PADDLE_WITH_MKLDNN + // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel + // Just set layout/format. No real transform occur + out.ShareDataWith(input_tensor); + out.set_layout(DataLayout::kMKLDNN); + out.set_format(ToMKLDNNFormat(lin)); +#endif + } else { + // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel + // Do transform via MKLDNN lib + TransDataLayoutFromMKLDNN(kernel_type_for_var, expected_kernel_type, in, + &out); + } + } else { + // Case3 - transfrom between Non-MKLDNN OPKernels + TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out); + } transformed = true; PassTensorData(&out, &in); } diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index c106761f72e689ff53867ecad8e36b6038173d0e..c43826b64cc5140c539df17fdd13d9bee7fefdcd 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -13,7 +13,7 @@ cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_ro if(WITH_GPU) nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory - dynload_cuda) + dynload_cuda variable_visitor) set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda) nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) @@ -25,6 +25,7 @@ else() endif() cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) +cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle) diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc new file mode 100644 index 0000000000000000000000000000000000000000..32415c192f0be51bf0850fe533c212c635779a30 --- /dev/null +++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fuse_vars_op_handle.h" + +namespace paddle { +namespace framework { +namespace details { + +void FuseVarsOpHandle::RunImpl() { + WaitInputVarGenerated(place_); + + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + PADDLE_ENFORCE_EQ(in_var_handles.size(), 0); + PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), ""); + + auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get(); + + auto out_var_handle = out_var_handles[0]; + auto out_var = scope->Var(out_var_handle->name_); + + auto out_tensor = out_var->GetMutable(); + out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_); + + int64_t s = 0; + for (size_t i = 1; i < out_var_handles.size(); ++i) { + auto out_name = out_var_handles[i]->name_; + auto out_t = scope->Var(out_name)->GetMutable(); + auto numel = this->inputs_numel_.at(out_name); + out_t->ShareDataWith(out_tensor->Slice(s, s + numel)); + s += numel; + } + this->RunAndRecordEvent([this] {}); +} + +std::string FuseVarsOpHandle::Name() const { return "fuse vars"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h new file mode 100644 index 0000000000000000000000000000000000000000..140fb5bb49a33146de974b6d79559b4cf15bdd7b --- /dev/null +++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h @@ -0,0 +1,63 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { + +struct FuseVarsOpHandle : public OpHandleBase { + public: + FuseVarsOpHandle(Scope *local_scope, const platform::Place &place, + const std::unordered_map &inputs_numel, + const std::type_index &var_type) + : local_scope_(local_scope), + place_(place), + inputs_numel_(inputs_numel), + type_(var_type) { + total_numel_ = 0; + for (auto in_numel : inputs_numel) { + PADDLE_ENFORCE_GT(in_numel.second, 0); + total_numel_ += in_numel.second; + } + } + + std::string Name() const override; + + bool IsMultiDeviceTransfer() override { return false; }; + + protected: + void RunImpl() override; + + private: + Scope *local_scope_; + const platform::Place place_; + const std::unordered_map inputs_numel_; + const std::type_index type_; + int64_t total_numel_; +}; +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 868de0f9a60a3dcad9a2c295f38707cdadcd1d21..41a56caa229a81caa3dbfdb413691cd3477cc374 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -230,7 +230,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( if (op->Type() == "concat") { auto got = remote_vars_devices_.find(op->InputArgumentNames()[0]); PADDLE_ENFORCE(got != remote_vars_devices_.end(), - "can not find right place to concat received var."); + "can not find right place to concatenate received var."); CreateDistTrainOp(&result, *op, got->second); } else { CreateDistTrainOp(&result, *op, 0); @@ -503,10 +503,9 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result, } void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, const OpDesc &op, - int place_id) const { - auto &p = places_[place_id]; - auto *s = local_scopes_[place_id]; - result->ops_.emplace_back(new RPCOpHandle(op, s, p, op.Type())); + int device_id) const { + result->ops_.emplace_back(new RPCOpHandle(op, local_scopes_[device_id], + op.Type(), places_[device_id])); if (op.Type() == "send_barrier") { ConnectOp(result, result->ops_.back().get(), "send_vars"); @@ -524,7 +523,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, const OpDesc &op, // TODO(Yancey1989): schedule rpc op on different place may // increate throughput - CreateOpHandleIOs(result, op, place_id); + CreateOpHandleIOs(result, op, device_id); } bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const { diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index fd4245461bce7115a581fed8f8aa226a7a9911fa..b4f93dbda68012ea8bb846842abe13395f42d345 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -58,7 +58,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { private: void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op, - size_t place_id) const; + size_t device_id) const; private: std::string loss_var_name_; diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc index 95aa599cd3e403e9cc66b2b5ad35d0d214d1ab5b..5bba089ade801a06e0364835efe5249105dcfcac 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc @@ -11,10 +11,12 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#include +#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" -#include #include "paddle/fluid/framework/details/reduce_and_gather.h" +#include "paddle/fluid/framework/details/variable_visitor.h" namespace paddle { namespace framework { @@ -30,27 +32,34 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle( } void NCCLAllReduceOpHandle::RunImpl() { - if (inputs_.size() == 1) { + if (NoDummyInputSize() == 1) { return; // No need to all reduce when GPU count = 1; } else { // Wait input done WaitInputVarGenerated(); - - auto &var_name = static_cast(this->inputs_[0])->name_; - int dtype = -1; - size_t numel = 0; + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), places_.size(), + "The NoDummyInputSize should be equal to the number of places."); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), out_var_handles.size(), + "The NoDummyInputSize and NoDummyOutputSize should be equal."); std::vector lod_tensors; - for (size_t i = 0; i < local_scopes_.size(); ++i) { auto *s = local_scopes_[i]; auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get(); - - auto &lod_tensor = local_scope.FindVar(var_name)->Get(); + auto &lod_tensor = + local_scope.FindVar(in_var_handles[i]->name_)->Get(); lod_tensors.emplace_back(&lod_tensor); + PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_, + "The name of input and output should be equal."); } if (platform::is_gpu_place(lod_tensors[0]->place())) { + int dtype = -1; + size_t numel = 0; std::vector> all_reduce_calls; for (size_t i = 0; i < local_scopes_.size(); ++i) { auto &p = places_[i]; @@ -96,7 +105,7 @@ void NCCLAllReduceOpHandle::RunImpl() { auto &scope = *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get(); auto &p = places_[i]; - auto *var = scope.FindVar(var_name); + auto *var = scope.FindVar(in_var_handles[i]->name_); auto *dev_ctx = dev_ctxes_[p]; RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 6b064650b4f09737836bda4a43fa421720077929..3849cca59a3347137b769f97261cfbf97da8d6ff 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -104,6 +104,16 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { } } +size_t OpHandleBase::NoDummyInputSize() const { + size_t cnt = 0; + for (auto *in : inputs_) { + if (dynamic_cast(in) == nullptr) { + ++cnt; + } + } + return cnt; +} + bool OpHandleBase::NeedWait(VarHandleBase *in_var) { return in_var && in_var->generated_op_; } diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 8f94206a87dbae8a81727ca48718886bbabbe25c..dc92b0fe9f760d95d4869fdd56c0400b6710437f 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -80,6 +80,8 @@ class OpHandleBase { const std::vector &Outputs() const { return outputs_; } + size_t NoDummyInputSize() const; + protected: void RunAndRecordEvent(const std::function &callback); diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc index 7f4da4c01de1010467d839ee5490c5e0d02d8c24..586465f99fd94117c821be2952bffda385fbcf75 100644 --- a/paddle/fluid/framework/details/rpc_op_handle.cc +++ b/paddle/fluid/framework/details/rpc_op_handle.cc @@ -19,12 +19,12 @@ namespace framework { namespace details { RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc, - const Scope *local_scope, const platform::Place &place, - const std::string &name) + const Scope *local_scope, const std::string &name, + const platform::Place &place) : op_(framework::OpRegistry::CreateOp(op_desc)), local_scope_(local_scope), - place_(place), - name_(name) {} + name_(name), + place_(place) {} void RPCOpHandle::RunImpl() { // TODO(wuyi): need further analysis whether wait VarDummyHandle. diff --git a/paddle/fluid/framework/details/rpc_op_handle.h b/paddle/fluid/framework/details/rpc_op_handle.h index d28b7721720d808a8d81701c3811eae16121fb41..ae38c7fe19e102a330455d89a1068414a7835fab 100644 --- a/paddle/fluid/framework/details/rpc_op_handle.h +++ b/paddle/fluid/framework/details/rpc_op_handle.h @@ -29,7 +29,7 @@ namespace details { struct RPCOpHandle : public OpHandleBase { RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope, - const platform::Place& place, const std::string& name); + const std::string& name, const platform::Place& place); std::string Name() const override; @@ -43,8 +43,8 @@ struct RPCOpHandle : public OpHandleBase { private: std::unique_ptr op_; const Scope* local_scope_; - const platform::Place& place_; const std::string name_; + platform::Place place_; }; } // namespace details diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index 0da9a2128fcecb3201ed7952f1b2185fac158f98..3c2c5273689267eb94b9727822cd12b8bc4fd124 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -30,7 +30,9 @@ class SSAGraphBuilder { SSAGraphBuilder() {} virtual ~SSAGraphBuilder() {} virtual std::unique_ptr Build(const ProgramDesc &program) const = 0; - virtual int GetRemoteVarDeviceId(const std::string &var_name) const = 0; + virtual int GetRemoteVarDeviceId(const std::string &var_name) const { + return -1; + } DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h index 5287be3b6a05ec7067ca433ba976b0314d05fe02..b4c90013789759d17646d95efdc81fc6a0a4f3e7 100644 --- a/paddle/fluid/framework/details/ssa_graph_printer.h +++ b/paddle/fluid/framework/details/ssa_graph_printer.h @@ -20,7 +20,7 @@ namespace paddle { namespace framework { namespace details { -class SSAGraph; +struct SSAGraph; class SSAGraphPrinter { public: virtual ~SSAGraphPrinter() {} diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h index fab20d75f5a45257f243333c1998d7b2549a25f9..f51a184e7bae2283f335fe9462a77b9c5fb831a5 100644 --- a/paddle/fluid/framework/op_kernel_type.h +++ b/paddle/fluid/framework/op_kernel_type.h @@ -87,7 +87,14 @@ inline std::string KernelTypeToString(const OpKernelType& kernel_key) { } inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) { - return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r; + bool ret = + (l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r); +#ifdef PADDLE_WITH_MKLDNN + // Layout transform needed for either non-MKLDNN to MKLDNN or vice versa + ret |= (l != DataLayout::kMKLDNN && r == DataLayout::kMKLDNN); + ret |= (l == DataLayout::kMKLDNN && r != DataLayout::kMKLDNN); +#endif + return ret; } inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) { diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 748317438b44bc4af84f13b25f8e4f88386388fb..43ab227a9478707445892c14723801992d0041aa 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -83,8 +83,14 @@ struct OpKernelRegistrarFunctor { void operator()(const char* op_type, const char* library_type) const { using T = typename KERNEL_TYPE::ELEMENT_TYPE; + std::string library(library_type); + std::string data_layout = "ANYLAYOUT"; + if (library == "MKLDNN") { + data_layout = "MKLDNNLAYOUT"; + } OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(), - DataLayout::kAnyLayout, StringToLibraryType(library_type)); + StringToDataLayout(data_layout), + StringToLibraryType(library_type)); OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE); constexpr auto size = std::tuple_size>::value; @@ -99,7 +105,8 @@ struct OpKernelRegistrarFunctor { void operator()(const char* op_type, const char* library_type) const {} }; -// User can register many kernel in one place. The data type could be different. +// User can register many kernel in one place. The data type could be +// different. template class OpKernelRegistrar : public Registrar { public: @@ -149,15 +156,15 @@ class OpKernelRegistrar : public Registrar { /** * Macro to register OperatorKernel. */ -#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...) \ +#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...) \ STATIC_ASSERT_GLOBAL_NAMESPACE( \ - __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__, \ + __reg_op_kernel_##op_type##_##library_type##__, \ "REGISTER_OP_KERNEL must be called in global namespace"); \ static ::paddle::framework::OpKernelRegistrar \ - __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type, \ - #LIBRARY_TYPE); \ - int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() { \ - __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch(); \ + __op_kernel_registrar_##op_type##_##library_type##__(#op_type, \ + #library_type); \ + int TouchOpKernelRegistrar_##op_type##_##library_type() { \ + __op_kernel_registrar_##op_type##_##library_type##__.Touch(); \ return 0; \ } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index f87d5521492418d2daf5b7fba1500c4bb31e10f5..c633a2f847683debce08c40b0c2ed6e58c0a7ad1 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -293,6 +293,38 @@ static Tensor* GetMutableTensorFromVar(Variable* var) { } } +bool ExecutionContext::HasInput(const std::string& name) const { + if (!op_.HasInputs(name)) { + return false; + } + auto& ins = Inputs(name); + size_t length = ins.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Input %s should not have more than one inputs", name); + auto arg = ins[0]; + auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg); + return var != nullptr; +} + +bool ExecutionContext::HasOutput(const std::string& name) const { + if (!op_.HasOutputs(name)) { + return false; + } + auto& outs = Outputs(name); + size_t length = outs.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Output %s should not have more than one inputs", name); + auto arg = outs[0]; + auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg); + return var != nullptr; +} + template <> const Tensor* ExecutionContext::Input(const std::string& name) const { auto* var = InputVar(name); @@ -444,10 +476,25 @@ class RuntimeInferShapeContext : public InferShapeContext { auto* out_tensor = out_var->GetMutable(); out_tensor->set_lod(in_tensor.lod()); - // TODO(dzhwinter) : reuse ShareLoD in most operators. - // Need to call ShareLayout explicitly in sequence related ops. - // Shall we have a better method to shared info between in/out Tensor? - out_tensor->set_layout(in_tensor.layout()); +// TODO(dzhwinter) : reuse ShareLoD in most operators. +// Need to call ShareLayout explicitly in sequence related ops. +// Shall we have a better method to shared info between in/out Tensor? +#ifdef PADDLE_WITH_MKLDNN + // Fix me: ugly workaround below + // Correct solution: + // set_layout() should NOT be called here (i.e. ShareLoD). Instead, + // layout of output tensor should be set "manually" in Compute() + // of each OPKernel. The reason layout should NOT be shared between + // input and output "automatically" (now by InferShape()->ShareLoD()) + // is that layout transform may occur after InferShape(). + // Workaround: + // Skip set_layout() when input layout is kMKLDNN + // This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN + // OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called + // in Compute() + if (in_tensor.layout() != DataLayout::kMKLDNN) +#endif + out_tensor->set_layout(in_tensor.layout()); } void ShareLayout(const std::string& in, const std::string& out, size_t i = 0, @@ -646,8 +693,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( } if (t != nullptr) { int tmp = static_cast(ToDataType(t->type())); - PADDLE_ENFORCE(tmp == data_type || data_type == -1, - "DataType of Paddle Op %s must be the same.", Type()); + PADDLE_ENFORCE( + tmp == data_type || data_type == -1, + "DataType of Paddle Op %s must be the same. Get %d != %d", Type(), + data_type, tmp); data_type = tmp; } } @@ -665,7 +714,8 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType( OpKernelType OperatorWithKernel::GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, const OpKernelType& expected_kernel_type) const { - return OpKernelType(expected_kernel_type.data_type_, tensor.place()); + return OpKernelType(expected_kernel_type.data_type_, tensor.place(), + tensor.layout()); } } // namespace framework diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 2f480e00c100d579e100de17d3feb957f5ef6167..b1d75d0d0ff3dccc67a1e833ccfe03a4cad8df39 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -191,9 +191,9 @@ class ExecutionContext { return op_.Attr(name); } - bool HasInput(const std::string& name) const { return op_.HasInputs(name); } + bool HasInput(const std::string& name) const; - bool HasOutput(const std::string& name) const { return op_.HasOutputs(name); } + bool HasOutput(const std::string& name) const; size_t InputSize(const std::string& name) const { return op_.Inputs(name).size(); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 85dad0a46a7a9fd2aa4dcd83c7316beed35db9d7..9844daf875bcb81cd00ad6d732c09fbda7c6e491 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -111,11 +111,7 @@ ParallelExecutor::ParallelExecutor( #ifdef PADDLE_WITH_CUDA builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get()); #endif - builder_.reset(builder_factory.Create().get()); - if (builder_.get() == nullptr) { - VLOG(3) << "builder is null."; - } - + builder_ = std::move(builder_factory.Create()); member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, places, builder_->Build(main_program))); diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 29566aaa53370b1fffc9ff9a90ae9b740b24f69e..ef224d68f1fc561f45e9d7a81425e62655457648 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -34,6 +34,28 @@ namespace framework { class LoDTensor; class Tensor { +#ifdef PADDLE_WITH_MKLDNN + + public: + inline mkldnn::memory::format format() const { return format_; } + + inline void set_format(const mkldnn::memory::format format) { + format_ = format; + } + + protected: + /** + * @brief the detail format of memory block which have layout as kMKLDNN + * + * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, + * nChw16c, etc. For a MKLDNN memory block, layout will be set as + * DataLayout::kMKLDNN meanwhile detail memory format will be kept in + * this field. + */ + + mkldnn::memory::format format_ = mkldnn::memory::format::format_undef; +#endif + public: template friend struct EigenTensor; @@ -195,8 +217,10 @@ class Tensor { * N,C,H,W for respectively the batch size, the number of * feature maps, the height. */ - - DataLayout layout_ = DataLayout::kNHWC; + // Fix me: here just change the default layout to kNCHW + // it doesn't fix the real issue, i.e. feeder should set up tensor layout + // according to actual input data + DataLayout layout_ = DataLayout::kNCHW; /** * @brief A PlaceHolder may be shared by more than one tensor. diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index e1012de2ec36eb4a858202d56a678b6a204c2f0a..0a1cb6d5703dace5e6be73285655ecd9d2ad89fb 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -209,7 +209,7 @@ TEST(Tensor, ReshapeToMatrix) { TEST(Tensor, Layout) { framework::Tensor src; - ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC); + ASSERT_EQ(src.layout(), framework::DataLayout::kNCHW); src.set_layout(framework::DataLayout::kAnyLayout); ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout); } diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 0dd0e5c9a2b08e406bf500f40e2fc8926012ac0e..748f5a084e8c880df215a60fe51c835ba5cd3110 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,9 +1,4 @@ # Add TRT tests -# This test is not stable -# See https://paddleci.ngrok.io/viewLog.html?tab=buildLog&buildTypeId=Paddle_PrCi2&buildId=36834&_focus=8828 -#nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc -# DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine -# SERIAL) nv_library(tensorrt_converter SRCS mul_op.cc conv2d_op.cc fc_op.cc DEPS tensorrt_engine mul_op) @@ -16,3 +11,5 @@ nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) +nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc index 7facf30d781a26c2c6eb0a8966ef1b87e5dfdf0b..e1cace9cc1b06f036f52e82b7b86c99a02d50f50 100644 --- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc @@ -22,7 +22,8 @@ namespace tensorrt { class ReluOpConverter : public OpConverter { public: ReluOpConverter() {} - void operator()(const framework::proto::OpDesc& op) override { + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { // Here the two nullptr looks strange, that's because the // framework::OpDesc's constructor is strange. framework::OpDesc op_desc(op, nullptr); @@ -33,7 +34,12 @@ class ReluOpConverter : public OpConverter { nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER( engine_, Activation, *const_cast(input_tensor), nvinfer1::ActivationType::kRELU); - engine_->SetITensor(op_desc.Output("Out")[0], layer->getOutput(0)); + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { // the test framework can not determine which is the + // output, so place the declaration inside. + engine_->DeclareOutput(output_name); + } } }; diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 86ca2ca08eb14265e1bfe7abd5eb6af5c83b8a5c..0a02a7bebf9efbd0555707e6cfa701ef1e7d9659 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -1,106 +1,47 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ #include -#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/inference/tensorrt/convert/io_converter.h" -#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/place.h" - -USE_OP(relu); +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" namespace paddle { namespace inference { namespace tensorrt { -void Compare(const std::string op_type, float input, float expect) { +TEST(ReluOpConverter, main) { framework::Scope scope; - platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); - - // init fluid op and variable - auto x_var = scope.Var("X"); - auto x_tensor = x_var->GetMutable(); - x_tensor->Resize({1, 1}); - x_tensor->mutable_data(place); - std::vector init; - init.push_back(input); - framework::TensorFromVector(init, ctx, x_tensor); - - auto out_var = scope.Var("Out"); - auto out_tensor = out_var->GetMutable(); - out_tensor->Resize({1, 1}); - out_tensor->mutable_data(place); - - framework::OpDesc op_desc; - op_desc.SetType(op_type); - op_desc.SetInput("X", {"X"}); - op_desc.SetOutput("Out", {"Out"}); - - auto op = framework::OpRegistry::CreateOp(*op_desc.Proto()); - - // run fluid op - op->Run(scope, place); - // get fluid output - std::vector out1; - framework::TensorToVector(*out_tensor, ctx, &out1); - - // init tensorrt op - cudaStream_t stream; - ASSERT_EQ(0, cudaStreamCreate(&stream)); - TensorRTEngine* engine = new TensorRTEngine(1, 1 << 10, &stream); - engine->InitNetwork(); - engine->DeclareInput("X", nvinfer1::DataType::kFLOAT, - nvinfer1::DimsCHW{1, 1, 1}); - // convert op - OpConverter op_converter; - op_converter.ConvertOp(*op_desc.Proto(), engine); - - engine->DeclareOutput("Out"); - engine->FreezeNetwork(); - - // convert LoDTensor to ITensor - size_t size = x_tensor->memory_size(); - EngineIOConverter::ConvertInput(op_type, *x_tensor, - engine->buffer("X").buffer, size, &stream); - // run tensorrt Outp - engine->Execute(1); - // convert ITensor to LoDTensor - EngineIOConverter::ConvertOutput(op_type, engine->buffer("Out").buffer, - out_tensor, size, &stream); - // get tensorrt output - std::vector out2; - framework::TensorToVector(*out_tensor, ctx, &out2); - - // compare - ASSERT_EQ(out1[0], out2[0]); - ASSERT_EQ(out1[0], expect); - - delete engine; - cudaStreamDestroy(stream); -} - -TEST(OpConverter, ConvertRelu) { - Compare("relu", 1, 1); // relu(1) = 1 - Compare("relu", -5, 0); // relu(-5) = 0 + std::unordered_set parameters; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6)); + validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("relu"); + desc.SetInput("X", {"relu-X"}); + desc.SetOutput("Out", {"relu-Out"}); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(10); } } // namespace tensorrt } // namespace inference } // namespace paddle -USE_OP(activation); +USE_OP(relu); diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index f75b7c70d60e77eb07927261d3c60bd526986f98..5e86b16ba1ff69c798372a144fb3bf699768f2e6 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -166,8 +166,6 @@ function(op_library TARGET) # NOTE(*): activation use macro to regist the kernels, set use_op manually. if(${TARGET} STREQUAL "activation") file(APPEND ${pybind_file} "USE_OP(relu);\n") - elseif(${TARGET} STREQUAL "reduce") - file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n") elseif(${TARGET} STREQUAL "fake_dequantize") file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") else() diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index dd71c66a75a039429f6e4b1771bb31508bb6b56d..8478ae20a59250f45daf9e8e4e18fddfe61b945e 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -58,14 +58,16 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, const framework::OperatorWithKernel& oper, const std::string& name) { framework::LibraryType library{framework::LibraryType::kPlain}; + + framework::DataLayout layout = framework::DataLayout::kAnyLayout; #ifdef PADDLE_WITH_MKLDNN auto it = oper.Attrs().find("use_mkldnn"); if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() && platform::CanMKLDNNBeUsed(ctx)) { library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } #endif - framework::DataLayout layout = framework::DataLayout::kAnyLayout; return framework::OpKernelType( framework::ToDataType(ctx.Input(name)->type()), ctx.GetPlace(), layout, library); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 6ec8c9d18b466142acdb46b0f46826a2aca7a47e..d7e0af28c1bfa6a9073b25b0a301234cc5d194f5 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -111,14 +111,16 @@ class BatchNormOp : public framework::OperatorWithKernel { "Variance input should be of float type"); framework::LibraryType library_{framework::LibraryType::kPlain}; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + #ifdef PADDLE_WITH_MKLDNN if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } #endif - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout = framework::DataLayout::kAnyLayout; return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, library_); } @@ -367,17 +369,18 @@ class BatchNormGradOp : public framework::OperatorWithKernel { } framework::LibraryType library_{framework::LibraryType::kPlain}; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout_ = framework::DataLayout::kAnyLayout; #ifdef PADDLE_WITH_MKLDNN if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout = framework::DataLayout::kAnyLayout; return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - layout, library_); + layout_, library_); } }; diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 697d91484257984b104a13b0572cf19b16f8d37e..850297a2327f33a4a765f64f201e217fce5db89b 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -75,6 +75,11 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library{framework::LibraryType::kPlain}; + + std::string data_format = ctx.Attr("data_format"); + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library = framework::LibraryType::kCUDNN; @@ -84,6 +89,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( if (library == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } #endif @@ -99,9 +105,6 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( "float16 can only be used when CUDNN is used"); } - std::string data_format = ctx.Attr("data_format"); - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout = framework::StringToDataLayout(data_format); return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, library); } @@ -309,6 +312,10 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; @@ -318,12 +325,10 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif - std::string data_format = ctx.Attr("data_format"); - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), layout_, library_); diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index 8843a1c44b7004ba5d7935f75d3c99d9c30fc6c0..a9ae1396db8d7dab0364779e506d5c0a3e2ff6ed 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -43,7 +43,7 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType FCOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library{framework::LibraryType::kMKLDNN}; - framework::DataLayout layout{framework::DataLayout::kAnyLayout}; + framework::DataLayout layout{framework::DataLayout::kMKLDNN}; return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), @@ -65,7 +65,7 @@ void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType FCOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library{framework::LibraryType::kMKLDNN}; - framework::DataLayout layout{framework::DataLayout::kAnyLayout}; + framework::DataLayout layout{framework::DataLayout::kMKLDNN}; return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index 52b9cd7fb7019b738098a8649f23277afd40e938..52b459a6a2e56b7c256efdb535b4652c64bae23c 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -124,16 +124,17 @@ namespace { framework::OpKernelType GetExpectedLRNKernel( const framework::ExecutionContext& ctx) { framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_MKLDNN if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif - std::string data_format = ctx.Attr("data_format"); - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), layout_, library_); diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc index 60e936298defe7c6ce8a33bdc7de05b52eb950e7..a045f9e98dd7348973c3c4506f44d3e261599a14 100644 --- a/paddle/fluid/operators/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/pool_mkldnn_op.cc @@ -24,10 +24,13 @@ using mkldnn::pooling_backward; // Generate keys for storing/retriving primitives for this operator // TODO(jczaja): Make hashing function more optimial -static std::string gethash(memory::dims& input_dims, std::string& pooling_type, - std::vector& ksize, std::vector& strides, - std::vector& paddings, std::string suffix) { - auto dims2str = [](memory::dims& operand_dims) { +static std::string gethash(const memory::dims& input_dims, + const std::string& pooling_type, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string& suffix) { + auto dims2str = [](const memory::dims& operand_dims) { std::string dstr = ""; for (size_t i = 0; i < operand_dims.size(); ++i) { dstr += std::to_string(operand_dims[i]) + "-"; diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index f4fb2b132fe8d59cb50f5a1f7359240ac50445fe..18aa2bd352c5d184b5748e57b4af17c1ae0d7a82 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -83,6 +83,9 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const { framework::OpKernelType PoolOp::GetExpectedKernelType( const framework::ExecutionContext &ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; @@ -92,11 +95,10 @@ framework::OpKernelType PoolOp::GetExpectedKernelType( if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif - std::string data_format = ctx.Attr("data_format"); - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), layout_, library_); @@ -112,6 +114,9 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { framework::OpKernelType PoolOpGrad::GetExpectedKernelType( const framework::ExecutionContext &ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; @@ -121,6 +126,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif @@ -129,8 +135,6 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN, "float16 can only be used when CUDNN is used"); } - std::string data_format = ctx.Attr("data_format"); - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, library_); } diff --git a/paddle/fluid/operators/reduce_max_op.cc b/paddle/fluid/operators/reduce_max_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..95d3768e1fdf6947659c7b3a1c9d57fad741472a --- /dev/null +++ b/paddle/fluid/operators/reduce_max_op.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_REDUCE_OP(reduce_max); +REGISTER_OP_CPU_KERNEL( + reduce_max, ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL( + reduce_max_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_max_op.cu b/paddle/fluid/operators/reduce_max_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..0d86b3127e42f7ee14ba57b1c762e8128a0f2d54 --- /dev/null +++ b/paddle/fluid/operators/reduce_max_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_max, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_max_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.cc b/paddle/fluid/operators/reduce_mean_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..fc258c2496340b47d24dc89f16f7419dbb4b0d95 --- /dev/null +++ b/paddle/fluid/operators/reduce_mean_op.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_mean_op.h" + +REGISTER_REDUCE_OP(reduce_mean); +REGISTER_OP_CPU_KERNEL(reduce_mean, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL(reduce_mean_grad, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..960cb3235be7f4cc98b97d3b088ceaeb3d4a4209 --- /dev/null +++ b/paddle/fluid/operators/reduce_mean_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_mean_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_mean, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_mean_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.h b/paddle/fluid/operators/reduce_mean_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1359679c4767d2032bf3e3a90849ad2a2ef3e829 --- /dev/null +++ b/paddle/fluid/operators/reduce_mean_op.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +struct MeanFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->mean(dim); + } +}; + +struct MeanGradFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, + const Dim& dim, int size) { + dx->device(place) = dy->broadcast(dim) / dx->constant(size); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_min_max_op.h b/paddle/fluid/operators/reduce_min_max_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ec59f3e71c1c702655a3feed10935b2f5a29d8a8 --- /dev/null +++ b/paddle/fluid/operators/reduce_min_max_op.h @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +struct MaxFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->maximum(dim); + } +}; + +struct MinFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->minimum(dim); + } +}; + +struct MaxOrMinGradFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, + const Dim& dim, int size) { + auto equals = (*x) == y->broadcast(dim); + auto ones = dx->constant(1); + auto zeros = dx->constant(0); + // If there are multiple minimum or maximum elements, the subgradient of + // each is the set [0, 1], and we pass gradient to all of them here. + dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_min_op.cc b/paddle/fluid/operators/reduce_min_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..330a86d2e4237a10d8cf6fd40025540edf08d897 --- /dev/null +++ b/paddle/fluid/operators/reduce_min_op.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_REDUCE_OP(reduce_min); +REGISTER_OP_CPU_KERNEL( + reduce_min, ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL( + reduce_min_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_min_op.cu b/paddle/fluid/operators/reduce_min_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..da466f805eff4709dc23471baef03e94052ee6c1 --- /dev/null +++ b/paddle/fluid/operators/reduce_min_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_min, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_min_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc deleted file mode 100644 index e293fd5e410b2a34b3c71ea674607ba9d7654535..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_op.cc +++ /dev/null @@ -1,186 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/reduce_op.h" - -#include -#include -#include - -namespace paddle { -namespace operators { - -using framework::Tensor; - -class ReduceOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of ReduceOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of ReduceOp should not be null."); - auto x_dims = ctx->GetInputDim("X"); - auto x_rank = x_dims.size(); - PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); - auto dims = ctx->Attrs().Get>("dim"); - for (size_t i = 0; i < dims.size(); ++i) { - if (dims[i] < 0) dims[i] = x_rank + dims[i]; - PADDLE_ENFORCE_LT( - dims[i], x_rank, - "The dim should be in the range [-rank(input), rank(input))."); - } - sort(dims.begin(), dims.end()); - bool reduce_all = ctx->Attrs().Get("reduce_all"); - bool keep_dim = ctx->Attrs().Get("keep_dim"); - if (reduce_all) { - if (keep_dim) - ctx->SetOutputDim( - "Out", framework::make_ddim(std::vector(x_rank, 1))); - else - ctx->SetOutputDim("Out", {1}); - } else { - auto dims_vector = vectorize(x_dims); - if (keep_dim) { - for (size_t i = 0; i < dims.size(); ++i) { - dims_vector[dims[i]] = 1; - } - } else { - const int kDelFlag = -2; - for (size_t i = 0; i < dims.size(); ++i) { - dims_vector[dims[i]] = kDelFlag; - } - dims_vector.erase( - remove(dims_vector.begin(), dims_vector.end(), kDelFlag), - dims_vector.end()); - } - auto out_dims = framework::make_ddim(dims_vector); - ctx->SetOutputDim("Out", out_dims); - if (dims[0] != 0) { - // Only pass LoD when not reducing on the first dim. - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - } -}; - -class ReduceGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null."); - auto x_dims = ctx->GetInputDim("X"); - auto x_rank = x_dims.size(); - PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); - auto dims = ctx->Attrs().Get>("dim"); - for (size_t i = 0; i < dims.size(); ++i) { - if (dims[i] < 0) dims[i] = x_rank + dims[i]; - PADDLE_ENFORCE_LT( - dims[i], x_rank, - "The dim should be in the range [-rank(input), rank(input))."); - } - sort(dims.begin(), dims.end()); - auto x_grad_name = framework::GradVarName("X"); - if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); - ctx->ShareLoD("X", /*->*/ x_grad_name); - } - } -}; - -class ReduceOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() final { - AddInput("X", - "(Tensor) The input tensor. Tensors with rank at most 6 are " - "supported."); - AddOutput("Out", "(Tensor) The result tensor."); - AddAttr>( - "dim", - "(list, default {0}) The dimensions to reduce. " - "Must be in the range [-rank(input), rank(input)). " - "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. " - "Note that reducing on the first dim will make the LoD info lost.") - .SetDefault({0}); - AddAttr("keep_dim", - "(bool, default false) " - "If true, retain the reduced dimension with length 1.") - .SetDefault(false); - AddAttr("reduce_all", - "(bool, default false) " - "If true, output a scalar reduced along all dimensions.") - .SetDefault(false); - AddComment(string::Sprintf(R"DOC( -%s Operator. - -This operator computes the %s of input tensor along the given dimension. -The result tensor has 1 fewer dimension than the input unless keep_dim is true. -If reduce_all is true, just reduce along all dimensions and output a scalar. - -)DOC", - GetOpType(), GetName())); - } - - protected: - virtual std::string GetName() const = 0; - virtual std::string GetOpType() const = 0; -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -#define REGISTER_REDUCE_OP(op_name) \ - class __##op_name##Maker__ : public ops::ReduceOpMaker { \ - protected: \ - virtual std::string GetName() const { return #op_name; } \ - virtual std::string GetOpType() const { return "Reduce " #op_name; } \ - }; \ - REGISTER_OPERATOR(reduce_##op_name, ops::ReduceOp, __##op_name##Maker__, \ - paddle::framework::DefaultGradOpDescMaker); \ - REGISTER_OPERATOR(reduce_##op_name##_grad, ops::ReduceGradOp) - -REGISTER_REDUCE_OP(sum); -REGISTER_REDUCE_OP(mean); -REGISTER_REDUCE_OP(max); -REGISTER_REDUCE_OP(min); -REGISTER_REDUCE_OP(prod); - -#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \ - REGISTER_OP_CPU_KERNEL(reduce_type, \ - ops::ReduceKernel, \ - ops::ReduceKernel, \ - ops::ReduceKernel, \ - ops::ReduceKernel); \ - REGISTER_OP_CPU_KERNEL( \ - reduce_type##_grad, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel); - -FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL); diff --git a/paddle/fluid/operators/reduce_op.cu b/paddle/fluid/operators/reduce_op.cu deleted file mode 100644 index ae29587f55847315b1d84f1344677e753fe01a9b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_op.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#define EIGEN_USE_GPU -#include "paddle/fluid/operators/reduce_op.h" - -namespace ops = paddle::operators; - -#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor) \ - REGISTER_OP_CUDA_KERNEL( \ - reduce_type, ops::ReduceKernel, \ - ops::ReduceKernel, \ - ops::ReduceKernel, \ - ops::ReduceKernel); \ - REGISTER_OP_CUDA_KERNEL( \ - reduce_type##_grad, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel, \ - ops::ReduceGradKernel); - -FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL); diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h index 7df47f316c30b9eb2644677681b91023e1838548..72b6cf1773d5bcc42e40e72111179d454d2bb4a9 100644 --- a/paddle/fluid/operators/reduce_op.h +++ b/paddle/fluid/operators/reduce_op.h @@ -14,105 +14,20 @@ limitations under the License. */ #pragma once +#include +#include #include -#include "glog/logging.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" + +#include "paddle/fluid/operators/reduce_op_function.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using DDim = framework::DDim; -template -using EigenTensor = framework::EigenTensor; -template -using EigenScalar = framework::EigenScalar; -template -using EigenVector = framework::EigenVector; - -struct SumFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->sum(dim); - } -}; - -struct SumGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - dx->device(place) = dy->broadcast(dim); - } -}; - -struct MeanFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->mean(dim); - } -}; - -struct MeanGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - dx->device(place) = dy->broadcast(dim) / dx->constant(size); - } -}; - -struct MaxFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->maximum(dim); - } -}; - -struct MinFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->minimum(dim); - } -}; - -struct MaxOrMinGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - auto equals = (*x) == y->broadcast(dim); - auto ones = dx->constant(1); - auto zeros = dx->constant(0); - // If there are multiple minimum or maximum elements, the subgradient of - // each is the set [0, 1], and we pass gradient to all of them here. - dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros); - } -}; - -struct ProdFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->prod(dim); - } -}; - -struct ProdGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse(); - } -}; - -#define HANDLE_DIM(NDIM, RDIM) \ - if (ndim == NDIM && rdim == RDIM) { \ - ReduceCompute(context); \ +#define HANDLE_DIM(NDIM, RDIM) \ + if (ndim == NDIM && rdim == RDIM) { \ + ReduceFunctor( \ + context.template device_context(), *input, output, \ + dims, keep_dim); \ } template @@ -120,11 +35,15 @@ class ReduceKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + output->mutable_data(context.GetPlace()); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + if (reduce_all) { // Flatten and reduce 1-D tensor - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - output->mutable_data(context.GetPlace()); auto x = EigenVector::Flatten(*input); auto out = EigenScalar::From(*output); auto& place = @@ -133,8 +52,8 @@ class ReduceKernel : public framework::OpKernel { Functor functor; functor(place, &x, &out, reduce_dim); } else { - int ndim = context.Input("X")->dims().size(); - int rdim = context.Attr>("dim").size(); + int ndim = input->dims().size(); + int rdim = dims.size(); // comments for accelerating compiling temporarily. // HANDLE_DIM(6, 5); // HANDLE_DIM(6, 4); @@ -154,48 +73,6 @@ class ReduceKernel : public framework::OpKernel { HANDLE_DIM(1, 1); } } - - private: - template - void ReduceCompute(const framework::ExecutionContext& context) const { - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - output->mutable_data(context.GetPlace()); - - auto x = EigenTensor::From(*input); - auto x_rank = static_cast(x.dimensions().size()); - auto dims = context.Attr>("dim"); - auto reduce_dim = Eigen::array(); - for (size_t i = 0; i < dims.size(); ++i) { - if (dims[i] < 0) dims[i] = x_rank + dims[i]; - reduce_dim[i] = dims[i]; - } - // construct the squeezed output tensor - bool keep_dim = context.Attr("keep_dim"); - DDim out_dims = output->dims(); - if (keep_dim && x_rank > 1) { - const int kDelFlag = -2; - auto dims_vector = vectorize(out_dims); - for (size_t i = 0; i < dims.size(); ++i) { - dims_vector[dims[i]] = kDelFlag; - } - dims_vector.erase( - remove(dims_vector.begin(), dims_vector.end(), kDelFlag), - dims_vector.end()); - out_dims = framework::make_ddim(dims_vector); - } - auto& place = - *context.template device_context().eigen_device(); - Functor functor; - - if (D == 1) { - auto out = EigenScalar::From(*output); - functor(place, &x, &out, reduce_dim); - } else { - auto out = EigenTensor::From(*output, out_dims); - functor(place, &x, &out, reduce_dim); - } - } }; template @@ -203,12 +80,15 @@ class ReduceGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { bool reduce_all = context.Attr("reduce_all"); + auto dims = context.Attr>("dim"); + + auto* input0 = context.Input("X"); + auto* input1 = context.Input("Out"); + auto* input2 = context.Input(framework::GradVarName("Out")); + auto* output = context.Output(framework::GradVarName("X")); + output->mutable_data(context.GetPlace()); + if (reduce_all) { - auto* input0 = context.Input("X"); - auto* input1 = context.Input("Out"); - auto* input2 = context.Input(framework::GradVarName("Out")); - auto* output = context.Output(framework::GradVarName("X")); - output->mutable_data(context.GetPlace()); auto x = EigenVector::Flatten(*input0); auto x_reduce = EigenVector::From(*input1); auto x_reduce_grad = EigenVector::From(*input2); @@ -221,74 +101,172 @@ class ReduceGradKernel : public framework::OpKernel { functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, broadcast_dim[0]); } else { - int rank = context.Input("X")->dims().size(); + int rank = input0->dims().size(); switch (rank) { case 1: - ReduceGradCompute<1>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 2: - ReduceGradCompute<2>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 3: - ReduceGradCompute<3>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 4: - ReduceGradCompute<4>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 5: - ReduceGradCompute<5>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; case 6: - ReduceGradCompute<6>(context); + ReduceGradFunctor( + context.template device_context(), *input0, + *input1, *input2, output, dims); break; } } } +}; - private: - template - void ReduceGradCompute(const framework::ExecutionContext& context) const { - auto* input0 = context.Input("X"); - auto* input1 = context.Input("Out"); - auto* input2 = context.Input(framework::GradVarName("Out")); - auto* output = context.Output(framework::GradVarName("X")); +class ReduceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; - output->mutable_data(context.GetPlace()); - auto x = EigenTensor::From(*input0); - auto x_grad = EigenTensor::From(*output); - auto x_rank = static_cast(x.dimensions().size()); - auto dims = context.Attr>("dim"); - auto x_dims = input0->dims(); - auto reduced_dims_v = vectorize(x_dims); - Eigen::array broadcast_dim; - for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ReduceOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ReduceOp should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto x_rank = x_dims.size(); + PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); + auto dims = ctx->Attrs().Get>("dim"); + for (size_t i = 0; i < dims.size(); ++i) { + if (dims[i] < 0) dims[i] = x_rank + dims[i]; + PADDLE_ENFORCE_LT( + dims[i], x_rank, + "The dim should be in the range [-rank(input), rank(input))."); + } + sort(dims.begin(), dims.end()); + bool reduce_all = ctx->Attrs().Get("reduce_all"); + bool keep_dim = ctx->Attrs().Get("keep_dim"); + if (reduce_all) { + if (keep_dim) + ctx->SetOutputDim( + "Out", framework::make_ddim(std::vector(x_rank, 1))); + else + ctx->SetOutputDim("Out", {1}); + } else { + auto dims_vector = vectorize(x_dims); + if (keep_dim) { + for (size_t i = 0; i < dims.size(); ++i) { + dims_vector[dims[i]] = 1; + } + } else { + const int kDelFlag = -2; + for (size_t i = 0; i < dims.size(); ++i) { + dims_vector[dims[i]] = kDelFlag; + } + dims_vector.erase( + remove(dims_vector.begin(), dims_vector.end(), kDelFlag), + dims_vector.end()); + } + auto out_dims = framework::make_ddim(dims_vector); + ctx->SetOutputDim("Out", out_dims); + if (dims[0] != 0) { + // Only pass LoD when not reducing on the first dim. + ctx->ShareLoD("X", /*->*/ "Out"); + } + } + } +}; + +class ReduceGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; - int broad_cats_times = 1; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto x_rank = x_dims.size(); + PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); + auto dims = ctx->Attrs().Get>("dim"); for (size_t i = 0; i < dims.size(); ++i) { if (dims[i] < 0) dims[i] = x_rank + dims[i]; - reduced_dims_v[dims[i]] = 1; - broadcast_dim[dims[i]] = x_dims[dims[i]]; - broad_cats_times *= x_dims[dims[i]]; + PADDLE_ENFORCE_LT( + dims[i], x_rank, + "The dim should be in the range [-rank(input), rank(input))."); + } + sort(dims.begin(), dims.end()); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + ctx->ShareLoD("X", /*->*/ x_grad_name); } - auto reduced_dims = framework::make_ddim(reduced_dims_v); - auto x_reduce = EigenTensor::From(*input1, reduced_dims); - auto x_reduce_grad = EigenTensor::From(*input2, reduced_dims); + } +}; + +class ReduceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() final { + AddInput("X", + "(Tensor) The input tensor. Tensors with rank at most 6 are " + "supported."); + AddOutput("Out", "(Tensor) The result tensor."); + AddAttr>( + "dim", + "(list, default {0}) The dimensions to reduce. " + "Must be in the range [-rank(input), rank(input)). " + "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. " + "Note that reducing on the first dim will make the LoD info lost.") + .SetDefault({0}); + AddAttr("keep_dim", + "(bool, default false) " + "If true, retain the reduced dimension with length 1.") + .SetDefault(false); + AddAttr("reduce_all", + "(bool, default false) " + "If true, output a scalar reduced along all dimensions.") + .SetDefault(false); + AddComment(string::Sprintf(R"DOC( +%s Operator. - auto& place = - *context.template device_context().eigen_device(); +This operator computes the %s of input tensor along the given dimension. +The result tensor has 1 fewer dimension than the input unless keep_dim is true. +If reduce_all is true, just reduce along all dimensions and output a scalar. - Functor functor; - functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, - broad_cats_times); +)DOC", + GetOpType(), GetName())); } + + protected: + virtual std::string GetName() const = 0; + virtual std::string GetOpType() const = 0; }; } // namespace operators } // namespace paddle -#define FOR_EACH_KERNEL_FUNCTOR(__macro) \ - __macro(reduce_sum, SumFunctor, SumGradFunctor); \ - __macro(reduce_mean, MeanFunctor, MeanGradFunctor); \ - __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \ - __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); \ - __macro(reduce_prod, ProdFunctor, ProdGradFunctor); +namespace ops = paddle::operators; + +#define REGISTER_REDUCE_OP(op_name) \ + class __##op_name##Maker__ : public ops::ReduceOpMaker { \ + protected: \ + virtual std::string GetName() const { return #op_name; } \ + virtual std::string GetOpType() const { return "Reduce " #op_name; } \ + }; \ + REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__, \ + paddle::framework::DefaultGradOpDescMaker); \ + REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp) diff --git a/paddle/fluid/operators/reduce_op_function.h b/paddle/fluid/operators/reduce_op_function.h new file mode 100644 index 0000000000000000000000000000000000000000..3da27bc8ac8d448471b9ff3779ac6aca59fac523 --- /dev/null +++ b/paddle/fluid/operators/reduce_op_function.h @@ -0,0 +1,109 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; +template +using EigenTensor = framework::EigenTensor; +template +using EigenScalar = framework::EigenScalar; +template +using EigenVector = framework::EigenVector; + +template +void ReduceFunctor(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* output, const std::vector& dims, + bool keep_dim) { + auto x = EigenTensor::From(input); + auto x_rank = static_cast(x.dimensions().size()); + auto reduce_dim = Eigen::array(); + std::vector dims_ref = dims; + for (size_t i = 0; i < dims_ref.size(); ++i) { + if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i]; + reduce_dim[i] = dims_ref[i]; + } + // construct the squeezed output tensor + DDim out_dims = output->dims(); + if (keep_dim && x_rank > 1) { + const int kDelFlag = -2; + auto dims_vector = framework::vectorize(out_dims); + for (size_t i = 0; i < dims_ref.size(); ++i) { + dims_vector[dims_ref[i]] = kDelFlag; + } + dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag), + dims_vector.end()); + out_dims = framework::make_ddim(dims_vector); + } + auto& place = *context.eigen_device(); + Functor functor; + + if (D == 1) { + auto out = EigenScalar::From(*output); + functor(place, &x, &out, reduce_dim); + } else { + auto out = EigenTensor::From(*output, out_dims); + functor(place, &x, &out, reduce_dim); + } +} + +template +void ReduceGradFunctor(const DeviceContext& context, + const framework::Tensor& input0, + const framework::Tensor& input1, + const framework::Tensor& input2, + framework::Tensor* output, + const std::vector& dims) { + auto x = EigenTensor::From(input0); + auto x_grad = EigenTensor::From(*output); + auto x_rank = static_cast(x.dimensions().size()); + auto x_dims = input0.dims(); + auto reduced_dims_v = framework::vectorize(x_dims); + std::vector dims_ref = dims; + Eigen::array broadcast_dim; + for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; + + int broad_cats_times = 1; + for (size_t i = 0; i < dims_ref.size(); ++i) { + if (dims_ref[i] < 0) { + dims_ref[i] = x_rank + dims_ref[i]; + } + reduced_dims_v[dims_ref[i]] = 1; + broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]]; + broad_cats_times *= x_dims[dims_ref[i]]; + } + auto reduced_dims = framework::make_ddim(reduced_dims_v); + auto x_reduce = EigenTensor::From(input1, reduced_dims); + auto x_reduce_grad = EigenTensor::From(input2, reduced_dims); + + auto& place = *context.eigen_device(); + + Functor functor; + functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, + broad_cats_times); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_prod_op.cc b/paddle/fluid/operators/reduce_prod_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..713728b99757a6f3bb128f665d5576ac64eef8ec --- /dev/null +++ b/paddle/fluid/operators/reduce_prod_op.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_prod_op.h" + +REGISTER_REDUCE_OP(reduce_prod); +REGISTER_OP_CPU_KERNEL(reduce_prod, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL(reduce_prod_grad, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_prod_op.cu b/paddle/fluid/operators/reduce_prod_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..d62e677d92cffecf629d1684026b0c7bcfec29e3 --- /dev/null +++ b/paddle/fluid/operators/reduce_prod_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_prod_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_prod, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_prod_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_prod_op.h b/paddle/fluid/operators/reduce_prod_op.h new file mode 100644 index 0000000000000000000000000000000000000000..97748113e092719aceed9d806ca6242077111532 --- /dev/null +++ b/paddle/fluid/operators/reduce_prod_op.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +struct ProdFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->prod(dim); + } +}; + +struct ProdGradFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, + const Dim& dim, int size) { + dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_sum_op.cc b/paddle/fluid/operators/reduce_sum_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c5b5398787b44e658b0f8390162df0e6c3006651 --- /dev/null +++ b/paddle/fluid/operators/reduce_sum_op.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_sum_op.h" + +REGISTER_REDUCE_OP(reduce_sum); +REGISTER_OP_CPU_KERNEL( + reduce_sum, ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL(reduce_sum_grad, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..f2e16955a50dc6a7feda9fbaf968c929ef3d8a4f --- /dev/null +++ b/paddle/fluid/operators/reduce_sum_op.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_sum_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_sum, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_sum_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_sum_op.h b/paddle/fluid/operators/reduce_sum_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e67d7e1da5f0244d2dee346873692a80cbad2fc4 --- /dev/null +++ b/paddle/fluid/operators/reduce_sum_op.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +struct SumFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->sum(dim); + } +}; + +struct SumGradFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, + const Dim& dim, int size) { + dx->device(place) = dy->broadcast(dim); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reverse_op.cc b/paddle/fluid/operators/reverse_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a20f7d231fa9ea313581ac0629a87fa5f4a88ce5 --- /dev/null +++ b/paddle/fluid/operators/reverse_op.cc @@ -0,0 +1,107 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reverse_op.h" +#include + +namespace paddle { +namespace operators { + +class ReverseOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); + const auto& x_dims = ctx->GetInputDim("X"); + const auto& axis = ctx->Attrs().Get>("axis"); + PADDLE_ENFORCE(!axis.empty(), "'axis' can not be empty."); + for (int a : axis) { + PADDLE_ENFORCE_LT(a, x_dims.size(), + "The axis must be less than input tensor's rank."); + } + ctx->SetOutputDim("Out", x_dims); + } +}; + +class ReverseOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The LoDTensor to be flipped."); + AddOutput("Out", "The LoDTensor after flipping."); + AddAttr>( + "axis", "The axises that along which order of elements is reversed."); + AddComment(R"DOC( + Reverse Operator. + + Reverse the order of elements in the input LoDTensor along given axises. + + Case 1: + Given + X = [[1, 2, 3, 4, 5] + [6, 7, 8, 9, 10] + [11, 12, 13, 14, 15]], + and + axis = [0], + we get: + Out = [[11, 12, 13, 14, 15] + [6, 7, 8, 9, 10] + [1, 2, 3, 4, 5]]. + + Case 2: + Given + X = [[[1, 2, 3, 4] + [5, 6, 7, 8]] + [[9, 10, 11, 12] + [13, 14, 15, 16]]], + and + axis = [0, 2], + we get: + Out = [[[12, 11, 10, 9] + [16, 15, 14, 13]] + [[4, 3, 2, 1] + [8, 7, 6, 5]]], + )DOC"); + } +}; + +class ReverseGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto* grad_op = new framework::OpDesc(); + grad_op->SetType("reverse"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttr("axis", GetAttr("axis")); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(reverse, ops::ReverseOp, ops::ReverseOpMaker, + ops::ReverseGradMaker); +REGISTER_OPERATOR(reverse_grad, ops::ReverseOp); +REGISTER_OP_CPU_KERNEL( + reverse, ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel) diff --git a/paddle/fluid/operators/reverse_op.cu b/paddle/fluid/operators/reverse_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..635c41529b38f2dd287b00ed2e5659e11f619e78 --- /dev/null +++ b/paddle/fluid/operators/reverse_op.cu @@ -0,0 +1,24 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reverse_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + reverse, ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel) diff --git a/paddle/fluid/operators/reverse_op.h b/paddle/fluid/operators/reverse_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9063cd59bba5c6307b55a500455908a5fd278390 --- /dev/null +++ b/paddle/fluid/operators/reverse_op.h @@ -0,0 +1,87 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +template +struct ReverseFunctor { + void operator()(const DeviceContext& context, const framework::LoDTensor& in, + framework::LoDTensor* out, const std::vector& axis) { + Eigen::array reverse_axis; + for (int i = 0; i < Rank; ++i) { + reverse_axis[i] = false; + } + for (int a : axis) { + reverse_axis[a] = true; + } + + auto in_eigen = framework::EigenTensor::From(in); + auto out_eigen = framework::EigenTensor::From(*out); + auto* dev = context.eigen_device(); + + out_eigen.device(*dev) = in_eigen.reverse(reverse_axis); + } +}; + +template +class ReverseKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + const auto& axis = context.Attr>("axis"); + int rank = x->dims().size(); + auto& dev_ctx = context.template device_context(); + + switch (rank) { + case 1: + ReverseFunctor functor1; + functor1(dev_ctx, *x, out, axis); + break; + case 2: + ReverseFunctor functor2; + functor2(dev_ctx, *x, out, axis); + break; + case 3: + ReverseFunctor functor3; + functor3(dev_ctx, *x, out, axis); + break; + case 4: + ReverseFunctor functor4; + functor4(dev_ctx, *x, out, axis); + break; + case 5: + ReverseFunctor functor5; + functor5(dev_ctx, *x, out, axis); + break; + case 6: + ReverseFunctor functor6; + functor6(dev_ctx, *x, out, axis); + break; + default: + PADDLE_THROW( + "Reserve operator doesn't supports tensors whose ranks are greater " + "than 6."); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index cc256aa627bdda0609f496cab93a2dec7d95f348..c90a3be964a3a309a182d3620abec619c366dd84 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -49,6 +49,9 @@ class SoftmaxOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { // choose cudnn kernel if the runtime supported. framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; @@ -58,6 +61,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif @@ -68,9 +72,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { "float16 can only be used on GPU place"); } - std::string data_format = ctx.Attr("data_format"); - return framework::OpKernelType(input_data_type, ctx.GetPlace(), - framework::StringToDataLayout(data_format), + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, library_); } }; @@ -142,6 +144,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { // choose cudnn kernel if the runtime supported. framework::LibraryType library_{framework::LibraryType::kPlain}; + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index f1187620d81ff3bc1deef2106edb54d6199fa927..de711b7d23ef01d57a62087c552ea090f01f0386 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace platform { @@ -86,5 +87,17 @@ inline mkldnn::memory::data_type MKLDNNGetDataType() { return mkldnn::memory::f32; } +inline void Reorder(const mkldnn::memory& src, const mkldnn::memory& dst) { + auto reorder_prim = mkldnn::reorder(src, dst); + std::vector pipeline; + pipeline.push_back(reorder_prim); + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); +} + +inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) { + return static_cast( + memory.get_primitive_desc().desc().data.format); +} + } // namespace platform } // namespace paddle diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 8d99e660e7e34e40b905563a0cf87176b0cf619d..55959197e7cd82253fb0c604604b4302ca0a3dc7 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -447,7 +447,7 @@ EOF # run paddle version to install python packages first RUN apt-get update &&\ ${NCCL_DEPS}\ - apt-get install -y wget python-pip python-opencv dmidecode python-tk && easy_install -U pip && \ + apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \ pip install /*.whl; apt-get install -f -y && \ apt-get clean -y && \ rm -f /*.whl && \ diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 93aa5f908ec929a33089a62caa2186ba9e57fffe..33d8f709412b25d29c6618272500dd7b953d6645 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -170,6 +170,8 @@ def get_program_cache_key(feed, fetch_list): return var.desc.name() elif isinstance(var, str): return var + elif isinstance(var, basestring): + return str(var) else: raise TypeError(str(var) + " should be Variable or str") diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 33b5caa0eab0ec192eb4a3b63cf82a672c58d2cb..9dc9038f4465e22c2e1bac60e18c36214f6414d5 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -72,6 +72,8 @@ def convert_np_dtype_to_dtype_(np_dtype): return core.VarDesc.VarType.INT64 elif dtype == np.bool: return core.VarDesc.VarType.BOOL + elif dtype == np.uint16: + return core.VarDesc.VarType.INT16 elif dtype == np.uint8: return core.VarDesc.VarType.UINT8 else: @@ -368,6 +370,13 @@ class Operator(object): Block. Users can use the build in instructions to describe their neural network. """ + OP_WITHOUT_KERNEL_SET = { + 'feed', 'fetch', 'save', 'load', 'recurrent', 'go', + 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', + 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', + 'ncclInit', 'channel_create', 'channel_close', 'channel_send', + 'channel_recv', 'select' + } def __init__(self, block, @@ -504,17 +513,13 @@ class Operator(object): else: self.desc.set_attr(attr_name, self.attrs[attr_name]) self.desc.check_attrs() - no_kernel_op_set = { - 'feed', 'fetch', 'save', 'load', 'recurrent', 'go', - 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', - 'recv', 'listen_and_serv', 'parallel_do', 'save_combine', - 'load_combine', 'ncclInit', 'channel_create', 'channel_close', - 'channel_send', 'channel_recv', 'select', 'gen_nccl_id' - } - if type not in no_kernel_op_set: + if self.has_kernel(type): self.desc.infer_var_type(self.block.desc) self.desc.infer_shape(self.block.desc) + def has_kernel(self, op_type): + return op_type not in self.OP_WITHOUT_KERNEL_SET + def to_string(self, throw_on_error): """ To debug string. @@ -742,7 +747,9 @@ class Block(object): def var(self, name): if not isinstance(name, basestring): - raise TypeError() + raise TypeError( + "var require string as parameter, but get %s instead." % + (type(name))) v = self.vars.get(name, None) if v is None: raise ValueError("var %s not in this block" % name) diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 8758ac9f94ab91b5be5fc70917c64db38997d1c1..a56f3ea9db6b9fabf9d78f102d394a0817a44a98 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -434,7 +434,7 @@ def open_files(filenames, shapes, lod_levels, dtypes, - thread_num, + thread_num=1, buffer_size=None, pass_num=1, for_parallel=True): diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index be34cc81a5d5ca0e781e5984b6c3eeaa4e25eb90..75d3bf879703a1db1108eae45d879164e0024156 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -363,6 +363,40 @@ def zeros(shape, dtype, force_cpu=False): return fill_constant(value=0.0, **locals()) +def reverse(x, axis): + """ + **reverse** + + This function reverse the input 'x' along given axises. + + Args: + x(Vairbale): the input to be reversed. + axis(int|tuple|list): Axis that along which order of elements + is reversed. If it is a tuple or a list, reversing + will be apply on each axis in the tuple or list. + + Returns: + Variable: The reversed tensor. + + Examples: + .. code-block:: python + + out = fluid.layers.reverse(x=in, axis=0) + # or: + out = fluid.layers.reverse(x=in, axis=[0,1]) + """ + if isinstance(axis, int): + axis = [axis] + helper = LayerHelper("reverse", **locals()) + out = helper.create_tmp_variable(dtype=x.dtype) + helper.append_op( + type='reverse', + inputs={'Input': x}, + outputs={'Out': [out]}, + attrs={'axis': axis}) + return out + + def save(x, file_path, overwrite=True): """ Saves a variable as a file. diff --git a/python/paddle/fluid/tests/unittests/benchmark.py b/python/paddle/fluid/tests/unittests/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..e891ee932f1440001eb25b222f1f4613e97dfcb1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/benchmark.py @@ -0,0 +1,113 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import unittest +import time +import itertools + +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from op_test import OpTest + + +class BenchmarkSuite(OpTest): + def timeit_function(self, callback, iters, *args, **kwargs): + assert iters != 0, "Iters should >= 1" + start = time.time() + for i in range(iters): + callback(*args, **kwargs) + elapse = time.time() - start + return elapse / iters + + def _assert_cpu_gpu_same(self, cpu_outs, gpu_outs, fetch_list, atol): + for item_cpu_out, item_gpu_out, variable in zip(cpu_outs, gpu_outs, + fetch_list): + # the cpu version is baseline, expect gpu version keep same with cpu version. + expect = item_cpu_out + expect_t = np.array(item_cpu_out) + actual = item_gpu_out + actual_t = np.array(item_gpu_out) + var_name = variable if isinstance(variable, + basestring) else variable.name + self.assertTrue( + np.allclose( + actual_t, expect_t, atol=atol), + "Output (" + var_name + ") has diff" + str(actual_t) + "\n" + + str(expect_t)) + self.assertListEqual(actual.lod(), + expect.lod(), + "Output (" + var_name + ") has different lod") + + def _get_input_names(self): + inputs = [] + for name, value in self.inputs.iteritems(): + if isinstance(value, list): + inputs.extend([sub_name for sub_name, _ in value]) + inputs.append(name) + return inputs + + def _get_output_names(self): + outputs = [] + for var_name, var in self.outputs.iteritems(): + if isinstance(var, list): + for sub_var_name, sub_var in var: + outputs.append(sub_var_name) + else: + outputs.append(var_name) + if len(outputs) == 0: + for out_name, out_dup in Operator.get_op_outputs(self.op_type): + outputs.append(str(out_name)) + return outputs + + def check_output_stability(self, atol=1e-8): + places = self._get_places() + if len(places) < 2: + return + cpu_outs, fetch_list = self._calc_output(places[0]) + gpu_outs, _ = self._calc_output(places[1]) + self._assert_cpu_gpu_same(cpu_outs, gpu_outs, fetch_list, atol) + + def timeit_output_with_place(self, place, iters): + return self.timeit_function(self.calc_output, iters, place) + + def timeit_output(self, iters=100): + places = self._get_places() + elapses = [] + for place in places: + elapses.append(self.timeit_output_with_place(place, iters)) + for place, elapse in zip(places, elapses): + print("One pass of ({2}_op) at {0} cost {1}".format( + str(place), elapse, self.op_type)) + + def timeit_grad_with_place(self, place, iters=100): + inputs_to_check = self._get_input_names() + output_names = self._get_output_names() + return self.timeit_function( + self._get_gradient, + iters, + inputs_to_check, + place, + output_names, + no_grad_set=None) + + def timeit_grad(self, iters=100): + places = self._get_places() + elapses = [] + for place in places: + elapses.append(self.timeit_grad_with_place(place, iters)) + for place, elapse in zip(places, elapses): + print("One pass of ({2}_grad_op) at {0} cost {1}".format( + str(place), elapse, self.op_type)) diff --git a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py new file mode 100644 index 0000000000000000000000000000000000000000..91a5f1bca4441d80489a02eb9283928e38321826 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py @@ -0,0 +1,82 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle.fluid as fluid +from benchmark import BenchmarkSuite +from op_test import OpTest + +# This is a demo op test case for operator benchmarking and high resolution number stability alignment. + + +class TestSumOp(BenchmarkSuite): + def setUp(self): + self.op_type = "sum" + self.customize_testcase() + self.customize_fetch_list() + + def customize_fetch_list(self): + """ + customize fetch list, configure the wanted variables. + >>> self.fetch_list = ["Out"] + """ + self.fetch_list = ["Out"] + # pass + + def customize_testcase(self): + # a test case + x0 = np.random.random((300, 400)).astype('float32') + x1 = np.random.random((300, 400)).astype('float32') + x2 = np.random.random((300, 400)).astype('float32') + + # NOTE: if the output is empty, then it will autofilled by benchmarkSuite. + # only the output dtype is used, the shape, lod and data is computed from input. + self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]} + self.outputs = {"Out": x0 + x1 + x2} + + def test_check_output(self): + """ + compare the output with customized output. In this case, + you should set the correct output by hands. + >>> self.outputs = {"Out": x0 + x1 + x2} + """ + self.check_output(atol=1e-8) + + def test_output_stability(self): + # compare the cpu gpu output in high resolution. + self.check_output_stability() + + def test_timeit_output(self): + """ + perf the op, time cost will be averged in iters. + output example + >>> One pass of (sum_op) at CPUPlace cost 0.000461330413818 + >>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596 + """ + self.timeit_output(iters=100) + + def test_timeit_grad(self): + """ + perf the op gradient, time cost will be averged in iters. + output example + >>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536 + >>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653 + """ + self.timeit_grad(iters=100) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index b611470fa1ff326df960c349b71006f52d586d8e..307caae4b0cf4869c1abb755215aa97795d47e15 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -15,13 +15,17 @@ import unittest import numpy as np import random +import time import itertools -import paddle.fluid.core as core import collections + +import paddle.fluid as fluid +import paddle.fluid.core as core from paddle.fluid.backward import append_backward from paddle.fluid.op import Operator from paddle.fluid.executor import Executor -from paddle.fluid.framework import Program, OpProtoHolder +from paddle.fluid.framework import Program, OpProtoHolder, Variable +from testsuite import create_op, set_input, append_input_output, append_loss_ops def randomize_probability(batch_size, class_num, dtype='float32'): @@ -33,73 +37,6 @@ def randomize_probability(batch_size, class_num, dtype='float32'): return prob -def create_op(scope, op_type, inputs, outputs, attrs): - kwargs = dict() - - op_maker = core.op_proto_and_checker_maker - op_role_attr_name = op_maker.kOpRoleAttrName() - - if op_role_attr_name not in attrs: - attrs[op_role_attr_name] = int(op_maker.OpRole.Forward) - - def __create_var__(name, var_name): - scope.var(var_name).get_tensor() - kwargs[name].append(var_name) - - for in_name, in_dup in Operator.get_op_inputs(op_type): - if in_name in inputs: - kwargs[in_name] = [] - if in_dup: - sub_in = inputs[in_name] - for item in sub_in: - sub_in_name, _ = item[0], item[1] - __create_var__(in_name, sub_in_name) - else: - __create_var__(in_name, in_name) - - for out_name, out_dup in Operator.get_op_outputs(op_type): - if out_name in outputs: - kwargs[out_name] = [] - if out_dup: - sub_out = outputs[out_name] - for item in sub_out: - sub_out_name, _ = item[0], item[1] - __create_var__(out_name, sub_out_name) - else: - __create_var__(out_name, out_name) - - for attr_name in Operator.get_op_attr_names(op_type): - if attr_name in attrs: - kwargs[attr_name] = attrs[attr_name] - - return Operator(op_type, **kwargs) - - -def set_input(scope, op, inputs, place): - def __set_input__(var_name, var): - if isinstance(var, tuple) or isinstance(var, np.ndarray): - tensor = scope.find_var(var_name).get_tensor() - if isinstance(var, tuple): - tensor.set_lod(var[1]) - var = var[0] - tensor.set_dims(var.shape) - tensor.set(var, place) - elif isinstance(var, float): - scope.find_var(var_name).set_float(var) - elif isinstance(var, int): - scope.find_var(var_name).set_int(var) - - for in_name, in_dup in Operator.get_op_inputs(op.type()): - if in_name in inputs: - if in_dup: - sub_in = inputs[in_name] - for item in sub_in: - sub_in_name, sub_in_val = item[0], item[1] - __set_input__(sub_in_name, sub_in_val) - else: - __set_input__(in_name, inputs[in_name]) - - def get_numeric_gradient(place, scope, op, @@ -173,54 +110,15 @@ def get_numeric_gradient(place, return gradient_flat.reshape(tensor_to_check.get_dims()) -def append_input_output(block, op_proto, np_list, is_input): - '''Insert VarDesc and generate Python variable instance''' - proto_list = op_proto.inputs if is_input else op_proto.outputs - - def create_var(block, name, np_list, var_proto): - if name not in np_list: - assert var_proto.intermediate, "{} not found".format(name) - shape = None - lod_level = None - else: - np_value = np_list[name] - if isinstance(np_value, tuple): - shape = list(np_value[0].shape) - lod_level = len(np_value[1]) - else: - shape = list(np_value.shape) - lod_level = 0 - return block.create_var( - dtype="float32", shape=shape, lod_level=lod_level, name=name) - - var_dict = {} - for var_proto in proto_list: - var_name = str(var_proto.name) - if is_input: - if (var_name not in np_list) and var_proto.dispensable: - continue - assert (var_name in np_list) or (var_proto.dispensable), \ - "Missing {} as input".format(var_name) - if var_proto.duplicable: - assert isinstance(np_list[var_name], list), \ - "Duplicable {} should be set as list".format(var_name) - var_list = [] - for (name, np_value) in np_list[var_name]: - var_list.append( - create_var(block, name, {name: np_value}, var_proto)) - var_dict[var_name] = var_list - else: - var_dict[var_name] = create_var(block, var_name, np_list, var_proto) - - return var_dict - - class OpTest(unittest.TestCase): @classmethod def setUpClass(cls): '''Fix random seeds to remove randomness from tests''' cls._np_rand_state = np.random.get_state() cls._py_rand_state = random.getstate() + cls.call_once = False + cls.dtype = "float32" + cls.outputs = {} np.random.seed(123) random.seed(124) @@ -231,6 +129,31 @@ class OpTest(unittest.TestCase): np.random.set_state(cls._np_rand_state) random.setstate(cls._py_rand_state) + def try_call_once(self, data_type): + if not self.call_once: + self.call_once = True + self.dtype = data_type + + def infer_dtype_from_inputs_outputs(self, inputs, outputs): + def infer_dtype(numpy_dict): + assert isinstance( + numpy_dict, + dict), "self.inputs, self.outputs must be numpy_dict" + for var_name, var_value in numpy_dict.iteritems(): + if isinstance(var_value, (np.ndarray, np.generic)): + self.try_call_once(var_value.dtype) + elif isinstance(var_value, (list, tuple)): + # the case of self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]} + if len(var_value) > 1 and isinstance(var_value[1], ( + np.ndarray, np.generic)): + instance = var_value[1] + self.try_call_once(instance[1].dtype) + else: + self.try_call_once("float32") + + infer_dtype(inputs) + infer_dtype(outputs) + def feed_var(self, input_vars, place): feed_map = {} for var_name in input_vars: @@ -254,18 +177,14 @@ class OpTest(unittest.TestCase): return feed_map - def calc_output(self, place): - outs, _ = self._calc_output(place) - return outs - - def _calc_output(self, place): + def _append_ops(self, block): op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) - - program = Program() - block = program.global_block() - - inputs = append_input_output(block, op_proto, self.inputs, True) - outputs = append_input_output(block, op_proto, self.outputs, False) + "infer datatype from inputs and outputs for this test case" + self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs) + inputs = append_input_output(block, op_proto, self.inputs, True, + self.dtype) + outputs = append_input_output(block, op_proto, self.outputs, False, + self.dtype) op = block.append_op( type=self.op_type, inputs=inputs, @@ -275,22 +194,68 @@ class OpTest(unittest.TestCase): op.desc.infer_var_type(block.desc) op.desc.infer_shape(block.desc) - fetch_list = [] - for var_name, var in outputs.iteritems(): - if var_name in self.outputs: + def _get_io_vars(self, block, numpy_inputs): + inputs = {} + for name, value in numpy_inputs.iteritems(): + if isinstance(value, list): + var_list = [ + block.var(sub_name) for sub_name, sub_value in value + ] + inputs[name] = var_list + else: + inputs[name] = block.var(name) + return inputs + + def _get_inputs(self, block): + return self._get_io_vars(block, self.inputs) + + def _get_outputs(self, block): + return self._get_io_vars(block, self.outputs) + + def calc_output(self, place): + outs, _ = self._calc_output(place) + return outs + + def _calc_output(self, place, parallel=False): + + program = Program() + block = program.global_block() + self._append_ops(block) + + inputs = self._get_inputs(block) + outputs = self._get_outputs(block) + feed_map = self.feed_var(inputs, place) + + if parallel: + use_cuda = False + if isinstance(place, fluid.CUDAPlace(0)): + use_cuda = True + executor = fluid.ParallelExecutor( + use_cuda=use_cuda, loss_name=loss.name, main_program=program) + else: + executor = Executor(place) + + fetch_list = getattr(self, "fetch_list", []) + # if the fetch_list is customized by user, we use it directly. + # if not, fill the fetch_list by the user configured outputs in test. + if len(fetch_list) == 0: + for var_name, var in outputs.iteritems(): if isinstance(var, list): for v in var: fetch_list.append(v) else: fetch_list.append(var) - - feed_map = self.feed_var(inputs, place) - - exe = Executor(place) - outs = exe.run(program, - feed=feed_map, - fetch_list=fetch_list, - return_numpy=False) + # if the fetch_list still empty, fill the fetch_list by the operator output. + if len(fetch_list) == 0: + for out_name, out_dup in Operator.get_op_outputs(self.op_type): + fetch_list.append(str(out_name)) + # fetch_list = map(block.var, fetch_list) + if not isinstance(fetch_list[0], Variable): + fetch_list = map(block.var, fetch_list) + outs = executor.run(program, + feed=feed_map, + fetch_list=fetch_list, + return_numpy=False) return outs, fetch_list def check_output_with_place(self, place, atol): @@ -346,17 +311,19 @@ class OpTest(unittest.TestCase): "Output (" + out_name + ") has different lod at " + str(place)) - def check_output(self, atol=1e-5): - places = [core.CPUPlace()] + def _get_places(self): + places = [fluid.CPUPlace()] if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): places.append(core.CUDAPlace(0)) + return places + + def check_output(self, atol=1e-5): + places = self._get_places() for place in places: self.check_output_with_place(place, atol) def check_output_customized(self, checker): - places = [core.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): - places.append(core.CUDAPlace(0)) + places = self._get_places() for place in places: outs = self.calc_output(place) outs = [np.array(out) for out in outs] @@ -389,9 +356,7 @@ class OpTest(unittest.TestCase): in_place=False, max_relative_error=0.005, user_defined_grads=None): - places = [core.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): - places.append(core.CUDAPlace(0)) + places = self._get_places() for place in places: self.check_grad_with_place(place, inputs_to_check, output_names, no_grad_set, numeric_grad_delta, @@ -438,35 +403,6 @@ class OpTest(unittest.TestCase): max_relative_error, "Gradient Check On %s" % str(place)) - @staticmethod - def _create_var_descs_(block, var_dict): - # FIXME: Try unify with `append_input_output` - for param_name in var_dict: - var = var_dict[param_name] - if not isinstance(var, list) and not isinstance(var, tuple): - var = [(param_name, var, None)] - if not isinstance(var[0], list) and not isinstance(var[0], tuple): - var = [(param_name, var[0], var[1])] - - for i, item in enumerate(var): - if not isinstance(item[0], basestring): - item = [[param_name] + list(item)] - if len(item) == 2: - if isinstance(item[1], tuple): - var[i] = [item[0], item[1][0], item[1][1]] - else: - # only set var name and value, set lod to None - var[i] = list(item) + [None] - var_descs = [(block.create_var( - name=name, shape=each.shape, dtype=each.dtype), each, lod) - for name, each, lod in var] - - yield param_name, var_descs - - @staticmethod - def _merge_list(iterable): - return reduce(lambda a, b: list(a) + list(b), iterable, []) - @staticmethod def _numpy_to_lod_tensor(np_value, lod, place): tensor = core.LoDTensor() @@ -497,83 +433,31 @@ class OpTest(unittest.TestCase): input.dtype = np.uint16 return input - def _get_gradient(self, input_to_check, place, output_names, no_grad_set): + def _get_gradient(self, + input_to_check, + place, + output_names, + no_grad_set, + parallel=False): prog = Program() block = prog.global_block() - inputs_with_np = { - key: value - for (key, value) in OpTest._create_var_descs_( - block, getattr(self, 'inputs', {})) - } - outputs_with_np = { - key: val - for (key, val) in OpTest._create_var_descs_( - block, getattr(self, 'outputs', {})) - } - inputs = { - k: [item[0] for item in inputs_with_np[k]] - for k in inputs_with_np - } - outputs = { - k: [item[0] for item in outputs_with_np[k]] - for k in outputs_with_np - } - - op = block.append_op( - type=self.op_type, - inputs=inputs, - outputs=outputs, - attrs=getattr(self, 'attrs', {})) - - # infer variable type and infer shape in compile-time - op.desc.infer_var_type(block.desc) - op.desc.infer_shape(block.desc) - - mean_inputs = map(block.var, output_names) - - if len(mean_inputs) == 1: - loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1]) - op = block.append_op( - inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean') - op.desc.infer_var_type(block.desc) - op.desc.infer_shape(block.desc) - else: - avg_sum = [] - for cur_loss in mean_inputs: - cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1]) - op = block.append_op( - inputs={"X": [cur_loss]}, - outputs={"Out": [cur_avg_loss]}, - type="mean") - op.desc.infer_var_type(block.desc) - op.desc.infer_shape(block.desc) - avg_sum.append(cur_avg_loss) - - loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1]) - op_sum = block.append_op( - inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum') - op_sum.desc.infer_var_type(block.desc) - op_sum.desc.infer_shape(block.desc) - - loss = block.create_var(dtype=loss_sum.dtype, shape=[1]) - op_loss = block.append_op( - inputs={"X": loss_sum}, - outputs={"Out": loss}, - type='scale', - attrs={'scale': 1.0 / float(len(avg_sum))}) - op_loss.desc.infer_var_type(block.desc) - op_loss.desc.infer_shape(block.desc) - + self._append_ops(block) + loss = append_loss_ops(block, output_names) param_grad_list = append_backward( loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set) - feed_dict = { - item[0].name: OpTest._numpy_to_lod_tensor(item[1], item[2], place) - for p_name in inputs_with_np for item in inputs_with_np[p_name] - } + inputs = self._get_inputs(block) + feed_dict = self.feed_var(inputs, place) fetch_list = [g for p, g in param_grad_list] - executor = Executor(place) + if parallel: + use_cuda = False + if isinstance(place, fluid.CUDAPlace(0)): + use_cuda = True + executor = fluid.ParallelExecutor( + use_cuda=use_cuda, loss_name=loss.name, main_program=program) + else: + executor = Executor(place) return map(np.array, executor.run(prog, feed_dict, fetch_list, return_numpy=False)) diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py index 1f52bd90d0d49bda6c180019e90ebd923c91439c..96d47906a0606bba4b1d2207f7da85b058e42a2b 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py @@ -252,5 +252,25 @@ class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp): self.axis = 1 +class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(3, 20, 20).astype(self.dtype) + self.y = np.random.rand(3, 1, 1).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = -1 + + +class TestFP16ElementwiseAddOp_channelwise_add(TestFP16ElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(3, 10, 20).astype(self.dtype) + self.y = np.random.rand(3, 1, 1).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = -1 + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py index f8ff5a3361af66612f08b2aa4eaffa363f04c594..e726f99d49877a1bc464090092ec80b97ab15d0c 100644 --- a/python/paddle/fluid/tests/unittests/test_lstm_op.py +++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py @@ -194,107 +194,104 @@ class TestLstmOp(OpTest): ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4) -class TestLstmOpHasInitial(TestLstmOp): - def set_argument(self): - self.lod = [[0, 2, 5, 7]] - self.D = 16 - - self.act_gate = 'sigmoid' - self.act_cell = 'tanh' - self.act_cand = 'tanh' - - self.has_initial_state = True - self.is_reverse = True - self.use_peepholes = True - - def test_check_grad(self): - # TODO(qingqing) remove folowing lines after the check_grad is refined. - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'], - max_relative_error=5e-4) - - def test_check_grad_ingore_bias(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Weight'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('Bias')) - - def test_check_grad_ingore_weight(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Bias'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('Weight')) - - def test_check_grad_ingore_input(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Weight', 'Bias'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('Input')) - - def test_check_grad_ingore_h0(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('H0')) - - def test_check_grad_ingore_c0(self): - N = len(self.lod[0]) - 1 - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros( - (N, self.D)).astype('float64') - self.check_grad( - ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'], - max_relative_error=5e-4, - no_grad_set=set('C0')) - - -class TestLstmOpRerverse(TestLstmOp): - def set_argument(self): - self.lod = [[0, 2, 5, 7]] - self.D = 16 - - self.act_gate = 'sigmoid' - self.act_cell = 'tanh' - self.act_cand = 'tanh' - - self.has_initial_state = False - self.is_reverse = True - self.use_peepholes = True - - -class TestLstmOpNotUsePeepholes(TestLstmOp): - def set_argument(self): - self.lod = [[0, 2, 5, 7]] - self.D = 16 - - self.act_gate = 'sigmoid' - self.act_cell = 'tanh' - self.act_cand = 'tanh' - - self.has_initial_state = False - self.is_reverse = True - self.use_peepholes = False - +# class TestLstmOpHasInitial(TestLstmOp): +# def set_argument(self): +# self.lod = [[0, 2, 5, 7]] +# self.D = 16 + +# self.act_gate = 'sigmoid' +# self.act_cell = 'tanh' +# self.act_cand = 'tanh' + +# self.has_initial_state = True +# self.is_reverse = True +# self.use_peepholes = True + +# def test_check_grad(self): +# # TODO(qingqing) remove folowing lines after the check_grad is refined. +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'], +# max_relative_error=5e-4) + +# def test_check_grad_ingore_bias(self): +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Weight'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('Bias')) + +# def test_check_grad_ingore_weight(self): +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Bias'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('Weight')) + +# def test_check_grad_ingore_input(self): +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Weight', 'Bias'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('Input')) + +# def test_check_grad_ingore_h0(self): +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('H0')) + +# def test_check_grad_ingore_c0(self): +# N = len(self.lod[0]) - 1 +# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') +# self.outputs['BatchCellPreAct'] = np.zeros( +# (N, self.D)).astype('float64') +# self.check_grad( +# ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'], +# max_relative_error=5e-4, +# no_grad_set=set('C0')) + +# class TestLstmOpRerverse(TestLstmOp): +# def set_argument(self): +# self.lod = [[0, 2, 5, 7]] +# self.D = 16 + +# self.act_gate = 'sigmoid' +# self.act_cell = 'tanh' +# self.act_cand = 'tanh' + +# self.has_initial_state = False +# self.is_reverse = True +# self.use_peepholes = True + +# class TestLstmOpNotUsePeepholes(TestLstmOp): +# def set_argument(self): +# self.lod = [[0, 2, 5, 7]] +# self.D = 16 + +# self.act_gate = 'sigmoid' +# self.act_cell = 'tanh' +# self.act_cand = 'tanh' + +# self.has_initial_state = False +# self.is_reverse = True +# self.use_peepholes = False if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_reverse_op.py b/python/paddle/fluid/tests/unittests/test_reverse_op.py new file mode 100644 index 0000000000000000000000000000000000000000..f845575a02869f08299d76b5600074598ca27f6c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py @@ -0,0 +1,67 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class TestReverseOp(OpTest): + def initTestCase(self): + self.x = np.random.random((3, 4)).astype('float32') + self.axis = [0] + + def setUp(self): + self.initTestCase() + self.op_type = "reverse" + self.inputs = {"X": self.x} + self.attrs = {'axis': self.axis} + out = self.x + for a in self.axis: + out = np.flip(out, axis=a) + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestCase0(TestReverseOp): + def initTestCase(self): + self.x = np.random.random((3, 4)).astype('float32') + self.axis = [1] + + +class TestCase1(TestReverseOp): + def initTestCase(self): + self.x = np.random.random((3, 4)).astype('float32') + self.axis = [0, 1] + + +class TestCase2(TestReverseOp): + def initTestCase(self): + self.x = np.random.random((3, 4, 5)).astype('float32') + self.axis = [0, 2] + + +class TestCase3(TestReverseOp): + def initTestCase(self): + self.x = np.random.random((3, 4, 5)).astype('float32') + self.axis = [1, 2] + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py new file mode 100644 index 0000000000000000000000000000000000000000..1dc94a80c9d3999d34fdf0edbf82ffe297bd95d7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/testsuite.py @@ -0,0 +1,182 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +import paddle.fluid.core as core +from paddle.fluid.op import Operator + + +def as_lodtensor(np_array, lod, place): + tensor = core.LoDTensor() + tensor.set(np_value, place) + if lod is not None: + tensor.set_lod(lod) + return tensor + + +def create_op(scope, op_type, inputs, outputs, attrs): + kwargs = dict() + + op_maker = core.op_proto_and_checker_maker + op_role_attr_name = op_maker.kOpRoleAttrName() + + if op_role_attr_name not in attrs: + attrs[op_role_attr_name] = int(op_maker.OpRole.Forward) + + def __create_var__(name, var_name): + scope.var(var_name).get_tensor() + kwargs[name].append(var_name) + + for in_name, in_dup in Operator.get_op_inputs(op_type): + if in_name in inputs: + kwargs[in_name] = [] + if in_dup: + sub_in = inputs[in_name] + for item in sub_in: + sub_in_name, _ = item[0], item[1] + __create_var__(in_name, sub_in_name) + else: + __create_var__(in_name, in_name) + + for out_name, out_dup in Operator.get_op_outputs(op_type): + if out_name in outputs: + kwargs[out_name] = [] + if out_dup: + sub_out = outputs[out_name] + for item in sub_out: + sub_out_name, _ = item[0], item[1] + __create_var__(out_name, sub_out_name) + else: + __create_var__(out_name, out_name) + + for attr_name in Operator.get_op_attr_names(op_type): + if attr_name in attrs: + kwargs[attr_name] = attrs[attr_name] + + return Operator(op_type, **kwargs) + + +def set_input(scope, op, inputs, place): + def __set_input__(var_name, var): + if isinstance(var, tuple) or isinstance(var, np.ndarray): + tensor = scope.find_var(var_name).get_tensor() + if isinstance(var, tuple): + tensor.set_lod(var[1]) + var = var[0] + tensor.set_dims(var.shape) + tensor.set(var, place) + elif isinstance(var, float): + scope.find_var(var_name).set_float(var) + elif isinstance(var, int): + scope.find_var(var_name).set_int(var) + + for in_name, in_dup in Operator.get_op_inputs(op.type()): + if in_name in inputs: + if in_dup: + sub_in = inputs[in_name] + for item in sub_in: + sub_in_name, sub_in_val = item[0], item[1] + __set_input__(sub_in_name, sub_in_val) + else: + __set_input__(in_name, inputs[in_name]) + + +def append_input_output(block, op_proto, np_list, is_input, dtype): + '''Insert VarDesc and generate Python variable instance''' + proto_list = op_proto.inputs if is_input else op_proto.outputs + + def create_var(block, name, np_list, var_proto): + dtype = None + shape = None + lod_level = None + if name not in np_list: + assert var_proto.intermediate, "{} not found".format(name) + else: + np_value = np_list[name] + if isinstance(np_value, tuple): + dtype = np_value[0].dtype + # output shape, lod should be infered from input. + if is_input: + shape = list(np_value[0].shape) + lod_level = len(np_value[1]) + else: + dtype = np_value.dtype + if is_input: + shape = list(np_value.shape) + lod_level = 0 + return block.create_var( + dtype=dtype, shape=shape, lod_level=lod_level, name=name) + + var_dict = {} + for var_proto in proto_list: + var_name = str(var_proto.name) + if is_input: + if (var_name not in np_list) and var_proto.dispensable: + continue + assert (var_name in np_list) or (var_proto.dispensable), \ + "Missing {} as input".format(var_name) + if var_proto.duplicable: + assert isinstance(np_list[var_name], list), \ + "Duplicable {} should be set as list".format(var_name) + var_list = [] + for (name, np_value) in np_list[var_name]: + var_list.append( + create_var(block, name, {name: np_value}, var_proto)) + var_dict[var_name] = var_list + else: + var_dict[var_name] = create_var(block, var_name, np_list, var_proto) + + return var_dict + + +def append_loss_ops(block, output_names): + mean_inputs = map(block.var, output_names) + # for item in mean_inputs: + # print(item) + # print("Item", item.dtype) + + if len(mean_inputs) == 1: + loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1]) + op = block.append_op( + inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean') + op.desc.infer_var_type(block.desc) + op.desc.infer_shape(block.desc) + else: + avg_sum = [] + for cur_loss in mean_inputs: + cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1]) + op = block.append_op( + inputs={"X": [cur_loss]}, + outputs={"Out": [cur_avg_loss]}, + type="mean") + op.desc.infer_var_type(block.desc) + op.desc.infer_shape(block.desc) + avg_sum.append(cur_avg_loss) + + loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1]) + op_sum = block.append_op( + inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum') + op_sum.desc.infer_var_type(block.desc) + op_sum.desc.infer_shape(block.desc) + + loss = block.create_var(dtype=loss_sum.dtype, shape=[1]) + op_loss = block.append_op( + inputs={"X": loss_sum}, + outputs={"Out": loss}, + type='scale', + attrs={'scale': 1.0 / float(len(avg_sum))}) + op_loss.desc.infer_var_type(block.desc) + op_loss.desc.infer_shape(block.desc) + return loss