提交 173d72b4 编写于 作者: C chengduoZH

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into enable_cpu_on_pe

...@@ -19,4 +19,4 @@ ADD *.whl / ...@@ -19,4 +19,4 @@ ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
ENV LD_LIBRARY_PATH=/usr/local/lib ENV LD_LIBRARY_PATH=/usr/local/lib
ADD fluid_benchmark.py dataset.py models/ /workspace/ ADD fluid_benchmark.py recordio_converter.py models/ /workspace/
...@@ -24,10 +24,12 @@ Currently supported `--model` argument include: ...@@ -24,10 +24,12 @@ Currently supported `--model` argument include:
* Run the following command to start a benchmark job locally: * Run the following command to start a benchmark job locally:
```bash ```bash
python fluid_benchmark.py --model mnist --device GPU python fluid_benchmark.py --model mnist --device GPU
``` ```
You can choose to use GPU/CPU training. With GPU training, you can specify You can choose to use GPU/CPU training. With GPU training, you can specify
`--gpus <gpu_num>` to run multi GPU training. `--gpus <gpu_num>` to run multi GPU training.
You can set async mode parameter server. With async mode, you can specify
`--async_mode` to train model asynchronous.
* Run distributed training with parameter servers: * Run distributed training with parameter servers:
* see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example. * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
* start parameter servers: * start parameter servers:
...@@ -44,6 +46,16 @@ Currently supported `--model` argument include: ...@@ -44,6 +46,16 @@ Currently supported `--model` argument include:
PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2 PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
``` ```
## Prepare the RecordIO file to Achieve Better Performance
Run the following command will generate RecordIO files like "mnist.recordio" under the path
and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
at any time using `fluid.batch`.
```bash
python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
```
## Run Distributed Benchmark on Kubernetes Cluster ## Run Distributed Benchmark on Kubernetes Cluster
You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
__all__ = ['parse_args', ]
BENCHMARK_MODELS = [
"machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
]
def parse_args():
parser = argparse.ArgumentParser('Fluid model benchmarks.')
parser.add_argument(
'--model',
type=str,
choices=BENCHMARK_MODELS,
default='resnet',
help='The model to run benchmark with.')
parser.add_argument(
'--batch_size', type=int, default=32, help='The minibatch size.')
# args related to learning rate
parser.add_argument(
'--learning_rate', type=float, default=0.001, help='The learning rate.')
# TODO(wuyi): add "--use_fake_data" option back.
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test'
)
parser.add_argument(
'--iterations', type=int, default=80, help='The number of minibatches.')
parser.add_argument(
'--pass_num', type=int, default=100, help='The number of passes.')
parser.add_argument(
'--data_format',
type=str,
default='NCHW',
choices=['NCHW', 'NHWC'],
help='The data data_format, now only support NCHW.')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type.')
parser.add_argument(
'--gpus',
type=int,
default=1,
help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
# this option is available only for vgg and resnet.
parser.add_argument(
'--cpus',
type=int,
default=1,
help='If cpus > 1, will use ParallelDo to run, else use Executor.')
parser.add_argument(
'--data_set',
type=str,
default='flowers',
choices=['cifar10', 'flowers'],
help='Optional dataset for benchmark.')
parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.')
parser.add_argument(
'--use_cprof', action='store_true', help='If set, use cProfile.')
parser.add_argument(
'--use_nvprof',
action='store_true',
help='If set, use nvprof for CUDA.')
parser.add_argument(
'--no_test',
action='store_true',
help='If set, do not test the testset during training.')
parser.add_argument(
'--memory_optimize',
action='store_true',
help='If set, optimize runtime memory before start.')
parser.add_argument(
'--use_fake_data',
action='store_true',
help='If set ommit the actual read data operators.')
parser.add_argument(
'--profile', action='store_true', help='If set, profile a few steps.')
parser.add_argument(
'--update_method',
type=str,
default='local',
choices=['local', 'pserver', 'nccl2'],
help='Choose parameter update method, can be local, pserver, nccl2.')
parser.add_argument(
'--no_split_var',
action='store_true',
default=False,
help='Whether split variables into blocks when update_method is pserver')
parser.add_argument(
'--async_mode',
action='store_true',
default=False,
help='Whether start pserver in async mode to support ASGD')
parser.add_argument(
'--use_reader_op',
action='store_true',
help='Whether to use reader op, and must specify the data path if set this to true.'
)
parser.add_argument(
'--data_path',
type=str,
default="",
help='Directory that contains all the training recordio files.')
args = parser.parse_args()
return args
...@@ -24,92 +24,7 @@ import paddle.fluid.core as core ...@@ -24,92 +24,7 @@ import paddle.fluid.core as core
import paddle.fluid.profiler as profiler import paddle.fluid.profiler as profiler
import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
BENCHMARK_MODELS = [ from args import *
"machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
]
def parse_args():
parser = argparse.ArgumentParser('Fluid model benchmarks.')
parser.add_argument(
'--model',
type=str,
choices=BENCHMARK_MODELS,
default='resnet',
help='The model to run benchmark with.')
parser.add_argument(
'--batch_size', type=int, default=32, help='The minibatch size.')
parser.add_argument(
'--learning_rate', type=float, default=0.001, help='The learning rate.')
# TODO(wuyi): add "--use_fake_data" option back.
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test'
)
parser.add_argument(
'--iterations', type=int, default=80, help='The number of minibatches.')
parser.add_argument(
'--pass_num', type=int, default=100, help='The number of passes.')
parser.add_argument(
'--data_format',
type=str,
default='NCHW',
choices=['NCHW', 'NHWC'],
help='The data data_format, now only support NCHW.')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type.')
parser.add_argument(
'--gpus',
type=int,
default=1,
help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
parser.add_argument(
'--cpus',
type=int,
default=1,
help='If cpus > 1, will use ParallelDo to run, else use Executor.')
parser.add_argument(
'--data_set',
type=str,
default='flowers',
choices=['cifar10', 'flowers'],
help='Optional dataset for benchmark.')
parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.')
parser.add_argument(
'--use_cprof', action='store_true', help='If set, use cProfile.')
parser.add_argument(
'--use_nvprof',
action='store_true',
help='If set, use nvprof for CUDA.')
parser.add_argument(
'--no_test',
action='store_true',
help='If set, do not test the testset during training.')
parser.add_argument(
'--memory_optimize',
action='store_true',
help='If set, optimize runtime memory before start.')
parser.add_argument(
'--use_fake_data',
action='store_true',
help='If set ommit the actual read data operators.')
parser.add_argument(
'--profile', action='store_true', help='If set, profile a few steps.')
parser.add_argument(
'--update_method',
type=str,
default='local',
choices=['local', 'pserver', 'nccl2'],
help='Choose parameter update method, can be local, pserver, nccl2.')
args = parser.parse_args()
return args
def append_nccl2_prepare(trainer_id): def append_nccl2_prepare(trainer_id):
...@@ -144,7 +59,7 @@ def append_nccl2_prepare(trainer_id): ...@@ -144,7 +59,7 @@ def append_nccl2_prepare(trainer_id):
"nccl-based dist train.") "nccl-based dist train.")
def dist_transpile(trainer_id): def dist_transpile(trainer_id, args):
if trainer_id < 0: if trainer_id < 0:
return None, None return None, None
...@@ -166,7 +81,12 @@ def dist_transpile(trainer_id): ...@@ -166,7 +81,12 @@ def dist_transpile(trainer_id):
training_role = os.getenv("PADDLE_TRAINING_ROLE") training_role = os.getenv("PADDLE_TRAINING_ROLE")
t = distribute_transpiler.DistributeTranspiler() t = distribute_transpiler.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) t.transpile(
trainer_id,
pservers=pserver_endpoints,
trainers=trainers,
sync_mode=not args.async_mode,
slice_var_up=not args.no_split_var)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_program = t.get_pserver_program(current_endpoint) pserver_program = t.get_pserver_program(current_endpoint)
pserver_startup_program = t.get_startup_program(current_endpoint, pserver_startup_program = t.get_startup_program(current_endpoint,
...@@ -210,26 +130,50 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, ...@@ -210,26 +130,50 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup_prog) exe.run(startup_prog)
feed_var_list = [
var for var in train_prog.global_block().vars.itervalues() if not args.use_reader_op:
if var.is_data feed_var_list = [
] var for var in train_prog.global_block().vars.itervalues()
feeder = fluid.DataFeeder(feed_var_list, place) if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place)
iters, num_samples, start_time = 0, 0, time.time() iters, num_samples, start_time = 0, 0, time.time()
for pass_id in range(args.pass_num): for pass_id in range(args.pass_num):
train_losses = [] train_losses = []
for batch_id, data in enumerate(train_reader()): if not args.use_reader_op:
reader_generator = train_reader()
batch_id = 0
data = None
while True:
if not args.use_reader_op:
data = next(reader_generator, None)
if data == None:
break
if iters == args.iterations:
break
if iters == args.skip_batch_num: if iters == args.skip_batch_num:
start_time = time.time() start_time = time.time()
num_samples = 0 num_samples = 0
if iters == args.iterations:
break if args.use_reader_op:
loss = exe.run(train_prog, try:
feed=feeder.feed(data), loss = exe.run(train_prog, fetch_list=[avg_loss])
fetch_list=[avg_loss]) except fluid.core.EnforceNotMet as ex:
break
else:
loss = exe.run(train_prog,
feed=feeder.feed(data),
fetch_list=[avg_loss])
iters += 1 iters += 1
num_samples += len(data) batch_id += 1
# FIXME(wuyi): For use_reader_op, if the current
# pass is not the last, the last batch of this pass
# is also equal to args.batch_size.
if args.use_reader_op:
num_samples += args.batch_size * args.gpus
else:
num_samples += len(data)
train_losses.append(loss) train_losses.append(loss)
print("Pass: %d, Iter: %d, Loss: %f\n" % print("Pass: %d, Iter: %d, Loss: %f\n" %
(pass_id, iters, np.mean(train_losses))) (pass_id, iters, np.mean(train_losses)))
...@@ -250,10 +194,14 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, ...@@ -250,10 +194,14 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
batch_acc, args, train_prog, startup_prog, nccl_id_var, batch_acc, args, train_prog, startup_prog, nccl_id_var,
num_trainers, trainer_id): num_trainers, trainer_id):
feed_var_list = [ place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
var for var in train_prog.global_block().vars.itervalues() if not args.use_reader_op:
if var.is_data feed_var_list = [
] var for var in train_prog.global_block().vars.itervalues()
if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place)
# generate fake: # generate fake:
if args.use_fake_data: if args.use_fake_data:
for var in feed_var_list: for var in feed_var_list:
...@@ -270,7 +218,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, ...@@ -270,7 +218,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
"value": 1.0, "value": 1.0,
"dtype": var.dtype}) "dtype": var.dtype})
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
if nccl_id_var and trainer_id == 0: if nccl_id_var and trainer_id == 0:
#FIXME(wuyi): wait other trainer to start listening #FIXME(wuyi): wait other trainer to start listening
time.sleep(30) time.sleep(30)
...@@ -287,12 +234,21 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, ...@@ -287,12 +234,21 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
num_trainers=num_trainers, num_trainers=num_trainers,
trainer_id=trainer_id) trainer_id=trainer_id)
feeder = fluid.DataFeeder(feed_var_list, place)
for pass_id in range(args.pass_num): for pass_id in range(args.pass_num):
num_samples = 0 num_samples = 0
iters = 0 iters = 0
start_time = time.time() start_time = time.time()
for batch_id, data in enumerate(train_reader()): if not args.use_reader_op:
reader_generator = train_reader()
batch_id = 0
data = None
while True:
if not args.use_reader_op:
data = next(reader_generator, None)
if data == None:
break
if iters == args.iterations:
break
if args.profile and pass_id == 0 and batch_id == 5: if args.profile and pass_id == 0 and batch_id == 5:
profiler.start_profiler("All") profiler.start_profiler("All")
elif args.profile and pass_id == 0 and batch_id == 10: elif args.profile and pass_id == 0 and batch_id == 10:
...@@ -301,19 +257,25 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, ...@@ -301,19 +257,25 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
if iters == args.skip_batch_num: if iters == args.skip_batch_num:
start_time = time.time() start_time = time.time()
num_samples = 0 num_samples = 0
if iters == args.iterations: if args.use_fake_data or args.use_reader_op:
break try:
if args.use_fake_data: loss, = exe.run([avg_loss.name])
loss, = exe.run([avg_loss.name]) except fluid.core.EnforceNotMet as ex:
break
else: else:
loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
if args.update_method == "pserver": if args.update_method == "pserver":
exe.bcast_params() exe.bcast_params()
num_samples += len(data) if args.use_reader_op:
num_samples += args.batch_size * args.gpus
else:
num_samples += len(data)
iters += 1 iters += 1
if batch_id % 1 == 0: if batch_id % 1 == 0:
print("Pass %d, batch %d, loss %s" % print("Pass %d, batch %d, loss %s" %
(pass_id, batch_id, np.array(loss))) (pass_id, batch_id, np.array(loss)))
batch_id += 1
print_train_time(start_time, time.time(), num_samples) print_train_time(start_time, time.time(), num_samples)
if not args.no_test and batch_acc: if not args.no_test and batch_acc:
test_acc = test(startup_exe, infer_prog, test_reader, feeder, test_acc = test(startup_exe, infer_prog, test_reader, feeder,
...@@ -359,7 +321,7 @@ def main(): ...@@ -359,7 +321,7 @@ def main():
fluid.memory_optimize(fluid.default_main_program()) fluid.memory_optimize(fluid.default_main_program())
if args.update_method == "pserver": if args.update_method == "pserver":
train_prog, startup_prog = dist_transpile(trainer_id) train_prog, startup_prog = dist_transpile(trainer_id, args)
if not train_prog: if not train_prog:
raise Exception( raise Exception(
"Must configure correct environments to run dist train.") "Must configure correct environments to run dist train.")
......
...@@ -197,6 +197,8 @@ def lodtensor_to_ndarray(lod_tensor): ...@@ -197,6 +197,8 @@ def lodtensor_to_ndarray(lod_tensor):
def get_model(args): def get_model(args):
if args.use_reader_op:
raise Exception("machine_translation do not support reader op for now.")
embedding_dim = 512 embedding_dim = 512
encoder_size = 512 encoder_size = 512
decoder_size = 512 decoder_size = 512
...@@ -221,7 +223,7 @@ def get_model(args): ...@@ -221,7 +223,7 @@ def get_model(args):
train_batch_generator = paddle.batch( train_batch_generator = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=1000), paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=args.batch_size) batch_size=args.batch_size * args.gpus)
test_batch_generator = paddle.batch( test_batch_generator = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
......
...@@ -20,6 +20,7 @@ import numpy as np ...@@ -20,6 +20,7 @@ import numpy as np
import argparse import argparse
import time import time
import cProfile import cProfile
import os
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -65,9 +66,24 @@ def cnn_model(data): ...@@ -65,9 +66,24 @@ def cnn_model(data):
def get_model(args): def get_model(args):
# Input data if args.use_reader_op:
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) filelist = [
label = fluid.layers.data(name='label', shape=[1], dtype='int64') os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
]
data_file = fluid.layers.open_files(
filenames=filelist,
shapes=[[-1, 1, 28, 28], (-1, 1)],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
thread_num=args.gpus,
pass_num=args.pass_num)
data_file = fluid.layers.double_buffer(
fluid.layers.batch(
data_file, batch_size=args.batch_size))
images, label = fluid.layers.read_file(data_file)
else:
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if args.device == 'CPU' and args.cpus > 1: if args.device == 'CPU' and args.cpus > 1:
places = fluid.layers.get_places(args.cpus) places = fluid.layers.get_places(args.cpus)
...@@ -103,7 +119,7 @@ def get_model(args): ...@@ -103,7 +119,7 @@ def get_model(args):
# Reader # Reader
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=args.batch_size) paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
test_reader = paddle.batch( test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=args.batch_size) paddle.dataset.mnist.test(), batch_size=args.batch_size)
return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
...@@ -19,6 +19,7 @@ from __future__ import print_function ...@@ -19,6 +19,7 @@ from __future__ import print_function
import functools import functools
import numpy as np import numpy as np
import time import time
import os
import cProfile, pstats, StringIO import cProfile, pstats, StringIO
...@@ -26,6 +27,7 @@ import paddle ...@@ -26,6 +27,7 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid.profiler as profiler import paddle.fluid.profiler as profiler
from recordio_converter import imagenet_train, imagenet_test
def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
...@@ -122,16 +124,48 @@ def get_model(args): ...@@ -122,16 +124,48 @@ def get_model(args):
else: else:
dshape = [32, 32, 3] dshape = [32, 32, 3]
model = resnet_cifar10 model = resnet_cifar10
else: train_reader = paddle.dataset.cifar.train10()
test_reader = paddle.dataset.cifar.test10()
elif args.data_set == "flowers":
class_dim = 102 class_dim = 102
if args.data_format == 'NCHW': if args.data_format == 'NCHW':
dshape = [3, 224, 224] dshape = [3, 224, 224]
else: else:
dshape = [224, 224, 3] dshape = [224, 224, 3]
model = resnet_imagenet model = resnet_imagenet
train_reader = paddle.dataset.flowers.train()
input = fluid.layers.data(name='data', shape=dshape, dtype='float32') test_reader = paddle.dataset.flowers.test()
label = fluid.layers.data(name='label', shape=[1], dtype='int64') elif args.data_set == "imagenet":
class_dim = 1000
if args.data_format == 'NCHW':
dshape = [3, 224, 224]
else:
dshape = [224, 224, 3]
model = resnet_imagenet
if not args.data_path:
raise Exception(
"Must specify --data_path when training with imagenet")
train_reader = imagenet_train(args.data_path)
test_reader = imagenet_test(args.data_path)
if args.use_reader_op:
filelist = [
os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
]
data_file = fluid.layers.open_files(
filenames=filelist,
shapes=[[-1] + dshape, (-1, 1)],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
thread_num=args.gpus,
pass_num=args.pass_num)
data_file = fluid.layers.double_buffer(
fluid.layers.batch(
data_file, batch_size=args.batch_size))
input, label = fluid.layers.read_file(data_file)
else:
input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if args.device == 'CPU' and args.cpus > 1: if args.device == 'CPU' and args.cpus > 1:
places = fluid.layers.get_places(args.cpus) places = fluid.layers.get_places(args.cpus)
...@@ -162,15 +196,10 @@ def get_model(args): ...@@ -162,15 +196,10 @@ def get_model(args):
optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
train_reader = paddle.batch( batched_train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.cifar.train10() train_reader, buf_size=5120),
if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), batch_size=args.batch_size * args.gpus)
buf_size=5120), batched_test_reader = paddle.batch(train_reader, batch_size=args.batch_size)
batch_size=args.batch_size)
test_reader = paddle.batch( return avg_cost, inference_program, optimizer, batched_train_reader, batched_test_reader, batch_acc
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
batch_size=args.batch_size)
return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
...@@ -44,6 +44,9 @@ def crop_sentence(reader, crop_size): ...@@ -44,6 +44,9 @@ def crop_sentence(reader, crop_size):
def get_model(args): def get_model(args):
if args.use_reader_op:
raise Exception(
"stacked_dynamic_lstm do not support reader op for now.")
lstm_size = 512 lstm_size = 512
emb_dim = 512 emb_dim = 512
crop_size = 1500 crop_size = 1500
...@@ -114,7 +117,7 @@ def get_model(args): ...@@ -114,7 +117,7 @@ def get_model(args):
train_reader = batch( train_reader = batch(
paddle.reader.shuffle( paddle.reader.shuffle(
crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000), crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
batch_size=args.batch_size) batch_size=args.batch_size * args.gpus)
test_reader = batch( test_reader = batch(
paddle.reader.shuffle( paddle.reader.shuffle(
crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000), crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
......
...@@ -22,6 +22,7 @@ import paddle.fluid as fluid ...@@ -22,6 +22,7 @@ import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import argparse import argparse
import functools import functools
import os
def vgg16_bn_drop(input): def vgg16_bn_drop(input):
...@@ -65,9 +66,25 @@ def get_model(args): ...@@ -65,9 +66,25 @@ def get_model(args):
else: else:
data_shape = [224, 224, 3] data_shape = [224, 224, 3]
# Input data if args.use_reader_op:
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') filelist = [
label = fluid.layers.data(name='label', shape=[1], dtype='int64') os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
]
data_file = fluid.layers.open_files(
filenames=filelist,
shapes=[[-1] + data_shape, (-1, 1)],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
thread_num=args.gpus,
pass_num=args.pass_num)
data_file = fluid.layers.double_buffer(
fluid.layers.batch(
data_file, batch_size=args.batch_size))
images, label = fluid.layers.read_file(data_file)
else:
images = fluid.layers.data(
name='data', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program # Train program
net = vgg16_bn_drop(images) net = vgg16_bn_drop(images)
...@@ -95,7 +112,7 @@ def get_model(args): ...@@ -95,7 +112,7 @@ def get_model(args):
paddle.dataset.cifar.train10() paddle.dataset.cifar.train10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
buf_size=5120), buf_size=5120),
batch_size=args.batch_size) batch_size=args.batch_size * args.gpus)
test_reader = paddle.batch( test_reader = paddle.batch(
paddle.dataset.cifar.test10() paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.dataset import mnist, cifar, flowers, image
def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data,
shape_label):
num_batches = 0
with fluid.program_guard(fluid.Program(), fluid.Program()):
reader = paddle.batch(py_reader(), batch_size=batch_size)
feeder = fluid.DataFeeder(
feed_list=[ # order is image and label
fluid.layers.data(
name='image', shape=shape_data),
fluid.layers.data(
name='label', shape=shape_label, dtype='int64'),
],
place=fluid.CPUPlace())
num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
outfilepath, reader, feeder)
return num_batches
def prepare_mnist(outpath, batch_size):
outfilepath = os.path.join(outpath, "mnist.recordio")
convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1])
def prepare_cifar10(outpath, batch_size):
outfilepath = os.path.join(outpath, "cifar.recordio")
convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1])
def prepare_flowers(outpath, batch_size):
outfilepath = os.path.join(outpath, "flowers.recordio")
convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224],
[1])
def default_mapper(sample):
img, label = sample
img = image.simple_transform(
img, 256, 224, True, mean=[103.94, 116.78, 123.68])
return img.flatten().astype('float32'), label
def imagenet_train(data_dir):
contents = os.listdir(data_dir)
if set(contents) != set(
["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
raise Exception("Imagenet data contents error!")
img2label = dict()
imgfilelist = []
with open(os.path.join(data_dir, "train.txt")) as fn:
while 1:
l = fn.readline()
if not l:
break
img, lbl = l[:-1].split(" ")
img2label[img] = int(lbl)
imgfilelist.append(img)
# shuffle all, this is slow
random.shuffle(imgfilelist)
def train_reader():
for idx, imgfile in enumerate(imgfilelist):
data = image.load_image(
os.path.join(data_dir, "train", imgfile.lower()))
label = [img2label[imgfile], ]
yield [data, label]
return paddle.reader.map_readers(default_mapper, train_reader)
def imagenet_test(data_dir):
contents = os.listdir(data_dir)
if set(contents) != set(
["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
raise Exception("Imagenet data contents error!")
img2label = dict()
imgfilelist = []
with open(os.path.join(data_dir, "val.txt")) as fn:
while 1:
l = fn.readline()
if not l:
break
img, lbl = l[:-1].split(" ")
img2label[img] = int(lbl)
imgfilelist.append(img)
def test_reader():
for idx, imgfile in enumerate(imgfilelist):
base_path = os.path.join(data_dir, "val", imgfile.split(".")[0])
image_path = ".".join([base_path, "jpeg"])
data = image.load_image(image_path)
label = [img2label[imgfile], ]
yield [data, label]
return paddle.reader.map_readers(default_mapper, test_reader)
# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged
def convert_reader_to_recordio_files(
filename,
batch_per_file,
reader_creator,
feeder,
compressor=core.RecordIOWriter.Compressor.Snappy,
max_num_records=1000,
feed_order=None):
if feed_order is None:
feed_order = feeder.feed_names
f_name, f_ext = os.path.splitext(filename)
assert (f_ext == ".recordio")
lines = []
f_idx = 0
counter = 0
for idx, batch in enumerate(reader_creator()):
lines.append(batch)
if idx >= batch_per_file and idx % batch_per_file == 0:
filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
with fluid.recordio_writer.create_recordio_writer(
filename, compressor, max_num_records) as writer:
for l in lines:
res = feeder.feed(l)
for each in feed_order:
writer.append_tensor(res[each])
writer.complete_append_tensor()
counter += 1
lines = []
f_idx += 1
print("written file: ", filename)
return counter
def prepare_imagenet(inpath, outpath, batch_size):
r = paddle.batch(imagenet_train(inpath), batch_size=batch_size)
feeder = fluid.DataFeeder(
feed_list=[
fluid.layers.data(
name="image", shape=[3, 224, 224]), fluid.layers.data(
name="label", shape=[1], dtype='int64')
],
place=fluid.CPUPlace())
outpath = os.path.join(outpath, "imagenet.recordio")
convert_reader_to_recordio_files(outpath, 10000, r, feeder)
# API注释撰写标准 # API注释撰写标准
- [API注释模块](#API注释模块) - [API注释撰写标准](#api)
- [格式及示例](#格式及示例) - [API注释模块](#api)
- [完整示例](#完整示例) - [格式及示例](#)
- [完整示例](#)
## API注释模块 ## API注释模块
...@@ -217,4 +218,4 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接 ...@@ -217,4 +218,4 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接
## 完整示例 ## 完整示例
fc 的完整注释见[示例](src/fc.py) fc 的完整注释见[示例](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)
# API Doc Standard # API Doc Standard
- [API Doc Structure](#API Doc Structure) - [API Doc Standard](#api-doc-standard)
- [Format and Examples](#Format and Examples) - [API Doc Structure](#api-doc-structure)
- [Complete Example](#Complete Example) - [Format and Examples](#format-and-examples)
- [Complete Example](#complete-example)
## API Doc Structure ## API Doc Structure
...@@ -223,4 +224,4 @@ Format and examples of each part of API documantation are as follows: (take fc f ...@@ -223,4 +224,4 @@ Format and examples of each part of API documantation are as follows: (take fc f
## Complete Example ## Complete Example
Complete Example of fc please see [here](src/fc.py) Complete Example of fc please see [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)
...@@ -106,7 +106,7 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安 ...@@ -106,7 +106,7 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安
- 学习 Docker 有多难? - 学习 Docker 有多难?
理解 Docker 并不难,大概花十分钟看一下 `这篇文章 <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。 理解 Docker 并不难,大概花十分钟看一下 `如何使用Docker <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
- 我可以用 IDE 吗? - 我可以用 IDE 吗?
...@@ -123,7 +123,7 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安 ...@@ -123,7 +123,7 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安
- 可以并行编译吗? - 可以并行编译吗?
是的。我们的 Docker image 运行一个 `Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。 是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
- Docker 需要 sudo - Docker 需要 sudo
...@@ -131,11 +131,11 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安 ...@@ -131,11 +131,11 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安
- 在 Windows/MacOS 上编译很慢 - 在 Windows/MacOS 上编译很慢
Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考 `这个issue <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。 Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。
- 磁盘不够 - 磁盘不够
本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考 `这篇文章 <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。 本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考 `如何删除Docker Container <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
.. _compile_deps: .. _compile_deps:
...@@ -195,7 +195,7 @@ BLAS ...@@ -195,7 +195,7 @@ BLAS
PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和 PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集, `OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集,
还会下载MKL-DNN数学库,详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。 还会下载MKL-DNN数学库,详细参考 `mkldnn设计文档 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
如果关闭MKL,则会使用OpenBLAS作为BLAS库。 如果关闭MKL,则会使用OpenBLAS作为BLAS库。
......
...@@ -24,31 +24,37 @@ set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library") ...@@ -24,31 +24,37 @@ set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
set(inference_deps paddle_inference_api paddle_fluid_api) set(inference_deps paddle_inference_api paddle_fluid_api)
# if anakin is set enable anakin api implementation # if anakin is set enable anakin api implementation
if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY) if(ANAKIN_INCLUDE AND ANAKIN_LIBRARY)
set(ANAKIN_FOUND ON) set(ANAKIN_FOUND ON)
else() else()
set(ANAKIN_FOUND OFF) set(ANAKIN_FOUND OFF)
endif() endif()
function(fetch_include_recursively root_dir)
if (IS_DIRECTORY ${root_dir})
include_directories(${root_dir})
endif()
file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
foreach(sub ${ALL_SUB})
if (IS_DIRECTORY ${root_dir}/${sub})
fetch_include_recursively(${root_dir}/${sub})
endif()
endforeach()
endfunction()
if (ANAKIN_FOUND) if (ANAKIN_FOUND)
# Anakin's code style doesn't follow google c style. # Anakin's code style doesn't follow google c style.
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=comment set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp")
-Wno-error=reorder
-Wno-error=format
-Wno-error=switch
-Wno-error=return-type
-Wno-error=non-virtual-dtor
-Wno-error=cpp")
message(STATUS "Anakin for inference is enabled") message(STATUS "Anakin for inference is enabled")
message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}") message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
include_directories("${ANAKIN_INCLUDE}") fetch_include_recursively(${ANAKIN_INCLUDE})
# Anakin's source path is a mass, need to set sub-directories trivially.
include_directories("${ANAKIN_INCLUDE}/saber") link_directories(${ANAKIN_LIBRARY})
link_directories("${ANAKIN_LIBRARY}")
nv_library(inference_anakin_api SRCS paddle_inference_api_anakin_engine.cc) nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
target_link_libraries(inference_anakin_api anakin) target_link_libraries(inference_anakin_api anakin anakin_saber_common)
list(APPEND inference_deps inference_anakin_api) list(APPEND inference_deps inference_anakin_api)
endif() endif()
...@@ -73,7 +79,7 @@ function(inference_api_test TARGET_NAME) ...@@ -73,7 +79,7 @@ function(inference_api_test TARGET_NAME)
endfunction(inference_api_test) endfunction(inference_api_test)
cc_library(paddle_inference_api cc_library(paddle_inference_api
SRCS paddle_inference_api.cc paddle_inference_api_impl.cc SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
cc_test(test_paddle_inference_api cc_test(test_paddle_inference_api
...@@ -84,8 +90,8 @@ inference_api_test(test_paddle_inference_api_impl ...@@ -84,8 +90,8 @@ inference_api_test(test_paddle_inference_api_impl
ARGS test_word2vec test_image_classification) ARGS test_word2vec test_image_classification)
if (ANAKIN_FOUND) if (ANAKIN_FOUND)
nv_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
DEPS ${inference_deps} protobuf) DEPS ${inference_deps})
endif() endif()
if(WITH_TESTING) if(WITH_TESTING)
......
...@@ -19,8 +19,8 @@ limitations under the License. */ ...@@ -19,8 +19,8 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <memory> #include <memory>
#include <thread>
#include "paddle/contrib/inference/paddle_inference_api.h" #include "paddle/contrib/inference/paddle_inference_api.h"
namespace paddle { namespace paddle {
namespace demo { namespace demo {
...@@ -61,13 +61,67 @@ void Main(bool use_gpu) { ...@@ -61,13 +61,67 @@ void Main(bool use_gpu) {
for (size_t i = 0; i < std::min(5UL, num_elements); i++) { for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i]; LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
} }
// TODO(Superjomn): this is should be free automatically
free(outputs[0].data.data);
}
}
void MainThreads(int num_threads, bool use_gpu) {
// Multi-threads only support on CPU
// 0. Create PaddlePredictor with a config.
NativeConfig config;
config.model_dir = FLAGS_dirname + "word2vec.inference.model";
config.use_gpu = use_gpu;
config.fraction_of_gpu_memory = 0.15;
config.device = 0;
auto main_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
std::vector<std::thread> threads;
for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() {
// 1. clone a predictor which shares the same parameters
auto predictor = main_predictor->Clone();
constexpr int num_batches = 3;
for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
// 2. Dummy Input Data
int64_t data[4] = {1, 2, 3, 4};
PaddleBuf buf{.data = data, .length = sizeof(data)};
PaddleTensor tensor{.name = "",
.shape = std::vector<int>({4, 1}),
.data = buf,
.dtype = PaddleDType::INT64};
std::vector<PaddleTensor> inputs(4, tensor);
std::vector<PaddleTensor> outputs;
// 3. Run
CHECK(predictor->Run(inputs, &outputs));
// 4. Get output.
ASSERT_EQ(outputs.size(), 1UL);
LOG(INFO) << "TID: " << tid << ", "
<< "output buffer size: " << outputs.front().data.length;
const size_t num_elements = outputs.front().data.length / sizeof(float);
// The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
}
free(outputs[0].data.data);
}
});
}
for (int i = 0; i < num_threads; ++i) {
threads[i].join();
} }
} }
TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); } TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); }
TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); } TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); }
TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); }
#endif #endif
} // namespace demo } // namespace demo
......
...@@ -113,5 +113,4 @@ struct AnakinConfig : public PaddlePredictor::Config { ...@@ -113,5 +113,4 @@ struct AnakinConfig : public PaddlePredictor::Config {
// Similarly, each engine kind should map to a unique predictor implementation. // Similarly, each engine kind should map to a unique predictor implementation.
template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative> template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config); std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
} // namespace paddle } // namespace paddle
...@@ -24,8 +24,16 @@ PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor( ...@@ -24,8 +24,16 @@ PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
} }
bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) { bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
// TODO(Superjomn) Tell anakin to support return code. if (!(graph_.load(config.model_file))) {
engine_.Build(config.model_file, config.max_batch_size); return false;
}
graph_.ResetBatchSize("input_0", config.max_batch_size);
// optimization for graph
if (!(graph_.Optimize())) {
return false;
}
// construct executer
executor_.init(graph_);
return true; return true;
} }
...@@ -38,24 +46,30 @@ bool PaddleInferenceAnakinPredictor::Run( ...@@ -38,24 +46,30 @@ bool PaddleInferenceAnakinPredictor::Run(
<< "'s type is not float"; << "'s type is not float";
return false; return false;
} }
engine_.SetInputFromCPU( auto d_tensor_in_p = executor_.get_in(input.name);
input.name, static_cast<float *>(input.data.data), input.data.length); float *d_data_p = d_tensor_in_p->mutable_data();
if (cudaMemcpy(d_data_p,
static_cast<float *>(input.data.data),
d_tensor_in_p->valid_size() * sizeof(float),
cudaMemcpyHostToDevice) != 0) {
LOG(ERROR) << "copy data from CPU to GPU error";
return false;
}
} }
// TODO(Superjomn) Tell anakin to support return code. executor_.prediction();
engine_.Execute();
if (output_data->empty()) { if (output_data->empty()) {
LOG(ERROR) << "At least one output should be set with tensors' names."; LOG(ERROR) << "At least one output should be set with tensors' names.";
return false; return false;
} }
for (auto &output : *output_data) { for (auto &output : *output_data) {
auto *tensor = engine_.GetOutputInGPU(output.name); auto *tensor = executor_.get_out(output.name);
output.shape = tensor->shape(); output.shape = tensor->shape();
// Copy data from GPU -> CPU // Copy data from GPU -> CPU
if (cudaMemcpy(output.data.data, if (cudaMemcpy(output.data.data,
tensor->data(), tensor->mutable_data(),
tensor->size(), tensor->valid_size() * sizeof(float),
cudaMemcpyDeviceToHost) != 0) { cudaMemcpyDeviceToHost) != 0) {
LOG(ERROR) << "copy data from GPU to CPU error"; LOG(ERROR) << "copy data from GPU to CPU error";
return false; return false;
...@@ -64,9 +78,26 @@ bool PaddleInferenceAnakinPredictor::Run( ...@@ -64,9 +78,26 @@ bool PaddleInferenceAnakinPredictor::Run(
return true; return true;
} }
// TODO(Superjomn) To implement latter. anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
&PaddleInferenceAnakinPredictor::get_executer() {
return executor_;
}
// the cloned new Predictor of anakin share the same net weights from original
// Predictor
std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() { std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
return nullptr; VLOG(3) << "Anakin Predictor::clone";
std::unique_ptr<PaddlePredictor> cls(new PaddleInferenceAnakinPredictor());
// construct executer from other graph
auto anakin_predictor_p =
dynamic_cast<PaddleInferenceAnakinPredictor *>(cls.get());
if (!anakin_predictor_p) {
LOG(ERROR) << "fail to call Init";
return nullptr;
}
anakin_predictor_p->get_executer().init(graph_);
return std::move(cls);
} }
// A factory to help create difference predictor. // A factory to help create difference predictor.
...@@ -74,6 +105,7 @@ template <> ...@@ -74,6 +105,7 @@ template <>
std::unique_ptr<PaddlePredictor> std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>( CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(
const AnakinConfig &config) { const AnakinConfig &config) {
VLOG(3) << "Anakin Predictor create.";
std::unique_ptr<PaddlePredictor> x( std::unique_ptr<PaddlePredictor> x(
new PaddleInferenceAnakinPredictor(config)); new PaddleInferenceAnakinPredictor(config));
return x; return x;
......
...@@ -20,32 +20,42 @@ limitations under the License. */ ...@@ -20,32 +20,42 @@ limitations under the License. */
#pragma once #pragma once
// NOTE This header file do not have namespace. // NOTE This header file do not have namespace.
// TODO(Superjomn) Tell Anakin to provide better APIs. //#include <test/framework/net/paddle_api.h>
#include <test/framework/net/paddle_api.h>
#include "paddle/contrib/inference/paddle_inference_api.h" #include "paddle/contrib/inference/paddle_inference_api.h"
#include "framework/core/net/net.h"
#include "saber/saber_types.h"
namespace paddle { namespace paddle {
class PaddleInferenceAnakinPredictor : public PaddlePredictor { class PaddleInferenceAnakinPredictor : public PaddlePredictor {
public: public:
PaddleInferenceAnakinPredictor() {}
PaddleInferenceAnakinPredictor(const AnakinConfig& config); PaddleInferenceAnakinPredictor(const AnakinConfig& config);
// NOTE Unlike the native engine, the buffers of anakin engine's output_data // NOTE Unlike the native engine, the buffers of anakin engine's output_data
// should be allocated first. // should be allocated first.
// TODO(Superjomn) should unify all the behaviors of output_data accross all
// the engines.
bool Run(const std::vector<PaddleTensor>& inputs, bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data) override; std::vector<PaddleTensor>* output_data) override;
std::unique_ptr<PaddlePredictor> Clone() override; std::unique_ptr<PaddlePredictor> Clone() override;
anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
get_executer();
~PaddleInferenceAnakinPredictor() override{};
private: private:
bool Init(const AnakinConfig& config); bool Init(const AnakinConfig& config);
anakin::AnakinEngine<anakin::NV, anakin::graph::Graph<anakin::NV,
anakin::saber::AK_FLOAT, anakin::saber::AK_FLOAT,
anakin::Precision::FP32> anakin::Precision::FP32>
engine_; graph_;
anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
executor_;
AnakinConfig config_;
}; };
} // namespace paddle } // namespace paddle
...@@ -12,16 +12,54 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,16 +12,54 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/contrib/inference/paddle_inference_api.h" #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "paddle/contrib/inference/paddle_inference_api.h"
namespace paddle { namespace paddle {
TEST(inference, anakin) { AnakinConfig GetConfig() {
AnakinConfig config; AnakinConfig config;
config.model_file = "./mobilenet_v2.anakin.bin";
config.device = 0;
config.max_batch_size = 1;
return config;
}
auto engine = TEST(inference, anakin) {
AnakinConfig config = GetConfig();
auto predictor =
CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config); CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
float data[1 * 3 * 224 * 224] = {1.0f};
PaddleBuf buf{.data = data, .length = sizeof(data)};
PaddleTensor tensor{.name = "input_0",
.shape = std::vector<int>({1, 3, 224, 224}),
.data = buf,
.dtype = PaddleDType::FLOAT32};
// For simplicity, we set all the slots with the same data.
std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
float data_out[1000];
PaddleBuf buf_out{.data = data_out, .length = sizeof(data)};
PaddleTensor tensor_out{.name = "prob_out",
.shape = std::vector<int>({1000, 1}),
.data = buf_out,
.dtype = PaddleDType::FLOAT32};
std::vector<PaddleTensor> outputs(1, tensor_out);
ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
float* data_o = static_cast<float*>(outputs[0].data.data);
for (size_t j = 0; j < 1000; ++j) {
LOG(INFO) << "output[" << j << "]: " << data_o[j];
}
} }
} // namespace paddle } // namespace paddle
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <thread>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/contrib/inference/paddle_inference_api_impl.h" #include "paddle/contrib/inference/paddle_inference_api_impl.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
...@@ -45,14 +47,19 @@ NativeConfig GetConfig() { ...@@ -45,14 +47,19 @@ NativeConfig GetConfig() {
config.model_dir = FLAGS_dirname + "word2vec.inference.model"; config.model_dir = FLAGS_dirname + "word2vec.inference.model";
LOG(INFO) << "dirname " << config.model_dir; LOG(INFO) << "dirname " << config.model_dir;
config.fraction_of_gpu_memory = 0.15; config.fraction_of_gpu_memory = 0.15;
#ifdef PADDLE_WITH_CUDA
config.use_gpu = true; config.use_gpu = true;
#else
config.use_gpu = false;
#endif
config.device = 0; config.device = 0;
return config; return config;
} }
TEST(paddle_inference_api_impl, word2vec) { void MainWord2Vec(bool use_gpu) {
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
auto predictor = CreatePaddlePredictor<NativeConfig>(config); auto predictor = CreatePaddlePredictor<NativeConfig>(config);
config.use_gpu = use_gpu;
framework::LoDTensor first_word, second_word, third_word, fourth_word; framework::LoDTensor first_word, second_word, third_word, fourth_word;
framework::LoD lod{{0, 1}}; framework::LoD lod{{0, 1}};
...@@ -100,11 +107,11 @@ TEST(paddle_inference_api_impl, word2vec) { ...@@ -100,11 +107,11 @@ TEST(paddle_inference_api_impl, word2vec) {
free(outputs[0].data.data); free(outputs[0].data.data);
} }
TEST(paddle_inference_api_impl, image_classification) { void MainImageClassification(bool use_gpu) {
int batch_size = 2; int batch_size = 2;
bool use_mkldnn = false;
bool repeat = false; bool repeat = false;
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
config.use_gpu = use_gpu;
config.model_dir = config.model_dir =
FLAGS_dirname + "image_classification_resnet.inference.model"; FLAGS_dirname + "image_classification_resnet.inference.model";
...@@ -126,12 +133,8 @@ TEST(paddle_inference_api_impl, image_classification) { ...@@ -126,12 +133,8 @@ TEST(paddle_inference_api_impl, image_classification) {
std::vector<framework::LoDTensor*> cpu_fetchs1; std::vector<framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1); cpu_fetchs1.push_back(&output1);
TestInference<platform::CPUPlace, false, true>(config.model_dir, TestInference<platform::CPUPlace, false, true>(
cpu_feeds, config.model_dir, cpu_feeds, cpu_fetchs1, repeat, is_combined);
cpu_fetchs1,
repeat,
is_combined,
use_mkldnn);
auto predictor = CreatePaddlePredictor(config); auto predictor = CreatePaddlePredictor(config);
std::vector<PaddleTensor> paddle_tensor_feeds; std::vector<PaddleTensor> paddle_tensor_feeds;
...@@ -149,4 +152,143 @@ TEST(paddle_inference_api_impl, image_classification) { ...@@ -149,4 +152,143 @@ TEST(paddle_inference_api_impl, image_classification) {
free(data); free(data);
} }
void MainThreadsWord2Vec(bool use_gpu) {
NativeConfig config = GetConfig();
config.use_gpu = use_gpu;
auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
// prepare inputs data and reference results
constexpr int num_jobs = 3;
std::vector<std::vector<framework::LoDTensor>> jobs(num_jobs);
std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
std::vector<framework::LoDTensor> refs(num_jobs);
for (size_t i = 0; i < jobs.size(); ++i) {
// each job has 4 words
jobs[i].resize(4);
for (size_t j = 0; j < 4; ++j) {
framework::LoD lod{{0, 1}};
int64_t dict_size = 2073; // The size of dictionary
SetupLoDTensor(&jobs[i][j], lod, static_cast<int64_t>(0), dict_size - 1);
paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i][j]));
}
// get reference result of each job
std::vector<paddle::framework::LoDTensor*> ref_feeds;
std::vector<paddle::framework::LoDTensor*> ref_fetches(1, &refs[i]);
for (auto& word : jobs[i]) {
ref_feeds.push_back(&word);
}
TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
}
// create threads and each thread run 1 job
std::vector<std::thread> threads;
for (int tid = 0; tid < num_jobs; ++tid) {
threads.emplace_back([&, tid]() {
auto predictor = main_predictor->Clone();
auto& local_inputs = paddle_tensor_feeds[tid];
std::vector<PaddleTensor> local_outputs;
ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
// check outputs range
ASSERT_EQ(local_outputs.size(), 1UL);
const size_t len = local_outputs[0].data.length;
float* data = static_cast<float*>(local_outputs[0].data.data);
for (size_t j = 0; j < len / sizeof(float); ++j) {
ASSERT_LT(data[j], 1.0);
ASSERT_GT(data[j], -1.0);
}
// check outputs correctness
float* ref_data = refs[tid].data<float>();
EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
for (int i = 0; i < refs[tid].numel(); ++i) {
EXPECT_NEAR(ref_data[i], data[i], 1e-3);
}
free(data);
});
}
for (int i = 0; i < num_jobs; ++i) {
threads[i].join();
}
}
void MainThreadsImageClassification(bool use_gpu) {
constexpr int num_jobs = 4; // each job run 1 batch
constexpr int batch_size = 1;
NativeConfig config = GetConfig();
config.use_gpu = use_gpu;
config.model_dir =
FLAGS_dirname + "image_classification_resnet.inference.model";
auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
std::vector<framework::LoDTensor> jobs(num_jobs);
std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
std::vector<framework::LoDTensor> refs(num_jobs);
for (size_t i = 0; i < jobs.size(); ++i) {
// prepare inputs
std::vector<std::vector<int64_t>> feed_target_shapes =
GetFeedTargetShapes(config.model_dir, /*is_combined*/ false);
feed_target_shapes[0][0] = batch_size;
framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
SetupTensor<float>(&jobs[i], input_dims, 0.f, 1.f);
paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i]));
// get reference result of each job
std::vector<framework::LoDTensor*> ref_feeds(1, &jobs[i]);
std::vector<framework::LoDTensor*> ref_fetches(1, &refs[i]);
TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
}
// create threads and each thread run 1 job
std::vector<std::thread> threads;
for (int tid = 0; tid < num_jobs; ++tid) {
threads.emplace_back([&, tid]() {
auto predictor = main_predictor->Clone();
auto& local_inputs = paddle_tensor_feeds[tid];
std::vector<PaddleTensor> local_outputs;
ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
// check outputs correctness
ASSERT_EQ(local_outputs.size(), 1UL);
const size_t len = local_outputs[0].data.length;
float* data = static_cast<float*>(local_outputs[0].data.data);
float* ref_data = refs[tid].data<float>();
EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
for (int i = 0; i < refs[tid].numel(); ++i) {
EXPECT_NEAR(ref_data[i], data[i], 1e-3);
}
free(data);
});
}
for (int i = 0; i < num_jobs; ++i) {
threads[i].join();
}
}
TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); }
TEST(inference_api_native, word2vec_cpu_threads) {
MainThreadsWord2Vec(false /*use_gpu*/);
}
TEST(inference_api_native, image_classification_cpu) {
MainThreadsImageClassification(false /*use_gpu*/);
}
TEST(inference_api_native, image_classification_cpu_threads) {
MainThreadsImageClassification(false /*use_gpu*/);
}
#ifdef PADDLE_WITH_CUDA
TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); }
TEST(inference_api_native, word2vec_gpu_threads) {
MainThreadsWord2Vec(true /*use_gpu*/);
}
TEST(inference_api_native, image_classification_gpu) {
MainThreadsImageClassification(true /*use_gpu*/);
}
TEST(inference_api_native, image_classification_gpu_threads) {
MainThreadsImageClassification(true /*use_gpu*/);
}
#endif
} // namespace paddle } // namespace paddle
...@@ -87,7 +87,7 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope ...@@ -87,7 +87,7 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
framework_proto glog lod_rank_table feed_fetch_method) framework_proto glog lod_rank_table feed_fetch_method)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor) cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
cc_library(prune SRCS prune.cc DEPS framework_proto) cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
......
...@@ -28,6 +28,9 @@ struct DataTypeMap { ...@@ -28,6 +28,9 @@ struct DataTypeMap {
}; };
static DataTypeMap* InitDataTypeMap(); static DataTypeMap* InitDataTypeMap();
// C++11 removes the need for manual locking. Concurrent execution shall wait if
// a static local variable is already being initialized.
// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
static DataTypeMap& gDataTypeMap() { static DataTypeMap& gDataTypeMap() {
static DataTypeMap* g_data_type_map_ = InitDataTypeMap(); static DataTypeMap* g_data_type_map_ = InitDataTypeMap();
return *g_data_type_map_; return *g_data_type_map_;
......
...@@ -8,6 +8,7 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place ...@@ -8,6 +8,7 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place
cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph) cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder) cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
...@@ -31,7 +32,7 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ...@@ -31,7 +32,7 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle) scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle)
cc_library(graph_builder_factory SRCS graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer) cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
......
...@@ -42,7 +42,7 @@ void FuseVarsOpHandle::RunImpl() { ...@@ -42,7 +42,7 @@ void FuseVarsOpHandle::RunImpl() {
out_t->ShareDataWith(out_tensor->Slice(s, s + numel)); out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
s += numel; s += numel;
} }
this->RunAndRecordEvent([this] {}); this->RunAndRecordEvent([] {});
} }
std::string FuseVarsOpHandle::Name() const { return "fuse vars"; } std::string FuseVarsOpHandle::Name() const { return "fuse vars"; }
......
...@@ -86,7 +86,7 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars( ...@@ -86,7 +86,7 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
for (auto *op : program.Block(0).AllOps()) { for (auto *op : program.Block(0).AllOps()) {
// TODO(Yancey1989): use a graceful method to find send op, // TODO(Yancey1989): use a graceful method to find send op,
// instead of the the hard code string // instead of the the hard code string
if (op->Type() == "send_vars") { if (op->Type() == "send") {
auto op_vars = op->InputArgumentNames(); auto op_vars = op->InputArgumentNames();
send_vars.reserve(send_vars.size() + send_vars.reserve(send_vars.size() +
std::distance(op_vars.begin(), op_vars.end())); std::distance(op_vars.begin(), op_vars.end()));
...@@ -471,22 +471,21 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result, ...@@ -471,22 +471,21 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
const OpDesc &op) const { const OpDesc &op) const {
auto &p = places_[0]; result->ops_.emplace_back(
auto *s = local_scopes_[0]; new RPCOpHandle(op, local_scopes_[0], op.Type(), places_[0]));
result->ops_.emplace_back(new RPCOpHandle(op, s, p, op.Type()));
if (op.Type() == "send_barrier") { if (op.Type() == "send_barrier") {
ConnectOp(result, result->ops_.back().get(), "send_vars"); ConnectOp(result, result->ops_.back().get(), "send");
} else if (op.Type() == "recv") { } else if (op.Type() == "recv") {
ConnectOp(result, result->ops_.back().get(), "send_barrier"); ConnectOp(result, result->ops_.back().get(), "send_barrier");
} else if (op.Type() == "fetch_barrier") { } else if (op.Type() == "fetch_barrier") {
ConnectOp(result, result->ops_.back().get(), "recv"); ConnectOp(result, result->ops_.back().get(), "recv");
} else if (op.Type() == "send_vars") { } else if (op.Type() == "send") {
// do nothing // do nothing
} else { } else {
PADDLE_THROW( PADDLE_THROW(
"rpc op should be in [" "rpc op should be in ["
"send_vars, send_barrier. recv, fetch_barrier]"); "send, send_barrier. recv, fetch_barrier]");
} }
// TODO(Yancey1989): schedule rpc op on different place may // TODO(Yancey1989): schedule rpc op on different place may
......
...@@ -19,12 +19,12 @@ namespace framework { ...@@ -19,12 +19,12 @@ namespace framework {
namespace details { namespace details {
RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc, RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc,
const Scope *local_scope, const platform::Place &place, const Scope *local_scope, const std::string &name,
const std::string &name) const platform::Place &place)
: op_(framework::OpRegistry::CreateOp(op_desc)), : op_(framework::OpRegistry::CreateOp(op_desc)),
local_scope_(local_scope), local_scope_(local_scope),
place_(place), name_(name),
name_(name) {} place_(place) {}
void RPCOpHandle::RunImpl() { void RPCOpHandle::RunImpl() {
// TODO(wuyi): need further analysis whether wait VarDummyHandle. // TODO(wuyi): need further analysis whether wait VarDummyHandle.
......
...@@ -29,7 +29,7 @@ namespace details { ...@@ -29,7 +29,7 @@ namespace details {
struct RPCOpHandle : public OpHandleBase { struct RPCOpHandle : public OpHandleBase {
RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope, RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
const platform::Place& place, const std::string& name); const std::string& name, const platform::Place& place);
std::string Name() const override; std::string Name() const override;
...@@ -43,8 +43,8 @@ struct RPCOpHandle : public OpHandleBase { ...@@ -43,8 +43,8 @@ struct RPCOpHandle : public OpHandleBase {
private: private:
std::unique_ptr<OperatorBase> op_; std::unique_ptr<OperatorBase> op_;
const Scope* local_scope_; const Scope* local_scope_;
const platform::Place& place_;
const std::string name_; const std::string name_;
platform::Place place_;
}; };
} // namespace details } // namespace details
......
...@@ -11,8 +11,8 @@ ...@@ -11,8 +11,8 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph_builder.h" #include "paddle/fluid/framework/details/ssa_graph_builder.h"
#include <utility>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -12,9 +12,10 @@ ...@@ -12,9 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/graph_builder_factory.h" #include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
#include <fstream> #include <fstream>
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/ssa_graph_checker.h"
#include "paddle/fluid/framework/details/ssa_graph_printer.h" #include "paddle/fluid/framework/details/ssa_graph_printer.h"
namespace paddle { namespace paddle {
...@@ -40,6 +41,8 @@ std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() { ...@@ -40,6 +41,8 @@ std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
res.reset(new SSAGraghBuilderWithPrinter( res.reset(new SSAGraghBuilderWithPrinter(
std::move(fout), std::move(graphviz_printer), std::move(res))); std::move(fout), std::move(graphviz_printer), std::move(res)));
} }
res.reset(new SSAGraghBuilderWithChecker(std::move(res)));
return res; return res;
} }
} // namespace details } // namespace details
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph.h"
#include <string>
#include "paddle/fluid/framework/details/ssa_graph_checker.h"
namespace paddle {
namespace framework {
namespace details {
bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars;
std::unordered_set<VarHandleBase *> ready_vars;
std::unordered_set<OpHandleBase *> ready_ops;
auto insert_pending_var = [&](VarHandleBase *var) {
pending_vars.insert(var);
if (var->generated_op_ == nullptr) {
ready_vars.emplace(var);
}
};
for (auto &var_map : graph->vars_) {
for (auto &name_pair : var_map) {
for (auto &version_pair : name_pair.second) {
insert_pending_var(version_pair.get());
}
}
}
for (auto &var : graph->dep_vars_) {
insert_pending_var(var.get());
}
for (auto &op : graph->ops_) {
if (op->Inputs().empty()) {
ready_ops.insert(op.get());
} else {
pending_ops.insert({op.get(), op.get()->NoDupInputSize()});
}
}
auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
for (auto *op : set) {
for (auto out : op->Outputs()) {
ready_vars.emplace(out);
}
}
set.clear();
};
while (!pending_vars.empty()) {
run_all_ops(ready_ops);
if (ready_vars.empty()) {
return false;
}
for (auto ready_var : ready_vars) {
pending_vars.erase(ready_var);
for (auto *op : ready_var->pending_ops_) {
auto &deps = --pending_ops[op];
if (deps == 0) {
ready_ops.insert(op);
}
}
}
ready_vars.clear();
}
return true;
}
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
namespace paddle {
namespace framework {
namespace details {
class SSAGraph;
class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
public:
explicit SSAGraghBuilderWithChecker(
std::unique_ptr<SSAGraphBuilder>&& builder)
: builder_(std::move(builder)) {}
std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
auto graph = builder_->Build(program);
PADDLE_ENFORCE(IsValidGraph(graph.get()));
return graph;
}
bool IsValidGraph(const SSAGraph* graph) const;
private:
std::unique_ptr<SSAGraphBuilder> builder_;
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -185,6 +185,7 @@ void ThreadedSSAGraphExecutor::InsertPendingVar( ...@@ -185,6 +185,7 @@ void ThreadedSSAGraphExecutor::InsertPendingVar(
ready_vars->Push(var); ready_vars->Push(var);
} }
} }
void ThreadedSSAGraphExecutor::RunOp( void ThreadedSSAGraphExecutor::RunOp(
BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) { BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
auto op_run = [ready_var_q, op, this] { auto op_run = [ready_var_q, op, this] {
......
...@@ -24,6 +24,7 @@ limitations under the License. */ ...@@ -24,6 +24,7 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DECLARE_bool(benchmark); DECLARE_bool(benchmark);
DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -115,6 +116,7 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, ...@@ -115,6 +116,7 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool create_local_scope, bool create_vars) { bool create_local_scope, bool create_vars) {
platform::RecordBlock b(block_id); platform::RecordBlock b(block_id);
if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
auto ctx = Prepare(pdesc, block_id); auto ctx = Prepare(pdesc, block_id);
RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
} }
...@@ -214,6 +216,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, ...@@ -214,6 +216,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
const std::string& feed_holder_name, const std::string& feed_holder_name,
const std::string& fetch_holder_name) { const std::string& fetch_holder_name) {
platform::RecordBlock b(kProgramId); platform::RecordBlock b(kProgramId);
if (FLAGS_use_mkldnn) EnableMKLDNN(program);
bool has_feed_ops = bool has_feed_ops =
has_feed_operators(program.Block(0), *feed_targets, feed_holder_name); has_feed_operators(program.Block(0), *feed_targets, feed_holder_name);
bool has_fetch_ops = bool has_fetch_ops =
...@@ -225,7 +228,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, ...@@ -225,7 +228,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
unique_ptr_of_copy_program.reset(new ProgramDesc(program)); unique_ptr_of_copy_program.reset(new ProgramDesc(program));
copy_program = unique_ptr_of_copy_program.get(); copy_program = unique_ptr_of_copy_program.get();
} }
auto* global_block = copy_program->MutableBlock(0); auto* global_block = copy_program->MutableBlock(0);
if (!has_feed_ops) { if (!has_feed_ops) {
...@@ -378,5 +380,19 @@ void Executor::RunPreparedContext( ...@@ -378,5 +380,19 @@ void Executor::RunPreparedContext(
} }
} }
void Executor::EnableMKLDNN(const ProgramDesc& program) {
#ifdef PADDLE_WITH_MKLDNN
VLOG(3) << "use_mkldnn=True";
for (size_t bid = 0; bid < program.Size(); ++bid) {
auto* block = const_cast<ProgramDesc&>(program).MutableBlock(bid);
for (auto* op : block->AllOps()) {
if (op->HasAttr("use_mkldnn")) {
op->SetAttr("use_mkldnn", true);
}
}
}
#endif
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -81,6 +81,8 @@ class Executor { ...@@ -81,6 +81,8 @@ class Executor {
const std::string& feed_holder_name = "feed", const std::string& feed_holder_name = "feed",
const std::string& fetch_holder_name = "fetch"); const std::string& fetch_holder_name = "fetch");
void EnableMKLDNN(const ProgramDesc& program);
private: private:
const platform::Place place_; const platform::Place place_;
}; };
......
...@@ -156,15 +156,15 @@ class OpKernelRegistrar : public Registrar { ...@@ -156,15 +156,15 @@ class OpKernelRegistrar : public Registrar {
/** /**
* Macro to register OperatorKernel. * Macro to register OperatorKernel.
*/ */
#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...) \ #define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \ STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op_kernel_##op_type##_##LIBRARY_TYPE##__, \ __reg_op_kernel_##op_type##_##library_type##__, \
"REGISTER_OP_KERNEL must be called in global namespace"); \ "REGISTER_OP_KERNEL must be called in global namespace"); \
static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__> \ static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__> \
__op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type, \ __op_kernel_registrar_##op_type##_##library_type##__(#op_type, \
#LIBRARY_TYPE); \ #library_type); \
int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() { \ int TouchOpKernelRegistrar_##op_type##_##library_type() { \
__op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch(); \ __op_kernel_registrar_##op_type##_##library_type##__.Touch(); \
return 0; \ return 0; \
} }
......
...@@ -293,6 +293,38 @@ static Tensor* GetMutableTensorFromVar(Variable* var) { ...@@ -293,6 +293,38 @@ static Tensor* GetMutableTensorFromVar(Variable* var) {
} }
} }
bool ExecutionContext::HasInput(const std::string& name) const {
if (!op_.HasInputs(name)) {
return false;
}
auto& ins = Inputs(name);
size_t length = ins.size();
if (length == 0) {
return false;
}
PADDLE_ENFORCE_EQ(length, 1UL,
"Input %s should not have more than one inputs", name);
auto arg = ins[0];
auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
return var != nullptr;
}
bool ExecutionContext::HasOutput(const std::string& name) const {
if (!op_.HasOutputs(name)) {
return false;
}
auto& outs = Outputs(name);
size_t length = outs.size();
if (length == 0) {
return false;
}
PADDLE_ENFORCE_EQ(length, 1UL,
"Output %s should not have more than one inputs", name);
auto arg = outs[0];
auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
return var != nullptr;
}
template <> template <>
const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const { const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
auto* var = InputVar(name); auto* var = InputVar(name);
...@@ -661,8 +693,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( ...@@ -661,8 +693,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
} }
if (t != nullptr) { if (t != nullptr) {
int tmp = static_cast<int>(ToDataType(t->type())); int tmp = static_cast<int>(ToDataType(t->type()));
PADDLE_ENFORCE(tmp == data_type || data_type == -1, PADDLE_ENFORCE(
"DataType of Paddle Op %s must be the same.", Type()); tmp == data_type || data_type == -1,
"DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
data_type, tmp);
data_type = tmp; data_type = tmp;
} }
} }
......
...@@ -191,9 +191,9 @@ class ExecutionContext { ...@@ -191,9 +191,9 @@ class ExecutionContext {
return op_.Attr<T>(name); return op_.Attr<T>(name);
} }
bool HasInput(const std::string& name) const { return op_.HasInputs(name); } bool HasInput(const std::string& name) const;
bool HasOutput(const std::string& name) const { return op_.HasOutputs(name); } bool HasOutput(const std::string& name) const;
size_t InputSize(const std::string& name) const { size_t InputSize(const std::string& name) const {
return op_.Inputs(name).size(); return op_.Inputs(name).size();
......
...@@ -22,8 +22,8 @@ limitations under the License. */ ...@@ -22,8 +22,8 @@ limitations under the License. */
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
#include "paddle/fluid/framework/details/graph_builder_factory.h"
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
......
...@@ -151,7 +151,8 @@ class TRTConvertValidation { ...@@ -151,7 +151,8 @@ class TRTConvertValidation {
// Compare two output // Compare two output
ASSERT_FALSE(fluid_out.empty()); ASSERT_FALSE(fluid_out.empty());
for (size_t i = 0; i < fluid_out.size(); i++) { for (size_t i = 0; i < fluid_out.size(); i++) {
EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 1e-6); // Loose the threshold for CI in different machine model.
EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
} }
} }
} }
......
...@@ -21,7 +21,6 @@ DEFINE_string(fp16_dirname, "", "Directory of the float16 inference model."); ...@@ -21,7 +21,6 @@ DEFINE_string(fp16_dirname, "", "Directory of the float16 inference model.");
DEFINE_int32(batch_size, 1, "Batch size of input data"); DEFINE_int32(batch_size, 1, "Batch size of input data");
DEFINE_int32(repeat, 1, "Running the inference program repeat times"); DEFINE_int32(repeat, 1, "Running the inference program repeat times");
DEFINE_bool(skip_cpu, false, "Skip the cpu test"); DEFINE_bool(skip_cpu, false, "Skip the cpu test");
DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
TEST(inference, image_classification) { TEST(inference, image_classification) {
if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) { if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
...@@ -59,10 +58,8 @@ TEST(inference, image_classification) { ...@@ -59,10 +58,8 @@ TEST(inference, image_classification) {
// Run inference on CPU // Run inference on CPU
LOG(INFO) << "--- CPU Runs: ---"; LOG(INFO) << "--- CPU Runs: ---";
LOG(INFO) << "Batch size is " << FLAGS_batch_size; LOG(INFO) << "Batch size is " << FLAGS_batch_size;
LOG(INFO) << "FLAGS_use_mkldnn: " << FLAGS_use_mkldnn;
TestInference<paddle::platform::CPUPlace, false, true>( TestInference<paddle::platform::CPUPlace, false, true>(
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined, dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
FLAGS_use_mkldnn);
LOG(INFO) << output1.dims(); LOG(INFO) << output1.dims();
} }
......
...@@ -27,7 +27,6 @@ limitations under the License. */ ...@@ -27,7 +27,6 @@ limitations under the License. */
DEFINE_string(model_path, "", "Directory of the inference model."); DEFINE_string(model_path, "", "Directory of the inference model.");
DEFINE_string(data_file, "", "File of input index data."); DEFINE_string(data_file, "", "File of input index data.");
DEFINE_int32(repeat, 100, "Running the inference program repeat times"); DEFINE_int32(repeat, 100, "Running the inference program repeat times");
DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
DEFINE_bool(prepare_vars, true, "Prepare variables before executor"); DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
DEFINE_int32(num_threads, 1, "Number of threads should be used"); DEFINE_int32(num_threads, 1, "Number of threads should be used");
...@@ -190,9 +189,6 @@ TEST(inference, nlp) { ...@@ -190,9 +189,6 @@ TEST(inference, nlp) {
std::unique_ptr<paddle::framework::ProgramDesc> inference_program; std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
inference_program = InitProgram(&executor, scope.get(), FLAGS_model_path, inference_program = InitProgram(&executor, scope.get(), FLAGS_model_path,
/*model combined*/ false); /*model combined*/ false);
if (FLAGS_use_mkldnn) {
EnableMKLDNN(inference_program);
}
// always prepare context // always prepare context
std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx; std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
ctx = executor.Prepare(*inference_program, 0); ctx = executor.Prepare(*inference_program, 0);
......
...@@ -22,6 +22,8 @@ limitations under the License. */ ...@@ -22,6 +22,8 @@ limitations under the License. */
#include "paddle/fluid/inference/io.h" #include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DECLARE_bool(use_mkldnn);
template <typename T> template <typename T>
void SetupTensor(paddle::framework::LoDTensor* input, void SetupTensor(paddle::framework::LoDTensor* input,
paddle::framework::DDim dims, T lower, T upper) { paddle::framework::DDim dims, T lower, T upper) {
...@@ -133,24 +135,11 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes( ...@@ -133,24 +135,11 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
return feed_target_shapes; return feed_target_shapes;
} }
void EnableMKLDNN(
const std::unique_ptr<paddle::framework::ProgramDesc>& program) {
for (size_t bid = 0; bid < program->Size(); ++bid) {
auto* block = program->MutableBlock(bid);
for (auto* op : block->AllOps()) {
if (op->HasAttr("use_mkldnn")) {
op->SetAttr("use_mkldnn", true);
}
}
}
}
template <typename Place, bool CreateVars = true, bool PrepareContext = false> template <typename Place, bool CreateVars = true, bool PrepareContext = false>
void TestInference(const std::string& dirname, void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds, const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs, const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
const int repeat = 1, const bool is_combined = false, const int repeat = 1, const bool is_combined = false) {
const bool use_mkldnn = false) {
// 1. Define place, executor, scope // 1. Define place, executor, scope
auto place = Place(); auto place = Place();
auto executor = paddle::framework::Executor(place); auto executor = paddle::framework::Executor(place);
...@@ -182,9 +171,6 @@ void TestInference(const std::string& dirname, ...@@ -182,9 +171,6 @@ void TestInference(const std::string& dirname,
"init_program", "init_program",
paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::platform::DeviceContextPool::Instance().Get(place));
inference_program = InitProgram(&executor, scope, dirname, is_combined); inference_program = InitProgram(&executor, scope, dirname, is_combined);
if (use_mkldnn) {
EnableMKLDNN(inference_program);
}
} }
// Disable the profiler and print the timing information // Disable the profiler and print the timing information
paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
...@@ -210,7 +196,10 @@ void TestInference(const std::string& dirname, ...@@ -210,7 +196,10 @@ void TestInference(const std::string& dirname,
fetch_targets[fetch_target_names[i]] = cpu_fetchs[i]; fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
} }
// 6. Run the inference program // 6. If export Flags_use_mkldnn=True, use mkldnn related ops.
if (FLAGS_use_mkldnn) executor.EnableMKLDNN(*inference_program);
// 7. Run the inference program
{ {
if (!CreateVars) { if (!CreateVars) {
// If users don't want to create and destroy variables every time they // If users don't want to create and destroy variables every time they
......
...@@ -166,8 +166,6 @@ function(op_library TARGET) ...@@ -166,8 +166,6 @@ function(op_library TARGET)
# NOTE(*): activation use macro to regist the kernels, set use_op manually. # NOTE(*): activation use macro to regist the kernels, set use_op manually.
if(${TARGET} STREQUAL "activation") if(${TARGET} STREQUAL "activation")
file(APPEND ${pybind_file} "USE_OP(relu);\n") file(APPEND ${pybind_file} "USE_OP(relu);\n")
elseif(${TARGET} STREQUAL "reduce")
file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
elseif(${TARGET} STREQUAL "fake_dequantize") elseif(${TARGET} STREQUAL "fake_dequantize")
file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
else() else()
...@@ -191,16 +189,14 @@ if(WITH_DISTRIBUTE) ...@@ -191,16 +189,14 @@ if(WITH_DISTRIBUTE)
set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
op_library(send_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS}) op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS}) op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS}) op_library(send_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS}) op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS}) op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
...@@ -210,15 +206,14 @@ if(WITH_DISTRIBUTE) ...@@ -210,15 +206,14 @@ if(WITH_DISTRIBUTE)
# listen_and_serv_op sum_op executor SERIAL) # listen_and_serv_op sum_op executor SERIAL)
if(WITH_GPU) if(WITH_GPU)
set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op executor SERIAL)
listen_and_serv_op executor SERIAL)
op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc) op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else() else()
set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op) set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
endif() endif()
else() else()
set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op fetch_barrier_op gen_nccl_id_op) set(DEPS_OPS ${DEPS_OPS} prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
endif() endif()
op_library(cross_entropy_op DEPS cross_entropy) op_library(cross_entropy_op DEPS cross_entropy)
......
...@@ -24,12 +24,12 @@ namespace operators { ...@@ -24,12 +24,12 @@ namespace operators {
: public ::paddle::framework::OpProtoAndCheckerMaker { \ : public ::paddle::framework::OpProtoAndCheckerMaker { \
public: \ public: \
void Make() override { \ void Make() override { \
AddInput("X", "Input of " #OP_NAME "operator"); \ AddInput("X", "Input of " #OP_NAME " operator"); \
AddOutput("Out", "Output of" #OP_NAME "operator"); \ AddOutput("Out", "Output of " #OP_NAME " operator"); \
AddAttr<bool>("use_mkldnn", \ AddAttr<bool>("use_mkldnn", \
"(bool, default false) Only used in mkldnn kernel") \ "(bool, default false) Only used in mkldnn kernel") \
.SetDefault(false); \ .SetDefault(false); \
AddComment(#OP_COMMENT); \ AddComment(OP_COMMENT); \
} \ } \
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/arg_min_max_op_base.h"
REGISTER_OPERATOR(arg_max, paddle::operators::ArgMinMaxOp,
paddle::operators::ArgMaxOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(
arg_max,
paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, float>,
paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, double>,
paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
int64_t>,
paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
int32_t>,
paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
int16_t>,
paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, size_t>,
paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
uint8_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/arg_min_max_op_base.h"
REGISTER_OP_CUDA_KERNEL(
arg_max,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, float>,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
double>,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
int64_t>,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
int32_t>,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
int16_t>,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
size_t>,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
uint8_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <type_traits>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/printf.h"
namespace paddle {
namespace operators {
enum ArgMinMaxType { kArgMin, kArgMax };
template <typename DeviceContext, typename T, typename Tout, int64_t Rank,
ArgMinMaxType argMinMaxValue>
struct ArgMinMaxFunctor {};
#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value) \
template <typename DeviceContext, typename T, typename Tout, int64_t Rank> \
struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank, \
enum_argminmax_value> { \
void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
framework::LoDTensor* out, int64_t axis) { \
auto in_eigen = framework::EigenTensor<T, Rank>::From(in); \
auto out_eigen = framework::EigenTensor<Tout, Rank - 1>::From(*out); \
out_eigen.device(*(ctx.eigen_device())) = \
in_eigen.eigen_op_type(axis).template cast<Tout>(); \
} \
}
DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin);
DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax);
template <typename DeviceContext, typename T, typename Tout,
ArgMinMaxType EnumArgMinMaxValue>
class ArgMinMaxKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& x = *(ctx.Input<framework::LoDTensor>("X"));
auto& out = *(ctx.Output<framework::LoDTensor>("Out"));
out.mutable_data<Tout>(ctx.GetPlace());
auto axis = ctx.Attr<int64_t>("axis");
auto& dev_ctx = ctx.template device_context<DeviceContext>();
#define CALL_ARG_MINMAX_FUNCTOR(rank) \
ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
functor##rank; \
functor##rank(dev_ctx, x, &out, axis)
switch (x.dims().size()) {
case 1:
CALL_ARG_MINMAX_FUNCTOR(1);
break;
case 2:
CALL_ARG_MINMAX_FUNCTOR(2);
break;
case 3:
CALL_ARG_MINMAX_FUNCTOR(3);
break;
case 4:
CALL_ARG_MINMAX_FUNCTOR(4);
break;
case 5:
CALL_ARG_MINMAX_FUNCTOR(5);
break;
case 6:
CALL_ARG_MINMAX_FUNCTOR(6);
break;
default:
PADDLE_THROW(
"%s operator doesn't supports tensors whose ranks are greater "
"than 6.",
(EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax"));
break;
#undef CALL_ARG_MINMAX_FUNCTOR
}
}
};
template <typename DeviceContext, typename T>
using ArgMinKernel =
ArgMinMaxKernel<DeviceContext, T, int64_t, ArgMinMaxType::kArgMin>;
template <typename DeviceContext, typename T>
using ArgMaxKernel =
ArgMinMaxKernel<DeviceContext, T, int64_t, ArgMinMaxType::kArgMax>;
class ArgMinMaxOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
const auto& x_dims = ctx->GetInputDim("X");
int64_t axis = ctx->Attrs().Get<int64_t>("axis");
PADDLE_ENFORCE(axis >= -x_dims.size() && axis < x_dims.size(),
"'axis' must be inside [-Rank(X), Rank(X))");
auto x_rank = x_dims.size();
if (axis < 0) axis += x_rank;
std::vector<int64_t> vec;
for (int64_t i = 0; i < axis; i++) vec.push_back(x_dims[i]);
for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]);
ctx->SetOutputDim("Out", framework::make_ddim(vec));
}
};
class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
protected:
virtual const char* OpName() const = 0;
virtual const char* Name() const = 0;
public:
void Make() override {
AddInput("X", "Input tensor.");
AddOutput("Out", "Output tensor.");
AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
AddComment(string::Sprintf(R"DOC(
%s Operator.
Computes the indices of the %s elements of the input tensor's element
along the provided axis.
)DOC",
OpName(), Name()));
}
};
class ArgMinOpMaker : public BaseArgMinMaxOpMaker {
protected:
const char* OpName() const override { return "ArgMin"; }
const char* Name() const override { return "min"; }
};
class ArgMaxOpMaker : public BaseArgMinMaxOpMaker {
protected:
const char* OpName() const override { return "ArgMax"; }
const char* Name() const override { return "max"; }
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/arg_min_max_op_base.h"
REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinMaxOp,
paddle::operators::ArgMinOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(
arg_min,
paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, float>,
paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, double>,
paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
int64_t>,
paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
int32_t>,
paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
int16_t>,
paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, size_t>,
paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
uint8_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/arg_min_max_op_base.h"
REGISTER_OP_CUDA_KERNEL(
arg_min,
paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, float>,
paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
double>,
paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
int64_t>,
paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
int32_t>,
paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
int16_t>,
paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
size_t>,
paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
uint8_t>);
...@@ -54,18 +54,18 @@ class BatchSizeLikeOp : public framework::OperatorWithKernel { ...@@ -54,18 +54,18 @@ class BatchSizeLikeOp : public framework::OperatorWithKernel {
class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker { class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() final { void Make() final {
AddInput("Input", AddInput(
"(Tensor) Tensor " "Input",
"whose input_dim_idx'th dimension specifies the batch_size"); "Tensor whose input_dim_idx'th dimension specifies the batch_size");
AddOutput("Out", AddOutput("Out",
"(Tensor) Tensor of specified shape will be filled " "Tensor of specified shape will be filled "
"with the specified value"); "with the specified value");
AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output"); AddAttr<std::vector<int>>("shape", "The shape of the output");
AddAttr<int>("input_dim_idx", AddAttr<int>("input_dim_idx",
"(int, default 0) The index of input's batch size dimension") "default 0. The index of input's batch size dimension")
.SetDefault(0); .SetDefault(0);
AddAttr<int>("output_dim_idx", AddAttr<int>("output_dim_idx",
"(int, default 0) The index of output's batch size dimension") "default 0. The index of output's batch size dimension")
.SetDefault(0); .SetDefault(0);
Apply(); Apply();
} }
......
...@@ -56,17 +56,16 @@ class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -56,17 +56,16 @@ class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", AddInput("X",
"(Tensor) The input tensor of bilinear interpolation, " "The input tensor of bilinear interpolation, "
"This is a 4-D tensor with shape of (N x C x h x w)"); "This is a 4-D tensor with shape of (N x C x h x w)");
AddInput("OutSize", AddInput("OutSize",
"(Tensor) This is a 1-D tensor with two number. " "This is a 1-D tensor with two number. "
"The first number is height and the second number is width.") "The first number is height and the second number is width.")
.AsDispensable(); .AsDispensable();
AddOutput("Out", AddOutput("Out", "The dimension of output is (N x C x out_h x out_w)");
"(Tensor) The dimension of output is (N x C x out_h x out_w]");
AddAttr<int>("out_h", "(int) output height of bilinear interpolation op."); AddAttr<int>("out_h", "output height of bilinear interpolation op.");
AddAttr<int>("out_w", "(int) output width of bilinear interpolation op."); AddAttr<int>("out_w", "output width of bilinear interpolation op.");
AddComment(R"DOC( AddComment(R"DOC(
Bilinear interpolation is an extension of linear interpolation for Bilinear interpolation is an extension of linear interpolation for
interpolating functions of two variables (e.g. H-direction and interpolating functions of two variables (e.g. H-direction and
......
...@@ -48,6 +48,13 @@ class CropOp : public framework::OperatorWithKernel { ...@@ -48,6 +48,13 @@ class CropOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("Out", y_dim); ctx->SetOutputDim("Out", y_dim);
} }
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
ctx.device_context());
}
}; };
class CropOpMaker : public framework::OpProtoAndCheckerMaker { class CropOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -60,13 +67,19 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -60,13 +67,19 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker {
"The input used as reference for cropping, " "The input used as reference for cropping, "
"which is of the same dimensions as X.") "which is of the same dimensions as X.")
.AsDispensable(); .AsDispensable();
AddInput("Offsets",
"The input used to describe offsets in runtime, which is a "
"1-D vector whose size equals to the rank of input 'X'. The "
"elements data type must be int.")
.AsDispensable();
AddOutput("Out", AddOutput("Out",
"The output of crop op, " "The output of crop op, "
"which is of the same dimensions as X."); "which is of the same dimensions as X.");
AddAttr<std::vector<int>>("offsets", AddAttr<std::vector<int>>("offsets",
"A list<int> describing offsets to be cropped. " "A list<int> describing offsets to be cropped. "
"The size of offsets list should be the same as " "The size of offsets list should be the same as "
"the dimension size of input X."); "the dimension size of input X.")
.SetDefault(std::vector<int>());
AddAttr<std::vector<int>>("shape", AddAttr<std::vector<int>>("shape",
"A list<int> describing the shape of output. " "A list<int> describing the shape of output. "
"The size of shape list should be the same as " "The size of shape list should be the same as "
...@@ -77,6 +90,17 @@ Crop Operator. ...@@ -77,6 +90,17 @@ Crop Operator.
Crop input into output, as specified by offsets and shape. Crop input into output, as specified by offsets and shape.
There are two ways to set the offsets:
1. In runtime: Using the input 'Offsets', which is a Vairbale and can be
output of other operators. This way is suitable for
dynamic offsets.
2. In network configuration: Using the attribute 'offsets', which will be
set in Python configure script. This way is
suitable for fixed offsets.
You CANNOT use these two ways at the same time. An exception will be raised
if input 'Offset' is configured and meanwhile the attribute 'offsets' is
not empty.
There are two ways to set shape: There are two ways to set shape:
1. reference input: crop input X into the same shape as reference input. 1. reference input: crop input X into the same shape as reference input.
The dimension of reference input should The dimension of reference input should
...@@ -146,6 +170,15 @@ class CropOpGrad : public framework::OperatorWithKernel { ...@@ -146,6 +170,15 @@ class CropOpGrad : public framework::OperatorWithKernel {
ctx->SetOutputDim(x_grad_name, x_dims); ctx->SetOutputDim(x_grad_name, x_dims);
} }
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
->type()),
ctx.device_context());
}
}; };
} // namespace operators } // namespace operators
......
...@@ -27,6 +27,37 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor, ...@@ -27,6 +27,37 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>; using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
using framework::Tensor; using framework::Tensor;
static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
std::vector<int> res;
int rank = ctx.Input<Tensor>("X")->dims().size();
if (ctx.HasInput("Offsets")) {
PADDLE_ENFORCE(ctx.Attr<std::vector<int>>("offsets").empty(),
"Input 'Offsets' and attribute 'offsets' should not be used "
"at the same time.");
const auto* offsets_tensor = ctx.Input<Tensor>("Offsets");
PADDLE_ENFORCE_EQ(offsets_tensor->dims().size(), 1);
PADDLE_ENFORCE_EQ(
rank, offsets_tensor->dims()[0],
"Offsets size should be equal to dimension size of input tensor.");
const int* offsets_data;
framework::Tensor cpu_tmp_tensor;
if (platform::is_cpu_place(offsets_tensor->place())) {
offsets_data = offsets_tensor->data<int>();
} else {
framework::TensorCopySync(*offsets_tensor, platform::CPUPlace(),
&cpu_tmp_tensor);
offsets_data = cpu_tmp_tensor.data<int>();
}
res = std::vector<int>(offsets_data, offsets_data + rank);
} else {
res = ctx.Attr<std::vector<int>>("offsets");
PADDLE_ENFORCE_EQ(
rank, res.size(),
"Offsets size should be equal to dimension size of input tensor.");
}
return res;
}
template <typename T> template <typename T>
class CropKernel : public framework::OpKernel<T> { class CropKernel : public framework::OpKernel<T> {
public: public:
...@@ -37,10 +68,7 @@ class CropKernel : public framework::OpKernel<T> { ...@@ -37,10 +68,7 @@ class CropKernel : public framework::OpKernel<T> {
T* out_data = out->mutable_data<T>(context.GetPlace()); T* out_data = out->mutable_data<T>(context.GetPlace());
auto x_stride = framework::stride(x->dims()); auto x_stride = framework::stride(x->dims());
auto out_stride = framework::stride(out->dims()); auto out_stride = framework::stride(out->dims());
auto offsets = context.Attr<std::vector<int>>("offsets"); auto offsets = GetOffsets(context);
PADDLE_ENFORCE_EQ(
x->dims().size(), static_cast<int64_t>(offsets.size()),
"Offsets size should be equal to dimension size of input tensor.");
int64_t offset = 0; int64_t offset = 0;
for (size_t i = 0; i < offsets.size(); ++i) { for (size_t i = 0; i < offsets.size(); ++i) {
offset += (x_stride[i] * offsets[i]); offset += (x_stride[i] * offsets[i]);
...@@ -56,7 +84,7 @@ void CropGradFunction(const framework::ExecutionContext& context) { ...@@ -56,7 +84,7 @@ void CropGradFunction(const framework::ExecutionContext& context) {
if (d_x != nullptr) { if (d_x != nullptr) {
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out")); auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
d_x->mutable_data<T>(context.GetPlace()); d_x->mutable_data<T>(context.GetPlace());
auto offsets = context.Attr<std::vector<int>>("offsets"); auto offsets = GetOffsets(context);
Eigen::array<std::pair<int, int>, D> paddings; Eigen::array<std::pair<int, int>, D> paddings;
for (size_t i = 0; i < D; ++i) { for (size_t i = 0; i < D; ++i) {
paddings[i].first = offsets[i]; paddings[i].first = offsets[i];
......
...@@ -80,7 +80,6 @@ class RequestHandler { ...@@ -80,7 +80,6 @@ class RequestHandler {
} }
framework::ProgramDesc* program() { return program_; } framework::ProgramDesc* program() { return program_; }
framework::Executor* executor() { return executor_; } framework::Executor* executor() { return executor_; }
std::vector<framework::Variable*>& sparse_vars() { return sparse_vars_; }
// This function processes user's rpc request. // This function processes user's rpc request.
// The implemention is in request_handler_impl. // The implemention is in request_handler_impl.
...@@ -113,13 +112,7 @@ class RequestHandler { ...@@ -113,13 +112,7 @@ class RequestHandler {
std::unordered_map<std::string, std::unordered_map<std::string,
std::shared_ptr<framework::ExecutorPrepareContext>>* std::shared_ptr<framework::ExecutorPrepareContext>>*
grad_to_prepared_ctx_; grad_to_prepared_ctx_;
// Record received sparse variables, so that
// we could reset those after execute optimize program
std::vector<framework::Variable*> sparse_vars_;
RPCServer* rpc_server_; RPCServer* rpc_server_;
std::mutex sparse_var_mutex_;
}; };
} // namespace detail } // namespace detail
......
...@@ -63,16 +63,22 @@ bool RequestSendHandler::Handle(const std::string& varname, ...@@ -63,16 +63,22 @@ bool RequestSendHandler::Handle(const std::string& varname,
PADDLE_THROW("sync: Can not find server side var"); PADDLE_THROW("sync: Can not find server side var");
return false; return false;
} }
if (invar->IsType<framework::SelectedRows>()) { if (invar->IsType<framework::SelectedRows>()) {
std::unique_lock<std::mutex> lock(sparse_var_mutex_); std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
sparse_vars_.push_back(invar); sparse_vars_.push_back(invar);
} }
} }
return true; return true;
} }
void RequestSendHandler::ResetSparseVarRecorder() {
std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
for (auto* var : sparse_vars_) {
var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
}
sparse_vars_.clear();
}
bool RequestGetHandler::Handle(const std::string& varname, bool RequestGetHandler::Handle(const std::string& varname,
framework::Scope* scope, framework::Scope* scope,
framework::Variable* invar, framework::Variable* invar,
......
...@@ -41,6 +41,11 @@ class RequestSendHandler final : public RequestHandler { ...@@ -41,6 +41,11 @@ class RequestSendHandler final : public RequestHandler {
virtual ~RequestSendHandler() {} virtual ~RequestSendHandler() {}
bool Handle(const std::string& varname, framework::Scope* scope, bool Handle(const std::string& varname, framework::Scope* scope,
framework::Variable* var, framework::Variable** outvar) override; framework::Variable* var, framework::Variable** outvar) override;
void ResetSparseVarRecorder();
private:
std::mutex mutex_sparse_vars_;
std::vector<framework::Variable*> sparse_vars_;
}; };
class RequestGetHandler final : public RequestHandler { class RequestGetHandler final : public RequestHandler {
......
...@@ -60,6 +60,7 @@ class RPCServer { ...@@ -60,6 +60,7 @@ class RPCServer {
void SetCond(const std::string& rpc_name); void SetCond(const std::string& rpc_name);
void WaitCond(const std::string& rpc_name); void WaitCond(const std::string& rpc_name);
void IncreaseBatchBarrier(const std::string rpc_name); void IncreaseBatchBarrier(const std::string rpc_name);
void ResetBarrierCounter(); void ResetBarrierCounter();
protected: protected:
......
...@@ -32,16 +32,16 @@ class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp { ...@@ -32,16 +32,16 @@ class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp {
class FillConstantBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker { class FillConstantBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
protected: protected:
void Apply() override { void Apply() override {
AddAttr<int>("dtype", AddAttr<int>(
"(int, default 5 (FP32)) " "dtype",
"Output data type") "It could be numpy.dtype. Output data type. Default is float32")
.SetDefault(framework::proto::VarType::FP32); .SetDefault(framework::proto::VarType::FP32);
AddAttr<float>("value", "(float, default 0) The value to be filled") AddAttr<float>("value", "default 0. The value to be filled")
.SetDefault(0.0f); .SetDefault(0.0f);
AddComment(R"DOC( AddComment(R"DOC(
FillConstantBatchSizeLike Operator. This function creates a tensor of specified *shape*, *dtype* and batch size,
and initializes this with a constant supplied in *value*. The batch size is
Fill up a variable with specified constant value. obtained from the `input` tensor.
)DOC"); )DOC");
} }
......
...@@ -43,7 +43,8 @@ TEST(Gather, GatherData) { ...@@ -43,7 +43,8 @@ TEST(Gather, GatherData) {
auto* cpu_place = new paddle::platform::CPUPlace(); auto* cpu_place = new paddle::platform::CPUPlace();
paddle::platform::CPUDeviceContext ctx(*cpu_place); paddle::platform::CPUDeviceContext ctx(*cpu_place);
paddle::operators::CPUGather<int>(ctx, *src, *index, output); paddle::operators::CPUGather<int>(ctx, *src, *index, output);
delete cpu_place;
cpu_place = NULL;
for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4); for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4); for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
......
...@@ -67,8 +67,6 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -67,8 +67,6 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
"mini-batch. Note: S is equal to the sequence number in a mini-batch. " "mini-batch. Note: S is equal to the sequence number in a mini-batch. "
"The output is no longer a LoDTensor."); "The output is no longer a LoDTensor.");
AddComment(R"DOC( AddComment(R"DOC(
LinearChainCRF Operator.
Conditional Random Field defines an undirected probabilistic graph with nodes Conditional Random Field defines an undirected probabilistic graph with nodes
denoting random variables and edges denoting dependencies between these denoting random variables and edges denoting dependencies between these
variables. CRF learns the conditional probability $P(Y|X)$, where variables. CRF learns the conditional probability $P(Y|X)$, where
......
...@@ -108,9 +108,6 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, ...@@ -108,9 +108,6 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
std::shared_ptr<framework::ExecutorPrepareContext>(nullptr)); std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
rpc_service_->ResetBarrierCounter(); rpc_service_->ResetBarrierCounter();
// Record received sparse variables, so that
// we could reset those after execute optimize program
std::vector<framework::Variable *> sparse_vars;
while (true) { while (true) {
// Get from multiple trainers, we don't care about the order in which // Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient. // the gradients arrives, just add suffix 0~n and merge the gradient.
...@@ -146,18 +143,12 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, ...@@ -146,18 +143,12 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
recv_scope); recv_scope);
VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)"; VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)";
// Reset the received sparse variables, the sum operator would not
// sum the input sparse variables which rows is empty at the next
// mini-batch.
// TODO(Yancey1989): move the reset action into an operator, we couldn't
// have any hide logic in the operator.
for (framework::Variable *var : sparse_vars) {
var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
}
rpc_service_->SetCond(detail::kRequestGet); rpc_service_->SetCond(detail::kRequestGet);
rpc_service_->WaitBarrier(detail::kRequestGet); rpc_service_->WaitBarrier(detail::kRequestGet);
rpc_service_->ResetBarrierCounter(); rpc_service_->ResetBarrierCounter();
// reset received sparse vars to avoid reuse it in the next mini-batch
dynamic_cast<detail::RequestSendHandler *>(request_send_handler_.get())
->ResetSparseVarRecorder();
} // while(true) } // while(true)
} }
......
...@@ -74,25 +74,18 @@ class LoadOp : public framework::OperatorBase { ...@@ -74,25 +74,18 @@ class LoadOp : public framework::OperatorBase {
class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddOutput("Out", "(Tensor) The tensor need to be loaded"); AddOutput("Out", "The tensor need to be loaded");
AddAttr<bool>( AddAttr<bool>(
"load_as_fp16", "load_as_fp16",
"(boolean, default false)"
"If true, the tensor will be first loaded and then " "If true, the tensor will be first loaded and then "
"converted to float16 data type. Otherwise, the tensor will be " "converted to float16 data type. Otherwise, the tensor will be "
"directly loaded without data type conversion.") "directly loaded without data type conversion. Default is false.")
.SetDefault(false); .SetDefault(false);
AddAttr<std::string>("file_path", AddAttr<std::string>("file_path",
"(string) " R"(Variable will be loaded from "file_path")")
"Variable will be loaded from \"file_path\".")
.AddCustomChecker( .AddCustomChecker(
[](const std::string &path) { return !path.empty(); }); [](const std::string &path) { return !path.empty(); });
AddComment(R"DOC( AddComment("Load operator will load a tensor variable from disk file.");
Load Operator.
Load operator will load a tensor variable from disk file.
)DOC");
} }
}; };
} // namespace operators } // namespace operators
......
...@@ -77,6 +77,8 @@ TEST(math_function, gemm_trans_clbas) { ...@@ -77,6 +77,8 @@ TEST(math_function, gemm_trans_clbas) {
paddle::platform::CPUDeviceContext context(*cpu_place); paddle::platform::CPUDeviceContext context(*cpu_place);
GetBlas<float>(context).GEMM(false, true, m, n, k, 1, input1_ptr, 3, GetBlas<float>(context).GEMM(false, true, m, n, k, 1, input1_ptr, 3,
input2_ptr + 3, 3, 1, input3_ptr + 1, 4); input2_ptr + 3, 3, 1, input3_ptr + 1, 4);
delete cpu_place;
cpu_place = NULL;
EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[0], 0);
EXPECT_EQ(input3_ptr[1], 24); EXPECT_EQ(input3_ptr[1], 24);
......
...@@ -42,10 +42,15 @@ class MaxSeqenceLenOp : public framework::OperatorBase { ...@@ -42,10 +42,15 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker { class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("RankTable", "The lod_rank_table."); AddInput("RankTable", "Input variable which is a LoDRankTable object");
AddOutput("Out", "The max sequence length."); AddOutput("Out", "The max sequence length");
AddComment( AddComment(R"DOC(
R"DOC(Calculate the max sequence length through lod_rank_table.)DOC"); Given a LoDRankTable object, this layer returns the max length of
a batch of sequences. In fact, a LoDRankTable object contains a list of
tuples(<sequence index, sequence length>) and the list is already sorted by
sequence length in descending order, so the operator just returns the
sequence length of the first tuple element
)DOC");
} }
}; };
......
...@@ -18,9 +18,14 @@ limitations under the License. */ ...@@ -18,9 +18,14 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using mkldnn::memory; // Note: paddle has also "memory" namespace using framework::DataLayout;
using mkldnn::pooling_forward; using mkldnn::memory;
using mkldnn::pooling_backward; using mkldnn::pooling_backward;
using mkldnn::pooling_forward;
using mkldnn::primitive;
using mkldnn::reorder;
using mkldnn::stream;
using platform::to_void_cast;
// Generate keys for storing/retriving primitives for this operator // Generate keys for storing/retriving primitives for this operator
// TODO(jczaja): Make hashing function more optimial // TODO(jczaja): Make hashing function more optimial
...@@ -55,8 +60,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -55,8 +60,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const Tensor* input = ctx.Input<Tensor>("X"); const Tensor* input = ctx.Input<Tensor>("X");
Tensor* output = ctx.Output<Tensor>("Out"); Tensor* output = ctx.Output<Tensor>("Out");
// Get an unique name from "argument" name of "Out" variable PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
// This name will be used as key when saving info into device context input->format() != memory::format::format_undef,
"Wrong layout/format set for Input tensor");
std::string pooling_type = ctx.Attr<std::string>("pooling_type"); std::string pooling_type = ctx.Attr<std::string>("pooling_type");
std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize"); std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
...@@ -82,6 +88,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -82,6 +88,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims()); std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims()); std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
auto input_format = input->format();
memory::format output_format{memory::format::format_undef};
const std::string key = gethash(src_tz, pooling_type, ksize, strides, const std::string key = gethash(src_tz, pooling_type, ksize, strides,
paddings, ctx.op().Output("Out")); paddings, ctx.op().Output("Out"));
const std::string key_pool_p = key + "@pool_p"; const std::string key_pool_p = key + "@pool_p";
...@@ -94,16 +103,17 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -94,16 +103,17 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto pool_p = auto pool_p =
std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p)); std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p));
if (pool_p == nullptr) { if (pool_p == nullptr) {
// TODO(pzelazko-intel): support more formats auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), input_format);
auto src_md = /* create memory descriptor for pooling without specified format
platform::MKLDNNMemDesc(src_tz, platform::MKLDNNGetDataType<T>(), * ('any') which lets a primitive (pooling in this case) choose
mkldnn::memory::format::nchw); * the memory format preferred for best performance
auto dst_md = */
platform::MKLDNNMemDesc(dst_tz, platform::MKLDNNGetDataType<T>(), auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw); mkldnn::memory::format::any);
std::shared_ptr<pooling_forward::primitive_desc> pool_pd = std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize, CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize,
pooling_type, mkldnn_engine); pooling_type, mkldnn_engine);
...@@ -116,20 +126,22 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -116,20 +126,22 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
// save pool_workspace_memory to be referred in backward path // save pool_workspace_memory to be referred in backward path
dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
auto pool_src_memory_p = std::make_shared<memory>( auto src_memory = std::make_shared<memory>(pool_pd->src_primitive_desc(),
memory::primitive_desc{src_md, mkldnn_engine}, to_void_cast<T>(input_data));
static_cast<void*>(const_cast<T*>(input_data))); auto dst_memory =
dev_ctx.SetBlob(key_pool_src_mem_p, pool_src_memory_p); std::make_shared<memory>(pool_pd->dst_primitive_desc(), output_data);
auto pool_dst_memory_p = std::make_shared<memory>( dev_ctx.SetBlob(key_pool_src_mem_p, src_memory);
memory::primitive_desc{dst_md, mkldnn_engine}, dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory);
static_cast<void*>(output_data));
dev_ctx.SetBlob(key_pool_dst_mem_p, pool_dst_memory_p); pool_p = std::make_shared<pooling_forward>(*pool_pd, *(src_memory.get()),
*(dst_memory.get()),
*workspace_memory);
pool_p = std::make_shared<pooling_forward>(
*pool_pd, *(pool_src_memory_p.get()), *(pool_dst_memory_p.get()),
*workspace_memory);
dev_ctx.SetBlob(key_pool_p, pool_p); dev_ctx.SetBlob(key_pool_p, pool_p);
output_format =
(memory::format)dst_memory->get_primitive_desc().desc().data.format;
} else { } else {
// Primitives already exist // Primitives already exist
auto pool_src_memory_p = auto pool_src_memory_p =
...@@ -140,14 +152,20 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -140,14 +152,20 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_dst_mem_p)); std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_dst_mem_p));
PADDLE_ENFORCE(pool_dst_memory_p != nullptr, PADDLE_ENFORCE(pool_dst_memory_p != nullptr,
"Fail to find pooling dst mem_p in device context"); "Fail to find pooling dst mem_p in device context");
pool_src_memory_p->set_data_handle( pool_src_memory_p->set_data_handle(to_void_cast<T>(input_data));
reinterpret_cast<void*>(const_cast<T*>(input_data)));
pool_dst_memory_p->set_data_handle(output_data); pool_dst_memory_p->set_data_handle(output_data);
output_format = (memory::format)pool_dst_memory_p->get_primitive_desc()
.desc()
.data.format;
} }
// push primitive to stream and wait until it's executed // push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline{*(pool_p.get())}; std::vector<mkldnn::primitive> pipeline{*(pool_p.get())};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); stream(stream::kind::eager).submit(pipeline).wait();
output->set_layout(DataLayout::kMKLDNN);
output->set_format(output_format);
} }
private: private:
...@@ -194,6 +212,13 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -194,6 +212,13 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out")); const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X")); Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
PADDLE_ENFORCE(in_x->layout() == DataLayout::kMKLDNN &&
in_x->format() != memory::format::format_undef,
"Wrong layout/format set for Input X tensor");
PADDLE_ENFORCE(out_grad->layout() == DataLayout::kMKLDNN &&
out_grad->format() != memory::format::format_undef,
"Wrong layout/format set for Input output_grad tensor");
std::string pooling_type = ctx.Attr<std::string>("pooling_type"); std::string pooling_type = ctx.Attr<std::string>("pooling_type");
std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize"); std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides"); std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
...@@ -212,6 +237,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -212,6 +237,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
const T* out_grad_data = out_grad->data<T>(); const T* out_grad_data = out_grad->data<T>();
T* in_x_grad_data = in_x_grad->mutable_data<T>(ctx.GetPlace()); T* in_x_grad_data = in_x_grad->mutable_data<T>(ctx.GetPlace());
memory::format in_x_grad_format{memory::format::format_undef};
std::vector<int> diff_src_tz = std::vector<int> diff_src_tz =
paddle::framework::vectorize2int(in_x_grad->dims()); paddle::framework::vectorize2int(in_x_grad->dims());
...@@ -225,39 +251,48 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -225,39 +251,48 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
const std::string key_pool_bwd_p = key + "@pool_bwd_p"; const std::string key_pool_bwd_p = key + "@pool_bwd_p";
const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p"; const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p";
const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p"; const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p";
const std::string key_pool_src_mem_p = key + "@pool_src_mem_p";
const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p";
const std::string key_pool_pd = key + "@pool_pd"; const std::string key_pool_pd = key + "@pool_pd";
const std::string key_pool_workspace_memory = const std::string key_pool_workspace_memory =
key + "@pool_workspace_memory"; key + "@pool_workspace_memory";
auto user_diff_dst_memory =
memory({{{diff_dst_tz}, memory::data_type::f32, out_grad->format()},
mkldnn_engine},
to_void_cast<T>(out_grad_data));
std::shared_ptr<memory> diff_src_memory;
std::shared_ptr<memory> diff_dst_memory;
auto dst_memory =
std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_dst_mem_p));
PADDLE_ENFORCE(dst_memory != nullptr,
"Fail to find dst_memory in device context");
primitive reorder_diff_dst;
bool is_diff_dst_reordered = false;
auto pool_bwd_p = std::static_pointer_cast<pooling_backward>( auto pool_bwd_p = std::static_pointer_cast<pooling_backward>(
dev_ctx.GetBlob(key_pool_bwd_p)); dev_ctx.GetBlob(key_pool_bwd_p));
if (pool_bwd_p == nullptr) { if (pool_bwd_p == nullptr) {
auto diff_src_md = // Retrieve src_memory/dst_memory saved in forward pass
platform::MKLDNNMemDesc(diff_src_tz, platform::MKLDNNGetDataType<T>(), auto src_memory =
mkldnn::memory::format::nchw); std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_src_mem_p));
auto diff_dst_md = PADDLE_ENFORCE(src_memory != nullptr,
platform::MKLDNNMemDesc(diff_dst_tz, platform::MKLDNNGetDataType<T>(), "Fail to find src_memory in device context");
mkldnn::memory::format::nchw);
// Retrieve pool_pd/pool_workspace_memory from device context // Retrieve pool_pd/pool_workspace_memory from device context
auto pool_pd = auto pool_pd =
std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>( std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
dev_ctx.GetBlob(key_pool_pd)); dev_ctx.GetBlob(key_pool_pd));
PADDLE_ENFORCE(pool_pd != nullptr, PADDLE_ENFORCE(pool_pd != nullptr,
"Fail to find pool_pd in device context"); "Fail to find pool_pd in device context");
auto workspace_memory = std::static_pointer_cast<memory>(
auto workspace_memory = std::static_pointer_cast<mkldnn::memory>(
dev_ctx.GetBlob(key_pool_workspace_memory)); dev_ctx.GetBlob(key_pool_workspace_memory));
PADDLE_ENFORCE(workspace_memory != nullptr, PADDLE_ENFORCE(workspace_memory != nullptr,
"Fail to find workspace_memory in device context"); "Fail to find workspace_memory in device context");
auto pool_diff_src_memory_p = std::make_shared<memory>(memory( // create memory descriptors for pooling
{diff_src_md, mkldnn_engine}, static_cast<void*>(in_x_grad_data))); auto diff_src_md = src_memory.get()->get_primitive_desc().desc();
dev_ctx.SetBlob(key_pool_diff_src_mem_p, pool_diff_src_memory_p); auto diff_dst_md = dst_memory.get()->get_primitive_desc().desc();
auto pool_diff_dst_memory_p = std::make_shared<memory>(
memory({diff_dst_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(out_grad_data))));
dev_ctx.SetBlob(key_pool_diff_dst_mem_p, pool_diff_dst_memory_p);
auto pool_bwd_desc = mkldnn::pooling_backward::desc( auto pool_bwd_desc = mkldnn::pooling_backward::desc(
pooling_type == "max" ? mkldnn::algorithm::pooling_max pooling_type == "max" ? mkldnn::algorithm::pooling_max
...@@ -267,35 +302,74 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -267,35 +302,74 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc( auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc(
pool_bwd_desc, mkldnn_engine, *pool_pd); pool_bwd_desc, mkldnn_engine, *pool_pd);
// reorder between user_diff_dst and pool diff_dst if needed
diff_dst_memory = std::make_shared<memory>(user_diff_dst_memory);
if (memory::primitive_desc(dst_memory->get_primitive_desc()) !=
user_diff_dst_memory.get_primitive_desc()) {
diff_dst_memory =
std::make_shared<memory>(dst_memory.get()->get_primitive_desc());
reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
is_diff_dst_reordered = true;
}
diff_src_memory = std::make_shared<memory>(
pool_bwd_pd.diff_src_primitive_desc(), in_x_grad_data);
dev_ctx.SetBlob(key_pool_diff_src_mem_p, diff_src_memory);
dev_ctx.SetBlob(key_pool_diff_dst_mem_p, diff_dst_memory);
pool_bwd_p = std::make_shared<pooling_backward>( pool_bwd_p = std::make_shared<pooling_backward>(
pool_bwd_pd, *(pool_diff_dst_memory_p.get()), *workspace_memory, pool_bwd_pd, *(diff_dst_memory.get()), *workspace_memory,
*(pool_diff_src_memory_p)); *(diff_src_memory));
dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p); dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p);
} else { } else {
// Primitives already exist // Primitives already exist
auto pool_diff_src_memory_p = std::static_pointer_cast<memory>( diff_src_memory = std::static_pointer_cast<memory>(
dev_ctx.GetBlob(key_pool_diff_src_mem_p)); dev_ctx.GetBlob(key_pool_diff_src_mem_p));
PADDLE_ENFORCE(pool_diff_src_memory_p != nullptr, PADDLE_ENFORCE(diff_src_memory != nullptr,
"Fail to find pooling src mem_p in device context"); "Fail to find pooling src mem_p in device context");
auto pool_diff_dst_memory_p = std::static_pointer_cast<memory>( diff_dst_memory = std::static_pointer_cast<memory>(
dev_ctx.GetBlob(key_pool_diff_dst_mem_p)); dev_ctx.GetBlob(key_pool_diff_dst_mem_p));
PADDLE_ENFORCE(pool_diff_dst_memory_p != nullptr, PADDLE_ENFORCE(diff_dst_memory != nullptr,
"Fail to find pooling dst mem_p in device context"); "Fail to find pooling dst mem_p in device context");
pool_diff_src_memory_p->set_data_handle(
reinterpret_cast<void*>(in_x_grad_data)); diff_src_memory->set_data_handle(reinterpret_cast<void*>(in_x_grad_data));
pool_diff_dst_memory_p->set_data_handle(const_cast<T*>(out_grad_data)); diff_dst_memory->set_data_handle(const_cast<T*>(out_grad_data));
// reorder between user_diff_dst and pool diff_dst if needed
if (memory::primitive_desc(dst_memory->get_primitive_desc()) !=
user_diff_dst_memory.get_primitive_desc()) {
diff_dst_memory =
std::make_shared<memory>(dst_memory.get()->get_primitive_desc());
reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
is_diff_dst_reordered = true;
}
} }
in_x_grad_format = (memory::format)diff_src_memory->get_primitive_desc()
.desc()
.data.format;
// push primitive to stream and wait until it's executed // push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline{*(pool_bwd_p.get())}; std::vector<mkldnn::primitive> pipeline;
if (is_diff_dst_reordered) {
pipeline.push_back(reorder_diff_dst);
}
pipeline.push_back(*(pool_bwd_p.get()));
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
in_x_grad->set_layout(DataLayout::kMKLDNN);
in_x_grad->set_format(in_x_grad_format);
} // Compute() } // Compute()
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace,
paddle::operators::PoolMKLDNNOpKernel<float>); ops::PoolMKLDNNOpKernel<float>);
REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
paddle::operators::PoolMKLDNNGradOpKernel<float>); ops::PoolMKLDNNGradOpKernel<float>);
...@@ -20,7 +20,6 @@ class RandomCropOp : public framework::OperatorWithKernel { ...@@ -20,7 +20,6 @@ class RandomCropOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
...@@ -36,11 +35,11 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -36,11 +35,11 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Seed", "The random seed."); AddInput("Seed", "The random seed.");
AddOutput("Out", "The cropped instance batch."); AddOutput("Out", "The cropped instance batch.");
AddOutput("SeedOut", "The random seed after random cropping.") AddOutput("SeedOut", "The random seed after random cropping.")
.AsDispensable(); .AsIntermediate();
AddAttr<std::vector<int>>("shape", "The shape of a cropped instance."); AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
AddComment(R"DOC( AddComment(R"DOC(
This operator takes a batch of instance, and do random cropping on each instance. This operator takes a batch of instance, and do random cropping on each instance.
It means that cropping positions differs on each instance, which is determined It means that cropping positions differs on each instance, which is determined
by an uniform random generator. All cropped instances have the same shape, which by an uniform random generator. All cropped instances have the same shape, which
is determined by the operator's attribute 'shape'. is determined by the operator's attribute 'shape'.
)DOC"); )DOC");
......
...@@ -78,9 +78,15 @@ This operator can get variables from server side. ...@@ -78,9 +78,15 @@ This operator can get variables from server side.
} }
}; };
class RecvOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker); REGISTER_OPERATOR(recv, ops::RecvOp, paddle::framework::EmptyGradOpMaker,
ops::RecvOpMaker, ops::RecvOpShapeInference);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_min_max_op.h"
REGISTER_REDUCE_OP(reduce_max);
REGISTER_OP_CPU_KERNEL(
reduce_max, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
ops::MaxFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
ops::MaxFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
ops::MaxFunctor>);
REGISTER_OP_CPU_KERNEL(
reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
float, ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
ops::MaxOrMinGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_min_max_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_max,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::MaxFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
double, ops::MaxFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::MaxFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::MaxFunctor>);
REGISTER_OP_CUDA_KERNEL(
reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::MaxOrMinGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_mean_op.h"
REGISTER_REDUCE_OP(reduce_mean);
REGISTER_OP_CPU_KERNEL(reduce_mean,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
float, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
double, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
int, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
int64_t, ops::MeanFunctor>);
REGISTER_OP_CPU_KERNEL(reduce_mean_grad,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
float, ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
double, ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
int, ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
int64_t, ops::MeanGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_mean_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_mean,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
double, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::MeanFunctor>);
REGISTER_OP_CUDA_KERNEL(
reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::MeanGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/reduce_op.h"
namespace paddle {
namespace operators {
struct MeanFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->mean(dim);
}
};
struct MeanGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
dx->device(place) = dy->broadcast(dim) / dx->constant(size);
}
};
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/reduce_op.h"
namespace paddle {
namespace operators {
struct MaxFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->maximum(dim);
}
};
struct MinFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->minimum(dim);
}
};
struct MaxOrMinGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
auto equals = (*x) == y->broadcast(dim);
auto ones = dx->constant(1);
auto zeros = dx->constant(0);
// If there are multiple minimum or maximum elements, the subgradient of
// each is the set [0, 1], and we pass gradient to all of them here.
dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
}
};
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_min_max_op.h"
REGISTER_REDUCE_OP(reduce_min);
REGISTER_OP_CPU_KERNEL(
reduce_min, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
ops::MinFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
ops::MinFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
ops::MinFunctor>);
REGISTER_OP_CPU_KERNEL(
reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
float, ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
ops::MaxOrMinGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_min_max_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_min,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::MinFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
double, ops::MinFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::MinFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::MinFunctor>);
REGISTER_OP_CUDA_KERNEL(
reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::MaxOrMinGradFunctor>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/reduce_op.h"
#include <algorithm>
#include <string>
#include <vector>
namespace paddle {
namespace operators {
using framework::Tensor;
class ReduceOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of ReduceOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ReduceOp should not be null.");
auto x_dims = ctx->GetInputDim("X");
auto x_rank = x_dims.size();
PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) dims[i] = x_rank + dims[i];
PADDLE_ENFORCE_LT(
dims[i], x_rank,
"The dim should be in the range [-rank(input), rank(input)).");
}
sort(dims.begin(), dims.end());
bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
if (reduce_all) {
if (keep_dim)
ctx->SetOutputDim(
"Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
else
ctx->SetOutputDim("Out", {1});
} else {
auto dims_vector = vectorize(x_dims);
if (keep_dim) {
for (size_t i = 0; i < dims.size(); ++i) {
dims_vector[dims[i]] = 1;
}
} else {
const int kDelFlag = -2;
for (size_t i = 0; i < dims.size(); ++i) {
dims_vector[dims[i]] = kDelFlag;
}
dims_vector.erase(
remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
dims_vector.end());
}
auto out_dims = framework::make_ddim(dims_vector);
ctx->SetOutputDim("Out", out_dims);
if (dims[0] != 0) {
// Only pass LoD when not reducing on the first dim.
ctx->ShareLoD("X", /*->*/ "Out");
}
}
}
};
class ReduceGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null.");
auto x_dims = ctx->GetInputDim("X");
auto x_rank = x_dims.size();
PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) dims[i] = x_rank + dims[i];
PADDLE_ENFORCE_LT(
dims[i], x_rank,
"The dim should be in the range [-rank(input), rank(input)).");
}
sort(dims.begin(), dims.end());
auto x_grad_name = framework::GradVarName("X");
if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims);
ctx->ShareLoD("X", /*->*/ x_grad_name);
}
}
};
class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() final {
AddInput("X",
"(Tensor) The input tensor. Tensors with rank at most 6 are "
"supported.");
AddOutput("Out", "(Tensor) The result tensor.");
AddAttr<std::vector<int>>(
"dim",
"(list<int>, default {0}) The dimensions to reduce. "
"Must be in the range [-rank(input), rank(input)). "
"If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
"Note that reducing on the first dim will make the LoD info lost.")
.SetDefault({0});
AddAttr<bool>("keep_dim",
"(bool, default false) "
"If true, retain the reduced dimension with length 1.")
.SetDefault(false);
AddAttr<bool>("reduce_all",
"(bool, default false) "
"If true, output a scalar reduced along all dimensions.")
.SetDefault(false);
AddComment(string::Sprintf(R"DOC(
%s Operator.
This operator computes the %s of input tensor along the given dimension.
The result tensor has 1 fewer dimension than the input unless keep_dim is true.
If reduce_all is true, just reduce along all dimensions and output a scalar.
)DOC",
GetOpType(), GetName()));
}
protected:
virtual std::string GetName() const = 0;
virtual std::string GetOpType() const = 0;
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
#define REGISTER_REDUCE_OP(op_name) \
class __##op_name##Maker__ : public ops::ReduceOpMaker { \
protected: \
virtual std::string GetName() const { return #op_name; } \
virtual std::string GetOpType() const { return "Reduce " #op_name; } \
}; \
REGISTER_OPERATOR(reduce_##op_name, ops::ReduceOp, __##op_name##Maker__, \
paddle::framework::DefaultGradOpDescMaker<true>); \
REGISTER_OPERATOR(reduce_##op_name##_grad, ops::ReduceGradOp)
REGISTER_REDUCE_OP(sum);
REGISTER_REDUCE_OP(mean);
REGISTER_REDUCE_OP(max);
REGISTER_REDUCE_OP(min);
REGISTER_REDUCE_OP(prod);
#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \
REGISTER_OP_CPU_KERNEL(reduce_type, \
ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
float, ops::functor>, \
ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
double, ops::functor>, \
ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
int, ops::functor>, \
ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
int64_t, ops::functor>); \
REGISTER_OP_CPU_KERNEL( \
reduce_type##_grad, \
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t, \
ops::grad_functor>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/reduce_op.h"
namespace ops = paddle::operators;
#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor) \
REGISTER_OP_CUDA_KERNEL( \
reduce_type, ops::ReduceKernel<paddle::platform::CUDADeviceContext, \
float, ops::functor>, \
ops::ReduceKernel<paddle::platform::CUDADeviceContext, double, \
ops::functor>, \
ops::ReduceKernel<paddle::platform::CUDADeviceContext, int, \
ops::functor>, \
ops::ReduceKernel<paddle::platform::CUDADeviceContext, int64_t, \
ops::functor>); \
REGISTER_OP_CUDA_KERNEL( \
reduce_type##_grad, \
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t, \
ops::grad_functor>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL);
...@@ -14,105 +14,20 @@ limitations under the License. */ ...@@ -14,105 +14,20 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <string>
#include <vector> #include <vector>
#include "glog/logging.h"
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/reduce_op_function.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; #define HANDLE_DIM(NDIM, RDIM) \
using DDim = framework::DDim; if (ndim == NDIM && rdim == RDIM) { \
template <typename T, size_t D, int MajorType = Eigen::RowMajor, ReduceFunctor<DeviceContext, T, NDIM, RDIM, Functor>( \
typename IndexType = Eigen::DenseIndex> context.template device_context<DeviceContext>(), *input, output, \
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>; dims, keep_dim); \
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
struct SumFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->sum(dim);
}
};
struct SumGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
dx->device(place) = dy->broadcast(dim);
}
};
struct MeanFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->mean(dim);
}
};
struct MeanGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
dx->device(place) = dy->broadcast(dim) / dx->constant(size);
}
};
struct MaxFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->maximum(dim);
}
};
struct MinFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->minimum(dim);
}
};
struct MaxOrMinGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
auto equals = (*x) == y->broadcast(dim);
auto ones = dx->constant(1);
auto zeros = dx->constant(0);
// If there are multiple minimum or maximum elements, the subgradient of
// each is the set [0, 1], and we pass gradient to all of them here.
dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
}
};
struct ProdFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->prod(dim);
}
};
struct ProdGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
}
};
#define HANDLE_DIM(NDIM, RDIM) \
if (ndim == NDIM && rdim == RDIM) { \
ReduceCompute<NDIM, RDIM>(context); \
} }
template <typename DeviceContext, typename T, typename Functor> template <typename DeviceContext, typename T, typename Functor>
...@@ -120,11 +35,15 @@ class ReduceKernel : public framework::OpKernel<T> { ...@@ -120,11 +35,15 @@ class ReduceKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all"); bool reduce_all = context.Attr<bool>("reduce_all");
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
output->mutable_data<T>(context.GetPlace());
auto dims = context.Attr<std::vector<int>>("dim");
bool keep_dim = context.Attr<bool>("keep_dim");
if (reduce_all) { if (reduce_all) {
// Flatten and reduce 1-D tensor // Flatten and reduce 1-D tensor
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
output->mutable_data<T>(context.GetPlace());
auto x = EigenVector<T>::Flatten(*input); auto x = EigenVector<T>::Flatten(*input);
auto out = EigenScalar<T>::From(*output); auto out = EigenScalar<T>::From(*output);
auto& place = auto& place =
...@@ -133,8 +52,8 @@ class ReduceKernel : public framework::OpKernel<T> { ...@@ -133,8 +52,8 @@ class ReduceKernel : public framework::OpKernel<T> {
Functor functor; Functor functor;
functor(place, &x, &out, reduce_dim); functor(place, &x, &out, reduce_dim);
} else { } else {
int ndim = context.Input<Tensor>("X")->dims().size(); int ndim = input->dims().size();
int rdim = context.Attr<std::vector<int>>("dim").size(); int rdim = dims.size();
// comments for accelerating compiling temporarily. // comments for accelerating compiling temporarily.
// HANDLE_DIM(6, 5); // HANDLE_DIM(6, 5);
// HANDLE_DIM(6, 4); // HANDLE_DIM(6, 4);
...@@ -154,48 +73,6 @@ class ReduceKernel : public framework::OpKernel<T> { ...@@ -154,48 +73,6 @@ class ReduceKernel : public framework::OpKernel<T> {
HANDLE_DIM(1, 1); HANDLE_DIM(1, 1);
} }
} }
private:
template <size_t D, size_t R_D>
void ReduceCompute(const framework::ExecutionContext& context) const {
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
output->mutable_data<T>(context.GetPlace());
auto x = EigenTensor<T, D>::From(*input);
auto x_rank = static_cast<int>(x.dimensions().size());
auto dims = context.Attr<std::vector<int>>("dim");
auto reduce_dim = Eigen::array<int, R_D>();
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) dims[i] = x_rank + dims[i];
reduce_dim[i] = dims[i];
}
// construct the squeezed output tensor
bool keep_dim = context.Attr<bool>("keep_dim");
DDim out_dims = output->dims();
if (keep_dim && x_rank > 1) {
const int kDelFlag = -2;
auto dims_vector = vectorize(out_dims);
for (size_t i = 0; i < dims.size(); ++i) {
dims_vector[dims[i]] = kDelFlag;
}
dims_vector.erase(
remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
dims_vector.end());
out_dims = framework::make_ddim(dims_vector);
}
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
Functor functor;
if (D == 1) {
auto out = EigenScalar<T>::From(*output);
functor(place, &x, &out, reduce_dim);
} else {
auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
functor(place, &x, &out, reduce_dim);
}
}
}; };
template <typename DeviceContext, typename T, typename Functor> template <typename DeviceContext, typename T, typename Functor>
...@@ -203,12 +80,15 @@ class ReduceGradKernel : public framework::OpKernel<T> { ...@@ -203,12 +80,15 @@ class ReduceGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all"); bool reduce_all = context.Attr<bool>("reduce_all");
auto dims = context.Attr<std::vector<int>>("dim");
auto* input0 = context.Input<Tensor>("X");
auto* input1 = context.Input<Tensor>("Out");
auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
auto* output = context.Output<Tensor>(framework::GradVarName("X"));
output->mutable_data<T>(context.GetPlace());
if (reduce_all) { if (reduce_all) {
auto* input0 = context.Input<Tensor>("X");
auto* input1 = context.Input<Tensor>("Out");
auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
auto* output = context.Output<Tensor>(framework::GradVarName("X"));
output->mutable_data<T>(context.GetPlace());
auto x = EigenVector<T>::Flatten(*input0); auto x = EigenVector<T>::Flatten(*input0);
auto x_reduce = EigenVector<T>::From(*input1); auto x_reduce = EigenVector<T>::From(*input1);
auto x_reduce_grad = EigenVector<T>::From(*input2); auto x_reduce_grad = EigenVector<T>::From(*input2);
...@@ -221,74 +101,172 @@ class ReduceGradKernel : public framework::OpKernel<T> { ...@@ -221,74 +101,172 @@ class ReduceGradKernel : public framework::OpKernel<T> {
functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
broadcast_dim[0]); broadcast_dim[0]);
} else { } else {
int rank = context.Input<Tensor>("X")->dims().size(); int rank = input0->dims().size();
switch (rank) { switch (rank) {
case 1: case 1:
ReduceGradCompute<1>(context); ReduceGradFunctor<DeviceContext, T, 1, Functor>(
context.template device_context<DeviceContext>(), *input0,
*input1, *input2, output, dims);
break; break;
case 2: case 2:
ReduceGradCompute<2>(context); ReduceGradFunctor<DeviceContext, T, 2, Functor>(
context.template device_context<DeviceContext>(), *input0,
*input1, *input2, output, dims);
break; break;
case 3: case 3:
ReduceGradCompute<3>(context); ReduceGradFunctor<DeviceContext, T, 3, Functor>(
context.template device_context<DeviceContext>(), *input0,
*input1, *input2, output, dims);
break; break;
case 4: case 4:
ReduceGradCompute<4>(context); ReduceGradFunctor<DeviceContext, T, 4, Functor>(
context.template device_context<DeviceContext>(), *input0,
*input1, *input2, output, dims);
break; break;
case 5: case 5:
ReduceGradCompute<5>(context); ReduceGradFunctor<DeviceContext, T, 5, Functor>(
context.template device_context<DeviceContext>(), *input0,
*input1, *input2, output, dims);
break; break;
case 6: case 6:
ReduceGradCompute<6>(context); ReduceGradFunctor<DeviceContext, T, 6, Functor>(
context.template device_context<DeviceContext>(), *input0,
*input1, *input2, output, dims);
break; break;
} }
} }
} }
};
private: class ReduceOp : public framework::OperatorWithKernel {
template <size_t D> public:
void ReduceGradCompute(const framework::ExecutionContext& context) const { using framework::OperatorWithKernel::OperatorWithKernel;
auto* input0 = context.Input<Tensor>("X");
auto* input1 = context.Input<Tensor>("Out");
auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
auto* output = context.Output<Tensor>(framework::GradVarName("X"));
output->mutable_data<T>(context.GetPlace()); void InferShape(framework::InferShapeContext* ctx) const override {
auto x = EigenTensor<T, D>::From(*input0); PADDLE_ENFORCE(ctx->HasInput("X"),
auto x_grad = EigenTensor<T, D>::From(*output); "Input(X) of ReduceOp should not be null.");
auto x_rank = static_cast<int>(x.dimensions().size()); PADDLE_ENFORCE(ctx->HasOutput("Out"),
auto dims = context.Attr<std::vector<int>>("dim"); "Output(Out) of ReduceOp should not be null.");
auto x_dims = input0->dims(); auto x_dims = ctx->GetInputDim("X");
auto reduced_dims_v = vectorize(x_dims); auto x_rank = x_dims.size();
Eigen::array<int, D> broadcast_dim; PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) dims[i] = x_rank + dims[i];
PADDLE_ENFORCE_LT(
dims[i], x_rank,
"The dim should be in the range [-rank(input), rank(input)).");
}
sort(dims.begin(), dims.end());
bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
if (reduce_all) {
if (keep_dim)
ctx->SetOutputDim(
"Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
else
ctx->SetOutputDim("Out", {1});
} else {
auto dims_vector = vectorize(x_dims);
if (keep_dim) {
for (size_t i = 0; i < dims.size(); ++i) {
dims_vector[dims[i]] = 1;
}
} else {
const int kDelFlag = -2;
for (size_t i = 0; i < dims.size(); ++i) {
dims_vector[dims[i]] = kDelFlag;
}
dims_vector.erase(
remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
dims_vector.end());
}
auto out_dims = framework::make_ddim(dims_vector);
ctx->SetOutputDim("Out", out_dims);
if (dims[0] != 0) {
// Only pass LoD when not reducing on the first dim.
ctx->ShareLoD("X", /*->*/ "Out");
}
}
}
};
class ReduceGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
int broad_cats_times = 1; void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null.");
auto x_dims = ctx->GetInputDim("X");
auto x_rank = x_dims.size();
PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
for (size_t i = 0; i < dims.size(); ++i) { for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) dims[i] = x_rank + dims[i]; if (dims[i] < 0) dims[i] = x_rank + dims[i];
reduced_dims_v[dims[i]] = 1; PADDLE_ENFORCE_LT(
broadcast_dim[dims[i]] = x_dims[dims[i]]; dims[i], x_rank,
broad_cats_times *= x_dims[dims[i]]; "The dim should be in the range [-rank(input), rank(input)).");
}
sort(dims.begin(), dims.end());
auto x_grad_name = framework::GradVarName("X");
if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims);
ctx->ShareLoD("X", /*->*/ x_grad_name);
} }
auto reduced_dims = framework::make_ddim(reduced_dims_v); }
auto x_reduce = EigenTensor<T, D>::From(*input1, reduced_dims); };
auto x_reduce_grad = EigenTensor<T, D>::From(*input2, reduced_dims);
class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() final {
AddInput("X",
"(Tensor) The input tensor. Tensors with rank at most 6 are "
"supported.");
AddOutput("Out", "(Tensor) The result tensor.");
AddAttr<std::vector<int>>(
"dim",
"(list<int>, default {0}) The dimensions to reduce. "
"Must be in the range [-rank(input), rank(input)). "
"If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
"Note that reducing on the first dim will make the LoD info lost.")
.SetDefault({0});
AddAttr<bool>("keep_dim",
"(bool, default false) "
"If true, retain the reduced dimension with length 1.")
.SetDefault(false);
AddAttr<bool>("reduce_all",
"(bool, default false) "
"If true, output a scalar reduced along all dimensions.")
.SetDefault(false);
AddComment(string::Sprintf(R"DOC(
%s Operator.
auto& place = This operator computes the %s of input tensor along the given dimension.
*context.template device_context<DeviceContext>().eigen_device(); The result tensor has 1 fewer dimension than the input unless keep_dim is true.
If reduce_all is true, just reduce along all dimensions and output a scalar.
Functor functor; )DOC",
functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, GetOpType(), GetName()));
broad_cats_times);
} }
protected:
virtual std::string GetName() const = 0;
virtual std::string GetOpType() const = 0;
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
#define FOR_EACH_KERNEL_FUNCTOR(__macro) \ namespace ops = paddle::operators;
__macro(reduce_sum, SumFunctor, SumGradFunctor); \
__macro(reduce_mean, MeanFunctor, MeanGradFunctor); \ #define REGISTER_REDUCE_OP(op_name) \
__macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \ class __##op_name##Maker__ : public ops::ReduceOpMaker { \
__macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); \ protected: \
__macro(reduce_prod, ProdFunctor, ProdGradFunctor); virtual std::string GetName() const { return #op_name; } \
virtual std::string GetOpType() const { return "Reduce " #op_name; } \
}; \
REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__, \
paddle::framework::DefaultGradOpDescMaker<true>); \
REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using DDim = framework::DDim;
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext, typename T, size_t D, size_t R_D,
typename Functor>
void ReduceFunctor(const DeviceContext& context, const framework::Tensor& input,
framework::Tensor* output, const std::vector<int>& dims,
bool keep_dim) {
auto x = EigenTensor<T, D>::From(input);
auto x_rank = static_cast<int>(x.dimensions().size());
auto reduce_dim = Eigen::array<int, R_D>();
std::vector<int> dims_ref = dims;
for (size_t i = 0; i < dims_ref.size(); ++i) {
if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
reduce_dim[i] = dims_ref[i];
}
// construct the squeezed output tensor
DDim out_dims = output->dims();
if (keep_dim && x_rank > 1) {
const int kDelFlag = -2;
auto dims_vector = framework::vectorize(out_dims);
for (size_t i = 0; i < dims_ref.size(); ++i) {
dims_vector[dims_ref[i]] = kDelFlag;
}
dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
dims_vector.end());
out_dims = framework::make_ddim(dims_vector);
}
auto& place = *context.eigen_device();
Functor functor;
if (D == 1) {
auto out = EigenScalar<T>::From(*output);
functor(place, &x, &out, reduce_dim);
} else {
auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
functor(place, &x, &out, reduce_dim);
}
}
template <typename DeviceContext, typename T, size_t D, typename Functor>
void ReduceGradFunctor(const DeviceContext& context,
const framework::Tensor& input0,
const framework::Tensor& input1,
const framework::Tensor& input2,
framework::Tensor* output,
const std::vector<int>& dims) {
auto x = EigenTensor<T, D>::From(input0);
auto x_grad = EigenTensor<T, D>::From(*output);
auto x_rank = static_cast<int>(x.dimensions().size());
auto x_dims = input0.dims();
auto reduced_dims_v = framework::vectorize(x_dims);
std::vector<int> dims_ref = dims;
Eigen::array<int, D> broadcast_dim;
for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
int broad_cats_times = 1;
for (size_t i = 0; i < dims_ref.size(); ++i) {
if (dims_ref[i] < 0) {
dims_ref[i] = x_rank + dims_ref[i];
}
reduced_dims_v[dims_ref[i]] = 1;
broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
broad_cats_times *= x_dims[dims_ref[i]];
}
auto reduced_dims = framework::make_ddim(reduced_dims_v);
auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
auto& place = *context.eigen_device();
Functor functor;
functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
broad_cats_times);
}
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_prod_op.h"
REGISTER_REDUCE_OP(reduce_prod);
REGISTER_OP_CPU_KERNEL(reduce_prod,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
float, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
double, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
int, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
int64_t, ops::ProdFunctor>);
REGISTER_OP_CPU_KERNEL(reduce_prod_grad,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
float, ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
double, ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
int, ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
int64_t, ops::ProdGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_prod_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_prod,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
double, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::ProdFunctor>);
REGISTER_OP_CUDA_KERNEL(
reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::ProdGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/reduce_op.h"
namespace paddle {
namespace operators {
struct ProdFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->prod(dim);
}
};
struct ProdGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
}
};
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_sum_op.h"
REGISTER_REDUCE_OP(reduce_sum);
REGISTER_OP_CPU_KERNEL(
reduce_sum, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
ops::SumFunctor>);
REGISTER_OP_CPU_KERNEL(reduce_sum_grad,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
float, ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
double, ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
int, ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
int64_t, ops::SumGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_sum_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_sum,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
double, ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::SumFunctor>);
REGISTER_OP_CUDA_KERNEL(
reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::SumGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/reduce_op.h"
namespace paddle {
namespace operators {
struct SumFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->sum(dim);
}
};
struct SumGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
dx->device(place) = dy->broadcast(dim);
}
};
} // namespace operators
} // namespace paddle
...@@ -16,7 +16,6 @@ limitations under the License. */ ...@@ -16,7 +16,6 @@ limitations under the License. */
#include <ostream> #include <ostream>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/detail/grpc_client.h"
...@@ -36,12 +35,9 @@ class SendOp : public framework::OperatorBase { ...@@ -36,12 +35,9 @@ class SendOp : public framework::OperatorBase {
void RunImpl(const framework::Scope& scope, void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override { const platform::Place& place) const override {
auto ins = Inputs("X"); auto ins = Inputs("X");
auto outs = Outputs("Out");
std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
std::vector<std::string> endpoints =
Attr<std::vector<std::string>>("endpoints");
bool sync_mode = Attr<bool>("sync_mode"); std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
int sync_send = Attr<int>("sync_mode");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place); auto& ctx = *pool.Get(place);
...@@ -55,32 +51,14 @@ class SendOp : public framework::OperatorBase { ...@@ -55,32 +51,14 @@ class SendOp : public framework::OperatorBase {
for (size_t i = 0; i < ins.size(); i++) { for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) { if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
// TODO(Yancey1989): we need to use an IO threadpool which has
// a larger number of threads than the computing threadpool.
rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]); rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
} else { } else {
VLOG(3) << "don't send no-initialied variable: " << ins[i]; VLOG(3) << "don't send no-initialied variable: " << ins[i];
} }
} }
rpc_client->Wait(); if (sync_send) {
if (sync_mode) {
for (auto& ep : endpoints) {
VLOG(3) << "batch barrier, ep: " << ep;
rpc_client->AsyncSendBatchBarrier(ep);
}
rpc_client->Wait();
}
if (outs.size() > 0) {
for (size_t i = 0; i < outs.size(); i++) {
VLOG(2) << "getting " << outs[i] << " from " << epmap[i];
rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
}
rpc_client->Wait();
// tell pservers that current trainer have called fetch
for (auto& ep : endpoints) {
VLOG(2) << "send fetch barrier, ep: " << ep;
rpc_client->AsyncSendFetchBarrier(ep);
}
rpc_client->Wait(); rpc_client->Wait();
} }
} }
...@@ -89,26 +67,22 @@ class SendOp : public framework::OperatorBase { ...@@ -89,26 +67,22 @@ class SendOp : public framework::OperatorBase {
class SendOpMaker : public framework::OpProtoAndCheckerMaker { class SendOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() { void Make() {
AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable(); AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
AddOutput("Out", "(Tensor) Output tensor to be received from server")
.AsDuplicable(); .AsDuplicable();
AddComment(R"DOC( AddComment(R"DOC(
Send operator Send operator
This operator will send tensor to recv_op at the parameter server. This operator will send variables to listen_and_serve op at the parameter server.
)DOC"); )DOC");
// TODO(typhoonzero): remove this attr generate de-duplicated vector from AddAttr<int>("sync_mode",
// epmap when initializing. "(int, default 0)"
AddAttr<std::vector<std::string>>("endpoints", "sync send or async send.")
"(string vector, default 127.0.0.1:6164)" .SetDefault(0);
"Server endpoints to send variables to.")
.SetDefault({});
AddAttr<std::vector<std::string>>("epmap", AddAttr<std::vector<std::string>>("epmap",
"(string vector, default 127.0.0.1:6164)" "(string vector, default 127.0.0.1:6164)"
"Server endpoints in the order of input " "Server endpoints in the order of input "
"variables for mapping") "variables for mapping")
.SetDefault({}); .SetDefault({"127.0.0.1:6164"});
AddAttr<bool>("sync_mode", "work in sync_mode or not").SetDefault(true);
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <future> // NOLINT
#include <ostream>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/grpc_client.h"
#include "paddle/fluid/operators/send_recv_util.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
class SendVarsOp : public framework::OperatorBase {
public:
SendVarsOp(const std::string& type, const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
auto ins = Inputs("X");
std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
int sync_send = Attr<int>("sync_send");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
// For profiling
platform::RecordEvent record_event(Type(), &ctx);
detail::RPCClient* rpc_client =
detail::RPCClient::GetInstance<detail::GRPCClient>();
for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
// TODO(Yancey1989): we need to use an IO threadpool which has
// a larger number of threads than the computing threadpool.
rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
} else {
VLOG(3) << "don't send no-initialied variable: " << ins[i];
}
}
if (sync_send) {
rpc_client->Wait();
}
}
};
class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
.AsDuplicable();
AddComment(R"DOC(
Send operator
This operator will send variables to listen_and_serve op at the parameter server.
)DOC");
AddAttr<int>("sync_send",
"(int, default 0)"
"sync send or async send.")
.SetDefault(0);
AddAttr<std::vector<std::string>>("epmap",
"(string vector, default 127.0.0.1:6164)"
"Server endpoints in the order of input "
"variables for mapping")
.SetDefault({"127.0.0.1:6164"});
}
};
class SendVarsOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(send_vars, ops::SendVarsOp,
paddle::framework::EmptyGradOpMaker, ops::SendVarsOpMaker,
ops::SendVarsOpShapeInference);
...@@ -11,6 +11,7 @@ limitations under the License. */ ...@@ -11,6 +11,7 @@ limitations under the License. */
#pragma once #pragma once
#include <memory> #include <memory>
#include <mutex> // NOLINT
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
...@@ -100,6 +101,7 @@ class CUDADeviceContext : public DeviceContext { ...@@ -100,6 +101,7 @@ class CUDADeviceContext : public DeviceContext {
template <typename Callback> template <typename Callback>
void RecordEvent(cudaEvent_t ev, Callback callback) { void RecordEvent(cudaEvent_t ev, Callback callback) {
std::lock_guard<std::mutex> guard(mtx_);
callback(); callback();
PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
} }
...@@ -116,6 +118,8 @@ class CUDADeviceContext : public DeviceContext { ...@@ -116,6 +118,8 @@ class CUDADeviceContext : public DeviceContext {
int compute_capability; int compute_capability;
int multi_process; int multi_process;
int max_threads_per_mp; int max_threads_per_mp;
std::mutex mtx_;
}; };
template <> template <>
......
...@@ -30,7 +30,7 @@ int main(int argc, char** argv) { ...@@ -30,7 +30,7 @@ int main(int argc, char** argv) {
new_argv.push_back( new_argv.push_back(
strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory")); strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
#else #else
new_argv.push_back(strdup("--tryfromenv=use_pinned_memory")); new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,use_mkldnn"));
#endif #endif
int new_argc = static_cast<int>(new_argv.size()); int new_argc = static_cast<int>(new_argv.size());
char** new_argv_address = new_argv.data(); char** new_argv_address = new_argv.data();
......
...@@ -26,6 +26,7 @@ from trainer import BeginEpochEvent ...@@ -26,6 +26,7 @@ from trainer import BeginEpochEvent
from trainer import EndEpochEvent from trainer import EndEpochEvent
from trainer import BeginStepEvent from trainer import BeginStepEvent
from trainer import EndStepEvent from trainer import EndStepEvent
from trainer import CheckpointConfig
import inferencer import inferencer
from inferencer import Inferencer from inferencer import Inferencer
...@@ -116,7 +117,7 @@ def __bootstrap__(): ...@@ -116,7 +117,7 @@ def __bootstrap__():
read_env_flags = [ read_env_flags = [
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
'eager_delete_scope' 'eager_delete_scope', 'use_mkldnn'
] ]
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += [ read_env_flags += [
......
...@@ -170,6 +170,8 @@ def get_program_cache_key(feed, fetch_list): ...@@ -170,6 +170,8 @@ def get_program_cache_key(feed, fetch_list):
return var.desc.name() return var.desc.name()
elif isinstance(var, str): elif isinstance(var, str):
return var return var
elif isinstance(var, basestring):
return str(var)
else: else:
raise TypeError(str(var) + " should be Variable or str") raise TypeError(str(var) + " should be Variable or str")
......
...@@ -72,6 +72,8 @@ def convert_np_dtype_to_dtype_(np_dtype): ...@@ -72,6 +72,8 @@ def convert_np_dtype_to_dtype_(np_dtype):
return core.VarDesc.VarType.INT64 return core.VarDesc.VarType.INT64
elif dtype == np.bool: elif dtype == np.bool:
return core.VarDesc.VarType.BOOL return core.VarDesc.VarType.BOOL
elif dtype == np.uint16:
return core.VarDesc.VarType.INT16
elif dtype == np.uint8: elif dtype == np.uint8:
return core.VarDesc.VarType.UINT8 return core.VarDesc.VarType.UINT8
else: else:
...@@ -361,6 +363,13 @@ class OpProtoHolder(object): ...@@ -361,6 +363,13 @@ class OpProtoHolder(object):
raise ValueError("Operator \"%s\" has not been registered." % type) raise ValueError("Operator \"%s\" has not been registered." % type)
return self.op_proto_map[type] return self.op_proto_map[type]
@staticmethod
def generated_op_attr_names():
return {
core.op_proto_and_checker_maker.kOpRoleAttrName(),
core.op_proto_and_checker_maker.kOpRoleVarAttrName()
}
class Operator(object): class Operator(object):
""" """
...@@ -368,6 +377,13 @@ class Operator(object): ...@@ -368,6 +377,13 @@ class Operator(object):
Block. Users can use the build in instructions to describe their neural Block. Users can use the build in instructions to describe their neural
network. network.
""" """
OP_WITHOUT_KERNEL_SET = {
'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
'ncclInit', 'channel_create', 'channel_close', 'channel_send',
'channel_recv', 'select'
}
def __init__(self, def __init__(self,
block, block,
...@@ -504,17 +520,13 @@ class Operator(object): ...@@ -504,17 +520,13 @@ class Operator(object):
else: else:
self.desc.set_attr(attr_name, self.attrs[attr_name]) self.desc.set_attr(attr_name, self.attrs[attr_name])
self.desc.check_attrs() self.desc.check_attrs()
no_kernel_op_set = { if self.has_kernel(type):
'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
'load_combine', 'ncclInit', 'channel_create', 'channel_close',
'channel_send', 'channel_recv', 'select', 'gen_nccl_id'
}
if type not in no_kernel_op_set:
self.desc.infer_var_type(self.block.desc) self.desc.infer_var_type(self.block.desc)
self.desc.infer_shape(self.block.desc) self.desc.infer_shape(self.block.desc)
def has_kernel(self, op_type):
return op_type not in self.OP_WITHOUT_KERNEL_SET
def to_string(self, throw_on_error): def to_string(self, throw_on_error):
""" """
To debug string. To debug string.
...@@ -742,7 +754,9 @@ class Block(object): ...@@ -742,7 +754,9 @@ class Block(object):
def var(self, name): def var(self, name):
if not isinstance(name, basestring): if not isinstance(name, basestring):
raise TypeError() raise TypeError(
"var require string as parameter, but get %s instead." %
(type(name)))
v = self.vars.get(name, None) v = self.vars.get(name, None)
if v is None: if v is None:
raise ValueError("var %s not in this block" % name) raise ValueError("var %s not in this block" % name)
......
...@@ -56,6 +56,8 @@ class Inferencer(object): ...@@ -56,6 +56,8 @@ class Inferencer(object):
else: else:
self.exe = executor.Executor(self.place) self.exe = executor.Executor(self.place)
self.inference_program = self.inference_program.clone(for_test=True)
def infer(self, inputs, return_numpy=True): def infer(self, inputs, return_numpy=True):
""" """
:param inputs: a map of {"input_name": input_var} that will be feed into the inference program :param inputs: a map of {"input_name": input_var} that will be feed into the inference program
......
...@@ -24,7 +24,8 @@ __all__ = [ ...@@ -24,7 +24,8 @@ __all__ = [
'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params', 'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
'load_persistables', 'save_inference_model', 'load_inference_model', 'load_persistables', 'save_inference_model', 'load_inference_model',
'get_inference_program', 'save_checkpoint', 'load_checkpoint', 'get_inference_program', 'save_checkpoint', 'load_checkpoint',
'clean_checkpoint' 'clean_checkpoint', 'load_persist_vars_without_grad',
'save_persist_vars_without_grad', 'get_latest_checkpoint_serial'
] ]
...@@ -457,95 +458,161 @@ def get_parameter_value_by_name(name, executor, program=None): ...@@ -457,95 +458,161 @@ def get_parameter_value_by_name(name, executor, program=None):
SUCCESS_MARK_FILENAME = "_SUCCESS" SUCCESS_MARK_FILENAME = "_SUCCESS"
CHECKPOINT_PREFIX = "checkpoint" CHECKPOINT_PREFIX = "checkpoint"
MODEL_DIR = "__model__"
TRAINER_PREFIX = "trainer"
CHECKPOINT_SEPARATOR = "_" CHECKPOINT_SEPARATOR = "_"
def save_checkpoint(executor, def save_checkpoint(executor,
checkpoint_dir=None, checkpoint_dir,
max_num_checkpoints=3, trainer_id,
save_interval_secs=600, trainer_args=None,
main_program=None): main_program=None,
max_num_checkpoints=3):
""" """
Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory, Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory,
the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy
to keep numbers of checkpoint directory, the numbers of checkpoint directory are max_num_checkpoints at most, to keep numbers of checkpoint directory, the numbers of checkpoint directory are max_num_checkpoints at most,
The interval between two saved checkpoints must greater than save_interval_secs. The interval between two saved checkpoints must greater than save_interval_secs.
:param executor :param executor executor for save the value
:param checkpoint_dir :param checkpoint_dir the checkpoint directory
:param max_num_checkpoints :param trainer_id currect trainer id, if id is equal to 0, the trainer is chief
:param save_interval_secs :param main_program will save all variables in program
:param main_program :param max_num_checkpoints will keep numbers of checkpoint serials not bigger than max_num_checkpoints
""" """
if checkpoint_dir is None: if checkpoint_dir is None:
checkpoint_dir = os.getcwd() raise ValueError("'checkpoint_dir' should not be None")
if trainer_args:
assert isinstance(trainer_args, dict)
if not os.path.isdir(checkpoint_dir): if not os.path.isdir(checkpoint_dir):
os.makedirs(checkpoint_dir) os.makedirs(checkpoint_dir)
serial = _get_lastest_checkpoint_dir(checkpoint_dir) serial = get_latest_checkpoint_serial(checkpoint_dir) + 1
if serial >= 0 and not _interval_secs_exceed( cur_dir = _get_serial_dir(checkpoint_dir, serial)
_get_serial_dir(serial, checkpoint_dir), save_interval_secs):
return
serial += 1 save_trainer_args(cur_dir, trainer_id, trainer_args)
cur_dir = _get_serial_dir(serial, checkpoint_dir)
save_vars( if trainer_id == 0:
executor, save_persist_vars_without_grad(executor, cur_dir, main_program)
dirname=cur_dir,
main_program=main_program, _scroll_delete(checkpoint_dir, max_num_checkpoints)
vars=None,
predicate=_is_checkpoint_var,
filename=None)
_write_success(cur_dir)
_lru_delete(checkpoint_dir, max_num_checkpoints)
def load_checkpoint(executor, checkpoint_dir=None, main_program=None): def load_checkpoint(executor, checkpoint_dir, serial, main_program):
""" """
Load checkpoint from a directory by executor, Load checkpoint from a directory by executor,
it will find the most recent saved checkpoint file and load it auto. it will find the most recent saved checkpoint file and load it auto.
:param executor :param executor executor for load the value
:param checkpoint_dir :param checkpoint_dir the checkpoint directory
:param main_program :param serial the serial folder in checkpoint directory will be load
:param main_program will load all variables in program
""" """
if checkpoint_dir is None: if checkpoint_dir is None:
checkpoint_dir = os.getcwd() raise ValueError("'checkpoint_dir' should not be None")
serial = _get_lastest_checkpoint_dir(checkpoint_dir) if serial is None or serial < 0:
raise ValueError("'serial' should not be None or <0 ")
if serial < 0: if main_program is None:
return raise ValueError('main_program should not be None.')
cur_dir = _get_serial_dir(serial, checkpoint_dir) cur_dir = _get_serial_dir(checkpoint_dir, serial)
load_persist_vars_without_grad(executor, cur_dir, main_program, True)
load_vars(
executor,
dirname=cur_dir,
main_program=main_program,
predicate=_is_checkpoint_var,
filename=None)
def clean_checkpoint(checkpoint_dir, delete_dir=False): def clean_checkpoint(checkpoint_dir, delete_dir=False):
""" """
clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before. clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before.
delete_dir only works when the directory is empty, otherwise, OSError is raised. delete_dir only works when the directory is empty, otherwise, OSError is raised.
:param checkpoint_dir
:param delete_dir
""" """
if checkpoint_dir is None: if checkpoint_dir is None:
checkpoint_dir = os.getcwd() raise ValueError("'checkpoint_dir' should not be None")
_lru_delete(checkpoint_dir, max_num_checkpoints=0) _scroll_delete(checkpoint_dir, max_num_checkpoints=0)
if delete_dir and not os.listdir(checkpoint_dir): if delete_dir and not os.listdir(checkpoint_dir):
os.rmdir(checkpoint_dir) os.rmdir(checkpoint_dir)
def _get_serial_dir(serial, checkpoint_dir): def load_persist_vars_without_grad(executor,
serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial) dirname,
return os.path.join(checkpoint_dir, serial_folder) program,
has_model_dir=False):
"""
load_persist_vars_without_grad will load variables from a directory by an executor,
the variable named end with "@GRAD" will not be loaded.
:param executor executor for load the value
:param dirname the checkpoint directory
:param program will load all variables in program
:param has_model_dir if has_model_dir is True, will load variables from sub directory named __model__
"""
if has_model_dir:
dirname = _get_model_dir(dirname)
load_vars(
executor,
dirname=dirname,
main_program=program,
predicate=_is_checkpoint_var,
filename=None)
def save_persist_vars_without_grad(executor, dirname, program):
"""
save_persist_vars_without_grad will save variables to a directory by an executor,
the variable named end with "@GRAD" will not be saved.
:param executor executor for load the value
:param dirname the checkpoint directory
:param program will load all variables in program
"""
cur_dir = _get_model_dir(dirname)
save_vars(
executor,
dirname=cur_dir,
main_program=program,
vars=None,
predicate=_is_checkpoint_var,
filename=None)
_write_success(cur_dir)
def save_trainer_args(dirname, trainer_id, trainer_args):
assert isinstance(trainer_args, dict)
cur_dir = _get_trainer_dir(dirname, trainer_id)
for name, value in trainer_args.iteritems():
args_file = os.path.join(cur_dir, name)
with open(args_file, 'w') as f:
f.write(str(value))
_write_success(cur_dir)
def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
assert isinstance(trainer_args, list)
cur_dir = _get_serial_dir(checkpoint_dir, serial)
cur_dir = _get_trainer_dir(cur_dir, trainer_id)
ret_values = []
for arg in trainer_args:
cur_file = os.path.join(cur_dir, arg)
with open(cur_file, 'r') as f:
contents = f.read()
ret_values.append(contents.strip())
return ret_values
def _is_checkpoint_var(var): def _is_checkpoint_var(var):
...@@ -559,36 +626,74 @@ def _is_checkpoint_var(var): ...@@ -559,36 +626,74 @@ def _is_checkpoint_var(var):
var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
var.desc.type() == core.VarDesc.VarType.RAW: var.desc.type() == core.VarDesc.VarType.RAW:
return False return False
# @GRAD are named for gradient variables, checkpoint will not save it.
if "@GRAD" in var.name:
return False
# .trainer_ are named for distribute train variables, checkpoint will not save it.
if ".trainer_" in var.name:
return False
if var.name.endswith("@GRAD"): # .block is named for distribute train variables, checkpoint will not save it.
if ".block" in var.name:
return False return False
return var.persistable return var.persistable
def _interval_secs_exceed(dirname, save_interval_secs): def _get_dir_serial(dirname):
dir_time = os.path.getmtime(dirname) _, serial = dirname.split(CHECKPOINT_SEPARATOR)
if save_interval_secs > (time.time() - dir_time):
return False try:
return True serial_num = int(serial)
except ValueError:
serial_num = -1
return serial_num
def _get_serial_dir(dirname, serial):
serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
serial_dir = os.path.join(dirname, serial_folder)
if not os.path.isdir(serial_dir):
os.makedirs(serial_dir)
return serial_dir
def _get_model_dir(dirname):
model_dir = os.path.join(dirname, MODEL_DIR)
def _lru_delete(dirname, max_num_checkpoints=3): if not os.path.isdir(model_dir):
os.makedirs(model_dir)
return model_dir
def _get_trainer_dir(dirname, trainer_id):
trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
trainer_dir = os.path.join(dirname, trainer_folder)
if not os.path.isdir(trainer_dir):
os.makedirs(trainer_dir)
return trainer_dir
def _scroll_delete(dirname, max_num_checkpoints=3):
dirs = os.listdir(dirname) dirs = os.listdir(dirname)
serials = [] serial_map = {}
for serial in dirs: for serial in dirs:
try: serial_num = _get_dir_serial(serial)
serials.append(int(serial)) serial_map[serial_num] = serial
except ValueError:
continue
if len(serials) <= max_num_checkpoints: if len(serial_map.keys()) <= max_num_checkpoints:
return return
serials = serial_map.keys()
serials.sort(reverse=True) serials.sort(reverse=True)
serials = serials[max_num_checkpoints:] serials = serials[max_num_checkpoints:]
for serial in serials: for serial in serials:
cur_dir = os.path.join(dirname, str(serial)) cur_dir = _get_serial_dir(dirname, serial)
shutil.rmtree(cur_dir) shutil.rmtree(cur_dir)
...@@ -604,33 +709,30 @@ def _write_success(dirname): ...@@ -604,33 +709,30 @@ def _write_success(dirname):
f.write(now) f.write(now)
def _get_lastest_checkpoint_dir(checkpoint_dir): def get_latest_checkpoint_serial(checkpoint_dir):
""" """
get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
:param checkpoint_dir :param checkpoint_dir
""" """
if not checkpoint_dir.strip(): if not checkpoint_dir:
return -1 return -1
def has_success(checkpoint_dir, cur_dir): def has_success(checkpoint_dir, cur_dir):
""" """
is _SUCCESS in this dir is _SUCCESS in this dir
""" """
_, serial = cur_dir.split(CHECKPOINT_SEPARATOR)
try:
int(serial)
except ValueError:
return -1
if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)): serial = _get_dir_serial(cur_dir)
if serial == -1 or not os.path.isdir(
os.path.join(checkpoint_dir, cur_dir)):
return -1 return -1
success_path = os.path.join( success_path = os.path.join(
_get_serial_dir(serial, checkpoint_dir), SUCCESS_MARK_FILENAME) _get_serial_dir(checkpoint_dir, serial), MODEL_DIR,
SUCCESS_MARK_FILENAME)
if os.path.isfile(success_path): if os.path.isfile(success_path):
return int(serial) return serial
if not os.path.isdir(checkpoint_dir): if not os.path.isdir(checkpoint_dir):
return -1 return -1
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
import contextlib import contextlib
from layer_function_generator import autodoc from layer_function_generator import autodoc, templatedoc
from tensor import assign, fill_constant from tensor import assign, fill_constant
from .. import core from .. import core
from ..framework import Program, Variable, Operator from ..framework import Program, Variable, Operator
...@@ -721,26 +721,22 @@ def lod_rank_table(x, level=0): ...@@ -721,26 +721,22 @@ def lod_rank_table(x, level=0):
return table return table
@templatedoc()
def max_sequence_len(rank_table): def max_sequence_len(rank_table):
"""Max Sequence Len Operator. Given a LoDRankTable object, this layer """
returns the max length of a batch of sequences. In fact, a LoDRankTable ${comment}
object contains a list of tuples(<sequence index, sequence length>) and
the list is already sorted by sequence length in descending order, so the >>> import paddle.fluid as fluid
operator just returns the sequence length of the first tuple element. >>> x = fluid.layers.data(name='x', shape=[10], dtype='float32',
>>> lod_level=1)
>>> rank_table = layers.lod_rank_table(x=x, level=0)
>>> max_seq_len = layers.max_sequence_len(rank_table)
Args: Args:
rank_table (Variable): Input variable which is a LoDRankTable object. rank_table(${rank_table_type}): ${rank_table_comment}.
Returns: Returns:
Variable: The max length of sequence. ${out_comment}.
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[10],
dtype='float32', lod_level=1)
rank_table = layers.lod_rank_table(x=x, level=0)
max_seq_len = layers.max_sequence_len(rank_table)
""" """
helper = LayerHelper("max_seqence_len", **locals()) helper = LayerHelper("max_seqence_len", **locals())
res = helper.create_tmp_variable(dtype="int64") res = helper.create_tmp_variable(dtype="int64")
......
...@@ -19,11 +19,12 @@ from ..unique_name import generate as unique_name ...@@ -19,11 +19,12 @@ from ..unique_name import generate as unique_name
from control_flow import BlockGuard from control_flow import BlockGuard
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
from ..executor import global_scope from ..executor import global_scope
from layer_function_generator import generate_layer_fn, templatedoc
__all__ = [ __all__ = [
'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file', 'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer', 'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
'random_data_generator', 'Preprocessor' 'random_data_generator', 'Preprocessor', 'load'
] ]
...@@ -434,7 +435,7 @@ def open_files(filenames, ...@@ -434,7 +435,7 @@ def open_files(filenames,
shapes, shapes,
lod_levels, lod_levels,
dtypes, dtypes,
thread_num, thread_num=1,
buffer_size=None, buffer_size=None,
pass_num=1, pass_num=1,
for_parallel=True): for_parallel=True):
...@@ -662,3 +663,29 @@ class Preprocessor(object): ...@@ -662,3 +663,29 @@ class Preprocessor(object):
"sink_var_names": self.sink_var_names "sink_var_names": self.sink_var_names
}) })
return monkey_patch_reader_methods(self.reader) return monkey_patch_reader_methods(self.reader)
@templatedoc()
def load(out, file_path, load_as_fp16=None):
"""
${comment}
>>> import paddle.fluid as fluid
>>> tmp_tensor = fluid.layers.create_tensor(dtype='float32')
>>> fluid.layers.load(tmp_tensor, "./tmp_tensor.bin")
Args:
out(${out_type}): ${out_comment}.
file_path(${file_path_type}): ${file_path_comment}.
load_as_fp16(${load_as_fp16_type}): ${load_as_fp16_comment}.
Returns:
None
"""
helper = LayerHelper("load", **locals())
attrs = {"file_path": file_path}
if load_as_fp16 is not None:
attrs['load_as_fp16'] = load_as_fp16
helper.append_op(type="load", inputs={}, output={"Out": out}, args=attrs)
...@@ -64,10 +64,6 @@ def auc(input, label, curve='ROC', num_thresholds=200): ...@@ -64,10 +64,6 @@ def auc(input, label, curve='ROC', num_thresholds=200):
topk_indices = helper.create_tmp_variable(dtype="int64") topk_indices = helper.create_tmp_variable(dtype="int64")
topk_out, topk_indices = nn.topk(input, k=k) topk_out, topk_indices = nn.topk(input, k=k)
auc_out = helper.create_tmp_variable(dtype="float32") auc_out = helper.create_tmp_variable(dtype="float32")
if correct is None:
correct = helper.create_tmp_variable(dtype="int64")
if total is None:
total = helper.create_tmp_variable(dtype="int64")
helper.append_op( helper.append_op(
type="accuracy", type="accuracy",
inputs={ inputs={
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册