提交 24649a78 编写于 作者: F fengjiayi

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into update_crop_op

......@@ -19,4 +19,4 @@ ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
ENV LD_LIBRARY_PATH=/usr/local/lib
ADD fluid_benchmark.py dataset.py models/ /workspace/
ADD fluid_benchmark.py recordio_converter.py models/ /workspace/
......@@ -44,6 +44,16 @@ Currently supported `--model` argument include:
PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
```
## Prepare the RecordIO file to Achieve Better Performance
Run the following command will generate RecordIO files like "mnist.recordio" under the path
and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
at any time using `fluid.batch`.
```bash
python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
```
## Run Distributed Benchmark on Kubernetes Cluster
You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
......
......@@ -38,10 +38,12 @@ def parse_args():
default='resnet',
help='The model to run benchmark with.')
parser.add_argument(
'--batch_size', type=int, default=32, help='The minibatch size.')
'--batch_size',
type=int,
default=32,
help='The batch size on each gpu.')
parser.add_argument(
'--learning_rate', type=float, default=0.001, help='The learning rate.')
# TODO(wuyi): add "--use_fake_data" option back.
parser.add_argument(
'--skip_batch_num',
type=int,
......@@ -49,7 +51,10 @@ def parse_args():
help='The first num of minibatch num to skip, for better performance test'
)
parser.add_argument(
'--iterations', type=int, default=80, help='The number of minibatches.')
'--iterations',
type=int,
default=80,
help='The number of minibatches, set to -1 to run all batches.')
parser.add_argument(
'--pass_num', type=int, default=100, help='The number of passes.')
parser.add_argument(
......@@ -69,6 +74,7 @@ def parse_args():
type=int,
default=1,
help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
# this option is available only for vgg and resnet.
parser.add_argument(
'--cpus',
type=int,
......@@ -78,7 +84,7 @@ def parse_args():
'--data_set',
type=str,
default='flowers',
choices=['cifar10', 'flowers'],
choices=['cifar10', 'flowers', 'imagenet'],
help='Optional dataset for benchmark.')
parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.')
......@@ -108,6 +114,16 @@ def parse_args():
default='local',
choices=['local', 'pserver', 'nccl2'],
help='Choose parameter update method, can be local, pserver, nccl2.')
parser.add_argument(
'--use_reader_op',
action='store_true',
help='Whether to use reader op, and must specify the data path if set this to true.'
)
parser.add_argument(
'--data_path',
type=str,
default="",
help='Directory that contains all the training recordio files.')
args = parser.parse_args()
return args
......@@ -210,26 +226,50 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup_prog)
feed_var_list = [
var for var in train_prog.global_block().vars.itervalues()
if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place)
if not args.use_reader_op:
feed_var_list = [
var for var in train_prog.global_block().vars.itervalues()
if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place)
iters, num_samples, start_time = 0, 0, time.time()
for pass_id in range(args.pass_num):
train_losses = []
for batch_id, data in enumerate(train_reader()):
if not args.use_reader_op:
reader_generator = train_reader()
batch_id = 0
data = None
while True:
if not args.use_reader_op:
data = next(reader_generator, None)
if data == None:
break
if iters == args.iterations:
break
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
loss = exe.run(train_prog,
feed=feeder.feed(data),
fetch_list=[avg_loss])
if args.use_reader_op:
try:
loss = exe.run(train_prog, fetch_list=[avg_loss])
except fluid.core.EnforceNotMet as ex:
break
else:
loss = exe.run(train_prog,
feed=feeder.feed(data),
fetch_list=[avg_loss])
iters += 1
num_samples += len(data)
batch_id += 1
# FIXME(wuyi): For use_reader_op, if the current
# pass is not the last, the last batch of this pass
# is also equal to args.batch_size.
if args.use_reader_op:
num_samples += args.batch_size * args.gpus
else:
num_samples += len(data)
train_losses.append(loss)
print("Pass: %d, Iter: %d, Loss: %f\n" %
(pass_id, iters, np.mean(train_losses)))
......@@ -250,10 +290,14 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
batch_acc, args, train_prog, startup_prog, nccl_id_var,
num_trainers, trainer_id):
feed_var_list = [
var for var in train_prog.global_block().vars.itervalues()
if var.is_data
]
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
if not args.use_reader_op:
feed_var_list = [
var for var in train_prog.global_block().vars.itervalues()
if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place)
# generate fake:
if args.use_fake_data:
for var in feed_var_list:
......@@ -270,7 +314,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
"value": 1.0,
"dtype": var.dtype})
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
if nccl_id_var and trainer_id == 0:
#FIXME(wuyi): wait other trainer to start listening
time.sleep(30)
......@@ -287,12 +330,21 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
num_trainers=num_trainers,
trainer_id=trainer_id)
feeder = fluid.DataFeeder(feed_var_list, place)
for pass_id in range(args.pass_num):
num_samples = 0
iters = 0
start_time = time.time()
for batch_id, data in enumerate(train_reader()):
if not args.use_reader_op:
reader_generator = train_reader()
batch_id = 0
data = None
while True:
if not args.use_reader_op:
data = next(reader_generator, None)
if data == None:
break
if iters == args.iterations:
break
if args.profile and pass_id == 0 and batch_id == 5:
profiler.start_profiler("All")
elif args.profile and pass_id == 0 and batch_id == 10:
......@@ -301,19 +353,26 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
if args.use_fake_data:
loss, = exe.run([avg_loss.name])
if args.use_fake_data or args.use_reader_op:
try:
loss, = exe.run([avg_loss.name])
except fluid.core.EnforceNotMet as ex:
break
else:
loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
if args.update_method == "pserver":
exe.bcast_params()
num_samples += len(data)
if args.use_reader_op:
num_samples += args.batch_size * args.gpus
else:
num_samples += len(data)
iters += 1
if batch_id % 1 == 0:
print("Pass %d, batch %d, loss %s" %
(pass_id, batch_id, np.array(loss)))
batch_id += 1
if args.use_reader_op:
num_samples = num_samples * args.gpus
print_train_time(start_time, time.time(), num_samples)
if not args.no_test and batch_acc:
test_acc = test(startup_exe, infer_prog, test_reader, feeder,
......
......@@ -197,6 +197,8 @@ def lodtensor_to_ndarray(lod_tensor):
def get_model(args):
if args.use_reader_op:
raise Exception("machine_translation do not support reader op for now.")
embedding_dim = 512
encoder_size = 512
decoder_size = 512
......@@ -221,7 +223,7 @@ def get_model(args):
train_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=args.batch_size)
batch_size=args.batch_size * args.gpus)
test_batch_generator = paddle.batch(
paddle.reader.shuffle(
......
......@@ -20,6 +20,7 @@ import numpy as np
import argparse
import time
import cProfile
import os
import paddle
import paddle.fluid as fluid
......@@ -65,9 +66,24 @@ def cnn_model(data):
def get_model(args):
# Input data
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if args.use_reader_op:
filelist = [
os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
]
data_file = fluid.layers.open_files(
filenames=filelist,
shapes=[[-1, 1, 28, 28], (-1, 1)],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
thread_num=args.gpus,
pass_num=args.pass_num)
data_file = fluid.layers.double_buffer(
fluid.layers.batch(
data_file, batch_size=args.batch_size))
images, label = fluid.layers.read_file(data_file)
else:
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if args.device == 'CPU' and args.cpus > 1:
places = fluid.layers.get_places(args.cpus)
......@@ -103,7 +119,7 @@ def get_model(args):
# Reader
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=args.batch_size)
paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=args.batch_size)
return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
......@@ -19,6 +19,7 @@ from __future__ import print_function
import functools
import numpy as np
import time
import os
import cProfile, pstats, StringIO
......@@ -26,6 +27,7 @@ import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.profiler as profiler
from recordio_converter import imagenet_train, imagenet_test
def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
......@@ -122,16 +124,48 @@ def get_model(args):
else:
dshape = [32, 32, 3]
model = resnet_cifar10
else:
train_reader = paddle.dataset.cifar.train10()
test_reader = paddle.dataset.cifar.test10()
elif args.data_set == "flowers":
class_dim = 102
if args.data_format == 'NCHW':
dshape = [3, 224, 224]
else:
dshape = [224, 224, 3]
model = resnet_imagenet
input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
train_reader = paddle.dataset.flowers.train()
test_reader = paddle.dataset.flowers.test()
elif args.data_set == "imagenet":
class_dim = 1000
if args.data_format == 'NCHW':
dshape = [3, 224, 224]
else:
dshape = [224, 224, 3]
model = resnet_imagenet
if not args.data_path:
raise Exception(
"Must specify --data_path when training with imagenet")
train_reader = imagenet_train(args.data_path)
test_reader = imagenet_test(args.data_path)
if args.use_reader_op:
filelist = [
os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
]
data_file = fluid.layers.open_files(
filenames=filelist,
shapes=[[-1] + dshape, (-1, 1)],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
thread_num=args.gpus,
pass_num=args.pass_num)
data_file = fluid.layers.double_buffer(
fluid.layers.batch(
data_file, batch_size=args.batch_size))
input, label = fluid.layers.read_file(data_file)
else:
input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if args.device == 'CPU' and args.cpus > 1:
places = fluid.layers.get_places(args.cpus)
......@@ -162,15 +196,10 @@ def get_model(args):
optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
train_reader = paddle.batch(
batched_train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
batch_size=args.batch_size)
return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
train_reader, buf_size=5120),
batch_size=args.batch_size * args.gpus)
batched_test_reader = paddle.batch(train_reader, batch_size=args.batch_size)
return avg_cost, inference_program, optimizer, batched_train_reader, batched_test_reader, batch_acc
......@@ -44,6 +44,9 @@ def crop_sentence(reader, crop_size):
def get_model(args):
if args.use_reader_op:
raise Exception(
"stacked_dynamic_lstm do not support reader op for now.")
lstm_size = 512
emb_dim = 512
crop_size = 1500
......@@ -114,7 +117,7 @@ def get_model(args):
train_reader = batch(
paddle.reader.shuffle(
crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
batch_size=args.batch_size)
batch_size=args.batch_size * args.gpus)
test_reader = batch(
paddle.reader.shuffle(
crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
......
......@@ -22,6 +22,7 @@ import paddle.fluid as fluid
import paddle.fluid.core as core
import argparse
import functools
import os
def vgg16_bn_drop(input):
......@@ -65,9 +66,24 @@ def get_model(args):
else:
data_shape = [224, 224, 3]
# Input data
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if args.use_reader_op:
filelist = [
os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
]
data_file = fluid.layers.open_files(
filenames=filelist,
shapes=[[-1] + data_shape, (-1, 1)],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
thread_num=args.gpus,
pass_num=args.pass_num)
data_file = fluid.layers.double_buffer(
fluid.layers.batch(
data_file, batch_size=args.batch_size))
images, label = fluid.layers.read_file(data_file)
else:
images = fluid.layers.data(name='data', shape=dshape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
net = vgg16_bn_drop(images)
......@@ -95,7 +111,7 @@ def get_model(args):
paddle.dataset.cifar.train10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
batch_size=args.batch_size * args.gpus)
test_reader = paddle.batch(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.dataset import mnist, cifar, flowers, image
def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data,
shape_label):
num_batches = 0
with fluid.program_guard(fluid.Program(), fluid.Program()):
reader = paddle.batch(py_reader(), batch_size=batch_size)
feeder = fluid.DataFeeder(
feed_list=[ # order is image and label
fluid.layers.data(
name='image', shape=shape_data),
fluid.layers.data(
name='label', shape=shape_label, dtype='int64'),
],
place=fluid.CPUPlace())
num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
outfilepath, reader, feeder)
return num_batches
def prepare_mnist(outpath, batch_size):
outfilepath = os.path.join(outpath, "mnist.recordio")
convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1])
def prepare_cifar10(outpath, batch_size):
outfilepath = os.path.join(outpath, "cifar.recordio")
convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1])
def prepare_flowers(outpath, batch_size):
outfilepath = os.path.join(outpath, "flowers.recordio")
convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224],
[1])
def default_mapper(sample):
img, label = sample
img = image.simple_transform(
img, 256, 224, True, mean=[103.94, 116.78, 123.68])
return img.flatten().astype('float32'), label
def imagenet_train(data_dir):
contents = os.listdir(data_dir)
if set(contents) != set(
["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
raise Exception("Imagenet data contents error!")
img2label = dict()
imgfilelist = []
with open(os.path.join(data_dir, "train.txt")) as fn:
while 1:
l = fn.readline()
if not l:
break
img, lbl = l[:-1].split(" ")
img2label[img] = int(lbl)
imgfilelist.append(img)
# shuffle all, this is slow
random.shuffle(imgfilelist)
def train_reader():
for idx, imgfile in enumerate(imgfilelist):
data = image.load_image(
os.path.join(data_dir, "train", imgfile.lower()))
label = [img2label[imgfile], ]
yield [data, label]
return paddle.reader.map_readers(default_mapper, train_reader)
def imagenet_test(data_dir):
contents = os.listdir(data_dir)
if set(contents) != set(
["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
raise Exception("Imagenet data contents error!")
img2label = dict()
imgfilelist = []
with open(os.path.join(data_dir, "val.txt")) as fn:
while 1:
l = fn.readline()
if not l:
break
img, lbl = l[:-1].split(" ")
img2label[img] = int(lbl)
imgfilelist.append(img)
def test_reader():
for idx, imgfile in enumerate(imgfilelist):
base_path = os.path.join(data_dir, "val", imgfile.split(".")[0])
image_path = ".".join([base_path, "jpeg"])
data = image.load_image(image_path)
label = [img2label[imgfile], ]
yield [data, label]
return paddle.reader.map_readers(default_mapper, test_reader)
# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged
def convert_reader_to_recordio_files(
filename,
batch_per_file,
reader_creator,
feeder,
compressor=core.RecordIOWriter.Compressor.Snappy,
max_num_records=1000,
feed_order=None):
if feed_order is None:
feed_order = feeder.feed_names
f_name, f_ext = os.path.splitext(filename)
assert (f_ext == ".recordio")
lines = []
f_idx = 0
counter = 0
for idx, batch in enumerate(reader_creator()):
lines.append(batch)
if idx >= batch_per_file and idx % batch_per_file == 0:
filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
with fluid.recordio_writer.create_recordio_writer(
filename, compressor, max_num_records) as writer:
for l in lines:
res = feeder.feed(l)
for each in feed_order:
writer.append_tensor(res[each])
writer.complete_append_tensor()
counter += 1
lines = []
f_idx += 1
print("written file: ", filename)
return counter
def prepare_imagenet(inpath, outpath, batch_size):
r = paddle.batch(imagenet_train(inpath), batch_size=batch_size)
feeder = fluid.DataFeeder(
feed_list=[
fluid.layers.data(
name="image", shape=[3, 224, 224]), fluid.layers.data(
name="label", shape=[1], dtype='int64')
],
place=fluid.CPUPlace())
outpath = os.path.join(outpath, "imagenet.recordio")
convert_reader_to_recordio_files(outpath, 10000, r, feeder)
......@@ -156,15 +156,15 @@ class OpKernelRegistrar : public Registrar {
/**
* Macro to register OperatorKernel.
*/
#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...) \
#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op_kernel_##op_type##_##LIBRARY_TYPE##__, \
__reg_op_kernel_##op_type##_##library_type##__, \
"REGISTER_OP_KERNEL must be called in global namespace"); \
static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__> \
__op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type, \
#LIBRARY_TYPE); \
int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() { \
__op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch(); \
__op_kernel_registrar_##op_type##_##library_type##__(#op_type, \
#library_type); \
int TouchOpKernelRegistrar_##op_type##_##library_type() { \
__op_kernel_registrar_##op_type##_##library_type##__.Touch(); \
return 0; \
}
......
......@@ -693,8 +693,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
}
if (t != nullptr) {
int tmp = static_cast<int>(ToDataType(t->type()));
PADDLE_ENFORCE(tmp == data_type || data_type == -1,
"DataType of Paddle Op %s must be the same.", Type());
PADDLE_ENFORCE(
tmp == data_type || data_type == -1,
"DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
data_type, tmp);
data_type = tmp;
}
}
......
......@@ -166,8 +166,6 @@ function(op_library TARGET)
# NOTE(*): activation use macro to regist the kernels, set use_op manually.
if(${TARGET} STREQUAL "activation")
file(APPEND ${pybind_file} "USE_OP(relu);\n")
elseif(${TARGET} STREQUAL "reduce")
file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
elseif(${TARGET} STREQUAL "fake_dequantize")
file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
else()
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_min_max_op.h"
REGISTER_REDUCE_OP(reduce_max);
REGISTER_OP_CPU_KERNEL(
reduce_max, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
ops::MaxFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
ops::MaxFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
ops::MaxFunctor>);
REGISTER_OP_CPU_KERNEL(
reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
float, ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
ops::MaxOrMinGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_min_max_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_max,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::MaxFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
double, ops::MaxFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::MaxFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::MaxFunctor>);
REGISTER_OP_CUDA_KERNEL(
reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::MaxOrMinGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_mean_op.h"
REGISTER_REDUCE_OP(reduce_mean);
REGISTER_OP_CPU_KERNEL(reduce_mean,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
float, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
double, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
int, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
int64_t, ops::MeanFunctor>);
REGISTER_OP_CPU_KERNEL(reduce_mean_grad,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
float, ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
double, ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
int, ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
int64_t, ops::MeanGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_mean_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_mean,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
double, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::MeanFunctor>);
REGISTER_OP_CUDA_KERNEL(
reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::MeanGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/reduce_op.h"
namespace paddle {
namespace operators {
struct MeanFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->mean(dim);
}
};
struct MeanGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
dx->device(place) = dy->broadcast(dim) / dx->constant(size);
}
};
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/reduce_op.h"
namespace paddle {
namespace operators {
struct MaxFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->maximum(dim);
}
};
struct MinFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->minimum(dim);
}
};
struct MaxOrMinGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
auto equals = (*x) == y->broadcast(dim);
auto ones = dx->constant(1);
auto zeros = dx->constant(0);
// If there are multiple minimum or maximum elements, the subgradient of
// each is the set [0, 1], and we pass gradient to all of them here.
dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
}
};
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_min_max_op.h"
REGISTER_REDUCE_OP(reduce_min);
REGISTER_OP_CPU_KERNEL(
reduce_min, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
ops::MinFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
ops::MinFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
ops::MinFunctor>);
REGISTER_OP_CPU_KERNEL(
reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
float, ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
ops::MaxOrMinGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_min_max_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_min,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::MinFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
double, ops::MinFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::MinFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::MinFunctor>);
REGISTER_OP_CUDA_KERNEL(
reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::MaxOrMinGradFunctor>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/reduce_op.h"
#include <algorithm>
#include <string>
#include <vector>
namespace paddle {
namespace operators {
using framework::Tensor;
class ReduceOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of ReduceOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ReduceOp should not be null.");
auto x_dims = ctx->GetInputDim("X");
auto x_rank = x_dims.size();
PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) dims[i] = x_rank + dims[i];
PADDLE_ENFORCE_LT(
dims[i], x_rank,
"The dim should be in the range [-rank(input), rank(input)).");
}
sort(dims.begin(), dims.end());
bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
if (reduce_all) {
if (keep_dim)
ctx->SetOutputDim(
"Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
else
ctx->SetOutputDim("Out", {1});
} else {
auto dims_vector = vectorize(x_dims);
if (keep_dim) {
for (size_t i = 0; i < dims.size(); ++i) {
dims_vector[dims[i]] = 1;
}
} else {
const int kDelFlag = -2;
for (size_t i = 0; i < dims.size(); ++i) {
dims_vector[dims[i]] = kDelFlag;
}
dims_vector.erase(
remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
dims_vector.end());
}
auto out_dims = framework::make_ddim(dims_vector);
ctx->SetOutputDim("Out", out_dims);
if (dims[0] != 0) {
// Only pass LoD when not reducing on the first dim.
ctx->ShareLoD("X", /*->*/ "Out");
}
}
}
};
class ReduceGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null.");
auto x_dims = ctx->GetInputDim("X");
auto x_rank = x_dims.size();
PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) dims[i] = x_rank + dims[i];
PADDLE_ENFORCE_LT(
dims[i], x_rank,
"The dim should be in the range [-rank(input), rank(input)).");
}
sort(dims.begin(), dims.end());
auto x_grad_name = framework::GradVarName("X");
if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims);
ctx->ShareLoD("X", /*->*/ x_grad_name);
}
}
};
class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() final {
AddInput("X",
"(Tensor) The input tensor. Tensors with rank at most 6 are "
"supported.");
AddOutput("Out", "(Tensor) The result tensor.");
AddAttr<std::vector<int>>(
"dim",
"(list<int>, default {0}) The dimensions to reduce. "
"Must be in the range [-rank(input), rank(input)). "
"If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
"Note that reducing on the first dim will make the LoD info lost.")
.SetDefault({0});
AddAttr<bool>("keep_dim",
"(bool, default false) "
"If true, retain the reduced dimension with length 1.")
.SetDefault(false);
AddAttr<bool>("reduce_all",
"(bool, default false) "
"If true, output a scalar reduced along all dimensions.")
.SetDefault(false);
AddComment(string::Sprintf(R"DOC(
%s Operator.
This operator computes the %s of input tensor along the given dimension.
The result tensor has 1 fewer dimension than the input unless keep_dim is true.
If reduce_all is true, just reduce along all dimensions and output a scalar.
)DOC",
GetOpType(), GetName()));
}
protected:
virtual std::string GetName() const = 0;
virtual std::string GetOpType() const = 0;
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
#define REGISTER_REDUCE_OP(op_name) \
class __##op_name##Maker__ : public ops::ReduceOpMaker { \
protected: \
virtual std::string GetName() const { return #op_name; } \
virtual std::string GetOpType() const { return "Reduce " #op_name; } \
}; \
REGISTER_OPERATOR(reduce_##op_name, ops::ReduceOp, __##op_name##Maker__, \
paddle::framework::DefaultGradOpDescMaker<true>); \
REGISTER_OPERATOR(reduce_##op_name##_grad, ops::ReduceGradOp)
REGISTER_REDUCE_OP(sum);
REGISTER_REDUCE_OP(mean);
REGISTER_REDUCE_OP(max);
REGISTER_REDUCE_OP(min);
REGISTER_REDUCE_OP(prod);
#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \
REGISTER_OP_CPU_KERNEL(reduce_type, \
ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
float, ops::functor>, \
ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
double, ops::functor>, \
ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
int, ops::functor>, \
ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
int64_t, ops::functor>); \
REGISTER_OP_CPU_KERNEL( \
reduce_type##_grad, \
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t, \
ops::grad_functor>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/reduce_op.h"
namespace ops = paddle::operators;
#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor) \
REGISTER_OP_CUDA_KERNEL( \
reduce_type, ops::ReduceKernel<paddle::platform::CUDADeviceContext, \
float, ops::functor>, \
ops::ReduceKernel<paddle::platform::CUDADeviceContext, double, \
ops::functor>, \
ops::ReduceKernel<paddle::platform::CUDADeviceContext, int, \
ops::functor>, \
ops::ReduceKernel<paddle::platform::CUDADeviceContext, int64_t, \
ops::functor>); \
REGISTER_OP_CUDA_KERNEL( \
reduce_type##_grad, \
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t, \
ops::grad_functor>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL);
......@@ -14,105 +14,20 @@ limitations under the License. */
#pragma once
#include <algorithm>
#include <string>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/reduce_op_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using DDim = framework::DDim;
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
struct SumFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->sum(dim);
}
};
struct SumGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
dx->device(place) = dy->broadcast(dim);
}
};
struct MeanFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->mean(dim);
}
};
struct MeanGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
dx->device(place) = dy->broadcast(dim) / dx->constant(size);
}
};
struct MaxFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->maximum(dim);
}
};
struct MinFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->minimum(dim);
}
};
struct MaxOrMinGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
auto equals = (*x) == y->broadcast(dim);
auto ones = dx->constant(1);
auto zeros = dx->constant(0);
// If there are multiple minimum or maximum elements, the subgradient of
// each is the set [0, 1], and we pass gradient to all of them here.
dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
}
};
struct ProdFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->prod(dim);
}
};
struct ProdGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
}
};
#define HANDLE_DIM(NDIM, RDIM) \
if (ndim == NDIM && rdim == RDIM) { \
ReduceCompute<NDIM, RDIM>(context); \
#define HANDLE_DIM(NDIM, RDIM) \
if (ndim == NDIM && rdim == RDIM) { \
ReduceFunctor<DeviceContext, T, NDIM, RDIM, Functor>( \
context.template device_context<DeviceContext>(), *input, output, \
dims, keep_dim); \
}
template <typename DeviceContext, typename T, typename Functor>
......@@ -120,11 +35,15 @@ class ReduceKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
output->mutable_data<T>(context.GetPlace());
auto dims = context.Attr<std::vector<int>>("dim");
bool keep_dim = context.Attr<bool>("keep_dim");
if (reduce_all) {
// Flatten and reduce 1-D tensor
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
output->mutable_data<T>(context.GetPlace());
auto x = EigenVector<T>::Flatten(*input);
auto out = EigenScalar<T>::From(*output);
auto& place =
......@@ -133,8 +52,8 @@ class ReduceKernel : public framework::OpKernel<T> {
Functor functor;
functor(place, &x, &out, reduce_dim);
} else {
int ndim = context.Input<Tensor>("X")->dims().size();
int rdim = context.Attr<std::vector<int>>("dim").size();
int ndim = input->dims().size();
int rdim = dims.size();
// comments for accelerating compiling temporarily.
// HANDLE_DIM(6, 5);
// HANDLE_DIM(6, 4);
......@@ -154,48 +73,6 @@ class ReduceKernel : public framework::OpKernel<T> {
HANDLE_DIM(1, 1);
}
}
private:
template <size_t D, size_t R_D>
void ReduceCompute(const framework::ExecutionContext& context) const {
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
output->mutable_data<T>(context.GetPlace());
auto x = EigenTensor<T, D>::From(*input);
auto x_rank = static_cast<int>(x.dimensions().size());
auto dims = context.Attr<std::vector<int>>("dim");
auto reduce_dim = Eigen::array<int, R_D>();
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) dims[i] = x_rank + dims[i];
reduce_dim[i] = dims[i];
}
// construct the squeezed output tensor
bool keep_dim = context.Attr<bool>("keep_dim");
DDim out_dims = output->dims();
if (keep_dim && x_rank > 1) {
const int kDelFlag = -2;
auto dims_vector = vectorize(out_dims);
for (size_t i = 0; i < dims.size(); ++i) {
dims_vector[dims[i]] = kDelFlag;
}
dims_vector.erase(
remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
dims_vector.end());
out_dims = framework::make_ddim(dims_vector);
}
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
Functor functor;
if (D == 1) {
auto out = EigenScalar<T>::From(*output);
functor(place, &x, &out, reduce_dim);
} else {
auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
functor(place, &x, &out, reduce_dim);
}
}
};
template <typename DeviceContext, typename T, typename Functor>
......@@ -203,12 +80,15 @@ class ReduceGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
auto dims = context.Attr<std::vector<int>>("dim");
auto* input0 = context.Input<Tensor>("X");
auto* input1 = context.Input<Tensor>("Out");
auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
auto* output = context.Output<Tensor>(framework::GradVarName("X"));
output->mutable_data<T>(context.GetPlace());
if (reduce_all) {
auto* input0 = context.Input<Tensor>("X");
auto* input1 = context.Input<Tensor>("Out");
auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
auto* output = context.Output<Tensor>(framework::GradVarName("X"));
output->mutable_data<T>(context.GetPlace());
auto x = EigenVector<T>::Flatten(*input0);
auto x_reduce = EigenVector<T>::From(*input1);
auto x_reduce_grad = EigenVector<T>::From(*input2);
......@@ -221,74 +101,172 @@ class ReduceGradKernel : public framework::OpKernel<T> {
functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
broadcast_dim[0]);
} else {
int rank = context.Input<Tensor>("X")->dims().size();
int rank = input0->dims().size();
switch (rank) {
case 1:
ReduceGradCompute<1>(context);
ReduceGradFunctor<DeviceContext, T, 1, Functor>(
context.template device_context<DeviceContext>(), *input0,
*input1, *input2, output, dims);
break;
case 2:
ReduceGradCompute<2>(context);
ReduceGradFunctor<DeviceContext, T, 2, Functor>(
context.template device_context<DeviceContext>(), *input0,
*input1, *input2, output, dims);
break;
case 3:
ReduceGradCompute<3>(context);
ReduceGradFunctor<DeviceContext, T, 3, Functor>(
context.template device_context<DeviceContext>(), *input0,
*input1, *input2, output, dims);
break;
case 4:
ReduceGradCompute<4>(context);
ReduceGradFunctor<DeviceContext, T, 4, Functor>(
context.template device_context<DeviceContext>(), *input0,
*input1, *input2, output, dims);
break;
case 5:
ReduceGradCompute<5>(context);
ReduceGradFunctor<DeviceContext, T, 5, Functor>(
context.template device_context<DeviceContext>(), *input0,
*input1, *input2, output, dims);
break;
case 6:
ReduceGradCompute<6>(context);
ReduceGradFunctor<DeviceContext, T, 6, Functor>(
context.template device_context<DeviceContext>(), *input0,
*input1, *input2, output, dims);
break;
}
}
}
};
private:
template <size_t D>
void ReduceGradCompute(const framework::ExecutionContext& context) const {
auto* input0 = context.Input<Tensor>("X");
auto* input1 = context.Input<Tensor>("Out");
auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
auto* output = context.Output<Tensor>(framework::GradVarName("X"));
class ReduceOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
output->mutable_data<T>(context.GetPlace());
auto x = EigenTensor<T, D>::From(*input0);
auto x_grad = EigenTensor<T, D>::From(*output);
auto x_rank = static_cast<int>(x.dimensions().size());
auto dims = context.Attr<std::vector<int>>("dim");
auto x_dims = input0->dims();
auto reduced_dims_v = vectorize(x_dims);
Eigen::array<int, D> broadcast_dim;
for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of ReduceOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ReduceOp should not be null.");
auto x_dims = ctx->GetInputDim("X");
auto x_rank = x_dims.size();
PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) dims[i] = x_rank + dims[i];
PADDLE_ENFORCE_LT(
dims[i], x_rank,
"The dim should be in the range [-rank(input), rank(input)).");
}
sort(dims.begin(), dims.end());
bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
if (reduce_all) {
if (keep_dim)
ctx->SetOutputDim(
"Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
else
ctx->SetOutputDim("Out", {1});
} else {
auto dims_vector = vectorize(x_dims);
if (keep_dim) {
for (size_t i = 0; i < dims.size(); ++i) {
dims_vector[dims[i]] = 1;
}
} else {
const int kDelFlag = -2;
for (size_t i = 0; i < dims.size(); ++i) {
dims_vector[dims[i]] = kDelFlag;
}
dims_vector.erase(
remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
dims_vector.end());
}
auto out_dims = framework::make_ddim(dims_vector);
ctx->SetOutputDim("Out", out_dims);
if (dims[0] != 0) {
// Only pass LoD when not reducing on the first dim.
ctx->ShareLoD("X", /*->*/ "Out");
}
}
}
};
class ReduceGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
int broad_cats_times = 1;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null.");
auto x_dims = ctx->GetInputDim("X");
auto x_rank = x_dims.size();
PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) dims[i] = x_rank + dims[i];
reduced_dims_v[dims[i]] = 1;
broadcast_dim[dims[i]] = x_dims[dims[i]];
broad_cats_times *= x_dims[dims[i]];
PADDLE_ENFORCE_LT(
dims[i], x_rank,
"The dim should be in the range [-rank(input), rank(input)).");
}
sort(dims.begin(), dims.end());
auto x_grad_name = framework::GradVarName("X");
if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims);
ctx->ShareLoD("X", /*->*/ x_grad_name);
}
auto reduced_dims = framework::make_ddim(reduced_dims_v);
auto x_reduce = EigenTensor<T, D>::From(*input1, reduced_dims);
auto x_reduce_grad = EigenTensor<T, D>::From(*input2, reduced_dims);
}
};
class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() final {
AddInput("X",
"(Tensor) The input tensor. Tensors with rank at most 6 are "
"supported.");
AddOutput("Out", "(Tensor) The result tensor.");
AddAttr<std::vector<int>>(
"dim",
"(list<int>, default {0}) The dimensions to reduce. "
"Must be in the range [-rank(input), rank(input)). "
"If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
"Note that reducing on the first dim will make the LoD info lost.")
.SetDefault({0});
AddAttr<bool>("keep_dim",
"(bool, default false) "
"If true, retain the reduced dimension with length 1.")
.SetDefault(false);
AddAttr<bool>("reduce_all",
"(bool, default false) "
"If true, output a scalar reduced along all dimensions.")
.SetDefault(false);
AddComment(string::Sprintf(R"DOC(
%s Operator.
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
This operator computes the %s of input tensor along the given dimension.
The result tensor has 1 fewer dimension than the input unless keep_dim is true.
If reduce_all is true, just reduce along all dimensions and output a scalar.
Functor functor;
functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
broad_cats_times);
)DOC",
GetOpType(), GetName()));
}
protected:
virtual std::string GetName() const = 0;
virtual std::string GetOpType() const = 0;
};
} // namespace operators
} // namespace paddle
#define FOR_EACH_KERNEL_FUNCTOR(__macro) \
__macro(reduce_sum, SumFunctor, SumGradFunctor); \
__macro(reduce_mean, MeanFunctor, MeanGradFunctor); \
__macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \
__macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); \
__macro(reduce_prod, ProdFunctor, ProdGradFunctor);
namespace ops = paddle::operators;
#define REGISTER_REDUCE_OP(op_name) \
class __##op_name##Maker__ : public ops::ReduceOpMaker { \
protected: \
virtual std::string GetName() const { return #op_name; } \
virtual std::string GetOpType() const { return "Reduce " #op_name; } \
}; \
REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__, \
paddle::framework::DefaultGradOpDescMaker<true>); \
REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using DDim = framework::DDim;
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext, typename T, size_t D, size_t R_D,
typename Functor>
void ReduceFunctor(const DeviceContext& context, const framework::Tensor& input,
framework::Tensor* output, const std::vector<int>& dims,
bool keep_dim) {
auto x = EigenTensor<T, D>::From(input);
auto x_rank = static_cast<int>(x.dimensions().size());
auto reduce_dim = Eigen::array<int, R_D>();
std::vector<int> dims_ref = dims;
for (size_t i = 0; i < dims_ref.size(); ++i) {
if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
reduce_dim[i] = dims_ref[i];
}
// construct the squeezed output tensor
DDim out_dims = output->dims();
if (keep_dim && x_rank > 1) {
const int kDelFlag = -2;
auto dims_vector = framework::vectorize(out_dims);
for (size_t i = 0; i < dims_ref.size(); ++i) {
dims_vector[dims_ref[i]] = kDelFlag;
}
dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
dims_vector.end());
out_dims = framework::make_ddim(dims_vector);
}
auto& place = *context.eigen_device();
Functor functor;
if (D == 1) {
auto out = EigenScalar<T>::From(*output);
functor(place, &x, &out, reduce_dim);
} else {
auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
functor(place, &x, &out, reduce_dim);
}
}
template <typename DeviceContext, typename T, size_t D, typename Functor>
void ReduceGradFunctor(const DeviceContext& context,
const framework::Tensor& input0,
const framework::Tensor& input1,
const framework::Tensor& input2,
framework::Tensor* output,
const std::vector<int>& dims) {
auto x = EigenTensor<T, D>::From(input0);
auto x_grad = EigenTensor<T, D>::From(*output);
auto x_rank = static_cast<int>(x.dimensions().size());
auto x_dims = input0.dims();
auto reduced_dims_v = framework::vectorize(x_dims);
std::vector<int> dims_ref = dims;
Eigen::array<int, D> broadcast_dim;
for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
int broad_cats_times = 1;
for (size_t i = 0; i < dims_ref.size(); ++i) {
if (dims_ref[i] < 0) {
dims_ref[i] = x_rank + dims_ref[i];
}
reduced_dims_v[dims_ref[i]] = 1;
broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
broad_cats_times *= x_dims[dims_ref[i]];
}
auto reduced_dims = framework::make_ddim(reduced_dims_v);
auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
auto& place = *context.eigen_device();
Functor functor;
functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
broad_cats_times);
}
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_prod_op.h"
REGISTER_REDUCE_OP(reduce_prod);
REGISTER_OP_CPU_KERNEL(reduce_prod,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
float, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
double, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
int, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
int64_t, ops::ProdFunctor>);
REGISTER_OP_CPU_KERNEL(reduce_prod_grad,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
float, ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
double, ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
int, ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
int64_t, ops::ProdGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_prod_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_prod,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
double, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::ProdFunctor>);
REGISTER_OP_CUDA_KERNEL(
reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::ProdGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/reduce_op.h"
namespace paddle {
namespace operators {
struct ProdFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->prod(dim);
}
};
struct ProdGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
}
};
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_sum_op.h"
REGISTER_REDUCE_OP(reduce_sum);
REGISTER_OP_CPU_KERNEL(
reduce_sum, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
ops::SumFunctor>);
REGISTER_OP_CPU_KERNEL(reduce_sum_grad,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
float, ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
double, ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
int, ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
int64_t, ops::SumGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_sum_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_sum,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
double, ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::SumFunctor>);
REGISTER_OP_CUDA_KERNEL(
reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::SumGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/reduce_op.h"
namespace paddle {
namespace operators {
struct SumFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->sum(dim);
}
};
struct SumGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
dx->device(place) = dy->broadcast(dim);
}
};
} // namespace operators
} // namespace paddle
......@@ -170,6 +170,8 @@ def get_program_cache_key(feed, fetch_list):
return var.desc.name()
elif isinstance(var, str):
return var
elif isinstance(var, basestring):
return str(var)
else:
raise TypeError(str(var) + " should be Variable or str")
......
......@@ -72,6 +72,8 @@ def convert_np_dtype_to_dtype_(np_dtype):
return core.VarDesc.VarType.INT64
elif dtype == np.bool:
return core.VarDesc.VarType.BOOL
elif dtype == np.uint16:
return core.VarDesc.VarType.INT16
elif dtype == np.uint8:
return core.VarDesc.VarType.UINT8
else:
......@@ -368,6 +370,13 @@ class Operator(object):
Block. Users can use the build in instructions to describe their neural
network.
"""
OP_WITHOUT_KERNEL_SET = {
'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
'ncclInit', 'channel_create', 'channel_close', 'channel_send',
'channel_recv', 'select'
}
def __init__(self,
block,
......@@ -504,17 +513,13 @@ class Operator(object):
else:
self.desc.set_attr(attr_name, self.attrs[attr_name])
self.desc.check_attrs()
no_kernel_op_set = {
'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
'load_combine', 'ncclInit', 'channel_create', 'channel_close',
'channel_send', 'channel_recv', 'select', 'gen_nccl_id'
}
if type not in no_kernel_op_set:
if self.has_kernel(type):
self.desc.infer_var_type(self.block.desc)
self.desc.infer_shape(self.block.desc)
def has_kernel(self, op_type):
return op_type not in self.OP_WITHOUT_KERNEL_SET
def to_string(self, throw_on_error):
"""
To debug string.
......@@ -742,7 +747,9 @@ class Block(object):
def var(self, name):
if not isinstance(name, basestring):
raise TypeError()
raise TypeError(
"var require string as parameter, but get %s instead." %
(type(name)))
v = self.vars.get(name, None)
if v is None:
raise ValueError("var %s not in this block" % name)
......
......@@ -434,7 +434,7 @@ def open_files(filenames,
shapes,
lod_levels,
dtypes,
thread_num,
thread_num=1,
buffer_size=None,
pass_num=1,
for_parallel=True):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import unittest
import time
import itertools
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.op import Operator
from op_test import OpTest
class BenchmarkSuite(OpTest):
def timeit_function(self, callback, iters, *args, **kwargs):
assert iters != 0, "Iters should >= 1"
start = time.time()
for i in range(iters):
callback(*args, **kwargs)
elapse = time.time() - start
return elapse / iters
def _assert_cpu_gpu_same(self, cpu_outs, gpu_outs, fetch_list, atol):
for item_cpu_out, item_gpu_out, variable in zip(cpu_outs, gpu_outs,
fetch_list):
# the cpu version is baseline, expect gpu version keep same with cpu version.
expect = item_cpu_out
expect_t = np.array(item_cpu_out)
actual = item_gpu_out
actual_t = np.array(item_gpu_out)
var_name = variable if isinstance(variable,
basestring) else variable.name
self.assertTrue(
np.allclose(
actual_t, expect_t, atol=atol),
"Output (" + var_name + ") has diff" + str(actual_t) + "\n" +
str(expect_t))
self.assertListEqual(actual.lod(),
expect.lod(),
"Output (" + var_name + ") has different lod")
def _get_input_names(self):
inputs = []
for name, value in self.inputs.iteritems():
if isinstance(value, list):
inputs.extend([sub_name for sub_name, _ in value])
inputs.append(name)
return inputs
def _get_output_names(self):
outputs = []
for var_name, var in self.outputs.iteritems():
if isinstance(var, list):
for sub_var_name, sub_var in var:
outputs.append(sub_var_name)
else:
outputs.append(var_name)
if len(outputs) == 0:
for out_name, out_dup in Operator.get_op_outputs(self.op_type):
outputs.append(str(out_name))
return outputs
def check_output_stability(self, atol=1e-8):
places = self._get_places()
if len(places) < 2:
return
cpu_outs, fetch_list = self._calc_output(places[0])
gpu_outs, _ = self._calc_output(places[1])
self._assert_cpu_gpu_same(cpu_outs, gpu_outs, fetch_list, atol)
def timeit_output_with_place(self, place, iters):
return self.timeit_function(self.calc_output, iters, place)
def timeit_output(self, iters=100):
places = self._get_places()
elapses = []
for place in places:
elapses.append(self.timeit_output_with_place(place, iters))
for place, elapse in zip(places, elapses):
print("One pass of ({2}_op) at {0} cost {1}".format(
str(place), elapse, self.op_type))
def timeit_grad_with_place(self, place, iters=100):
inputs_to_check = self._get_input_names()
output_names = self._get_output_names()
return self.timeit_function(
self._get_gradient,
iters,
inputs_to_check,
place,
output_names,
no_grad_set=None)
def timeit_grad(self, iters=100):
places = self._get_places()
elapses = []
for place in places:
elapses.append(self.timeit_grad_with_place(place, iters))
for place, elapse in zip(places, elapses):
print("One pass of ({2}_grad_op) at {0} cost {1}".format(
str(place), elapse, self.op_type))
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle.fluid as fluid
from benchmark import BenchmarkSuite
from op_test import OpTest
# This is a demo op test case for operator benchmarking and high resolution number stability alignment.
class TestSumOp(BenchmarkSuite):
def setUp(self):
self.op_type = "sum"
self.customize_testcase()
self.customize_fetch_list()
def customize_fetch_list(self):
"""
customize fetch list, configure the wanted variables.
>>> self.fetch_list = ["Out"]
"""
self.fetch_list = ["Out"]
# pass
def customize_testcase(self):
# a test case
x0 = np.random.random((300, 400)).astype('float32')
x1 = np.random.random((300, 400)).astype('float32')
x2 = np.random.random((300, 400)).astype('float32')
# NOTE: if the output is empty, then it will autofilled by benchmarkSuite.
# only the output dtype is used, the shape, lod and data is computed from input.
self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
self.outputs = {"Out": x0 + x1 + x2}
def test_check_output(self):
"""
compare the output with customized output. In this case,
you should set the correct output by hands.
>>> self.outputs = {"Out": x0 + x1 + x2}
"""
self.check_output(atol=1e-8)
def test_output_stability(self):
# compare the cpu gpu output in high resolution.
self.check_output_stability()
def test_timeit_output(self):
"""
perf the op, time cost will be averged in iters.
output example
>>> One pass of (sum_op) at CPUPlace cost 0.000461330413818
>>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596
"""
self.timeit_output(iters=100)
def test_timeit_grad(self):
"""
perf the op gradient, time cost will be averged in iters.
output example
>>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536
>>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653
"""
self.timeit_grad(iters=100)
if __name__ == "__main__":
unittest.main()
......@@ -15,13 +15,17 @@
import unittest
import numpy as np
import random
import time
import itertools
import paddle.fluid.core as core
import collections
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.backward import append_backward
from paddle.fluid.op import Operator
from paddle.fluid.executor import Executor
from paddle.fluid.framework import Program, OpProtoHolder
from paddle.fluid.framework import Program, OpProtoHolder, Variable
from testsuite import create_op, set_input, append_input_output, append_loss_ops
def randomize_probability(batch_size, class_num, dtype='float32'):
......@@ -33,73 +37,6 @@ def randomize_probability(batch_size, class_num, dtype='float32'):
return prob
def create_op(scope, op_type, inputs, outputs, attrs):
kwargs = dict()
op_maker = core.op_proto_and_checker_maker
op_role_attr_name = op_maker.kOpRoleAttrName()
if op_role_attr_name not in attrs:
attrs[op_role_attr_name] = int(op_maker.OpRole.Forward)
def __create_var__(name, var_name):
scope.var(var_name).get_tensor()
kwargs[name].append(var_name)
for in_name, in_dup in Operator.get_op_inputs(op_type):
if in_name in inputs:
kwargs[in_name] = []
if in_dup:
sub_in = inputs[in_name]
for item in sub_in:
sub_in_name, _ = item[0], item[1]
__create_var__(in_name, sub_in_name)
else:
__create_var__(in_name, in_name)
for out_name, out_dup in Operator.get_op_outputs(op_type):
if out_name in outputs:
kwargs[out_name] = []
if out_dup:
sub_out = outputs[out_name]
for item in sub_out:
sub_out_name, _ = item[0], item[1]
__create_var__(out_name, sub_out_name)
else:
__create_var__(out_name, out_name)
for attr_name in Operator.get_op_attr_names(op_type):
if attr_name in attrs:
kwargs[attr_name] = attrs[attr_name]
return Operator(op_type, **kwargs)
def set_input(scope, op, inputs, place):
def __set_input__(var_name, var):
if isinstance(var, tuple) or isinstance(var, np.ndarray):
tensor = scope.find_var(var_name).get_tensor()
if isinstance(var, tuple):
tensor.set_lod(var[1])
var = var[0]
tensor.set_dims(var.shape)
tensor.set(var, place)
elif isinstance(var, float):
scope.find_var(var_name).set_float(var)
elif isinstance(var, int):
scope.find_var(var_name).set_int(var)
for in_name, in_dup in Operator.get_op_inputs(op.type()):
if in_name in inputs:
if in_dup:
sub_in = inputs[in_name]
for item in sub_in:
sub_in_name, sub_in_val = item[0], item[1]
__set_input__(sub_in_name, sub_in_val)
else:
__set_input__(in_name, inputs[in_name])
def get_numeric_gradient(place,
scope,
op,
......@@ -173,54 +110,15 @@ def get_numeric_gradient(place,
return gradient_flat.reshape(tensor_to_check.get_dims())
def append_input_output(block, op_proto, np_list, is_input):
'''Insert VarDesc and generate Python variable instance'''
proto_list = op_proto.inputs if is_input else op_proto.outputs
def create_var(block, name, np_list, var_proto):
if name not in np_list:
assert var_proto.intermediate, "{} not found".format(name)
shape = None
lod_level = None
else:
np_value = np_list[name]
if isinstance(np_value, tuple):
shape = list(np_value[0].shape)
lod_level = len(np_value[1])
else:
shape = list(np_value.shape)
lod_level = 0
return block.create_var(
dtype="float32", shape=shape, lod_level=lod_level, name=name)
var_dict = {}
for var_proto in proto_list:
var_name = str(var_proto.name)
if is_input:
if (var_name not in np_list) and var_proto.dispensable:
continue
assert (var_name in np_list) or (var_proto.dispensable), \
"Missing {} as input".format(var_name)
if var_proto.duplicable:
assert isinstance(np_list[var_name], list), \
"Duplicable {} should be set as list".format(var_name)
var_list = []
for (name, np_value) in np_list[var_name]:
var_list.append(
create_var(block, name, {name: np_value}, var_proto))
var_dict[var_name] = var_list
else:
var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
return var_dict
class OpTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
'''Fix random seeds to remove randomness from tests'''
cls._np_rand_state = np.random.get_state()
cls._py_rand_state = random.getstate()
cls.call_once = False
cls.dtype = "float32"
cls.outputs = {}
np.random.seed(123)
random.seed(124)
......@@ -231,6 +129,31 @@ class OpTest(unittest.TestCase):
np.random.set_state(cls._np_rand_state)
random.setstate(cls._py_rand_state)
def try_call_once(self, data_type):
if not self.call_once:
self.call_once = True
self.dtype = data_type
def infer_dtype_from_inputs_outputs(self, inputs, outputs):
def infer_dtype(numpy_dict):
assert isinstance(
numpy_dict,
dict), "self.inputs, self.outputs must be numpy_dict"
for var_name, var_value in numpy_dict.iteritems():
if isinstance(var_value, (np.ndarray, np.generic)):
self.try_call_once(var_value.dtype)
elif isinstance(var_value, (list, tuple)):
# the case of self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
if len(var_value) > 1 and isinstance(var_value[1], (
np.ndarray, np.generic)):
instance = var_value[1]
self.try_call_once(instance[1].dtype)
else:
self.try_call_once("float32")
infer_dtype(inputs)
infer_dtype(outputs)
def feed_var(self, input_vars, place):
feed_map = {}
for var_name in input_vars:
......@@ -254,18 +177,14 @@ class OpTest(unittest.TestCase):
return feed_map
def calc_output(self, place):
outs, _ = self._calc_output(place)
return outs
def _calc_output(self, place):
def _append_ops(self, block):
op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
program = Program()
block = program.global_block()
inputs = append_input_output(block, op_proto, self.inputs, True)
outputs = append_input_output(block, op_proto, self.outputs, False)
"infer datatype from inputs and outputs for this test case"
self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
inputs = append_input_output(block, op_proto, self.inputs, True,
self.dtype)
outputs = append_input_output(block, op_proto, self.outputs, False,
self.dtype)
op = block.append_op(
type=self.op_type,
inputs=inputs,
......@@ -275,22 +194,68 @@ class OpTest(unittest.TestCase):
op.desc.infer_var_type(block.desc)
op.desc.infer_shape(block.desc)
fetch_list = []
for var_name, var in outputs.iteritems():
if var_name in self.outputs:
def _get_io_vars(self, block, numpy_inputs):
inputs = {}
for name, value in numpy_inputs.iteritems():
if isinstance(value, list):
var_list = [
block.var(sub_name) for sub_name, sub_value in value
]
inputs[name] = var_list
else:
inputs[name] = block.var(name)
return inputs
def _get_inputs(self, block):
return self._get_io_vars(block, self.inputs)
def _get_outputs(self, block):
return self._get_io_vars(block, self.outputs)
def calc_output(self, place):
outs, _ = self._calc_output(place)
return outs
def _calc_output(self, place, parallel=False):
program = Program()
block = program.global_block()
self._append_ops(block)
inputs = self._get_inputs(block)
outputs = self._get_outputs(block)
feed_map = self.feed_var(inputs, place)
if parallel:
use_cuda = False
if isinstance(place, fluid.CUDAPlace(0)):
use_cuda = True
executor = fluid.ParallelExecutor(
use_cuda=use_cuda, loss_name=loss.name, main_program=program)
else:
executor = Executor(place)
fetch_list = getattr(self, "fetch_list", [])
# if the fetch_list is customized by user, we use it directly.
# if not, fill the fetch_list by the user configured outputs in test.
if len(fetch_list) == 0:
for var_name, var in outputs.iteritems():
if isinstance(var, list):
for v in var:
fetch_list.append(v)
else:
fetch_list.append(var)
feed_map = self.feed_var(inputs, place)
exe = Executor(place)
outs = exe.run(program,
feed=feed_map,
fetch_list=fetch_list,
return_numpy=False)
# if the fetch_list still empty, fill the fetch_list by the operator output.
if len(fetch_list) == 0:
for out_name, out_dup in Operator.get_op_outputs(self.op_type):
fetch_list.append(str(out_name))
# fetch_list = map(block.var, fetch_list)
if not isinstance(fetch_list[0], Variable):
fetch_list = map(block.var, fetch_list)
outs = executor.run(program,
feed=feed_map,
fetch_list=fetch_list,
return_numpy=False)
return outs, fetch_list
def check_output_with_place(self, place, atol):
......@@ -346,17 +311,19 @@ class OpTest(unittest.TestCase):
"Output (" + out_name +
") has different lod at " + str(place))
def check_output(self, atol=1e-5):
places = [core.CPUPlace()]
def _get_places(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
places.append(core.CUDAPlace(0))
return places
def check_output(self, atol=1e-5):
places = self._get_places()
for place in places:
self.check_output_with_place(place, atol)
def check_output_customized(self, checker):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
places.append(core.CUDAPlace(0))
places = self._get_places()
for place in places:
outs = self.calc_output(place)
outs = [np.array(out) for out in outs]
......@@ -389,9 +356,7 @@ class OpTest(unittest.TestCase):
in_place=False,
max_relative_error=0.005,
user_defined_grads=None):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
places.append(core.CUDAPlace(0))
places = self._get_places()
for place in places:
self.check_grad_with_place(place, inputs_to_check, output_names,
no_grad_set, numeric_grad_delta,
......@@ -438,35 +403,6 @@ class OpTest(unittest.TestCase):
max_relative_error,
"Gradient Check On %s" % str(place))
@staticmethod
def _create_var_descs_(block, var_dict):
# FIXME: Try unify with `append_input_output`
for param_name in var_dict:
var = var_dict[param_name]
if not isinstance(var, list) and not isinstance(var, tuple):
var = [(param_name, var, None)]
if not isinstance(var[0], list) and not isinstance(var[0], tuple):
var = [(param_name, var[0], var[1])]
for i, item in enumerate(var):
if not isinstance(item[0], basestring):
item = [[param_name] + list(item)]
if len(item) == 2:
if isinstance(item[1], tuple):
var[i] = [item[0], item[1][0], item[1][1]]
else:
# only set var name and value, set lod to None
var[i] = list(item) + [None]
var_descs = [(block.create_var(
name=name, shape=each.shape, dtype=each.dtype), each, lod)
for name, each, lod in var]
yield param_name, var_descs
@staticmethod
def _merge_list(iterable):
return reduce(lambda a, b: list(a) + list(b), iterable, [])
@staticmethod
def _numpy_to_lod_tensor(np_value, lod, place):
tensor = core.LoDTensor()
......@@ -497,83 +433,31 @@ class OpTest(unittest.TestCase):
input.dtype = np.uint16
return input
def _get_gradient(self, input_to_check, place, output_names, no_grad_set):
def _get_gradient(self,
input_to_check,
place,
output_names,
no_grad_set,
parallel=False):
prog = Program()
block = prog.global_block()
inputs_with_np = {
key: value
for (key, value) in OpTest._create_var_descs_(
block, getattr(self, 'inputs', {}))
}
outputs_with_np = {
key: val
for (key, val) in OpTest._create_var_descs_(
block, getattr(self, 'outputs', {}))
}
inputs = {
k: [item[0] for item in inputs_with_np[k]]
for k in inputs_with_np
}
outputs = {
k: [item[0] for item in outputs_with_np[k]]
for k in outputs_with_np
}
op = block.append_op(
type=self.op_type,
inputs=inputs,
outputs=outputs,
attrs=getattr(self, 'attrs', {}))
# infer variable type and infer shape in compile-time
op.desc.infer_var_type(block.desc)
op.desc.infer_shape(block.desc)
mean_inputs = map(block.var, output_names)
if len(mean_inputs) == 1:
loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
op = block.append_op(
inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
op.desc.infer_var_type(block.desc)
op.desc.infer_shape(block.desc)
else:
avg_sum = []
for cur_loss in mean_inputs:
cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
op = block.append_op(
inputs={"X": [cur_loss]},
outputs={"Out": [cur_avg_loss]},
type="mean")
op.desc.infer_var_type(block.desc)
op.desc.infer_shape(block.desc)
avg_sum.append(cur_avg_loss)
loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
op_sum = block.append_op(
inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
op_sum.desc.infer_var_type(block.desc)
op_sum.desc.infer_shape(block.desc)
loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
op_loss = block.append_op(
inputs={"X": loss_sum},
outputs={"Out": loss},
type='scale',
attrs={'scale': 1.0 / float(len(avg_sum))})
op_loss.desc.infer_var_type(block.desc)
op_loss.desc.infer_shape(block.desc)
self._append_ops(block)
loss = append_loss_ops(block, output_names)
param_grad_list = append_backward(
loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set)
feed_dict = {
item[0].name: OpTest._numpy_to_lod_tensor(item[1], item[2], place)
for p_name in inputs_with_np for item in inputs_with_np[p_name]
}
inputs = self._get_inputs(block)
feed_dict = self.feed_var(inputs, place)
fetch_list = [g for p, g in param_grad_list]
executor = Executor(place)
if parallel:
use_cuda = False
if isinstance(place, fluid.CUDAPlace(0)):
use_cuda = True
executor = fluid.ParallelExecutor(
use_cuda=use_cuda, loss_name=loss.name, main_program=program)
else:
executor = Executor(place)
return map(np.array,
executor.run(prog, feed_dict, fetch_list,
return_numpy=False))
......@@ -194,107 +194,104 @@ class TestLstmOp(OpTest):
['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4)
class TestLstmOpHasInitial(TestLstmOp):
def set_argument(self):
self.lod = [[0, 2, 5, 7]]
self.D = 16
self.act_gate = 'sigmoid'
self.act_cell = 'tanh'
self.act_cand = 'tanh'
self.has_initial_state = True
self.is_reverse = True
self.use_peepholes = True
def test_check_grad(self):
# TODO(qingqing) remove folowing lines after the check_grad is refined.
N = len(self.lod[0]) - 1
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
(N, self.D)).astype('float64')
self.check_grad(
['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
max_relative_error=5e-4)
def test_check_grad_ingore_bias(self):
N = len(self.lod[0]) - 1
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
(N, self.D)).astype('float64')
self.check_grad(
['Input', 'Weight'], ['Hidden'],
max_relative_error=5e-4,
no_grad_set=set('Bias'))
def test_check_grad_ingore_weight(self):
N = len(self.lod[0]) - 1
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
(N, self.D)).astype('float64')
self.check_grad(
['Input', 'Bias'], ['Hidden'],
max_relative_error=5e-4,
no_grad_set=set('Weight'))
def test_check_grad_ingore_input(self):
N = len(self.lod[0]) - 1
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
(N, self.D)).astype('float64')
self.check_grad(
['Weight', 'Bias'], ['Hidden'],
max_relative_error=5e-4,
no_grad_set=set('Input'))
def test_check_grad_ingore_h0(self):
N = len(self.lod[0]) - 1
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
(N, self.D)).astype('float64')
self.check_grad(
['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
max_relative_error=5e-4,
no_grad_set=set('H0'))
def test_check_grad_ingore_c0(self):
N = len(self.lod[0]) - 1
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
(N, self.D)).astype('float64')
self.check_grad(
['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
max_relative_error=5e-4,
no_grad_set=set('C0'))
class TestLstmOpRerverse(TestLstmOp):
def set_argument(self):
self.lod = [[0, 2, 5, 7]]
self.D = 16
self.act_gate = 'sigmoid'
self.act_cell = 'tanh'
self.act_cand = 'tanh'
self.has_initial_state = False
self.is_reverse = True
self.use_peepholes = True
class TestLstmOpNotUsePeepholes(TestLstmOp):
def set_argument(self):
self.lod = [[0, 2, 5, 7]]
self.D = 16
self.act_gate = 'sigmoid'
self.act_cell = 'tanh'
self.act_cand = 'tanh'
self.has_initial_state = False
self.is_reverse = True
self.use_peepholes = False
# class TestLstmOpHasInitial(TestLstmOp):
# def set_argument(self):
# self.lod = [[0, 2, 5, 7]]
# self.D = 16
# self.act_gate = 'sigmoid'
# self.act_cell = 'tanh'
# self.act_cand = 'tanh'
# self.has_initial_state = True
# self.is_reverse = True
# self.use_peepholes = True
# def test_check_grad(self):
# # TODO(qingqing) remove folowing lines after the check_grad is refined.
# N = len(self.lod[0]) - 1
# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
# self.outputs['BatchCellPreAct'] = np.zeros(
# (N, self.D)).astype('float64')
# self.check_grad(
# ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
# max_relative_error=5e-4)
# def test_check_grad_ingore_bias(self):
# N = len(self.lod[0]) - 1
# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
# self.outputs['BatchCellPreAct'] = np.zeros(
# (N, self.D)).astype('float64')
# self.check_grad(
# ['Input', 'Weight'], ['Hidden'],
# max_relative_error=5e-4,
# no_grad_set=set('Bias'))
# def test_check_grad_ingore_weight(self):
# N = len(self.lod[0]) - 1
# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
# self.outputs['BatchCellPreAct'] = np.zeros(
# (N, self.D)).astype('float64')
# self.check_grad(
# ['Input', 'Bias'], ['Hidden'],
# max_relative_error=5e-4,
# no_grad_set=set('Weight'))
# def test_check_grad_ingore_input(self):
# N = len(self.lod[0]) - 1
# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
# self.outputs['BatchCellPreAct'] = np.zeros(
# (N, self.D)).astype('float64')
# self.check_grad(
# ['Weight', 'Bias'], ['Hidden'],
# max_relative_error=5e-4,
# no_grad_set=set('Input'))
# def test_check_grad_ingore_h0(self):
# N = len(self.lod[0]) - 1
# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
# self.outputs['BatchCellPreAct'] = np.zeros(
# (N, self.D)).astype('float64')
# self.check_grad(
# ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
# max_relative_error=5e-4,
# no_grad_set=set('H0'))
# def test_check_grad_ingore_c0(self):
# N = len(self.lod[0]) - 1
# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
# self.outputs['BatchCellPreAct'] = np.zeros(
# (N, self.D)).astype('float64')
# self.check_grad(
# ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
# max_relative_error=5e-4,
# no_grad_set=set('C0'))
# class TestLstmOpRerverse(TestLstmOp):
# def set_argument(self):
# self.lod = [[0, 2, 5, 7]]
# self.D = 16
# self.act_gate = 'sigmoid'
# self.act_cell = 'tanh'
# self.act_cand = 'tanh'
# self.has_initial_state = False
# self.is_reverse = True
# self.use_peepholes = True
# class TestLstmOpNotUsePeepholes(TestLstmOp):
# def set_argument(self):
# self.lod = [[0, 2, 5, 7]]
# self.D = 16
# self.act_gate = 'sigmoid'
# self.act_cell = 'tanh'
# self.act_cand = 'tanh'
# self.has_initial_state = False
# self.is_reverse = True
# self.use_peepholes = False
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle.fluid.core as core
from paddle.fluid.op import Operator
def as_lodtensor(np_array, lod, place):
tensor = core.LoDTensor()
tensor.set(np_value, place)
if lod is not None:
tensor.set_lod(lod)
return tensor
def create_op(scope, op_type, inputs, outputs, attrs):
kwargs = dict()
op_maker = core.op_proto_and_checker_maker
op_role_attr_name = op_maker.kOpRoleAttrName()
if op_role_attr_name not in attrs:
attrs[op_role_attr_name] = int(op_maker.OpRole.Forward)
def __create_var__(name, var_name):
scope.var(var_name).get_tensor()
kwargs[name].append(var_name)
for in_name, in_dup in Operator.get_op_inputs(op_type):
if in_name in inputs:
kwargs[in_name] = []
if in_dup:
sub_in = inputs[in_name]
for item in sub_in:
sub_in_name, _ = item[0], item[1]
__create_var__(in_name, sub_in_name)
else:
__create_var__(in_name, in_name)
for out_name, out_dup in Operator.get_op_outputs(op_type):
if out_name in outputs:
kwargs[out_name] = []
if out_dup:
sub_out = outputs[out_name]
for item in sub_out:
sub_out_name, _ = item[0], item[1]
__create_var__(out_name, sub_out_name)
else:
__create_var__(out_name, out_name)
for attr_name in Operator.get_op_attr_names(op_type):
if attr_name in attrs:
kwargs[attr_name] = attrs[attr_name]
return Operator(op_type, **kwargs)
def set_input(scope, op, inputs, place):
def __set_input__(var_name, var):
if isinstance(var, tuple) or isinstance(var, np.ndarray):
tensor = scope.find_var(var_name).get_tensor()
if isinstance(var, tuple):
tensor.set_lod(var[1])
var = var[0]
tensor.set_dims(var.shape)
tensor.set(var, place)
elif isinstance(var, float):
scope.find_var(var_name).set_float(var)
elif isinstance(var, int):
scope.find_var(var_name).set_int(var)
for in_name, in_dup in Operator.get_op_inputs(op.type()):
if in_name in inputs:
if in_dup:
sub_in = inputs[in_name]
for item in sub_in:
sub_in_name, sub_in_val = item[0], item[1]
__set_input__(sub_in_name, sub_in_val)
else:
__set_input__(in_name, inputs[in_name])
def append_input_output(block, op_proto, np_list, is_input, dtype):
'''Insert VarDesc and generate Python variable instance'''
proto_list = op_proto.inputs if is_input else op_proto.outputs
def create_var(block, name, np_list, var_proto):
dtype = None
shape = None
lod_level = None
if name not in np_list:
assert var_proto.intermediate, "{} not found".format(name)
else:
np_value = np_list[name]
if isinstance(np_value, tuple):
dtype = np_value[0].dtype
# output shape, lod should be infered from input.
if is_input:
shape = list(np_value[0].shape)
lod_level = len(np_value[1])
else:
dtype = np_value.dtype
if is_input:
shape = list(np_value.shape)
lod_level = 0
return block.create_var(
dtype=dtype, shape=shape, lod_level=lod_level, name=name)
var_dict = {}
for var_proto in proto_list:
var_name = str(var_proto.name)
if is_input:
if (var_name not in np_list) and var_proto.dispensable:
continue
assert (var_name in np_list) or (var_proto.dispensable), \
"Missing {} as input".format(var_name)
if var_proto.duplicable:
assert isinstance(np_list[var_name], list), \
"Duplicable {} should be set as list".format(var_name)
var_list = []
for (name, np_value) in np_list[var_name]:
var_list.append(
create_var(block, name, {name: np_value}, var_proto))
var_dict[var_name] = var_list
else:
var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
return var_dict
def append_loss_ops(block, output_names):
mean_inputs = map(block.var, output_names)
# for item in mean_inputs:
# print(item)
# print("Item", item.dtype)
if len(mean_inputs) == 1:
loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
op = block.append_op(
inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
op.desc.infer_var_type(block.desc)
op.desc.infer_shape(block.desc)
else:
avg_sum = []
for cur_loss in mean_inputs:
cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
op = block.append_op(
inputs={"X": [cur_loss]},
outputs={"Out": [cur_avg_loss]},
type="mean")
op.desc.infer_var_type(block.desc)
op.desc.infer_shape(block.desc)
avg_sum.append(cur_avg_loss)
loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
op_sum = block.append_op(
inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
op_sum.desc.infer_var_type(block.desc)
op_sum.desc.infer_shape(block.desc)
loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
op_loss = block.append_op(
inputs={"X": loss_sum},
outputs={"Out": loss},
type='scale',
attrs={'scale': 1.0 / float(len(avg_sum))})
op_loss.desc.infer_var_type(block.desc)
op_loss.desc.infer_shape(block.desc)
return loss
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册