提交 a169067e 编写于 作者: L Luo Tao

Merge branch 'develop' into make_clean

#FROM python:2.7.14
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
RUN apt-get update && apt-get install -y python
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
# so we must build one with distribute support to install in this image.
RUN pip install paddlepaddle
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
RUN pip uninstall -y paddlepaddle
# below lines may change a lot for debugging
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl && \
chmod +x /usr/bin/paddle_k8s
ENV LD_LIBRARY_PATH=/usr/local/lib
ADD vgg16_fluid.py vgg16_v2.py /workspace/
# Performance for Distributed vgg16
## Test Result
### Hardware Infomation
- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
- cpu MHz : 2101.000
- cache size : 20480 KB
### Single Node Single Thread
- PServer Count: 10
- Trainer Count: 20
- Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
| TensorFlow | - | - | - | - |
### Different Batch Size
- PServer Count: 10
- Trainer Count: 20
- Per trainer CPU Core: 1
- Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
| TensorFlow | - | - | - | - |
### Accelerate Rate
- Pserver Count: 20
- Batch Size: 128
- Metrics: samples / sec
| Trainer Count | 20 | 40 | 80 | 100 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
| TensorFlow | - | - | - | - |
### Different Pserver Count
- Trainer Count: 60
- Batch Size: 128
- Metrics: samples/ sec
| PServer Count | 3 | 6 |10 | 20 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
| TensorFlow | - | - | - | - |
*The performance gap between Fuild and v2 comes from the network interference.*
## Steps to Run the Performance Test
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
Check the logs for the distributed training progress and analyze the performance.
## Enable Verbos Logs
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: MKL_NUM_THREADS
value: "1"
- name: TRAINING_ROLE
value: "PSERVER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
command: ["paddle_k8s", "start_fluid"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_fluid"]
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: TRAINING_ROLE
value: "TRAINER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16v2job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16v2job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "python train.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
command: ["paddle_k8s", "start_pserver"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16v2job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16v2job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_trainer", "v2"]
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: BATCH_SIZE
value: "256"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "2"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""VGG16 benchmark in Fluid"""
from __future__ import print_function
import sys
import time
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
import paddle.v2.fluid.profiler as profiler
import argparse
import functools
import os
def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--batch_size', type=int, default=128, help="Batch size for training.")
parser.add_argument(
'--learning_rate',
type=float,
default=1e-3,
help="Learning rate for training.")
parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
parser.add_argument(
'--device',
type=str,
default='CPU',
choices=['CPU', 'GPU'],
help="The device type.")
parser.add_argument('--device_id', type=int, default=0, help="The device id.")
parser.add_argument(
'--data_format',
type=str,
default='NCHW',
choices=['NCHW', 'NHWC'],
help='The data order, now only support NCHW.')
parser.add_argument(
'--data_set',
type=str,
default='cifar10',
choices=['cifar10', 'flowers'],
help='Optional dataset for benchmark.')
parser.add_argument(
'--local',
type=str2bool,
default=True,
help='Whether to run as local mode.')
args = parser.parse_args()
def vgg16_bn_drop(input):
def conv_block(input, num_filter, groups, dropouts):
return fluid.nets.img_conv_group(
input=input,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act='relu',
conv_with_batchnorm=True,
conv_batchnorm_drop_rate=dropouts,
pool_type='max')
conv1 = conv_block(input, 64, 2, [0.3, 0])
conv2 = conv_block(conv1, 128, 2, [0.4, 0])
conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = fluid.layers.fc(input=drop, size=512, act=None)
bn = fluid.layers.batch_norm(input=fc1, act='relu')
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
return fc2
def main():
if args.data_set == "cifar10":
classdim = 10
if args.data_format == 'NCHW':
data_shape = [3, 32, 32]
else:
data_shape = [32, 32, 3]
else:
classdim = 102
if args.data_format == 'NCHW':
data_shape = [3, 224, 224]
else:
data_shape = [224, 224, 3]
# Input data
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
net = vgg16_bn_drop(images)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
# inference program
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
test_target = accuracy.metrics + accuracy.states
inference_program = fluid.io.get_inference_program(test_target)
# Optimization
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
optimize_ops, params_grads = optimizer.minimize(avg_cost)
# Initialize executor
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
args.device_id)
exe = fluid.Executor(place)
# test
def test(exe):
accuracy.reset(exe)
for batch_id, data in enumerate(test_reader()):
img_data = np.array(map(lambda x: x[0].reshape(data_shape),
data)).astype("float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
exe.run(inference_program,
feed={"pixel": img_data,
"label": y_data})
return accuracy.eval(exe)
def train_loop(exe, trainer_prog):
iters = 0
ts = time.time()
for pass_id in range(args.num_passes):
# train
start_time = time.time()
num_samples = 0
accuracy.reset(exe)
with profiler.profiler("CPU", 'total') as prof:
for batch_id, data in enumerate(train_reader()):
ts = time.time()
img_data = np.array(
map(lambda x: x[0].reshape(data_shape), data)).astype(
"float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
loss, acc = exe.run(
trainer_prog,
feed={"pixel": img_data,
"label": y_data},
fetch_list=[avg_cost] + accuracy.metrics)
iters += 1
num_samples += len(data)
print(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
% (pass_id, iters, loss, acc, time.time() - ts)
) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time
pass_train_acc = accuracy.eval(exe)
pass_test_acc = test(exe)
print(
"Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
% (pass_id, num_samples / pass_elapsed, pass_train_acc,
pass_test_acc))
if args.local:
# Parameter initialization
exe.run(fluid.default_startup_program())
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
batch_size=args.batch_size)
train_loop(exe, fluid.default_main_program())
else:
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # all pserver endpoints
eplist = []
for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, "6174"]))
pserver_endpoints = ",".join(eplist)
print("pserver endpoints: ", pserver_endpoints)
trainers = int(os.getenv("TRAINERS")) # total trainer count
print("trainers total: ", trainers)
current_endpoint = os.getenv(
"POD_IP") + ":6174" # current pserver endpoint
training_role = os.getenv(
"TRAINING_ROLE",
"TRAINER") # get the training role: trainer/pserver
t = fluid.DistributeTranspiler()
t.transpile(
optimize_ops,
params_grads,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER":
if not current_endpoint:
print("need env SERVER_ENDPOINT")
exit(1)
pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint,
pserver_prog)
print("starting server side startup")
exe.run(pserver_startup)
print("starting parameter server...")
exe.run(pserver_prog)
elif training_role == "TRAINER":
# Parameter initialization
exe.run(fluid.default_startup_program())
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
paddle.dataset.flowers.test(),
batch_size=args.batch_size)
trainer_prog = t.get_trainer_program()
feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
# TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
exe.run(fluid.default_startup_program())
train_loop(exe, trainer_prog)
else:
print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
def print_arguments():
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == "__main__":
print_arguments()
main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import gzip
import paddle.v2.dataset.cifar as cifar
import paddle.v2 as paddle
import time
import os
DATA_DIM = 3 * 32 * 32
CLASS_DIM = 10
BATCH_SIZE = os.getenv("BATCH_SIZE")
if BATCH_SIZE:
BATCH_SIZE = int(BATCH_SIZE)
else:
BATCH_SIZE = 128
print "batch_size", BATCH_SIZE
NODE_COUNT = int(os.getenv("TRAINERS"))
ts = 0
def vgg(input, nums, class_dim):
def conv_block(input, num_filter, groups, num_channels=None):
return paddle.networks.img_conv_group(
input=input,
num_channels=num_channels,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act=paddle.activation.Relu(),
pool_type=paddle.pooling.Max())
assert len(nums) == 5
# the channel of input feature is 3
conv1 = conv_block(input, 64, nums[0], 3)
conv2 = conv_block(conv1, 128, nums[1])
conv3 = conv_block(conv2, 256, nums[2])
conv4 = conv_block(conv3, 512, nums[3])
conv5 = conv_block(conv4, 512, nums[4])
fc_dim = 512
fc1 = paddle.layer.fc(input=conv5,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
fc2 = paddle.layer.fc(input=fc1,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
out = paddle.layer.fc(input=fc2,
size=class_dim,
act=paddle.activation.Softmax())
return out
def vgg13(input, class_dim):
nums = [2, 2, 2, 2, 2]
return vgg(input, nums, class_dim)
def vgg16(input, class_dim):
nums = [2, 2, 3, 3, 3]
return vgg(input, nums, class_dim)
def vgg19(input, class_dim):
nums = [2, 2, 4, 4, 4]
return vgg(input, nums, class_dim)
def main():
global ts
paddle.init(use_gpu=False)
image = paddle.layer.data(
name="image", type=paddle.data_type.dense_vector(DATA_DIM))
lbl = paddle.layer.data(
name="label", type=paddle.data_type.integer_value(CLASS_DIM))
extra_layers = None
# NOTE: for v2 distributed training need averaging updates.
learning_rate = 1e-3 / NODE_COUNT
out = vgg16(image, class_dim=CLASS_DIM)
cost = paddle.layer.classification_cost(input=out, label=lbl)
# Create parameters
parameters = paddle.parameters.create(cost)
# Create optimizer
optimizer = paddle.optimizer.Momentum(
momentum=0.9,
regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
BATCH_SIZE),
learning_rate=learning_rate / BATCH_SIZE,
learning_rate_decay_a=0.1,
learning_rate_decay_b=128000 * 35,
learning_rate_schedule="discexp", )
train_reader = paddle.batch(
paddle.reader.shuffle(
cifar.train10(),
# To use other data, replace the above line with:
# reader.train_reader('train.list'),
buf_size=1000),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
cifar.test10(),
# To use other data, replace the above line with:
# reader.test_reader('val.list'),
batch_size=BATCH_SIZE)
# Create trainer
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer,
extra_layers=extra_layers,
is_local=False)
# End batch and end pass event handler
def event_handler(event):
global ts, ts_pass
if isinstance(event, paddle.event.BeginPass):
ts_pass = time.time()
if isinstance(event, paddle.event.BeginIteration):
ts = time.time()
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 1 == 0:
print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
event.pass_id, event.batch_id, event.cost, event.metrics,
time.time() - ts)
if isinstance(event, paddle.event.EndPass):
print "Pass %d end, spent: %f" % (event.pass_id,
time.time() - ts_pass)
result = trainer.test(reader=test_reader)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
trainer.train(
reader=train_reader, num_passes=200, event_handler=event_handler)
if __name__ == '__main__':
main()
......@@ -87,6 +87,11 @@ roi_pool
.. autoclass:: paddle.v2.layer.roi_pool
:noindex:
pad
----
.. autoclass:: paddle.v2.layer.pad
:noindex:
Norm Layer
==========
......@@ -133,6 +138,11 @@ grumemory
.. autoclass:: paddle.v2.layer.grumemory
:noindex:
gated_unit
-----------
.. autoclass:: paddle.v2.layer.gated_unit
:noindex:
Recurrent Layer Group
=====================
......@@ -340,6 +350,11 @@ bilinear_interp
.. autoclass:: paddle.v2.layer.bilinear_interp
:noindex:
dropout
--------
.. autoclass:: paddle.v2.layer.dropout
:noindex:
dot_prod
---------
.. autoclass:: paddle.v2.layer.dot_prod
......@@ -402,6 +417,11 @@ scale_shift
.. autoclass:: paddle.v2.layer.scale_shift
:noindex:
factorization_machine
---------------------
.. autoclass:: paddle.v2.layer.factorization_machine
:noindex:
Sampling Layers
===============
......@@ -420,22 +440,6 @@ multiplex
.. autoclass:: paddle.v2.layer.multiplex
:noindex:
Factorization Machine Layer
============================
factorization_machine
---------------------
.. autoclass:: paddle.v2.layer.factorization_machine
:noindex:
Slicing and Joining Layers
==========================
pad
----
.. autoclass:: paddle.v2.layer.pad
:noindex:
.. _api_v2.layer_costs:
Cost Layers
......@@ -526,6 +530,11 @@ multibox_loss
.. autoclass:: paddle.v2.layer.multibox_loss
:noindex:
detection_output
----------------
.. autoclass:: paddle.v2.layer.detection_output
:noindex:
Check Layer
============
......@@ -534,31 +543,10 @@ eos
.. autoclass:: paddle.v2.layer.eos
:noindex:
Miscs
=====
dropout
--------
.. autoclass:: paddle.v2.layer.dropout
:noindex:
Activation with learnable parameter
===================================
Activation
==========
prelu
--------
.. autoclass:: paddle.v2.layer.prelu
:noindex:
gated_unit
-----------
.. autoclass:: paddle.v2.layer.gated_unit
:noindex:
Detection output Layer
======================
detection_output
----------------
.. autoclass:: paddle.v2.layer.detection_output
:noindex:
......@@ -73,3 +73,10 @@ wmt14
.. automodule:: paddle.v2.dataset.wmt14
:members:
:noindex:
wmt16
+++++
.. automodule:: paddle.v2.dataset.wmt16
:members:
:noindex:
### Design Doc: Switch
### Background
Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid.
The following example shows the usage of `fluid.switch`.
```python
a = fluid.Var(10)
b = fluid.Var(0)
switch = fluid.switch()
with switch.block():
with switch.case(fluid.less_equal(a, 10)):
fluid.print("Case 1")
with switch.case(fluid.larger(a, 0)):
fluid.print("Case 2")
with switch.default():
fluid.print("Case 3")
```
### The Semantics
1. A `switch` control-flow checks cases one-by-one.
1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
1. It runs the first matched case, or the default case if there is one.
1. Once it matches a case, it runs the corresponding branch and only that branch. It's like there is a C's `break` keyword at the end of each case.
The above program should print and print only "Case 1".
The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
......@@ -23,8 +23,8 @@ namespace framework {
template <typename T>
class Channel {
public:
virtual void Send(T*) = 0;
virtual void Receive(T*) = 0;
virtual bool Send(T*) = 0;
virtual bool Receive(T*) = 0;
virtual size_t Cap() = 0;
virtual void Close() = 0;
virtual ~Channel() {}
......
......@@ -48,12 +48,12 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
for (size_t i = 0; i < buffer_size; ++i) {
ch->Send(&i); // should not block
EXPECT_EQ(ch->Send(&i), true); // should not block
}
size_t out;
for (size_t i = 0; i < buffer_size; ++i) {
ch->Receive(&out); // should not block
EXPECT_EQ(ch->Receive(&out), true); // should not block
EXPECT_EQ(out, i);
}
CloseChannel(ch);
......@@ -67,7 +67,10 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
std::thread t([&]() {
// Try to write more than buffer size.
for (size_t i = 0; i < 2 * buffer_size; ++i) {
ch->Send(&i); // should not block
if (i < buffer_size)
EXPECT_EQ(ch->Send(&i), true); // should block after 10 iterations
else
EXPECT_EQ(ch->Send(&i), false);
sum += i;
}
});
......@@ -84,13 +87,13 @@ TEST(Channel, SimpleUnbufferedChannelTest) {
unsigned sum_send = 0;
std::thread t([&]() {
for (int i = 0; i < 5; i++) {
ch->Send(&i);
EXPECT_EQ(ch->Send(&i), true);
sum_send += i;
}
});
for (int i = 0; i < 5; i++) {
int recv;
ch->Receive(&recv);
EXPECT_EQ(ch->Receive(&recv), true);
EXPECT_EQ(recv, i);
}
......@@ -100,6 +103,102 @@ TEST(Channel, SimpleUnbufferedChannelTest) {
delete ch;
}
// This tests that closing a buffered channel also unblocks
// any receivers waiting on the channel
TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
auto ch = MakeChannel<int>(1);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
// All reads should return false
EXPECT_EQ(ch->Receive(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that all threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the channel
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
}
// This tests that closing a buffered channel also unblocks
// any senders waiting for channel to have write space
TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
auto ch = MakeChannel<int>(1);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
bool send_success[num_threads];
// Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
[&](bool *ended, bool *success) {
int data = 10;
*success = ch->Send(&data);
*ended = true;
},
&thread_ended[i], &send_success[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that atleast 4 threads are blocked
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (thread_ended[i] == false) ct++;
}
// Atleast 4 threads must be blocked
EXPECT_GE(ct, 4);
// Explicitly close the thread
// This should unblock all senders
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
// Verify that only 1 send was successful
ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (send_success[i]) ct++;
}
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
}
// This tests that closing an unbuffered channel also unblocks
// unblocks any receivers waiting for senders
TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
......@@ -114,7 +213,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
t[i] = std::thread(
[&](bool *p) {
int data;
ch->Receive(&data);
EXPECT_EQ(ch->Receive(&data), false);
*p = true;
},
&thread_ended[i]);
......@@ -155,7 +254,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
t[i] = std::thread(
[&](bool *p) {
int data = 10;
ch->Send(&data);
EXPECT_EQ(ch->Send(&data), false);
*p = true;
},
&thread_ended[i]);
......@@ -207,3 +306,37 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
t.join();
delete ch;
}
TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
auto ch = MakeChannel<int>(0);
unsigned sum_send = 0;
unsigned sum_receive = 0;
// The receiver should block after 5
// iterations, since there are only 5 senders.
std::thread t([&]() {
for (int i = 0; i < 8; i++) {
int recv;
ch->Receive(&recv); // should block after the fifth iteration.
EXPECT_EQ(recv, i);
sum_receive += i;
}
});
for (int i = 0; i < 5; i++) {
ch->Send(&i);
sum_send += i;
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
EXPECT_EQ(sum_send, 10U);
EXPECT_EQ(sum_receive, 10U);
// send three more elements
for (int i = 5; i < 8; i++) {
ch->Send(&i);
sum_send += i;
}
CloseChannel(ch);
t.join();
EXPECT_EQ(sum_send, 28U);
EXPECT_EQ(sum_receive, 28U);
delete ch;
}
......@@ -30,8 +30,8 @@ class Buffered : public paddle::framework::Channel<T> {
friend void paddle::framework::CloseChannel<T>(Channel<T>*);
public:
virtual void Send(T*);
virtual void Receive(T*);
virtual bool Send(T*);
virtual bool Receive(T*);
virtual size_t Cap() { return cap_; }
virtual void Close();
virtual ~Buffered();
......@@ -48,39 +48,43 @@ class Buffered : public paddle::framework::Channel<T> {
PADDLE_ENFORCE_GT(cap, 0);
}
void NotifyAllSenders(std::unique_lock<std::mutex>*);
void NotifyAllParticipants(std::unique_lock<std::mutex>*);
};
template <typename T>
void Buffered<T>::Send(T* item) {
bool Buffered<T>::Send(T* item) {
std::unique_lock<std::mutex> lock(mu_);
full_cond_var_.wait(lock,
[this]() { return channel_.size() < cap_ || closed_; });
bool ret = false;
if (!closed_) {
channel_.push_back(std::move(*item));
lock.unlock();
empty_cond_var_.notify_one();
ret = true;
}
return ret;
}
template <typename T>
void Buffered<T>::Receive(T* item) {
bool Buffered<T>::Receive(T* item) {
std::unique_lock<std::mutex> lock(mu_);
empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
bool ret = false;
if (!closed_) {
*item = std::move(channel_.front());
channel_.pop_front();
NotifyAllSenders(&lock);
} else {
item = nullptr;
full_cond_var_.notify_one();
ret = true;
}
return ret;
}
template <typename T>
void Buffered<T>::Close() {
std::unique_lock<std::mutex> lock(mu_);
closed_ = true;
NotifyAllSenders(&lock);
NotifyAllParticipants(&lock);
}
template <typename T>
......@@ -88,13 +92,14 @@ Buffered<T>::~Buffered() {
std::unique_lock<std::mutex> lock(mu_);
closed_ = true;
channel_.clear();
NotifyAllSenders(&lock);
NotifyAllParticipants(&lock);
}
template <typename T>
void Buffered<T>::NotifyAllSenders(std::unique_lock<std::mutex>* lock) {
void Buffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
lock->unlock();
full_cond_var_.notify_all();
empty_cond_var_.notify_all();
}
} // namespace details
......
......@@ -29,8 +29,8 @@ class UnBuffered : public paddle::framework::Channel<T> {
friend void paddle::framework::CloseChannel<T>(Channel<T>*);
public:
virtual void Send(T*);
virtual void Receive(T*);
virtual bool Send(T*);
virtual bool Receive(T*);
virtual size_t Cap() { return 0; }
virtual void Close();
virtual ~UnBuffered();
......@@ -57,7 +57,7 @@ class UnBuffered : public paddle::framework::Channel<T> {
// This function implements the concept of how data should
// be sent from a writer to a reader.
template <typename T>
void UnBuffered<T>::Send(T* data) {
bool UnBuffered<T>::Send(T* data) {
// Prevent other writers from entering
std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
writer_found_ = true;
......@@ -66,6 +66,7 @@ void UnBuffered<T>::Send(T* data) {
cv_writer_.wait(cv_lock,
[this]() { return reader_found_ == true || closed_; });
cv_reader_.notify_one();
bool ret = false;
if (!closed_) {
std::unique_lock<std::mutex> channel_lock(mu_ch_);
item = data;
......@@ -74,14 +75,16 @@ void UnBuffered<T>::Send(T* data) {
channel_lock.lock();
cv_channel_.wait(channel_lock,
[this]() { return item == nullptr || closed_; });
ret = true;
}
writer_found_ = false;
return ret;
}
// This function implements the concept of how
// data that was sent by a writer is read from a reader.
template <typename T>
void UnBuffered<T>::Receive(T* data) {
bool UnBuffered<T>::Receive(T* data) {
// Prevent other readers from entering
std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
reader_found_ = true;
......@@ -90,6 +93,7 @@ void UnBuffered<T>::Receive(T* data) {
cv_reader_.wait(cv_lock,
[this]() { return writer_found_ == true || closed_; });
cv_writer_.notify_one();
bool ret = false;
if (!closed_) {
std::unique_lock<std::mutex> lock_ch{mu_ch_};
// Reader should wait for the writer to first write its data
......@@ -98,10 +102,12 @@ void UnBuffered<T>::Receive(T* data) {
*data = std::move(*item);
item = nullptr;
lock_ch.unlock();
ret = true;
}
cv_channel_.notify_one();
}
reader_found_ = false;
return ret;
}
// This function implements the sequence of events
......
......@@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext {
bool HasOutputs(const std::string &name) const override;
DDim GetInputDim(const std::string &name) const override;
void SetOutputDim(const std::string &name, const DDim &dim) override;
AttrReader Attrs() const override;
const std::vector<std::string> &Inputs(
......@@ -444,21 +440,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
return true;
}
DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const {
std::vector<DDim> ddims = GetInputsDim(name);
auto length = ddims.size();
PADDLE_ENFORCE_EQ(length, 1UL,
"Input(%s) should have 1 value, "
"but it has %d now",
name, length);
return ddims[0];
}
void CompileTimeInferShapeContext::SetOutputDim(const std::string &name,
const DDim &dim) {
SetOutputsDim(name, {dim});
}
AttrReader CompileTimeInferShapeContext::Attrs() const {
return AttrReader(op_.GetAttrMap());
}
......
......@@ -366,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
return true;
}
DDim GetInputDim(const std::string& name) const override {
return GetDim(op_.Input(name));
}
void SetOutputDim(const std::string& name, const DDim& dim) override {
SetDim(op_.Output(name), dim);
}
AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
const std::vector<std::string>& Inputs(
......
......@@ -18,10 +18,18 @@ limitations under the License. */
namespace paddle {
namespace framework {
std::vector<framework::DDim> InferShapeContext::GetInputsDim(
DDim InferShapeContext::GetInputDim(const std::string &name) const {
const std::vector<std::string> &arg_names = Inputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Input(%s) should hold one element, but now it holds %d",
name, arg_names.size());
return this->GetDim(arg_names[0]);
}
std::vector<DDim> InferShapeContext::GetInputsDim(
const std::string &name) const {
const std::vector<std::string> &names = Inputs(name);
return GetDims(names);
const std::vector<std::string> &arg_names = Inputs(name);
return GetDims(arg_names);
}
DDim InferShapeContext::GetInputsElementDim(const std::string &name,
......@@ -30,24 +38,31 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name,
return this->GetDim(names[idx]);
}
void InferShapeContext::SetOutputsDim(
const std::string &name, const std::vector<framework::DDim> &dims) {
void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
auto &arg_names = Outputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Output(%s) should hold one element, but now it holds %d",
name, arg_names.size());
SetDim(arg_names[0], dim);
}
void InferShapeContext::SetOutputsDim(const std::string &name,
const std::vector<DDim> &dims) {
auto &names = Outputs(name);
SetDims(names, dims);
}
std::vector<framework::DDim> InferShapeContext::GetDims(
std::vector<DDim> InferShapeContext::GetDims(
const std::vector<std::string> &names) const {
std::vector<framework::DDim> ret;
std::vector<DDim> ret;
ret.reserve(names.size());
std::transform(
names.begin(), names.end(), std::back_inserter(ret),
[this](const std::string &name) { return this->GetDim(name); });
return ret;
}
void InferShapeContext::SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims) {
const std::vector<DDim> &dims) {
size_t length = names.size();
PADDLE_ENFORCE_EQ(length, dims.size());
for (size_t i = 0; i < length; ++i) {
......
......@@ -35,14 +35,13 @@ class InferShapeContext {
virtual bool HasInputs(const std::string &name) const = 0;
virtual bool HasOutputs(const std::string &name) const = 0;
virtual framework::DDim GetInputDim(const std::string &name) const = 0;
DDim GetInputDim(const std::string &name) const;
std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
std::vector<DDim> GetInputsDim(const std::string &name) const;
DDim GetInputsElementDim(const std::string &name, int idx) const;
virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
void SetOutputsDim(const std::string &name,
const std::vector<framework::DDim> &dims);
void SetOutputDim(const std::string &name, const DDim &dim);
void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims);
virtual AttrReader Attrs() const = 0;
virtual const std::vector<std::string> &Inputs(
......@@ -57,15 +56,13 @@ class InferShapeContext {
// Note: In while op, we need this to be public
void SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims);
const std::vector<DDim> &dims);
protected:
virtual framework::DDim GetDim(const std::string &name) const = 0;
virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
std::vector<framework::DDim> GetDims(
const std::vector<std::string> &names) const;
virtual DDim GetDim(const std::string &name) const = 0;
virtual void SetDim(const std::string &name, const DDim &dim) = 0;
std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
std::vector<proto::VarDesc::VarType> GetVarTypes(
const std::vector<std::string> &names) const;
......
......@@ -158,7 +158,10 @@ op_library(parallel_do_op DEPS executor)
# Regist multiple Kernel to pybind
if (WITH_GPU)
op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col)
op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
vol2col depthwise_conv)
op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -28,12 +28,18 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("DistMat"),
"Input(DistMat) of BipartiteMatch should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("ColToRowMatchIndices"),
"Output(ColToRowMatchIndices) of BipartiteMatch should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("ColToRowMatchDist"),
"Output(ColToRowMatchDist) of BipartiteMatch should not be null.");
auto dims = ctx->GetInputDim("DistMat");
PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
ctx->SetOutputDim("ColToRowMatchIndices", dims);
ctx->SetOutputDim("ColToRowMatchDis", dims);
ctx->SetOutputDim("ColToRowMatchDist", dims);
}
};
......@@ -91,7 +97,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& context) const override {
auto* dist_mat = context.Input<LoDTensor>("DistMat");
auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
auto* match_dist = context.Output<Tensor>("ColToRowMatchDis");
auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");
auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
......@@ -148,13 +154,13 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
"Otherwise, it means B[j] is matched to row "
"ColToRowMatchIndices[i][j] in i-th instance. The row number of "
"i-th instance is saved in ColToRowMatchIndices[i][j].");
AddOutput("ColToRowMatchDis",
AddOutput("ColToRowMatchDist",
"(Tensor) A 2-D Tensor with shape [N, M] in float type. "
"N is batch size. If ColToRowMatchIndices[i][j] is -1, "
"ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed "
"ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed "
"ColToRowMatchIndices[i][j] = d, and the row offsets of each "
"instance are called LoD. Then "
"ColToRowMatchDis[i][j] = DistMat[d+LoD[i]][j]");
"ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]");
AddComment(R"DOC(
This operator is a greedy bipartite matching algorithm, which is used to
obtain the matching with the maximum distance based on the input
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/box_coder_op.h"
namespace paddle {
namespace operators {
class BoxCoderOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
"Input(PriorBox) of BoxCoderOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
"Input(PriorBoxVar) of BoxCoderOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
"Input(TargetBox) of BoxCoderOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
"Output(OutputBox) of BoxCoderOp should not be null.");
auto prior_box_dims = ctx->GetInputDim("PriorBox");
auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
auto target_box_dims = ctx->GetInputDim("TargetBox");
PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
"The rank of Input of PriorBoxVar must be 2");
PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
"The rank of Input of TargetBox must be 2");
PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
"The shape of TargetBox is [M, 4]");
GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
ctx->SetOutputDim(
"OutputBox",
framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
}
};
class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
public:
BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"PriorBox",
"(Tensor, default Tensor<float>) "
"Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, "
"each box is represented as [xmin, ymin, xmax, ymax], "
"[xmin, ymin] is the left top coordinate of the anchor box, "
"if the input is image feature map, they are close to the origin "
"of the coordinate system. [xmax, ymax] is the right bottom "
"coordinate of the anchor box.");
AddInput("PriorBoxVar",
"(Tensor, default Tensor<float>) "
"PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
"of variance.");
AddInput(
"TargetBox",
"(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
"[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
"[xmin, ymin] is the left top coordinate of the box if the input "
"is image feature map, they are close to the origin of the coordinate "
"system. [xmax, ymax] is the right bottom coordinate of the box. "
"This tensor can contain LoD information to represent a batch "
"of inputs. One instance of this batch can contain different "
"numbers of entities.");
AddAttr<std::string>("code_type",
"(string, default encode_center_size) "
"the code type used with the target box")
.SetDefault("encode_center_size")
.InEnum({"encode_center_size", "decode_center_size"});
AddOutput(
"OutputBox",
"(LoDTensor or Tensor) "
"(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
"representing the result of N target boxes encoded/decoded with "
"M Prior boxes and variances.");
AddComment(R"DOC(
Bounding Box Coder Operator.
Encode/Decode the target bounding box with the priorbox information.
The Encoding schema described below:
ox = (tx - px) / pw / pxv
oy = (ty - py) / ph / pyv
ow = log(abs(tw / pw)) / pwv
oh = log(abs(th / ph)) / phv
The Decoding schema described below:
ox = (pw * pxv * tx * + px) - tw / 2
oy = (ph * pyv * ty * + py) - th / 2
ow = exp(pwv * tw) * pw + tw / 2
oh = exp(phv * th) * ph + th / 2
where tx, ty, tw, th denote the target box's center coordinates, width and
height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
width and height.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
ops::BoxCoderKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/box_coder_op.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void EncodeCenterSizeKernel(const T* prior_box_data,
const T* prior_box_var_data,
const T* target_box_data, const int row,
const int col, const int len,
T* output) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < row * col) {
const int row_idx = idx / col;
const int col_idx = idx % col;
T prior_box_width =
prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
T prior_box_height =
prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
T prior_box_center_x =
(prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
prior_box_data[col_idx * len + 1]) /
2;
T target_box_center_x =
(target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
2;
T target_box_center_y = (target_box_data[row_idx * len + 3] +
target_box_data[row_idx * len + 1]) /
2;
T target_box_width =
target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
T target_box_height =
target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
output[idx * len] = (target_box_center_x - prior_box_center_x) /
prior_box_width / prior_box_var_data[col_idx * len];
output[idx * len + 1] = (target_box_center_y - prior_box_center_y) /
prior_box_height /
prior_box_var_data[col_idx * len + 1];
output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) /
prior_box_var_data[col_idx * len + 2];
output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) /
prior_box_var_data[col_idx * len + 3];
}
}
template <typename T>
__global__ void DecodeCenterSizeKernel(const T* prior_box_data,
const T* prior_box_var_data,
const T* target_box_data, const int row,
const int col, const int len,
T* output) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < row * col) {
const int row_idx = idx / col;
const int col_idx = idx % col;
T prior_box_width =
prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
T prior_box_height =
prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
T prior_box_center_x =
(prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
prior_box_data[col_idx * len + 1]) /
2;
T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
target_box_data[row_idx * len + 2]) *
prior_box_width;
T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
target_box_data[row_idx * len + 3]) *
prior_box_height;
T target_box_center_x = prior_box_var_data[col_idx * len] *
target_box_data[row_idx * len] *
prior_box_width +
prior_box_center_x;
T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
target_box_data[row_idx * len + 1] *
prior_box_height +
prior_box_center_y;
output[idx * len] = target_box_center_x - target_box_width / 2;
output[idx * len + 1] = target_box_center_y - target_box_height / 2;
output[idx * len + 2] = target_box_center_x + target_box_width / 2;
output[idx * len + 3] = target_box_center_y + target_box_height / 2;
}
}
template <typename T>
class BoxCoderCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
"This kernel only runs on GPU device.");
auto* prior_box = context.Input<framework::Tensor>("PriorBox");
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
auto* output_box = context.Output<framework::Tensor>("OutputBox");
if (target_box->lod().size()) {
PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
"Only support 1 level of LoD.");
}
auto row = target_box->dims()[0];
auto col = prior_box->dims()[0];
auto len = prior_box->dims()[1];
int block = 512;
int grid = (row * col + block - 1) / block;
auto& device_ctx = context.cuda_device_context();
const T* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = prior_box_var->data<T>();
const T* target_box_data = target_box->data<T>();
output_box->mutable_data<T>({row, col, len}, context.GetPlace());
T* output = output_box->data<T>();
auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
if (code_type == BoxCodeType::kEncodeCenterSize) {
EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
prior_box_data, prior_box_var_data, target_box_data, row, col, len,
output);
} else if (code_type == BoxCodeType::kDecodeCenterSize) {
DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
prior_box_data, prior_box_var_data, target_box_data, row, col, len,
output);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel<float>,
ops::BoxCoderCUDAKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
inline BoxCodeType GetBoxCodeType(const std::string& type) {
if (type == "encode_center_size") {
return BoxCodeType::kEncodeCenterSize;
} else if (type == "decode_center_size") {
return BoxCodeType::kDecodeCenterSize;
}
PADDLE_THROW("Not support type %s.", type);
}
template <typename T>
class BoxCoderKernel : public framework::OpKernel<T> {
public:
void EncodeCenterSize(const framework::Tensor& target_box,
const framework::Tensor& prior_box,
const framework::Tensor& prior_box_var,
T* output) const {
int64_t row = target_box.dims()[0];
int64_t col = prior_box.dims()[0];
int64_t len = prior_box.dims()[1];
auto* target_box_data = target_box.data<T>();
auto* prior_box_data = prior_box.data<T>();
auto* prior_box_var_data = prior_box_var.data<T>();
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
T prior_box_width =
prior_box_data[j * len + 2] - prior_box_data[j * len];
T prior_box_height =
prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
T prior_box_center_x =
(prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
T prior_box_center_y =
(prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
T target_box_center_x =
(target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
T target_box_center_y =
(target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
T target_box_width =
target_box_data[i * len + 2] - target_box_data[i * len];
T target_box_height =
target_box_data[i * len + 3] - target_box_data[i * len + 1];
size_t offset = i * col * len + j * len;
output[offset] = (target_box_center_x - prior_box_center_x) /
prior_box_width / prior_box_var_data[j * len];
output[offset + 1] = (target_box_center_y - prior_box_center_y) /
prior_box_height / prior_box_var_data[j * len + 1];
output[offset + 2] =
std::log(std::fabs(target_box_width / prior_box_width)) /
prior_box_var_data[j * len + 2];
output[offset + 3] =
std::log(std::fabs(target_box_height / prior_box_height)) /
prior_box_var_data[j * len + 3];
}
}
}
void DecodeCenterSize(const framework::Tensor& target_box,
const framework::Tensor& prior_box,
const framework::Tensor& prior_box_var,
T* output) const {
int64_t row = target_box.dims()[0];
int64_t col = prior_box.dims()[0];
int64_t len = prior_box.dims()[1];
auto* target_box_data = target_box.data<T>();
auto* prior_box_data = prior_box.data<T>();
auto* prior_box_var_data = prior_box_var.data<T>();
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
T prior_box_width =
prior_box_data[j * len + 2] - prior_box_data[j * len];
T prior_box_height =
prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
T prior_box_center_x =
(prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
T prior_box_center_y =
(prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
T target_box_center_x = prior_box_var_data[j * len] *
target_box_data[i * len] * prior_box_width +
prior_box_center_x;
T target_box_center_y = prior_box_var_data[j * len + 1] *
target_box_data[i * len + 1] *
prior_box_height +
prior_box_center_y;
T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
target_box_data[i * len + 2]) *
prior_box_width;
T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
target_box_data[i * len + 3]) *
prior_box_height;
size_t offset = i * col * len + j * len;
output[offset] = target_box_center_x - target_box_width / 2;
output[offset + 1] = target_box_center_y - target_box_height / 2;
output[offset + 2] = target_box_center_x + target_box_width / 2;
output[offset + 3] = target_box_center_y + target_box_height / 2;
}
}
}
void Compute(const framework::ExecutionContext& context) const override {
auto* prior_box = context.Input<framework::Tensor>("PriorBox");
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
auto* output_box = context.Output<framework::Tensor>("OutputBox");
if (target_box->lod().size()) {
PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
"Only support 1 level of LoD.");
}
auto row = target_box->dims()[0];
auto col = prior_box->dims()[0];
auto len = prior_box->dims()[1];
output_box->mutable_data<T>({row, col, len}, context.GetPlace());
auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
T* output = output_box->data<T>();
if (code_type == BoxCodeType::kEncodeCenterSize) {
EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
} else if (code_type == BoxCodeType::kDecodeCenterSize) {
DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
}
}
};
} // namespace operators
} // namespace paddle
......@@ -54,7 +54,15 @@ class CompareOpKernel
public:
void Compute(const framework::ExecutionContext& context) const override {
using T = typename Functor::ELEM_TYPE;
ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context);
using Tensor = framework::Tensor;
auto* x = context.Input<Tensor>("X");
auto* y = context.Input<Tensor>("Y");
auto* z = context.Output<Tensor>("Out");
z->mutable_data<T>(context.GetPlace());
int axis = context.Attr<int>("axis");
ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
z);
}
};
......
......@@ -318,9 +318,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
namespace ops = paddle::operators;
REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
ops::ConvOpGrad);
// depthwise convolution op
REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
depthwise_conv2d_grad, ops::ConvOpGrad);
REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
ops::ConvOpGrad);
// depthwise conv kernel
// TODO(xingzhaolong): neon kernel for mobile
REGISTER_OP_CPU_KERNEL(
depthwise_conv2d,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
depthwise_conv2d_grad,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
......
......@@ -16,6 +16,16 @@ limitations under the License. */
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d,
ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d_grad,
ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/depthwise_conv.h"
#include "paddle/operators/math/im2col.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/vol2col.h"
......@@ -350,5 +351,72 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
}
}
};
template <typename DeviceContext, typename T>
class DepthwiseConvKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
output->mutable_data<T>(context.GetPlace());
PADDLE_ENFORCE_EQ(
output->dims()[1] % input->dims()[1], 0,
"The output channels must be a multiple of the input channels");
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
auto& dev_ctx = context.template device_context<DeviceContext>();
depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);
}
};
template <typename DeviceContext, typename T>
class DepthwiseConvGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad =
context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter"));
Tensor filter = *context.Input<Tensor>("Filter");
if (!input_grad && !filter_grad) return;
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
math::SetConstant<DeviceContext, T> set_zero;
auto& dev_ctx = context.template device_context<DeviceContext>();
math::DepthwiseConvInputGradFunctor<DeviceContext, T>
depthwiseConvInputGrad;
math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
depthwiseConvFilterGrad;
if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(0));
depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
paddings, input_grad);
}
if (filter_grad) {
filter_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
filter_grad);
}
}
};
} // namespace operators
} // namespace paddle
......@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
class ElementwiseAddKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx);
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
}
};
......@@ -92,9 +99,19 @@ template <typename DeviceContext, typename T>
class ElementwiseAddGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseAddGradFunctor<T>,
ElementwiseAddBroadCastGradFunctor<T>,
ElementwiseAddBroadCast2GradFunctor<T>>(ctx);
ElementwiseAddBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
}
};
......
......@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
class ElementwiseDivKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx);
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
}
};
......@@ -111,9 +118,19 @@ template <typename DeviceContext, typename T>
class ElementwiseDivGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>,
ElementwiseDivBroadCastGradFunctor<T>,
ElementwiseDivBroadCast2GradFunctor<T>>(ctx);
ElementwiseDivBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
}
};
......
......@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
class ElementwiseMaxKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx);
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
}
};
......@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T>
class ElementwiseMaxGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseMaxGradFunctor<T>,
ElementwiseMaxBroadCastGradFunctor<T>,
ElementwiseMaxBroadCast2GradFunctor<T>>(ctx);
ElementwiseMaxBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
}
};
......
......@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
class ElementwiseMinKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx);
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
}
};
......@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T>
class ElementwiseMinGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseMinGradFunctor<T>,
ElementwiseMinBroadCastGradFunctor<T>,
ElementwiseMinBroadCast2GradFunctor<T>>(ctx);
ElementwiseMinBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
}
};
......
......@@ -27,7 +27,14 @@ template <typename DeviceContext, typename T>
class ElementwiseMulKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx);
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
}
};
......@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T>
class ElementwiseMulGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>,
ElementwiseMulBroadCastGradFunctor<T>,
ElementwiseMulBroadCast2GradFunctor<T>>(ctx);
ElementwiseMulBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
}
};
......
......@@ -313,21 +313,18 @@ EIGEN_FUNCTOR(Div, EIGEN_DIV);
template <typename DeviceContext, typename T, typename functor,
typename broadcastfunctor, typename broadcast2functor>
void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
const framework::Tensor* x,
const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout, int axis,
framework::Tensor* dx, framework::Tensor* dy) {
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto x_dims = x->dims();
auto y_dims = y->dims();
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
}
......@@ -348,7 +345,6 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
x_dims = framework::make_ddim(extended_dims);
}
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
int pre, n, post;
......@@ -367,13 +363,10 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
template <typename Functor, typename DeviceContext, typename T,
typename OutType = T>
void ElementwiseComputeEx(const framework::ExecutionContext& ctx) {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<OutType>(ctx.GetPlace());
void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
const framework::Tensor* x,
const framework::Tensor* y, int axis,
framework::Tensor* z) {
TransformFunctor<Functor, T, DeviceContext, OutType> functor(
x, y, z, ctx.template device_context<DeviceContext>(), Functor());
......@@ -394,7 +387,6 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx) {
x_dims = framework::make_ddim(extended_dims);
}
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
"Axis should be in range [0, x_dims)");
......
......@@ -29,7 +29,14 @@ template <typename DeviceContext, typename T>
class ElementwisePowKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx);
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
}
};
......
......@@ -27,7 +27,14 @@ template <typename DeviceContext, typename T>
class ElementwiseSubKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx);
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
}
};
......@@ -93,9 +100,19 @@ template <typename DeviceContext, typename T>
class ElementwiseSubGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>,
ElementwiseSubBroadCastGradFunctor<T>,
ElementwiseSubBroadCast2GradFunctor<T>>(ctx);
ElementwiseSubBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
}
};
......
......@@ -8,6 +8,7 @@ if(WITH_GPU)
nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context)
nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
nv_library(depthwise_conv SRCS depthwise_conv.cu DEPS device_context)
nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
......
/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/depthwise_conv.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
namespace operators {
namespace math {
// A Cuda kernel to compute the depthwise convolution forward pass
// in NCHW format.
template <typename T>
__global__ void KernelDepthwiseConv(
const int nthreads, const T* const input_data, const T* const filter_data,
const int batch_size, const int output_channels, const int output_height,
const int output_width, const int input_channels, const int input_height,
const int input_width, const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, T* const output_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int batch = index / output_channels / output_height / output_width;
const int c_out = (index / output_height / output_width) % output_channels;
const int h_out = (index / output_width) % output_height;
const int w_out = index % output_width;
const int c_in = c_out / filter_multiplier;
const T* weight = filter_data + c_out * filter_height * filter_width;
T value = 0;
const int h_in_start = -padding_height + h_out * stride_height;
const int w_in_start = -padding_width + w_out * stride_width;
const int h_in_end = h_in_start + filter_height;
const int w_in_end = w_in_start + filter_width;
const int in_offset =
((batch * input_channels + c_in) * input_height) * input_width;
const int h_end = h_in_end < input_height ? h_in_end : input_height;
const int w_end = w_in_end < input_width ? w_in_end : input_width;
const int h_start = h_in_start > 0 ? h_in_start : 0;
const int w_start = w_in_start > 0 ? w_in_start : 0;
for (int h_in = h_start; h_in < h_end; h_in++) {
for (int w_in = w_start; w_in < w_end; w_in++) {
const int offset = in_offset + h_in * input_width + w_in;
value +=
weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] *
input_data[offset];
}
}
output_data[index] = value;
}
}
// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
template <typename T>
__global__ void KernelDepthwiseConvInputGrad(
const int nthreads, const T* const output_grad_data,
const T* const filter_data, const int batch_size, const int output_channels,
const int output_height, const int output_width, const int input_channels,
const int input_height, const int input_width, const int filter_multiplier,
const int filter_height, const int filter_width, const int stride_height,
const int stride_width, const int padding_height, const int padding_width,
T* const input_grad_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int batch = index / input_channels / input_height / input_width;
const int c_in = (index / input_height / input_width) % input_channels;
const int h_in = (index / input_width) % input_height;
const int w_in = index % input_width;
const int c_out_start = c_in * filter_multiplier;
int h_out_start =
(h_in - filter_height + padding_height + stride_height) / stride_height;
h_out_start = 0 > h_out_start ? 0 : h_out_start;
int h_out_end = (h_in + padding_height) / stride_height;
h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end;
int w_out_start =
(w_in - filter_width + padding_width + stride_width) / stride_width;
w_out_start = 0 > w_out_start ? 0 : w_out_start;
int w_out_end = (w_in + padding_width) / stride_width;
w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end;
T value = 0;
for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
c_out++) {
for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
const int filter_h = h_in + padding_height - h_out * stride_height;
for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
const int filter_w = w_in + padding_width - w_out * stride_width;
const int filter_offset = c_out * filter_height * filter_width +
filter_h * filter_width + filter_w;
const int output_grad_offset =
((batch * output_channels + c_out) * output_height + h_out) *
output_width +
w_out;
value +=
output_grad_data[output_grad_offset] * filter_data[filter_offset];
}
}
}
input_grad_data[index] += value;
}
}
// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
template <typename T>
__global__ void KernelDepthwiseConvFilterGrad(
const int nthreads, const T* const output_grad_data,
const T* const input_data, const int num, const int output_channels,
const int output_height, const int output_width, const int input_channels,
const int input_height, const int input_width, const int filter_multiplier,
const int filter_height, const int filter_width, const int stride_height,
const int stride_width, const int padding_height, const int padding_width,
T* const filter_grad_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int w_out = index % output_width;
const int h_out = (index / output_width) % output_height;
const int c_out = (index / output_width / output_height) % output_channels;
const int batch = (index / output_width / output_height / output_channels);
const int c_in = c_out / filter_multiplier;
const int h_in_start = -padding_height + h_out * stride_height;
const int w_in_start = -padding_width + w_out * stride_width;
const int h_in_end =
-padding_height + h_out * stride_height + filter_height;
const int w_in_end = -padding_width + w_out * stride_width + filter_width;
const int in_offset =
(batch * input_channels + c_in) * input_height * input_width;
T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
const int h_end = h_in_end < input_height ? h_in_end : input_height;
const int w_end = w_in_end < input_width ? w_in_end : input_width;
const int h_start = h_in_start > 0 ? h_in_start : 0;
const int w_start = w_in_start > 0 ? w_in_start : 0;
for (int h_in = h_start; h_in < h_end; h_in++) {
for (int w_in = w_start; w_in < w_end; w_in++) {
const int offset = in_offset + h_in * input_width + w_in;
const T diff_temp = output_grad_data[index] * input_data[offset];
T* addr = addr_offset + (h_in - h_in_start) * filter_width +
(w_in - w_in_start);
paddle::platform::CudaAtomicAdd(addr, diff_temp);
}
}
}
}
/*
* All tensors are in NCHW format.
* Ksize, strides, paddings are two elements. These two elements represent
* height and width, respectively.
*/
template <class T>
class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* output) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output->dims()[1];
const int output_height = output->dims()[2];
const int output_width = output->dims()[3];
const int ksize_height = filter.dims()[2];
const int ksize_width = filter.dims()[3];
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const T* input_data = input.data<T>();
const T* filter_data = filter.data<T>();
T* output_data = output->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_height * output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelDepthwiseConv<T><<<grid, threads, 0, context.stream()>>>(
nthreads, input_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
output_channels / input_channels, ksize_height, ksize_width,
stride_height, stride_width, padding_height, padding_width,
output_data);
}
};
template <typename T>
class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& filter,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output_grad.dims()[1];
const int output_height = output_grad.dims()[2];
const int output_width = output_grad.dims()[3];
const int ksize_height = filter.dims()[2];
const int ksize_width = filter.dims()[3];
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const T* filter_data = filter.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * input_channels * input_height * input_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelDepthwiseConvInputGrad<T><<<grid, threads, 0, context.stream()>>>(
nthreads, output_grad_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
output_channels / input_channels, ksize_height, ksize_width,
stride_height, stride_width, padding_height, padding_width,
input_grad_data);
}
};
template <typename T>
class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* filter_grad) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output_grad.dims()[1];
const int output_height = output_grad.dims()[2];
const int output_width = output_grad.dims()[3];
const int ksize_height = filter_grad->dims()[2];
const int ksize_width = filter_grad->dims()[3];
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const T* input_data = input.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_height * output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelDepthwiseConvFilterGrad<T><<<grid, threads, 0, context.stream()>>>(
nthreads, output_grad_data, input_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
output_channels / input_channels, ksize_height, ksize_width,
stride_height, stride_width, padding_height, padding_width,
filter_grad_data);
}
};
template class DepthwiseConvFunctor<platform::CUDADeviceContext, float>;
template class DepthwiseConvFunctor<platform::CUDADeviceContext, double>;
template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
float>;
template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
double>;
template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
float>;
template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/hostdevice.h"
namespace paddle {
namespace operators {
namespace math {
/*
* \brief Compute the depthwise convolution which include
* forward process and backpropagation process
*/
template <typename DeviceContext, typename T>
class DepthwiseConvFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* output);
};
template <typename DeviceContext, typename T>
class DepthwiseConvInputGradFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& filter,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad);
};
template <typename DeviceContext, typename T>
class DepthwiseConvFilterGradFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* filter_grad);
};
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
enum MiningType { kNone = 0, kMaxNegative, kHardExample };
template <typename T>
bool SortScoreDescend(const std::pair<float, T>& pair1,
const std::pair<float, T>& pair2) {
return pair1.first > pair2.first;
}
inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,
const float match_dist,
const float neg_dist_threshold) {
if (mining_type == MiningType::kMaxNegative) {
return match_idx == -1 && match_dist < neg_dist_threshold;
} else if (mining_type == MiningType::kHardExample) {
return true;
} else {
return false;
}
}
inline MiningType GetMiningType(std::string str) {
if (str == "max_negative") {
return MiningType::kMaxNegative;
} else if (str == "hard_example") {
return MiningType::kHardExample;
} else {
return MiningType::kNone;
}
}
template <typename DeviceContext, typename T>
class MineHardExamplesKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in_cls_loss = ctx.Input<framework::Tensor>("ClsLoss");
auto* in_loc_loss = ctx.Input<framework::Tensor>("LocLoss");
auto* in_matched_indices = ctx.Input<framework::Tensor>("MatchIndices");
auto* in_match_dist = ctx.Input<framework::Tensor>("MatchDist");
float neg_pos_ratio = ctx.Attr<float>("neg_pos_ratio");
T neg_dist_threshold =
static_cast<T>(ctx.Attr<float>("neg_dist_threshold"));
int sample_size = ctx.Attr<int>("sample_size");
MiningType mining_type =
GetMiningType(ctx.Attr<std::string>("mining_type"));
auto out_neg_indices = ctx.Output<framework::LoDTensor>("NegIndices");
auto out_match_indices =
ctx.Output<framework::Tensor>("UpdatedMatchIndices");
framework::Copy(*in_matched_indices, ctx.GetPlace(), out_match_indices);
int batch_size = in_matched_indices->dims()[0];
int prior_num = in_matched_indices->dims()[1];
auto match_indices = framework::EigenMatrix<int>::From(*in_matched_indices);
auto match_indices_et =
framework::EigenMatrix<int>::From(*out_match_indices);
auto match_dist = framework::EigenMatrix<T>::From(*in_match_dist);
const T* cls_loss = in_cls_loss->data<T>();
const T* loc_loss = nullptr;
if (in_loc_loss) {
loc_loss = in_loc_loss->data<T>();
}
std::vector<std::vector<int>> all_neg_indices;
std::vector<size_t> batch_starts = {0};
for (int n = 0; n < batch_size; ++n) {
std::vector<std::pair<T, size_t>> loss_idx;
int neg_sel = 0;
for (int m = 0; m < prior_num; ++m) {
if (IsEligibleMining(mining_type, match_indices(n, m), match_dist(n, m),
neg_dist_threshold)) {
T loss = cls_loss[n * prior_num + m];
if (mining_type == MiningType::kHardExample && loc_loss != nullptr) {
loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m];
}
loss_idx.push_back(std::make_pair(loss, m));
++neg_sel;
}
}
if (mining_type == MiningType::kMaxNegative) {
int num_pos = 0;
for (int m = 0; m < prior_num; ++m) {
if (match_indices(n, m) != -1) ++num_pos;
}
neg_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), neg_sel);
} else if (mining_type == MiningType::kHardExample) {
neg_sel = std::min(sample_size, neg_sel);
}
std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend<size_t>);
std::set<int> sel_indices;
std::vector<int> neg_indices;
std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel,
std::inserter(sel_indices, sel_indices.begin()),
[](std::pair<T, size_t>& l) -> int {
return static_cast<int>(l.second);
});
if (mining_type == MiningType::kHardExample) {
for (int m = 0; m < prior_num; ++m) {
if (match_indices(n, m) > -1) {
if (sel_indices.find(m) == sel_indices.end()) {
match_indices_et(n, m) = -1;
}
} else {
if (sel_indices.find(m) != sel_indices.end()) {
neg_indices.push_back(m);
}
}
}
} else {
neg_indices.resize(sel_indices.size());
std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin());
}
all_neg_indices.push_back(neg_indices);
batch_starts.push_back(batch_starts.back() + neg_indices.size());
}
framework::LoD out_neg_indices_lod;
out_neg_indices_lod.emplace_back(batch_starts);
int neg_offset = 0;
auto neg_data = out_neg_indices->mutable_data<int>(
framework::make_ddim({static_cast<int>(batch_starts.back()), 1}),
ctx.GetPlace());
for (auto neg_indices : all_neg_indices) {
std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset);
neg_offset += neg_indices.size();
}
out_neg_indices->set_lod(out_neg_indices_lod);
return;
}
};
class MineHardExamplesOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("ClsLoss"),
"Input(ClsLoss) of MineHardExamplesOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("MatchIndices"),
"Input(MatchIndices) of MineHardExamplesOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("MatchDist"),
"Input(MatchDist) of MineHardExamplesOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("NegIndices"),
"Output(NegIndices) of MineHardExamplesOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("UpdatedMatchIndices"),
"Output(UpdatedMatchIndices) of MineHardExamplesOp should "
"not be null.");
auto cls_loss_dims = ctx->GetInputDim("ClsLoss");
auto idx_dims = ctx->GetInputDim("MatchIndices");
auto dis_dims = ctx->GetInputDim("MatchDist");
PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL,
"The shape of ClsLoss is [N, Np].");
PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL,
"The shape of MatchIndices is [N, Np].");
PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL,
"The shape of MatchDist is [N, Np].");
if (ctx->HasInput("LocLoss")) {
auto loc_loss_dims = ctx->GetInputDim("LocLoss");
PADDLE_ENFORCE_EQ(loc_loss_dims.size(), 2UL,
"The shape of LocLoss is [N, Np].");
PADDLE_ENFORCE_EQ(cls_loss_dims[0], loc_loss_dims[0],
"Batch size of ClsLoss and LocLoss must be the same.");
PADDLE_ENFORCE_EQ(
cls_loss_dims[1], loc_loss_dims[1],
"Prior box number of ClsLoss and LocLoss must be the same.");
}
PADDLE_ENFORCE_EQ(
cls_loss_dims[0], idx_dims[0],
"Batch size of ClsLoss and MatchIndices must be the same.");
PADDLE_ENFORCE_EQ(
cls_loss_dims[1], idx_dims[1],
"Prior box number of ClsLoss and MatchIndices must be the same.");
PADDLE_ENFORCE_EQ(cls_loss_dims[0], dis_dims[0],
"Batch size of ClsLoss and MatchDist must be the same.");
PADDLE_ENFORCE_EQ(
cls_loss_dims[1], idx_dims[1],
"Prior box number of ClsLoss and MatchDist must be the same.");
auto mining_type =
GetMiningType(ctx->Attrs().Get<std::string>("mining_type"));
PADDLE_ENFORCE_NE(mining_type, MiningType::kNone,
"mining_type must be hard_example or max_negative");
if (mining_type == MiningType::kMaxNegative) {
auto neg_pos_ratio = ctx->Attrs().Get<float>("neg_pos_ratio");
auto neg_dist_threshold = ctx->Attrs().Get<float>("neg_dist_threshold");
PADDLE_ENFORCE_GT(
neg_pos_ratio, 0.0f,
"neg_pos_ratio must greater than zero in max_negative mode");
PADDLE_ENFORCE_GT(
neg_dist_threshold, 0.0f,
"neg_dist_threshold must greater than zero in max_negative mode");
} else if (mining_type == MiningType::kHardExample) {
auto sample_size = ctx->Attrs().Get<int>("sample_size");
PADDLE_ENFORCE_GT(
sample_size, 0,
"sample_size must greater than zero in hard_example mode");
}
ctx->SetOutputDim("UpdatedMatchIndices", idx_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("ClsLoss")->type()),
ctx.device_context());
}
};
class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
public:
MineHardExamplesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"ClsLoss",
"(Tensor, default Tensor<float>), The classification loss with shape "
"[N, Np], N is the batch size and Np is the number of prior box.");
AddInput("LocLoss",
"(Tensor, optional, default Tensor<float>), The localization loss "
"with shape [N, Np], N is the batch size and Np is the number of "
"prior box.")
.AsDispensable();
AddInput("MatchIndices",
"(Tensor, Tensor<int>), Matched indices with shape [N, Np], N is "
"the batch size and Np is the number of prior box. "
"MatchIndices[i][j] equal -1 means the j-th prior box in i-th "
"instance does not match any entity, otherwise means it is "
"matched to row.");
AddInput("MatchDist",
"(Tensor, default Tensor<float>) Matched indices with shape [N, "
"Np], N is the batch size and Np is the number of prior box.");
AddAttr<float>("neg_pos_ratio",
"(float) The ratio of the negative box to the positive "
"box. Use only when mining_type is max_negative.")
.SetDefault(1.0);
AddAttr<float>("neg_dist_threshold",
"(float) The negative overlap upper bound for the unmatched "
"predictions. Use only when mining_type is max_negative.")
.SetDefault(0.5);
AddAttr<int>("sample_size",
"(float) The max sample size of negative box. Use only when "
"mining_type is hard_example.")
.SetDefault(0);
AddAttr<std::string>("mining_type",
"(float) The mining algorithm name, the value is "
"hard_example or max_negative.")
.SetDefault("max_negative")
.InEnum({"hard_example", "max_negative"});
AddOutput(
"NegIndices",
"(LoDTensor<int>) The output of negative example indices. a LoDTensor "
"with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, "
"and each element is the prior box index. "
"For example, the batch size is 2, the lod is [[0, 1, 2]], "
"the sample 0's box 1(MatchIndices[0][1]) is selected, "
"and sample 1's box 0 is selected. The output NegIndices is "
"[[1], [0]].");
AddOutput("UpdatedMatchIndices",
"(Tensor<int>) The output of updated MatchIndices, a tensor with "
"shape [N, Np]. Only update when mining_type is "
"hard_example. The input MatchIndices elements will be update to "
"-1 when it is not in the candidate high loss list of negative "
"examples.");
AddComment(R"DOC(
Mine hard examples Operator.
This operator implements hard example mining to select a subset of negative box indices.
For each image, selects the box with highest losses. subject to the condition that the
box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative.
The selected number is min(sample_size, max_negative_box_number) when mining_type is
hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number)
when mining_type is max_negative, where the max_negative_box_number is the count of
MatchIndices elements with value -1.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp,
ops::MineHardExamplesOpMaker);
REGISTER_OP_CPU_KERNEL(
mine_hard_examples,
ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, float>,
ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
constexpr int64_t kOutputDim = 6;
constexpr int64_t kBBoxSize = 4;
class MultiClassNMSOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("BBoxes"),
"Input(BBoxes) of MultiClassNMS should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Scores"),
"Input(Scores) of MultiClassNMS should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of MultiClassNMS should not be null.");
auto box_dims = ctx->GetInputDim("BBoxes");
auto score_dims = ctx->GetInputDim("Scores");
PADDLE_ENFORCE_EQ(box_dims.size(), 2,
"The rank of Input(BBoxes) must be 2.");
PADDLE_ENFORCE_EQ(score_dims.size(), 3,
"The rank of Input(Scores) must be 3.");
PADDLE_ENFORCE_EQ(box_dims[1], 4,
"The 2nd dimension of Input(BBoxes) must be 4, "
"represents the layout of coordinate "
"[xmin, ymin, xmax, ymax]");
PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2],
"The 1st dimensiong of Input(BBoxes) must be equal to "
"3rd dimension of Input(Scores), which represents the "
"predicted bboxes.");
// Here the box_dims[0] is not the real dimension of output.
// It will be rewritten in the computing kernel.
ctx->SetOutputDim("Out", {box_dims[0], 6});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>("Scores")->type()),
ctx.device_context());
}
};
template <class T>
bool SortScorePairDescend(const std::pair<float, T>& pair1,
const std::pair<float, T>& pair2) {
return pair1.first > pair2.first;
}
template <class T>
static inline void GetMaxScoreIndex(
const std::vector<T>& scores, const T threshold, int top_k,
std::vector<std::pair<T, int>>* sorted_indices) {
for (size_t i = 0; i < scores.size(); ++i) {
if (scores[i] > threshold) {
sorted_indices->push_back(std::make_pair(scores[i], i));
}
}
// Sort the score pair according to the scores in descending order
std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
SortScorePairDescend<int>);
// Keep top_k scores if needed.
if (top_k > -1 && top_k < sorted_indices->size()) {
sorted_indices->resize(top_k);
}
}
template <class T>
static inline T BBoxArea(const T* box, const bool normalized) {
if (box[2] < box[0] || box[3] < box[1]) {
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return static_cast<T>(0.);
} else {
const T w = box[2] - box[0];
const T h = box[3] - box[1];
if (normalized) {
return w * h;
} else {
// If coordinate values are not within range [0, 1].
return (w + 1) * (h + 1);
}
}
}
template <class T>
static inline T JaccardOverlap(const T* box1, const T* box2,
const bool normalized) {
if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
box2[3] < box1[1]) {
return static_cast<T>(0.);
} else {
const T inter_xmin = std::max(box1[0], box2[0]);
const T inter_ymin = std::max(box1[1], box2[1]);
const T inter_xmax = std::min(box1[2], box2[2]);
const T inter_ymax = std::min(box1[3], box2[3]);
const T inter_w = inter_xmax - inter_xmin;
const T inter_h = inter_ymax - inter_ymin;
const T inter_area = inter_w * inter_h;
const T bbox1_area = BBoxArea<T>(box1, normalized);
const T bbox2_area = BBoxArea<T>(box2, normalized);
return inter_area / (bbox1_area + bbox2_area - inter_area);
}
}
template <typename T>
class MultiClassNMSKernel : public framework::OpKernel<T> {
public:
void NMSFast(const Tensor& bbox, const Tensor& scores,
const T score_threshold, const T nms_threshold, const T eta,
const int64_t top_k, std::vector<int>* selected_indices) const {
// The total boxes for each instance.
int64_t num_boxes = bbox.dims()[0];
// 4: [xmin ymin xmax ymax]
int64_t box_size = bbox.dims()[1];
std::vector<T> scores_data(num_boxes);
std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
std::vector<std::pair<T, int>> sorted_indices;
GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
selected_indices->clear();
T adaptive_threshold = nms_threshold;
const T* bbox_data = bbox.data<T>();
while (sorted_indices.size() != 0) {
const int idx = sorted_indices.front().second;
bool keep = true;
for (int k = 0; k < selected_indices->size(); ++k) {
if (keep) {
const int kept_idx = (*selected_indices)[k];
T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
bbox_data + kept_idx * box_size, true);
keep = overlap <= adaptive_threshold;
} else {
break;
}
}
if (keep) {
selected_indices->push_back(idx);
}
sorted_indices.erase(sorted_indices.begin());
if (keep && eta < 1 && adaptive_threshold > 0.5) {
adaptive_threshold *= eta;
}
}
}
void MultiClassNMS(const framework::ExecutionContext& ctx,
const Tensor& scores, const Tensor& bboxes,
std::map<int, std::vector<int>>& indices,
int& num_nmsed_out) const {
int64_t background_label = ctx.Attr<int>("background_label");
int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
int64_t class_num = scores.dims()[0];
int64_t predict_dim = scores.dims()[1];
int num_det = 0;
for (int64_t c = 0; c < class_num; ++c) {
if (c == background_label) continue;
Tensor score = scores.Slice(c, c + 1);
NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k,
&(indices[c]));
num_det += indices[c].size();
}
num_nmsed_out = num_det;
const T* scores_data = scores.data<T>();
if (keep_top_k > -1 && num_det > keep_top_k) {
std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
for (const auto& it : indices) {
int label = it.first;
const T* sdata = scores_data + label * predict_dim;
const std::vector<int>& label_indices = it.second;
for (int j = 0; j < label_indices.size(); ++j) {
int idx = label_indices[j];
PADDLE_ENFORCE_LT(idx, predict_dim);
score_index_pairs.push_back(
std::make_pair(sdata[idx], std::make_pair(label, idx)));
}
}
// Keep top k results per image.
std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
SortScorePairDescend<std::pair<int, int>>);
score_index_pairs.resize(keep_top_k);
// Store the new indices.
std::map<int, std::vector<int>> new_indices;
for (int j = 0; j < score_index_pairs.size(); ++j) {
int label = score_index_pairs[j].second.first;
int idx = score_index_pairs[j].second.second;
new_indices[label].push_back(idx);
}
new_indices.swap(indices);
num_nmsed_out = keep_top_k;
}
}
void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
std::map<int, std::vector<int>>& selected_indices,
Tensor* outs) const {
int predict_dim = scores.dims()[1];
auto* scores_data = scores.data<T>();
auto* bboxes_data = bboxes.data<T>();
auto* odata = outs->data<T>();
int count = 0;
for (const auto& it : selected_indices) {
int label = it.first;
const T* sdata = scores_data + label * predict_dim;
const std::vector<int>& indices = it.second;
for (int j = 0; j < indices.size(); ++j) {
int idx = indices[j];
const T* bdata = bboxes_data + idx * kBBoxSize;
odata[count * kOutputDim] = label; // label
odata[count * kOutputDim + 1] = sdata[idx]; // score
// xmin, ymin, xmax, ymax
std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
count++;
}
}
}
void Compute(const framework::ExecutionContext& ctx) const override {
auto* boxes = ctx.Input<Tensor>("BBoxes");
auto* scores = ctx.Input<Tensor>("Scores");
auto* outs = ctx.Output<LoDTensor>("Out");
auto score_dims = scores->dims();
int64_t batch_size = score_dims[0];
int64_t class_num = score_dims[1];
int64_t predict_dim = score_dims[2];
std::vector<std::map<int, std::vector<int>>> all_indices;
std::vector<size_t> batch_starts = {0};
for (int64_t i = 0; i < batch_size; ++i) {
Tensor ins_score = scores->Slice(i, i + 1);
ins_score.Resize({class_num, predict_dim});
std::map<int, std::vector<int>> indices;
int num_nmsed_out = 0;
MultiClassNMS(ctx, ins_score, *boxes, indices, num_nmsed_out);
all_indices.push_back(indices);
batch_starts.push_back(batch_starts.back() + num_nmsed_out);
}
int num_kept = batch_starts.back();
if (num_kept == 0) {
T* od = outs->mutable_data<T>({1}, ctx.GetPlace());
od[0] = -1;
} else {
outs->mutable_data<T>({num_kept, kOutputDim}, ctx.GetPlace());
for (int64_t i = 0; i < batch_size; ++i) {
Tensor ins_score = scores->Slice(i, i + 1);
ins_score.Resize({class_num, predict_dim});
int64_t s = batch_starts[i];
int64_t e = batch_starts[i + 1];
if (e > s) {
Tensor out = outs->Slice(s, e);
MultiClassOutput(ins_score, *boxes, all_indices[i], &out);
}
}
}
framework::LoD lod;
lod.emplace_back(batch_starts);
outs->set_lod(lod);
}
};
class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
public:
MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("BBoxes",
"(Tensor) A 2-D Tensor with shape [M, 4] represents the "
"predicted locations of M bounding bboxes. Each bounding box "
"has four coordinate values and the layout is "
"[xmin, ymin, xmax, ymax].");
AddInput("Scores",
"(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
"predicted confidence predictions. N is the batch size, C is the "
"class number, M is number of bounding boxes. For each category "
"there are total M scores which corresponding M bounding boxes. "
" Please note, M is equal to the 1st dimension of BBoxes. ");
AddAttr<int>(
"background_label",
"(int64_t, defalut: 0) "
"The index of background label, the background label will be ignored. "
"If set to -1, then all categories will be considered.")
.SetDefault(0);
AddAttr<float>("score_threshold",
"(float) "
"Threshold to filter out bounding boxes with low "
"confidence score. If not provided, consider all boxes.");
AddAttr<int>("nms_top_k",
"(int64_t) "
"Maximum number of detections to be kept according to the "
"confidences aftern the filtering detections based on "
"score_threshold");
AddAttr<float>("nms_threshold",
"(float, defalut: 0.3) "
"The threshold to be used in NMS.")
.SetDefault(0.3);
AddAttr<float>("nms_eta",
"(float) "
"The parameter for adaptive NMS.")
.SetDefault(1.0);
AddAttr<int>("keep_top_k",
"(int64_t) "
"Number of total bboxes to be kept per image after NMS "
"step. -1 means keeping all bboxes after NMS step.");
AddOutput("Out",
"(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
"detections. Each row has 6 values: "
"[label, confidence, xmin, ymin, xmax, ymax], No is the total "
"number of detections in this mini-batch. For each instance, "
"the offsets in first dimension are called LoD, the number of "
"offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
"no detected bbox.");
AddComment(R"DOC(
This operator is to do multi-class non maximum suppression (NMS) on a batched
of boxes and scores.
In the NMS step, this operator greedily selects a subset of detection bounding
boxes that have high scores larger than score_threshold, if providing this
threshold, then selects the largest nms_top_k confidences scores if nms_top_k
is larger than -1. Then this operator pruns away boxes that have high IOU
(intersection over union) overlap with already selected boxes by adaptive
threshold NMS based on parameters of nms_threshold and nms_eta.
Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
per image if keep_top_k is larger than -1.
This operator support multi-class and batched inputs. It applying NMS
independently for each class. The outputs is a 2-D LoDTenosr, for each
image, the offsets in first dimension of LoDTensor are called LoD, the number
of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
means there is no detected bbox for this image. If there is no detected boxes
for all images, all the elements in LoD are 0, and the Out only contains one
value which is -1.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp,
ops::MultiClassNMSOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel<float>,
ops::MultiClassNMSKernel<double>);
......@@ -233,7 +233,7 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
};
break;
default:
sorted_domain = "event end time";
sorted_domain = "event first end time";
}
std::vector<std::vector<EventItem>> events_table;
......
......@@ -12,10 +12,202 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import re
from graphviz import GraphPreviewGenerator
import proto.framework_pb2 as framework_pb2
_vartype2str_ = [
"UNK",
"LoDTensor",
"SelectedRows",
"FeedMinibatch",
"FetchList",
"StepScopes",
"LodRankTable",
"LoDTensorArray",
"PlaceList",
]
_dtype2str_ = [
"bool",
"int16",
"int32",
"int64",
"float16",
"float32",
"float64",
]
def repr_data_type(type):
return _dtype2str_[type]
def repr_tensor(proto):
return "tensor(type={}, shape={})".format(_dtype2str_[int(proto.data_type)],
str(proto.dims))
reprtpl = "{ttype} {name} ({reprs})"
def repr_lodtensor(proto):
if not proto.lod_tensor: return
level = proto.lod_tensor.lod_level
reprs = repr_tensor(proto.lod_tensor.tensor)
return reprtpl.format(
ttype="LoDTensor" if level > 0 else "Tensor",
name=proto.name,
reprs="level=%d, %s" % (level, reprs) if level > 0 else reprs)
def repr_selected_rows(proto):
if not proto.selected_rows: return
return reprtpl.format(
ttype="SelectedRows",
name=proto.name,
reprs=repr_tensor(proto.selected_rows))
def repr_tensor_array(proto):
if not proto.tensor_array: return
return reprtpl.format(
ttype="TensorArray",
name=proto.name,
reprs="level=%d, %s" % (proto.tensor_array.lod_level,
repr_tensor(proto.lod_tensor)))
type_handlers = [
repr_lodtensor,
repr_selected_rows,
repr_tensor_array,
]
def repr_var(vardesc):
for handler in type_handlers:
res = handler(vardesc)
if res:
return res
def pprint_program_codes(program_desc):
reprs = []
for block_idx in range(program_desc.num_blocks()):
block_desc = program_desc.block(block_idx)
block_repr = pprint_block_codes(block_desc)
reprs.append(block_repr)
return '\n'.join(reprs)
def pprint_block_codes(block_desc, show_backward=False):
def is_op_backward(op_desc):
if op_desc.type.endswith('_grad'): return True
def is_var_backward(var):
if "@GRAD" in var.parameter: return True
for arg in var.arguments:
if "@GRAD" in arg: return True
for var in op_desc.inputs:
if is_var_backward(var): return True
for var in op_desc.outputs:
if is_var_backward(var): return True
return False
def is_var_backward(var_desc):
return "@GRAD" in var_desc.name
if type(block_desc) is not framework_pb2.BlockDesc:
block_desc = framework_pb2.BlockDesc.FromString(
block_desc.serialize_to_string())
var_reprs = []
op_reprs = []
for var in block_desc.vars:
if not show_backward and is_var_backward(var):
continue
var_reprs.append(repr_var(var))
for op in block_desc.ops:
if not show_backward and is_op_backward(op): continue
op_reprs.append(repr_op(op))
tpl = "// block-{idx} parent-{pidx}\n// variables\n{vars}\n\n// operators\n{ops}\n"
return tpl.format(
idx=block_desc.idx,
pidx=block_desc.parent_idx,
vars='\n'.join(var_reprs),
ops='\n'.join(op_reprs), )
def repr_attr(desc):
tpl = "{key}={value}"
valgetter = [
lambda attr: attr.i,
lambda attr: attr.f,
lambda attr: attr.s,
lambda attr: attr.ints,
lambda attr: attr.floats,
lambda attr: attr.strings,
lambda attr: attr.b,
lambda attr: attr.bools,
lambda attr: attr.block_idx,
lambda attr: attr.l,
]
key = desc.name
value = valgetter[desc.type](desc)
if key == "dtype":
value = repr_data_type(value)
return tpl.format(key=key, value=str(value)), (key, value)
def _repr_op_fill_constant(optype, inputs, outputs, attrs):
if optype == "fill_constant":
return "{output} = {data} [shape={shape}]".format(
output=','.join(outputs),
data=attrs['value'],
shape=str(attrs['shape']))
op_repr_handlers = [_repr_op_fill_constant, ]
def repr_op(opdesc):
optype = None
attrs = []
attr_dict = {}
is_target = None
inputs = []
outputs = []
tpl = "{outputs} = {optype}({inputs}{is_target}) [{attrs}]"
args2value = lambda args: args[0] if len(args) == 1 else str(list(args))
for var in opdesc.inputs:
key = var.parameter
value = args2value(var.arguments)
inputs.append("%s=%s" % (key, value))
for var in opdesc.outputs:
value = args2value(var.arguments)
outputs.append(value)
for attr in opdesc.attrs:
attr_repr, attr_pair = repr_attr(attr)
attrs.append(attr_repr)
attr_dict[attr_pair[0]] = attr_pair[1]
is_target = opdesc.is_target
for handler in op_repr_handlers:
res = handler(opdesc.type, inputs, outputs, attr_dict)
if res: return res
return tpl.format(
outputs=', '.join(outputs),
optype=opdesc.type,
inputs=', '.join(inputs),
attrs="{%s}" % ','.join(attrs),
is_target=", is_target" if is_target else "")
def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
'''
......
......@@ -1231,10 +1231,17 @@ def conv2d(input,
"""
if stride is None:
stride = [1, 1]
helper = LayerHelper('conv2d', **locals())
dtype = helper.input_dtype()
num_channels = input.shape[1]
l_type = 'conv2d'
if (num_channels == groups and num_filters % num_channels == 0 and
not use_cudnn):
l_type = 'depthwise_conv2d'
helper = LayerHelper(l_type, **locals())
dtype = helper.input_dtype()
if groups is None:
num_filter_channels = num_channels
else:
......@@ -1267,7 +1274,7 @@ def conv2d(input,
pre_bias = helper.create_tmp_variable(dtype)
helper.append_op(
type='conv2d',
type=l_type,
inputs={
'Input': input,
'Filter': filter_param,
......@@ -1478,7 +1485,9 @@ def batch_norm(input,
param_attr=None,
bias_attr=None,
data_layout='NCHW',
name=None):
name=None,
moving_mean_name=None,
moving_variance_name=None):
"""
This function helps create an operator to implement
the BatchNorm layer using the configurations from the input parameters.
......@@ -1508,6 +1517,7 @@ def batch_norm(input,
attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
mean = helper.create_global_variable(
name=moving_mean_name,
dtype=input.dtype,
shape=param_shape,
persistable=True,
......@@ -1515,6 +1525,7 @@ def batch_norm(input,
helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
variance = helper.create_global_variable(
name=moving_variance_name,
dtype=input.dtype,
shape=param_shape,
persistable=True,
......
......@@ -103,10 +103,10 @@ def profiler(state, sorted_key=None):
core.enable_profiler(prof_state)
yield
if sorted_key not in ['calls', 'total', 'max', 'min', 'ave']:
raise ValueError("The state must be in 'calls', 'total', "
"'max', 'min', 'ave'")
sorted_key = 'default' if sorted_key is None else sorted_key
if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
raise ValueError("The sorted_key must be None or in 'calls', 'total', "
"'max', 'min' and 'ave'")
key_map = {
'default': core.EventSortingKey.kDefault,
'calls': core.EventSortingKey.kCalls,
......
......@@ -62,7 +62,7 @@ def batch_bipartite_match(distance, lod):
return match_indices, match_dist
class TestBipartiteMatchOpForWithLoD(OpTest):
class TestBipartiteMatchOpWithLoD(OpTest):
def setUp(self):
self.op_type = 'bipartite_match'
lod = [[0, 5, 11, 23]]
......@@ -72,7 +72,7 @@ class TestBipartiteMatchOpForWithLoD(OpTest):
self.inputs = {'DistMat': (dist, lod)}
self.outputs = {
'ColToRowMatchIndices': (match_indices),
'ColToRowMatchDis': (match_dist),
'ColToRowMatchDist': (match_dist),
}
def test_check_output(self):
......@@ -89,7 +89,7 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
self.inputs = {'DistMat': dist}
self.outputs = {
'ColToRowMatchIndices': match_indices,
'ColToRowMatchDis': match_dist,
'ColToRowMatchDist': match_dist,
}
def test_check_output(self):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import sys
import math
from op_test import OpTest
def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
prior_box_x = (
(prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0])
prior_box_y = (
(prior_box[:, 3] + prior_box[:, 1]) / 2).reshape(1, prior_box.shape[0])
prior_box_width = (
(prior_box[:, 2] - prior_box[:, 0])).reshape(1, prior_box.shape[0])
prior_box_height = (
(prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0])
prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0],
prior_box_var.shape[1])
if (code_type == "EncodeCenterSize"):
target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape(
target_box.shape[0], 1)
target_box_y = ((target_box[:, 3] + target_box[:, 1]) / 2).reshape(
target_box.shape[0], 1)
target_box_width = ((target_box[:, 2] - target_box[:, 0])).reshape(
target_box.shape[0], 1)
target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape(
target_box.shape[0], 1)
output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \
prior_box_var[:,:,0]
output_box[:,:,1] = (target_box_y - prior_box_y) / prior_box_height / \
prior_box_var[:,:,1]
output_box[:,:,2] = np.log(np.fabs(target_box_width / prior_box_width)) / \
prior_box_var[:,:,2]
output_box[:,:,3] = np.log(np.fabs(target_box_height / prior_box_height)) / \
prior_box_var[:,:,3]
elif (code_type == "DecodeCenterSize"):
target_box = target_box.reshape(target_box.shape[0], 1,
target_box.shape[1])
target_box_x = prior_box_var[:,:,0] * target_box[:,:,0] * \
prior_box_width + prior_box_x
target_box_y = prior_box_var[:,:,1] * target_box[:,:,1] * \
prior_box_height + prior_box_y
target_box_width = np.exp(prior_box_var[:,:,2] * target_box[:,:,2]) * \
prior_box_width
target_box_height = np.exp(prior_box_var[:,:,3] * target_box[:,:,3]) * \
prior_box_height
output_box[:, :, 0] = target_box_x - target_box_width / 2
output_box[:, :, 1] = target_box_y - target_box_height / 2
output_box[:, :, 2] = target_box_x + target_box_width / 2
output_box[:, :, 3] = target_box_y + target_box_height / 2
def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type):
n = target_box.shape[0]
m = prior_box.shape[0]
output_box = np.zeros((n, m, 4), dtype=np.float32)
for i in range(len(lod) - 1):
box_coder(target_box[lod[i]:lod[i + 1], :], prior_box, prior_box_var,
output_box[lod[i]:lod[i + 1], :, :], code_type)
return output_box
class TestBoxCoderOp(OpTest):
def test_check_output(self):
self.check_output()
def setUp(self):
self.op_type = "box_coder"
lod = [[0, 20]]
prior_box = np.random.random((10, 4)).astype('float32')
prior_box_var = np.random.random((10, 4)).astype('float32')
target_box = np.random.random((20, 4)).astype('float32')
code_type = "DecodeCenterSize"
output_box = batch_box_coder(prior_box, prior_box_var, target_box,
lod[0], code_type)
self.inputs = {
'PriorBox': prior_box,
'PriorBoxVar': prior_box_var,
'TargetBox': target_box,
}
self.attrs = {'code_type': 'decode_center_size'}
self.outputs = {'OutputBox': output_box}
class TestBoxCoderOpWithLoD(OpTest):
def test_check_output(self):
self.check_output()
def setUp(self):
self.op_type = "box_coder"
lod = [[0, 4, 12, 20]]
prior_box = np.random.random((10, 4)).astype('float32')
prior_box_var = np.random.random((10, 4)).astype('float32')
target_box = np.random.random((20, 4)).astype('float32')
code_type = "EncodeCenterSize"
output_box = batch_box_coder(prior_box, prior_box_var, target_box,
lod[0], code_type)
self.inputs = {
'PriorBox': prior_box,
'PriorBoxVar': prior_box_var,
'TargetBox': (target_box, lod),
}
self.attrs = {'code_type': 'encode_center_size'}
self.outputs = {'OutputBox': output_box}
if __name__ == '__main__':
unittest.main()
......@@ -241,6 +241,30 @@ class TestCUDNNWith1x1(TestWith1x1):
self.op_type = "conv2d"
class TestDepthwiseConv(TestConv2dOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
class TestDepthwiseConv2(TestConv2dOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
# cudnn v5 does not support dilation conv.
# class TestCUDNNWithDilation(TestWithDilation):
# def init_op_type(self):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import sys
import math
from op_test import OpTest
class TestMineHardExamplesOp(OpTest):
def set_data(self):
self.init_test_data()
self.inputs = {
'ClsLoss': self.cls_loss,
'LocLoss': self.loc_loss,
'MatchIndices': self.match_indices,
'MatchDist': self.match_dis
}
self.attrs = {
'neg_pos_ratio': self.neg_pos_ratio,
'neg_overlap': self.neg_overlap,
'sample_size': self.sample_size,
'mining_type': self.mining_type
}
self.outputs = {
'NegIndices': (self.neg_indices, self.neg_indices_lod),
'UpdatedMatchIndices': self.updated_match_indices
}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
return
def setUp(self):
self.op_type = "mine_hard_examples"
self.set_data()
def init_test_data(self):
self.neg_pos_ratio = 1.0
self.neg_overlap = 0.5
self.sample_size = 0
self.mining_type = "max_negative"
self.cls_loss = np.array([[0.1, 0.1, 0.3],
[0.3, 0.1, 0.1]]).astype('float32')
self.loc_loss = np.array([[0.1, 0.2, 0.3],
[0.3, 0.4, 0.1]]).astype('float32')
self.match_dis = np.array([[0.2, 0.4, 0.8],
[0.1, 0.9, 0.3]]).astype('float32')
self.match_indices = np.array([[0, -1, -1],
[-1, 0, -1]]).astype('int32')
self.updated_match_indices = self.match_indices
self.neg_indices_lod = [[0, 1, 2]]
self.neg_indices = np.array([[1], [0]]).astype('int32')
class TestMineHardExamplesOpHardExample(TestMineHardExamplesOp):
def init_test_data(self):
super(TestMineHardExamplesOpHardExample, self).init_test_data()
self.mining_type = "hard_example"
self.sample_size = 2
self.cls_loss = np.array([[0.5, 0.1, 0.3],
[0.3, 0.1, 0.1]]).astype('float32')
self.loc_loss = np.array([[0.2, 0.2, 0.3],
[0.3, 0.1, 0.2]]).astype('float32')
self.match_indices = np.array([[0, -1, -1],
[-1, 0, -1]]).astype('int32')
self.updated_match_indices = np.array([[0, -1, -1],
[-1, -1, -1]]).astype('int32')
self.neg_indices_lod = [[0, 1, 3]]
self.neg_indices = np.array([[2], [0], [2]]).astype('int32')
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import unittest
import numpy as np
import copy
from op_test import OpTest
def iou(box_a, box_b):
"""Apply intersection-over-union overlap between box_a and box_b
"""
xmin_a = min(box_a[0], box_a[2])
ymin_a = min(box_a[1], box_a[3])
xmax_a = max(box_a[0], box_a[2])
ymax_a = max(box_a[1], box_a[3])
xmin_b = min(box_b[0], box_b[2])
ymin_b = min(box_b[1], box_b[3])
xmax_b = max(box_b[0], box_b[2])
ymax_b = max(box_b[1], box_b[3])
area_a = (ymax_a - ymin_a) * (xmax_a - xmin_a)
area_b = (ymax_b - ymin_b) * (xmax_b - xmin_b)
if area_a <= 0 and area_b <= 0:
return 0.0
xa = max(xmin_a, xmin_b)
ya = max(ymin_a, ymin_b)
xb = min(xmax_a, xmax_b)
yb = min(ymax_a, ymax_b)
inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0)
box_a_area = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
box_b_area = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
iou_ratio = inter_area / (area_a + area_b - inter_area)
return iou_ratio
def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0):
"""Apply non-maximum suppression at test time to avoid detecting too many
overlapping bounding boxes for a given object.
Args:
boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
scores: (tensor) The class predscores for the img, Shape:[num_priors].
score_threshold: (float) The confidence thresh for filtering low
confidence boxes.
nms_threshold: (float) The overlap thresh for suppressing unnecessary
boxes.
top_k: (int) The maximum number of box preds to consider.
eta: (float) The parameter for adaptive NMS.
Return:
The indices of the kept boxes with respect to num_priors.
"""
all_scores = copy.deepcopy(scores)
all_scores = all_scores.flatten()
selected_indices = np.argwhere(all_scores > score_threshold)
selected_indices = selected_indices.flatten()
all_scores = all_scores[selected_indices]
sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
sorted_scores = all_scores[sorted_indices]
if top_k > -1 and top_k < sorted_indices.shape[0]:
sorted_indices = sorted_indices[:top_k]
sorted_scores = sorted_scores[:top_k]
selected_indices = []
adaptive_threshold = nms_threshold
for i in range(sorted_scores.shape[0]):
idx = sorted_indices[i]
keep = True
for k in range(len(selected_indices)):
if keep:
kept_idx = selected_indices[k]
overlap = iou(boxes[idx], boxes[kept_idx])
keep = True if overlap <= adaptive_threshold else False
else:
break
if keep:
selected_indices.append(idx)
if keep and eta < 1 and adaptive_threshold > 0.5:
adaptive_threshold *= eta
return selected_indices
def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
nms_top_k, keep_top_k):
class_num = scores.shape[0]
priorbox_num = scores.shape[1]
selected_indices = {}
num_det = 0
for c in range(class_num):
if c == background: continue
indices = nms(boxes, scores[c], score_threshold, nms_threshold,
nms_top_k)
selected_indices[c] = indices
num_det += len(indices)
if keep_top_k > -1 and num_det > keep_top_k:
score_index = []
for c, indices in selected_indices.iteritems():
for idx in indices:
score_index.append((scores[c][idx], c, idx))
sorted_score_index = sorted(
score_index, key=lambda tup: tup[0], reverse=True)
sorted_score_index = sorted_score_index[:keep_top_k]
selected_indices = {}
for _, c, _ in sorted_score_index:
selected_indices[c] = []
for s, c, idx in sorted_score_index:
selected_indices[c].append(idx)
num_det = keep_top_k
return selected_indices, num_det
def batched_multiclass_nms(boxes, scores, background, score_threshold,
nms_threshold, nms_top_k, keep_top_k):
batch_size = scores.shape[0]
det_outs = []
lod = [0]
for n in range(batch_size):
nmsed_outs, nmsed_num = multiclass_nms(boxes, scores[n], background,
score_threshold, nms_threshold,
nms_top_k, keep_top_k)
lod.append(lod[-1] + nmsed_num)
if nmsed_num == 0: continue
for c, indices in nmsed_outs.iteritems():
for idx in indices:
xmin, ymin, xmax, ymax = boxes[idx][:]
det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
return det_outs, lod
class TestMulticlassNMSOp(OpTest):
def set_argument(self):
self.score_threshold = 0.01
def setUp(self):
self.set_argument()
N = 7
M = 1200
C = 21
BOX_SIZE = 4
background = 0
nms_threshold = 0.3
nms_top_k = 400
keep_top_k = 200
score_threshold = self.score_threshold
scores = np.random.random((N * M, C)).astype('float32')
def softmax(x):
shiftx = x - np.max(x).clip(-64.)
exps = np.exp(shiftx)
return exps / np.sum(exps)
scores = np.apply_along_axis(softmax, 1, scores)
scores = np.reshape(scores, (N, M, C))
scores = np.transpose(scores, (0, 2, 1))
boxes = np.random.random((M, BOX_SIZE)).astype('float32')
boxes[:, 0:2] = boxes[:, 0:2] * 0.5
boxes[:, 2:4] = boxes[:, 2:4] * 0.5 + 0.5
nmsed_outs, lod = batched_multiclass_nms(boxes, scores, background,
score_threshold, nms_threshold,
nms_top_k, keep_top_k)
nmsed_outs = [-1] if not nmsed_outs else nmsed_outs
nmsed_outs = np.array(nmsed_outs).astype('float32')
self.op_type = 'multiclass_nms'
self.inputs = {'BBoxes': boxes, 'Scores': scores}
self.outputs = {'Out': (nmsed_outs, [lod])}
self.attrs = {
'background_label': 0,
'nms_threshold': nms_threshold,
'nms_top_k': nms_top_k,
'keep_top_k': keep_top_k,
'score_threshold': score_threshold,
'nms_eta': 1.0,
}
def test_check_output(self):
self.check_output()
class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp):
def set_argument(self):
# Here set 2.0 to test the case there is no outputs.
# In practical use, 0.0 < score_threshold < 1.0
self.score_threshold = 2.0
class TestIOU(unittest.TestCase):
def test_iou(self):
box1 = np.array([4.0, 3.0, 7.0, 5.0]).astype('float32')
box2 = np.array([3.0, 4.0, 6.0, 8.0]).astype('float32')
expt_output = np.array([2.0 / 16.0]).astype('float32')
calc_output = np.array([iou(box1, box2)]).astype('float32')
self.assertTrue(np.allclose(calc_output, expt_output))
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册