提交 3074ae7b 编写于 作者: X xzl

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_depthwiseConv_op_gpu

#FROM python:2.7.14
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
RUN apt-get update && apt-get install -y python
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
# so we must build one with distribute support to install in this image.
RUN pip install paddlepaddle
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
RUN pip uninstall -y paddlepaddle
# below lines may change a lot for debugging
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl && \
chmod +x /usr/bin/paddle_k8s
ENV LD_LIBRARY_PATH=/usr/local/lib
ADD vgg16_fluid.py vgg16_v2.py /workspace/
# Performance for Distributed vgg16
## Test Result
### Hardware Infomation
- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
- cpu MHz : 2101.000
- cache size : 20480 KB
### Single Node Single Thread
- PServer Count: 10
- Trainer Count: 20
- Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
| TensorFlow | - | - | - | - |
### Different Batch Size
- PServer Count: 10
- Trainer Count: 20
- Per trainer CPU Core: 1
- Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
| TensorFlow | - | - | - | - |
### Accelerate Rate
- Pserver Count: 20
- Batch Size: 128
- Metrics: samples / sec
| Trainer Count | 20 | 40 | 80 | 100 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
| TensorFlow | - | - | - | - |
### Different Pserver Count
- Trainer Count: 60
- Batch Size: 128
- Metrics: samples/ sec
| PServer Count | 3 | 6 |10 | 20 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
| TensorFlow | - | - | - | - |
*The performance gap between Fuild and v2 comes from the network interference.*
## Steps to Run the Performance Test
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
Check the logs for the distributed training progress and analyze the performance.
## Enable Verbos Logs
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: MKL_NUM_THREADS
value: "1"
- name: TRAINING_ROLE
value: "PSERVER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
command: ["paddle_k8s", "start_fluid"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_fluid"]
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: TRAINING_ROLE
value: "TRAINER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16v2job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16v2job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "python train.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
command: ["paddle_k8s", "start_pserver"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16v2job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16v2job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_trainer", "v2"]
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: BATCH_SIZE
value: "256"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "2"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""VGG16 benchmark in Fluid"""
from __future__ import print_function
import sys
import time
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
import paddle.v2.fluid.profiler as profiler
import argparse
import functools
import os
def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--batch_size', type=int, default=128, help="Batch size for training.")
parser.add_argument(
'--learning_rate',
type=float,
default=1e-3,
help="Learning rate for training.")
parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
parser.add_argument(
'--device',
type=str,
default='CPU',
choices=['CPU', 'GPU'],
help="The device type.")
parser.add_argument('--device_id', type=int, default=0, help="The device id.")
parser.add_argument(
'--data_format',
type=str,
default='NCHW',
choices=['NCHW', 'NHWC'],
help='The data order, now only support NCHW.')
parser.add_argument(
'--data_set',
type=str,
default='cifar10',
choices=['cifar10', 'flowers'],
help='Optional dataset for benchmark.')
parser.add_argument(
'--local',
type=str2bool,
default=True,
help='Whether to run as local mode.')
args = parser.parse_args()
def vgg16_bn_drop(input):
def conv_block(input, num_filter, groups, dropouts):
return fluid.nets.img_conv_group(
input=input,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act='relu',
conv_with_batchnorm=True,
conv_batchnorm_drop_rate=dropouts,
pool_type='max')
conv1 = conv_block(input, 64, 2, [0.3, 0])
conv2 = conv_block(conv1, 128, 2, [0.4, 0])
conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = fluid.layers.fc(input=drop, size=512, act=None)
bn = fluid.layers.batch_norm(input=fc1, act='relu')
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
return fc2
def main():
if args.data_set == "cifar10":
classdim = 10
if args.data_format == 'NCHW':
data_shape = [3, 32, 32]
else:
data_shape = [32, 32, 3]
else:
classdim = 102
if args.data_format == 'NCHW':
data_shape = [3, 224, 224]
else:
data_shape = [224, 224, 3]
# Input data
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
net = vgg16_bn_drop(images)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
# inference program
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
test_target = accuracy.metrics + accuracy.states
inference_program = fluid.io.get_inference_program(test_target)
# Optimization
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
optimize_ops, params_grads = optimizer.minimize(avg_cost)
# Initialize executor
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
args.device_id)
exe = fluid.Executor(place)
# test
def test(exe):
accuracy.reset(exe)
for batch_id, data in enumerate(test_reader()):
img_data = np.array(map(lambda x: x[0].reshape(data_shape),
data)).astype("float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
exe.run(inference_program,
feed={"pixel": img_data,
"label": y_data})
return accuracy.eval(exe)
def train_loop(exe, trainer_prog):
iters = 0
ts = time.time()
for pass_id in range(args.num_passes):
# train
start_time = time.time()
num_samples = 0
accuracy.reset(exe)
with profiler.profiler("CPU", 'total') as prof:
for batch_id, data in enumerate(train_reader()):
ts = time.time()
img_data = np.array(
map(lambda x: x[0].reshape(data_shape), data)).astype(
"float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
loss, acc = exe.run(
trainer_prog,
feed={"pixel": img_data,
"label": y_data},
fetch_list=[avg_cost] + accuracy.metrics)
iters += 1
num_samples += len(data)
print(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
% (pass_id, iters, loss, acc, time.time() - ts)
) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time
pass_train_acc = accuracy.eval(exe)
pass_test_acc = test(exe)
print(
"Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
% (pass_id, num_samples / pass_elapsed, pass_train_acc,
pass_test_acc))
if args.local:
# Parameter initialization
exe.run(fluid.default_startup_program())
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
batch_size=args.batch_size)
train_loop(exe, fluid.default_main_program())
else:
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # all pserver endpoints
eplist = []
for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, "6174"]))
pserver_endpoints = ",".join(eplist)
print("pserver endpoints: ", pserver_endpoints)
trainers = int(os.getenv("TRAINERS")) # total trainer count
print("trainers total: ", trainers)
current_endpoint = os.getenv(
"POD_IP") + ":6174" # current pserver endpoint
training_role = os.getenv(
"TRAINING_ROLE",
"TRAINER") # get the training role: trainer/pserver
t = fluid.DistributeTranspiler()
t.transpile(
optimize_ops,
params_grads,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER":
if not current_endpoint:
print("need env SERVER_ENDPOINT")
exit(1)
pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint,
pserver_prog)
print("starting server side startup")
exe.run(pserver_startup)
print("starting parameter server...")
exe.run(pserver_prog)
elif training_role == "TRAINER":
# Parameter initialization
exe.run(fluid.default_startup_program())
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
paddle.dataset.flowers.test(),
batch_size=args.batch_size)
trainer_prog = t.get_trainer_program()
feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
# TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
exe.run(fluid.default_startup_program())
train_loop(exe, trainer_prog)
else:
print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
def print_arguments():
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == "__main__":
print_arguments()
main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import gzip
import paddle.v2.dataset.cifar as cifar
import paddle.v2 as paddle
import time
import os
DATA_DIM = 3 * 32 * 32
CLASS_DIM = 10
BATCH_SIZE = os.getenv("BATCH_SIZE")
if BATCH_SIZE:
BATCH_SIZE = int(BATCH_SIZE)
else:
BATCH_SIZE = 128
print "batch_size", BATCH_SIZE
NODE_COUNT = int(os.getenv("TRAINERS"))
ts = 0
def vgg(input, nums, class_dim):
def conv_block(input, num_filter, groups, num_channels=None):
return paddle.networks.img_conv_group(
input=input,
num_channels=num_channels,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act=paddle.activation.Relu(),
pool_type=paddle.pooling.Max())
assert len(nums) == 5
# the channel of input feature is 3
conv1 = conv_block(input, 64, nums[0], 3)
conv2 = conv_block(conv1, 128, nums[1])
conv3 = conv_block(conv2, 256, nums[2])
conv4 = conv_block(conv3, 512, nums[3])
conv5 = conv_block(conv4, 512, nums[4])
fc_dim = 512
fc1 = paddle.layer.fc(input=conv5,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
fc2 = paddle.layer.fc(input=fc1,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
out = paddle.layer.fc(input=fc2,
size=class_dim,
act=paddle.activation.Softmax())
return out
def vgg13(input, class_dim):
nums = [2, 2, 2, 2, 2]
return vgg(input, nums, class_dim)
def vgg16(input, class_dim):
nums = [2, 2, 3, 3, 3]
return vgg(input, nums, class_dim)
def vgg19(input, class_dim):
nums = [2, 2, 4, 4, 4]
return vgg(input, nums, class_dim)
def main():
global ts
paddle.init(use_gpu=False)
image = paddle.layer.data(
name="image", type=paddle.data_type.dense_vector(DATA_DIM))
lbl = paddle.layer.data(
name="label", type=paddle.data_type.integer_value(CLASS_DIM))
extra_layers = None
# NOTE: for v2 distributed training need averaging updates.
learning_rate = 1e-3 / NODE_COUNT
out = vgg16(image, class_dim=CLASS_DIM)
cost = paddle.layer.classification_cost(input=out, label=lbl)
# Create parameters
parameters = paddle.parameters.create(cost)
# Create optimizer
optimizer = paddle.optimizer.Momentum(
momentum=0.9,
regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
BATCH_SIZE),
learning_rate=learning_rate / BATCH_SIZE,
learning_rate_decay_a=0.1,
learning_rate_decay_b=128000 * 35,
learning_rate_schedule="discexp", )
train_reader = paddle.batch(
paddle.reader.shuffle(
cifar.train10(),
# To use other data, replace the above line with:
# reader.train_reader('train.list'),
buf_size=1000),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
cifar.test10(),
# To use other data, replace the above line with:
# reader.test_reader('val.list'),
batch_size=BATCH_SIZE)
# Create trainer
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer,
extra_layers=extra_layers,
is_local=False)
# End batch and end pass event handler
def event_handler(event):
global ts, ts_pass
if isinstance(event, paddle.event.BeginPass):
ts_pass = time.time()
if isinstance(event, paddle.event.BeginIteration):
ts = time.time()
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 1 == 0:
print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
event.pass_id, event.batch_id, event.cost, event.metrics,
time.time() - ts)
if isinstance(event, paddle.event.EndPass):
print "Pass %d end, spent: %f" % (event.pass_id,
time.time() - ts_pass)
result = trainer.test(reader=test_reader)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
trainer.train(
reader=train_reader, num_passes=200, event_handler=event_handler)
if __name__ == '__main__':
main()
...@@ -87,6 +87,11 @@ roi_pool ...@@ -87,6 +87,11 @@ roi_pool
.. autoclass:: paddle.v2.layer.roi_pool .. autoclass:: paddle.v2.layer.roi_pool
:noindex: :noindex:
pad
----
.. autoclass:: paddle.v2.layer.pad
:noindex:
Norm Layer Norm Layer
========== ==========
...@@ -133,6 +138,11 @@ grumemory ...@@ -133,6 +138,11 @@ grumemory
.. autoclass:: paddle.v2.layer.grumemory .. autoclass:: paddle.v2.layer.grumemory
:noindex: :noindex:
gated_unit
-----------
.. autoclass:: paddle.v2.layer.gated_unit
:noindex:
Recurrent Layer Group Recurrent Layer Group
===================== =====================
...@@ -340,6 +350,11 @@ bilinear_interp ...@@ -340,6 +350,11 @@ bilinear_interp
.. autoclass:: paddle.v2.layer.bilinear_interp .. autoclass:: paddle.v2.layer.bilinear_interp
:noindex: :noindex:
dropout
--------
.. autoclass:: paddle.v2.layer.dropout
:noindex:
dot_prod dot_prod
--------- ---------
.. autoclass:: paddle.v2.layer.dot_prod .. autoclass:: paddle.v2.layer.dot_prod
...@@ -402,6 +417,11 @@ scale_shift ...@@ -402,6 +417,11 @@ scale_shift
.. autoclass:: paddle.v2.layer.scale_shift .. autoclass:: paddle.v2.layer.scale_shift
:noindex: :noindex:
factorization_machine
---------------------
.. autoclass:: paddle.v2.layer.factorization_machine
:noindex:
Sampling Layers Sampling Layers
=============== ===============
...@@ -420,22 +440,6 @@ multiplex ...@@ -420,22 +440,6 @@ multiplex
.. autoclass:: paddle.v2.layer.multiplex .. autoclass:: paddle.v2.layer.multiplex
:noindex: :noindex:
Factorization Machine Layer
============================
factorization_machine
---------------------
.. autoclass:: paddle.v2.layer.factorization_machine
:noindex:
Slicing and Joining Layers
==========================
pad
----
.. autoclass:: paddle.v2.layer.pad
:noindex:
.. _api_v2.layer_costs: .. _api_v2.layer_costs:
Cost Layers Cost Layers
...@@ -526,6 +530,11 @@ multibox_loss ...@@ -526,6 +530,11 @@ multibox_loss
.. autoclass:: paddle.v2.layer.multibox_loss .. autoclass:: paddle.v2.layer.multibox_loss
:noindex: :noindex:
detection_output
----------------
.. autoclass:: paddle.v2.layer.detection_output
:noindex:
Check Layer Check Layer
============ ============
...@@ -534,31 +543,10 @@ eos ...@@ -534,31 +543,10 @@ eos
.. autoclass:: paddle.v2.layer.eos .. autoclass:: paddle.v2.layer.eos
:noindex: :noindex:
Miscs Activation
===== ==========
dropout
--------
.. autoclass:: paddle.v2.layer.dropout
:noindex:
Activation with learnable parameter
===================================
prelu prelu
-------- --------
.. autoclass:: paddle.v2.layer.prelu .. autoclass:: paddle.v2.layer.prelu
:noindex: :noindex:
gated_unit
-----------
.. autoclass:: paddle.v2.layer.gated_unit
:noindex:
Detection output Layer
======================
detection_output
----------------
.. autoclass:: paddle.v2.layer.detection_output
:noindex:
...@@ -73,3 +73,10 @@ wmt14 ...@@ -73,3 +73,10 @@ wmt14
.. automodule:: paddle.v2.dataset.wmt14 .. automodule:: paddle.v2.dataset.wmt14
:members: :members:
:noindex: :noindex:
wmt16
+++++
.. automodule:: paddle.v2.dataset.wmt16
:members:
:noindex:
### Design Doc: Switch
### Background
Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid.
The following example shows the usage of `fluid.switch`.
```python
a = fluid.Var(10)
b = fluid.Var(0)
switch = fluid.switch()
with switch.block():
with switch.case(fluid.less_equal(a, 10)):
fluid.print("Case 1")
with switch.case(fluid.larger(a, 0)):
fluid.print("Case 2")
with switch.default():
fluid.print("Case 3")
```
### The Semantics
1. A `switch` control-flow checks cases one-by-one.
1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
1. It runs the first matched case, or the default case if there is one.
1. Once it matches a case, it runs the corresponding branch and only that branch. It's like there is a C's `break` keyword at the end of each case.
The above program should print and print only "Case 1".
The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
...@@ -29,16 +29,16 @@ TEST(Channel, MakeAndClose) { ...@@ -29,16 +29,16 @@ TEST(Channel, MakeAndClose) {
{ {
// MakeChannel should return a buffered channel is buffer_size > 0. // MakeChannel should return a buffered channel is buffer_size > 0.
auto ch = MakeChannel<int>(10); auto ch = MakeChannel<int>(10);
EXPECT_NE(dynamic_cast<Buffered<int>*>(ch), nullptr); EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
EXPECT_EQ(dynamic_cast<UnBuffered<int>*>(ch), nullptr); EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
CloseChannel(ch); CloseChannel(ch);
delete ch; delete ch;
} }
{ {
// MakeChannel should return an un-buffered channel is buffer_size = 0. // MakeChannel should return an un-buffered channel is buffer_size = 0.
auto ch = MakeChannel<int>(0); auto ch = MakeChannel<int>(0);
EXPECT_EQ(dynamic_cast<Buffered<int>*>(ch), nullptr); EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
EXPECT_NE(dynamic_cast<UnBuffered<int>*>(ch), nullptr); EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
CloseChannel(ch); CloseChannel(ch);
delete ch; delete ch;
} }
...@@ -78,3 +78,132 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { ...@@ -78,3 +78,132 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
t.join(); t.join();
delete ch; delete ch;
} }
TEST(Channel, SimpleUnbufferedChannelTest) {
auto ch = MakeChannel<int>(0);
unsigned sum_send = 0;
std::thread t([&]() {
for (int i = 0; i < 5; i++) {
ch->Send(&i);
sum_send += i;
}
});
for (int i = 0; i < 5; i++) {
int recv;
ch->Receive(&recv);
EXPECT_EQ(recv, i);
}
CloseChannel(ch);
t.join();
EXPECT_EQ(sum_send, 10U);
delete ch;
}
// This tests that closing an unbuffered channel also unblocks
// unblocks any receivers waiting for senders
TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
auto ch = MakeChannel<int>(0);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked becausew of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
ch->Receive(&data);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the thread
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
}
// This tests that closing an unbuffered channel also unblocks
// unblocks any senders waiting for senders
TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
auto ch = MakeChannel<int>(0);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked becausew of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data = 10;
ch->Send(&data);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the thread
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
}
TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
auto ch = MakeChannel<int>(0);
unsigned sum_send = 0;
// Send should block after three iterations
// since we only have three receivers.
std::thread t([&]() {
// Try to send more number of times
// than receivers
for (int i = 0; i < 4; i++) {
ch->Send(&i);
sum_send += i;
}
});
for (int i = 0; i < 3; i++) {
int recv;
ch->Receive(&recv);
EXPECT_EQ(recv, i);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec
EXPECT_EQ(sum_send, 3U);
CloseChannel(ch);
t.join();
delete ch;
}
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <atomic>
#include <condition_variable> #include <condition_variable>
#include <deque>
#include <mutex> #include <mutex>
#include "paddle/framework/channel.h" #include "paddle/framework/channel.h"
...@@ -36,20 +36,104 @@ class UnBuffered : public paddle::framework::Channel<T> { ...@@ -36,20 +36,104 @@ class UnBuffered : public paddle::framework::Channel<T> {
virtual ~UnBuffered(); virtual ~UnBuffered();
private: private:
UnBuffered() {} std::mutex mu_ch_;
// Mutex for readers and writers who are waiting for other reader
// and writer to complete execution
std::recursive_mutex mu_read_, mu_write_;
// reader_found_ is set true when a reader is ready to accept data
// writer_found_ is set true when a writer is ready to send data
// A transaction occurs only when both are true
std::atomic<bool> reader_found_{false}, writer_found_{false};
std::condition_variable cv_channel_;
std::condition_variable_any cv_reader_, cv_writer_;
T* item{nullptr};
std::atomic<bool> closed_{false};
UnBuffered() : closed_(false) {}
void NotifyAllParticipants(std::unique_lock<std::mutex>*);
}; };
// This function implements the concept of how data should
// be sent from a writer to a reader.
template <typename T>
void UnBuffered<T>::Send(T* data) {
// Prevent other writers from entering
std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
writer_found_ = true;
std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
// If writer comes first, it should wait till a reader arrives
cv_writer_.wait(cv_lock,
[this]() { return reader_found_ == true || closed_; });
cv_reader_.notify_one();
if (!closed_) {
std::unique_lock<std::mutex> channel_lock(mu_ch_);
item = data;
channel_lock.unlock();
cv_channel_.notify_one();
channel_lock.lock();
cv_channel_.wait(channel_lock,
[this]() { return item == nullptr || closed_; });
}
writer_found_ = false;
}
// This function implements the concept of how
// data that was sent by a writer is read from a reader.
template <typename T> template <typename T>
void UnBuffered<T>::Send(T* channel_element) {} void UnBuffered<T>::Receive(T* data) {
// Prevent other readers from entering
std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
reader_found_ = true;
std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
// If reader comes first, it should wait till a writer arrives
cv_reader_.wait(cv_lock,
[this]() { return writer_found_ == true || closed_; });
cv_writer_.notify_one();
if (!closed_) {
std::unique_lock<std::mutex> lock_ch{mu_ch_};
// Reader should wait for the writer to first write its data
cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
if (!closed_) {
*data = std::move(*item);
item = nullptr;
lock_ch.unlock();
}
cv_channel_.notify_one();
}
reader_found_ = false;
}
// This function implements the sequence of events
// that take place once the channel is closed.
template <typename T> template <typename T>
void UnBuffered<T>::Receive(T*) {} void UnBuffered<T>::Close() {
std::unique_lock<std::mutex> lock(mu_ch_);
item = nullptr;
closed_ = true;
NotifyAllParticipants(&lock);
}
// This function implements the sequence of events
// that are executed once the object of an UnBuffered
// channel is destroyed.
template <typename T> template <typename T>
void UnBuffered<T>::Close() {} UnBuffered<T>::~UnBuffered() {
std::unique_lock<std::mutex> lock(mu_ch_);
item = nullptr;
closed_ = true;
NotifyAllParticipants(&lock);
}
// This function notifies all the readers, writers and
// the channel condition variables.
template <typename T> template <typename T>
UnBuffered<T>::~UnBuffered() {} void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
lock->unlock();
cv_writer_.notify_all();
cv_channel_.notify_all();
cv_reader_.notify_all();
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
...@@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext {
bool HasOutputs(const std::string &name) const override; bool HasOutputs(const std::string &name) const override;
DDim GetInputDim(const std::string &name) const override;
void SetOutputDim(const std::string &name, const DDim &dim) override;
AttrReader Attrs() const override; AttrReader Attrs() const override;
const std::vector<std::string> &Inputs( const std::vector<std::string> &Inputs(
...@@ -444,21 +440,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const { ...@@ -444,21 +440,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
return true; return true;
} }
DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const {
std::vector<DDim> ddims = GetInputsDim(name);
auto length = ddims.size();
PADDLE_ENFORCE_EQ(length, 1UL,
"Input(%s) should have 1 value, "
"but it has %d now",
name, length);
return ddims[0];
}
void CompileTimeInferShapeContext::SetOutputDim(const std::string &name,
const DDim &dim) {
SetOutputsDim(name, {dim});
}
AttrReader CompileTimeInferShapeContext::Attrs() const { AttrReader CompileTimeInferShapeContext::Attrs() const {
return AttrReader(op_.GetAttrMap()); return AttrReader(op_.GetAttrMap());
} }
......
...@@ -366,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -366,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
return true; return true;
} }
DDim GetInputDim(const std::string& name) const override {
return GetDim(op_.Input(name));
}
void SetOutputDim(const std::string& name, const DDim& dim) override {
SetDim(op_.Output(name), dim);
}
AttrReader Attrs() const override { return AttrReader(op_.Attrs()); } AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
const std::vector<std::string>& Inputs( const std::vector<std::string>& Inputs(
......
...@@ -18,10 +18,18 @@ limitations under the License. */ ...@@ -18,10 +18,18 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
std::vector<framework::DDim> InferShapeContext::GetInputsDim( DDim InferShapeContext::GetInputDim(const std::string &name) const {
const std::vector<std::string> &arg_names = Inputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Input(%s) should hold one element, but now it holds %d",
name, arg_names.size());
return this->GetDim(arg_names[0]);
}
std::vector<DDim> InferShapeContext::GetInputsDim(
const std::string &name) const { const std::string &name) const {
const std::vector<std::string> &names = Inputs(name); const std::vector<std::string> &arg_names = Inputs(name);
return GetDims(names); return GetDims(arg_names);
} }
DDim InferShapeContext::GetInputsElementDim(const std::string &name, DDim InferShapeContext::GetInputsElementDim(const std::string &name,
...@@ -30,24 +38,31 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name, ...@@ -30,24 +38,31 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name,
return this->GetDim(names[idx]); return this->GetDim(names[idx]);
} }
void InferShapeContext::SetOutputsDim( void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
const std::string &name, const std::vector<framework::DDim> &dims) { auto &arg_names = Outputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Output(%s) should hold one element, but now it holds %d",
name, arg_names.size());
SetDim(arg_names[0], dim);
}
void InferShapeContext::SetOutputsDim(const std::string &name,
const std::vector<DDim> &dims) {
auto &names = Outputs(name); auto &names = Outputs(name);
SetDims(names, dims); SetDims(names, dims);
} }
std::vector<framework::DDim> InferShapeContext::GetDims( std::vector<DDim> InferShapeContext::GetDims(
const std::vector<std::string> &names) const { const std::vector<std::string> &names) const {
std::vector<framework::DDim> ret; std::vector<DDim> ret;
ret.reserve(names.size()); ret.reserve(names.size());
std::transform( std::transform(
names.begin(), names.end(), std::back_inserter(ret), names.begin(), names.end(), std::back_inserter(ret),
[this](const std::string &name) { return this->GetDim(name); }); [this](const std::string &name) { return this->GetDim(name); });
return ret; return ret;
} }
void InferShapeContext::SetDims(const std::vector<std::string> &names, void InferShapeContext::SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims) { const std::vector<DDim> &dims) {
size_t length = names.size(); size_t length = names.size();
PADDLE_ENFORCE_EQ(length, dims.size()); PADDLE_ENFORCE_EQ(length, dims.size());
for (size_t i = 0; i < length; ++i) { for (size_t i = 0; i < length; ++i) {
......
...@@ -35,14 +35,13 @@ class InferShapeContext { ...@@ -35,14 +35,13 @@ class InferShapeContext {
virtual bool HasInputs(const std::string &name) const = 0; virtual bool HasInputs(const std::string &name) const = 0;
virtual bool HasOutputs(const std::string &name) const = 0; virtual bool HasOutputs(const std::string &name) const = 0;
virtual framework::DDim GetInputDim(const std::string &name) const = 0; DDim GetInputDim(const std::string &name) const;
std::vector<framework::DDim> GetInputsDim(const std::string &name) const; std::vector<DDim> GetInputsDim(const std::string &name) const;
DDim GetInputsElementDim(const std::string &name, int idx) const; DDim GetInputsElementDim(const std::string &name, int idx) const;
virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0; void SetOutputDim(const std::string &name, const DDim &dim);
void SetOutputsDim(const std::string &name, void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims);
const std::vector<framework::DDim> &dims);
virtual AttrReader Attrs() const = 0; virtual AttrReader Attrs() const = 0;
virtual const std::vector<std::string> &Inputs( virtual const std::vector<std::string> &Inputs(
...@@ -57,15 +56,13 @@ class InferShapeContext { ...@@ -57,15 +56,13 @@ class InferShapeContext {
// Note: In while op, we need this to be public // Note: In while op, we need this to be public
void SetDims(const std::vector<std::string> &names, void SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims); const std::vector<DDim> &dims);
protected: protected:
virtual framework::DDim GetDim(const std::string &name) const = 0; virtual DDim GetDim(const std::string &name) const = 0;
virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0; virtual void SetDim(const std::string &name, const DDim &dim) = 0;
std::vector<framework::DDim> GetDims(
const std::vector<std::string> &names) const;
std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
std::vector<proto::VarDesc::VarType> GetVarTypes( std::vector<proto::VarDesc::VarType> GetVarTypes(
const std::vector<std::string> &names) const; const std::vector<std::string> &names) const;
......
...@@ -178,19 +178,22 @@ public: ...@@ -178,19 +178,22 @@ public:
real* inputData = inputs[0].data<real>(); real* inputData = inputs[0].data<real>();
real* filterData = inputs[1].data<real>(); real* filterData = inputs[1].data<real>();
real* outputData = outputs[0].data<real>(); real* outputData = outputs[0].data<real>();
real* colData = NULL;
bool needIm2col = isNeedIm2col(filter); bool needIm2col = isNeedIm2col(filter);
TensorShape imShape = TensorShape imShape =
TensorShape({inputChannels / groups_, inputHeight, inputWidth}); TensorShape({inputChannels / groups_, inputHeight, inputWidth});
TensorShape colShape; TensorShape colShape;
real* colData = NULL;
size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth; // Max col matrix width 4096, Max col matrix size 4M.
size_t colWidth = outputHeight * outputWidth; size_t outputHeightSteps =
// Max col matrix height 256, Max col matrix width 1024 std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256)); size_t maxColWidth = outputHeightSteps * outputWidth;
size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048)); size_t channelSteps =
std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
(size_t)1),
inputChannels / groups_);
size_t maxColHeight = channelSteps * filterHeight * filterWidth;
if (needIm2col) { if (needIm2col) {
colShape = TensorShape({inputChannels / groups_, colShape = TensorShape({inputChannels / groups_,
...@@ -199,7 +202,7 @@ public: ...@@ -199,7 +202,7 @@ public:
outputHeight, outputHeight,
outputWidth}); outputWidth});
resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real)); resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
colData = reinterpret_cast<real*>(memory_->getBuf()); colData = reinterpret_cast<real*>(memory_->getBuf());
} }
...@@ -209,20 +212,24 @@ public: ...@@ -209,20 +212,24 @@ public:
(outputChannels / groups_) * outputHeight * outputWidth; (outputChannels / groups_) * outputHeight * outputWidth;
size_t filterOffset = filter.getElements() / groups_; size_t filterOffset = filter.getElements() / groups_;
int nStride = colWidth; int nStride = outputHeight * outputWidth;
int kStride = colHeight; int kStride = inputChannels / groups_ * filterHeight * filterWidth;
for (size_t i = 0; i < batchSize; i++) { for (size_t i = 0; i < batchSize; i++) {
filterData = inputs[1].data<real>();
for (size_t g = 0; g < groups_; g++) { for (size_t g = 0; g < groups_; g++) {
if (needIm2col) { if (needIm2col) {
real beta_ = beta; real beta_ = beta;
for (size_t colHeightStart = 0; colHeightStart < colHeight; for (size_t ic = 0; ic < inputChannels / groups_;
colHeightStart += stepColHeight) { ic += channelSteps) {
for (size_t colWidthStart = 0; colWidthStart < colWidth; int channels = std::min(inputChannels / groups_ - ic, channelSteps);
colWidthStart += stepColWidth) { for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
int N = std::min(colWidth - colWidthStart, stepColWidth); int height = std::min(outputHeight - oh, outputHeightSteps);
int K = std::min(colHeight - colHeightStart, stepColHeight);
int M = outputChannels / groups_;
int N = height * outputWidth;
int K = channels * filterHeight * filterWidth;
// im2col // im2col
im2col(inputData + g * inputOffset, im2col(inputData,
imShape, imShape,
colData, colData,
colShape, colShape,
...@@ -232,13 +239,12 @@ public: ...@@ -232,13 +239,12 @@ public:
paddingW(), paddingW(),
dilationH(), dilationH(),
dilationW(), dilationW(),
colHeightStart, channels,
K, oh,
colWidthStart, height,
N); N);
// gemm // gemm
int M = outputChannels / groups_;
BlasGemm<Device, real>::compute( BlasGemm<Device, real>::compute(
false, false,
false, false,
...@@ -246,12 +252,12 @@ public: ...@@ -246,12 +252,12 @@ public:
N, N,
K, K,
1.0f, 1.0f,
filterData + g * filterOffset + colHeightStart, filterData + ic * filterHeight * filterWidth,
kStride, kStride,
colData, colData,
N, N,
beta_, beta_,
outputData + g * outputOffset + colWidthStart, outputData + oh * outputWidth,
nStride); nStride);
} }
beta_ = 1.0; beta_ = 1.0;
...@@ -266,17 +272,18 @@ public: ...@@ -266,17 +272,18 @@ public:
N, N,
K, K,
1.0f, 1.0f,
filterData + g * filterOffset, filterData,
K, K,
inputData + g * inputOffset, inputData,
N, N,
beta, beta,
outputData + g * outputOffset, outputData,
N); N);
} }
inputData += inputOffset;
outputData += outputOffset;
filterData += filterOffset;
} }
inputData += inputChannels * inputHeight * inputWidth;
outputData += outputChannels * outputHeight * outputWidth;
} }
memory_.reset(); memory_.reset();
......
...@@ -111,39 +111,42 @@ public: ...@@ -111,39 +111,42 @@ public:
int paddingWidth, int paddingWidth,
int dilationHeight, int dilationHeight,
int dilationWidth, int dilationWidth,
int colHeightStart, int inputChannels,
int colHeightSize, int colOffset,
int colWidthStart, int colOutputHeight,
int colWidthSize) { int colWidth) {
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
int filterHeight = colShape[1]; int filterHeight = colShape[1];
int filterWidth = colShape[2]; int filterWidth = colShape[2];
int outputWidth = colShape[4]; int outputWidth = colShape[4];
for (int colh = 0; colh < colHeightSize; colh++) { for (int ic = 0; ic < inputChannels; ic++) {
int wOffset = (colHeightStart + colh) % filterWidth; for (int oh = 0; oh < colOutputHeight; oh++) {
int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight; T* dstData = colData + oh * outputWidth;
int c_im = (colHeightStart + colh) / filterWidth / filterHeight; for (int fh = 0; fh < filterHeight; fh++) {
for (int fw = 0; fw < filterWidth; fw++) {
for (int colw = 0; colw < colWidthSize; colw++) { int imRowIdx = (oh + colOffset) * strideHeight +
int h = (colWidthStart + colw) / outputWidth; fh * dilationHeight - paddingHeight;
int w = (colWidthStart + colw) % outputWidth; if (imRowIdx < 0 || imRowIdx >= inputHeight) {
memset(dstData, 0, outputWidth * sizeof(T));
int imRowIdx = h * strideHeight + hOffset * dilationHeight; } else {
int imColIdx = w * strideWidth + wOffset * dilationWidth; for (int ow = 0; ow < outputWidth; ow++) {
if ((imRowIdx - paddingHeight) < 0 || int imColIdx =
(imRowIdx - paddingHeight) >= inputHeight || ow * strideWidth + fw * dilationWidth - paddingWidth;
(imColIdx - paddingWidth) < 0 || if (imColIdx < 0 || imColIdx >= inputWidth) {
(imColIdx - paddingWidth) >= inputWidth) { dstData[ow] = T(0);
colData[colh * colWidthSize + colw] = static_cast<T>(0); } else {
} else { dstData[ow] = imData[imRowIdx * inputWidth + imColIdx];
imRowIdx += c_im * inputHeight - paddingHeight; }
imColIdx -= paddingWidth; }
colData[colh * colWidthSize + colw] = }
imData[imRowIdx * inputWidth + imColIdx]; dstData += colWidth;
}
} }
} }
colData += filterHeight * filterWidth * colWidth;
imData += inputHeight * inputWidth;
} }
} }
}; };
......
...@@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() { ...@@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() {
padding, padding,
dilation, dilation,
dilation, dilation,
channels,
0, 0,
height, outputHeight,
0, outputHeight * outputWidth);
width);
autotest::TensorCheckEqual(*output1, *output2); autotest::TensorCheckEqual(*output1, *output2);
} }
......
...@@ -4,4 +4,4 @@ cc_test(test_inference_recognize_digits_mlp ...@@ -4,4 +4,4 @@ cc_test(test_inference_recognize_digits_mlp
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model) ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
set_tests_properties(test_inference_recognize_digits_mlp set_tests_properties(test_inference_recognize_digits_mlp
PROPERTIES DEPENDS test_recognize_digits_mlp_cpu) PROPERTIES DEPENDS test_recognize_digits)
...@@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, ...@@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
CHECK_EQ(channels * outLength, maskMatP->getWidth()); CHECK_EQ(channels * outLength, maskMatP->getWidth());
} }
/* initialize the data_ */
for (size_t i = 0; i < height_; i++) {
for (size_t j = 0; j < width_; j++) {
outData[i * outStride + j] = -(real)FLT_MAX;
}
}
/* pool max one by one */ /* pool max one by one */
for (size_t n = 0; n < num; ++n) { // frame by frame for (size_t n = 0; n < num; ++n) { // frame by frame
if (!isContiguous()) { if (!isContiguous()) {
...@@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, ...@@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
for (size_t c = 0; c < channels; ++c) { // channel by channel for (size_t c = 0; c < channels; ++c) { // channel by channel
for (size_t ph = 0; ph < outputH; ++ph) { for (size_t ph = 0; ph < outputH; ++ph) {
int hstart = ph * strideH - paddingH; int hstart = ph * strideH - paddingH;
int hend = std::min(hstart + sizeY, imgSizeH); int hend = hstart + sizeY;
hstart = std::max(hstart, 0); hstart = hstart < 0 ? 0 : hstart;
hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
for (size_t pw = 0; pw < outputW; ++pw) { for (size_t pw = 0; pw < outputW; ++pw) {
int wstart = pw * strideW - paddingW; int wstart = pw * strideW - paddingW;
int wend = std::min(wstart + sizeX, imgSizeW); int wend = wstart + sizeX;
wstart = std::max(wstart, 0); wstart = wstart < 0 ? 0 : wstart;
wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
if (maskData == NULL) { if (maskData == NULL) {
real tmp = -(real)FLT_MAX;
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
outData[ph * outputW + pw] = std::max( tmp = tmp < inputData[h * imgSizeW + w]
outData[ph * outputW + pw], inputData[h * imgSizeW + w]); ? inputData[h * imgSizeW + w]
: tmp;
} }
} }
outData[ph * outputW + pw] = tmp;
} else { } else {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -28,12 +28,18 @@ class BipartiteMatchOp : public framework::OperatorWithKernel { ...@@ -28,12 +28,18 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("DistMat"), PADDLE_ENFORCE(ctx->HasInput("DistMat"),
"Input(DistMat) of BipartiteMatch should not be null."); "Input(DistMat) of BipartiteMatch should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("ColToRowMatchIndices"),
"Output(ColToRowMatchIndices) of BipartiteMatch should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("ColToRowMatchDist"),
"Output(ColToRowMatchDist) of BipartiteMatch should not be null.");
auto dims = ctx->GetInputDim("DistMat"); auto dims = ctx->GetInputDim("DistMat");
PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2."); PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
ctx->SetOutputDim("ColToRowMatchIndices", dims); ctx->SetOutputDim("ColToRowMatchIndices", dims);
ctx->SetOutputDim("ColToRowMatchDis", dims); ctx->SetOutputDim("ColToRowMatchDist", dims);
} }
}; };
...@@ -91,7 +97,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> { ...@@ -91,7 +97,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* dist_mat = context.Input<LoDTensor>("DistMat"); auto* dist_mat = context.Input<LoDTensor>("DistMat");
auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices"); auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
auto* match_dist = context.Output<Tensor>("ColToRowMatchDis"); auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");
auto& dev_ctx = context.device_context<platform::CPUDeviceContext>(); auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
...@@ -148,13 +154,13 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -148,13 +154,13 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
"Otherwise, it means B[j] is matched to row " "Otherwise, it means B[j] is matched to row "
"ColToRowMatchIndices[i][j] in i-th instance. The row number of " "ColToRowMatchIndices[i][j] in i-th instance. The row number of "
"i-th instance is saved in ColToRowMatchIndices[i][j]."); "i-th instance is saved in ColToRowMatchIndices[i][j].");
AddOutput("ColToRowMatchDis", AddOutput("ColToRowMatchDist",
"(Tensor) A 2-D Tensor with shape [N, M] in float type. " "(Tensor) A 2-D Tensor with shape [N, M] in float type. "
"N is batch size. If ColToRowMatchIndices[i][j] is -1, " "N is batch size. If ColToRowMatchIndices[i][j] is -1, "
"ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed " "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed "
"ColToRowMatchIndices[i][j] = d, and the row offsets of each " "ColToRowMatchIndices[i][j] = d, and the row offsets of each "
"instance are called LoD. Then " "instance are called LoD. Then "
"ColToRowMatchDis[i][j] = DistMat[d+LoD[i]][j]"); "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]");
AddComment(R"DOC( AddComment(R"DOC(
This operator is a greedy bipartite matching algorithm, which is used to This operator is a greedy bipartite matching algorithm, which is used to
obtain the matching with the maximum distance based on the input obtain the matching with the maximum distance based on the input
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/box_coder_op.h"
namespace paddle {
namespace operators {
class BoxCoderOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
"Input(PriorBox) of BoxCoderOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
"Input(PriorBoxVar) of BoxCoderOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
"Input(TargetBox) of BoxCoderOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
"Output(OutputBox) of BoxCoderOp should not be null.");
auto prior_box_dims = ctx->GetInputDim("PriorBox");
auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
auto target_box_dims = ctx->GetInputDim("TargetBox");
PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
"The rank of Input of PriorBoxVar must be 2");
PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
"The rank of Input of TargetBox must be 2");
PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
"The shape of TargetBox is [M, 4]");
GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
ctx->SetOutputDim(
"OutputBox",
framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
}
};
class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
public:
BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"PriorBox",
"(Tensor, default Tensor<float>) "
"Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, "
"each box is represented as [xmin, ymin, xmax, ymax], "
"[xmin, ymin] is the left top coordinate of the anchor box, "
"if the input is image feature map, they are close to the origin "
"of the coordinate system. [xmax, ymax] is the right bottom "
"coordinate of the anchor box.");
AddInput("PriorBoxVar",
"(Tensor, default Tensor<float>) "
"PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
"of variance.");
AddInput(
"TargetBox",
"(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
"[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
"[xmin, ymin] is the left top coordinate of the box if the input "
"is image feature map, they are close to the origin of the coordinate "
"system. [xmax, ymax] is the right bottom coordinate of the box. "
"This tensor can contain LoD information to represent a batch "
"of inputs. One instance of this batch can contain different "
"numbers of entities.");
AddAttr<std::string>("code_type",
"(string, default encode_center_size) "
"the code type used with the target box")
.SetDefault("encode_center_size")
.InEnum({"encode_center_size", "decode_center_size"});
AddOutput(
"OutputBox",
"(LoDTensor or Tensor) "
"(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
"representing the result of N target boxes encoded/decoded with "
"M Prior boxes and variances.");
AddComment(R"DOC(
Bounding Box Coder Operator.
Encode/Decode the target bounding box with the priorbox information.
The Encoding schema described below:
ox = (tx - px) / pw / pxv
oy = (ty - py) / ph / pyv
ow = log(abs(tw / pw)) / pwv
oh = log(abs(th / ph)) / phv
The Decoding schema described below:
ox = (pw * pxv * tx * + px) - tw / 2
oy = (ph * pyv * ty * + py) - th / 2
ow = exp(pwv * tw) * pw + tw / 2
oh = exp(phv * th) * ph + th / 2
where tx, ty, tw, th denote the target box's center coordinates, width and
height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
width and height.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
ops::BoxCoderKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/box_coder_op.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void EncodeCenterSizeKernel(const T* prior_box_data,
const T* prior_box_var_data,
const T* target_box_data, const int row,
const int col, const int len,
T* output) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < row * col) {
const int row_idx = idx / col;
const int col_idx = idx % col;
T prior_box_width =
prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
T prior_box_height =
prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
T prior_box_center_x =
(prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
prior_box_data[col_idx * len + 1]) /
2;
T target_box_center_x =
(target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
2;
T target_box_center_y = (target_box_data[row_idx * len + 3] +
target_box_data[row_idx * len + 1]) /
2;
T target_box_width =
target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
T target_box_height =
target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
output[idx * len] = (target_box_center_x - prior_box_center_x) /
prior_box_width / prior_box_var_data[col_idx * len];
output[idx * len + 1] = (target_box_center_y - prior_box_center_y) /
prior_box_height /
prior_box_var_data[col_idx * len + 1];
output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) /
prior_box_var_data[col_idx * len + 2];
output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) /
prior_box_var_data[col_idx * len + 3];
}
}
template <typename T>
__global__ void DecodeCenterSizeKernel(const T* prior_box_data,
const T* prior_box_var_data,
const T* target_box_data, const int row,
const int col, const int len,
T* output) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < row * col) {
const int row_idx = idx / col;
const int col_idx = idx % col;
T prior_box_width =
prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
T prior_box_height =
prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
T prior_box_center_x =
(prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
prior_box_data[col_idx * len + 1]) /
2;
T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
target_box_data[row_idx * len + 2]) *
prior_box_width;
T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
target_box_data[row_idx * len + 3]) *
prior_box_height;
T target_box_center_x = prior_box_var_data[col_idx * len] *
target_box_data[row_idx * len] *
prior_box_width +
prior_box_center_x;
T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
target_box_data[row_idx * len + 1] *
prior_box_height +
prior_box_center_y;
output[idx * len] = target_box_center_x - target_box_width / 2;
output[idx * len + 1] = target_box_center_y - target_box_height / 2;
output[idx * len + 2] = target_box_center_x + target_box_width / 2;
output[idx * len + 3] = target_box_center_y + target_box_height / 2;
}
}
template <typename T>
class BoxCoderCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
"This kernel only runs on GPU device.");
auto* prior_box = context.Input<framework::Tensor>("PriorBox");
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
auto* output_box = context.Output<framework::Tensor>("OutputBox");
if (target_box->lod().size()) {
PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
"Only support 1 level of LoD.");
}
auto row = target_box->dims()[0];
auto col = prior_box->dims()[0];
auto len = prior_box->dims()[1];
int block = 512;
int grid = (row * col + block - 1) / block;
auto& device_ctx = context.cuda_device_context();
const T* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = prior_box_var->data<T>();
const T* target_box_data = target_box->data<T>();
output_box->mutable_data<T>({row, col, len}, context.GetPlace());
T* output = output_box->data<T>();
auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
if (code_type == BoxCodeType::kEncodeCenterSize) {
EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
prior_box_data, prior_box_var_data, target_box_data, row, col, len,
output);
} else if (code_type == BoxCodeType::kDecodeCenterSize) {
DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
prior_box_data, prior_box_var_data, target_box_data, row, col, len,
output);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel<float>,
ops::BoxCoderCUDAKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
inline BoxCodeType GetBoxCodeType(const std::string& type) {
if (type == "encode_center_size") {
return BoxCodeType::kEncodeCenterSize;
} else if (type == "decode_center_size") {
return BoxCodeType::kDecodeCenterSize;
}
PADDLE_THROW("Not support type %s.", type);
}
template <typename T>
class BoxCoderKernel : public framework::OpKernel<T> {
public:
void EncodeCenterSize(const framework::Tensor& target_box,
const framework::Tensor& prior_box,
const framework::Tensor& prior_box_var,
T* output) const {
int64_t row = target_box.dims()[0];
int64_t col = prior_box.dims()[0];
int64_t len = prior_box.dims()[1];
auto* target_box_data = target_box.data<T>();
auto* prior_box_data = prior_box.data<T>();
auto* prior_box_var_data = prior_box_var.data<T>();
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
T prior_box_width =
prior_box_data[j * len + 2] - prior_box_data[j * len];
T prior_box_height =
prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
T prior_box_center_x =
(prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
T prior_box_center_y =
(prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
T target_box_center_x =
(target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
T target_box_center_y =
(target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
T target_box_width =
target_box_data[i * len + 2] - target_box_data[i * len];
T target_box_height =
target_box_data[i * len + 3] - target_box_data[i * len + 1];
size_t offset = i * col * len + j * len;
output[offset] = (target_box_center_x - prior_box_center_x) /
prior_box_width / prior_box_var_data[j * len];
output[offset + 1] = (target_box_center_y - prior_box_center_y) /
prior_box_height / prior_box_var_data[j * len + 1];
output[offset + 2] =
std::log(std::fabs(target_box_width / prior_box_width)) /
prior_box_var_data[j * len + 2];
output[offset + 3] =
std::log(std::fabs(target_box_height / prior_box_height)) /
prior_box_var_data[j * len + 3];
}
}
}
void DecodeCenterSize(const framework::Tensor& target_box,
const framework::Tensor& prior_box,
const framework::Tensor& prior_box_var,
T* output) const {
int64_t row = target_box.dims()[0];
int64_t col = prior_box.dims()[0];
int64_t len = prior_box.dims()[1];
auto* target_box_data = target_box.data<T>();
auto* prior_box_data = prior_box.data<T>();
auto* prior_box_var_data = prior_box_var.data<T>();
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
T prior_box_width =
prior_box_data[j * len + 2] - prior_box_data[j * len];
T prior_box_height =
prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
T prior_box_center_x =
(prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
T prior_box_center_y =
(prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
T target_box_center_x = prior_box_var_data[j * len] *
target_box_data[i * len] * prior_box_width +
prior_box_center_x;
T target_box_center_y = prior_box_var_data[j * len + 1] *
target_box_data[i * len + 1] *
prior_box_height +
prior_box_center_y;
T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
target_box_data[i * len + 2]) *
prior_box_width;
T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
target_box_data[i * len + 3]) *
prior_box_height;
size_t offset = i * col * len + j * len;
output[offset] = target_box_center_x - target_box_width / 2;
output[offset + 1] = target_box_center_y - target_box_height / 2;
output[offset + 2] = target_box_center_x + target_box_width / 2;
output[offset + 3] = target_box_center_y + target_box_height / 2;
}
}
}
void Compute(const framework::ExecutionContext& context) const override {
auto* prior_box = context.Input<framework::Tensor>("PriorBox");
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
auto* output_box = context.Output<framework::Tensor>("OutputBox");
if (target_box->lod().size()) {
PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
"Only support 1 level of LoD.");
}
auto row = target_box->dims()[0];
auto col = prior_box->dims()[0];
auto len = prior_box->dims()[1];
output_box->mutable_data<T>({row, col, len}, context.GetPlace());
auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
T* output = output_box->data<T>();
if (code_type == BoxCodeType::kEncodeCenterSize) {
EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
} else if (code_type == BoxCodeType::kDecodeCenterSize) {
DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
}
}
};
} // namespace operators
} // namespace paddle
...@@ -54,7 +54,15 @@ class CompareOpKernel ...@@ -54,7 +54,15 @@ class CompareOpKernel
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
using T = typename Functor::ELEM_TYPE; using T = typename Functor::ELEM_TYPE;
ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context); using Tensor = framework::Tensor;
auto* x = context.Input<Tensor>("X");
auto* y = context.Input<Tensor>("Y");
auto* z = context.Output<Tensor>("Out");
z->mutable_data<T>(context.GetPlace());
int axis = context.Attr<int>("axis");
ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
z);
} }
}; };
......
...@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T> ...@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
class ElementwiseAddKernel : public framework::OpKernel<T> { class ElementwiseAddKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx); using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
} }
}; };
...@@ -92,9 +99,19 @@ template <typename DeviceContext, typename T> ...@@ -92,9 +99,19 @@ template <typename DeviceContext, typename T>
class ElementwiseAddGradKernel : public framework::OpKernel<T> { class ElementwiseAddGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseAddGradFunctor<T>, ElementwiseGradCompute<DeviceContext, T, ElementwiseAddGradFunctor<T>,
ElementwiseAddBroadCastGradFunctor<T>, ElementwiseAddBroadCastGradFunctor<T>,
ElementwiseAddBroadCast2GradFunctor<T>>(ctx); ElementwiseAddBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
} }
}; };
......
...@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T> ...@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
class ElementwiseDivKernel : public framework::OpKernel<T> { class ElementwiseDivKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx); using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
} }
}; };
...@@ -111,9 +118,19 @@ template <typename DeviceContext, typename T> ...@@ -111,9 +118,19 @@ template <typename DeviceContext, typename T>
class ElementwiseDivGradKernel : public framework::OpKernel<T> { class ElementwiseDivGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>, ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>,
ElementwiseDivBroadCastGradFunctor<T>, ElementwiseDivBroadCastGradFunctor<T>,
ElementwiseDivBroadCast2GradFunctor<T>>(ctx); ElementwiseDivBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
} }
}; };
......
...@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T> ...@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
class ElementwiseMaxKernel : public framework::OpKernel<T> { class ElementwiseMaxKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx); using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
} }
}; };
...@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T> ...@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T>
class ElementwiseMaxGradKernel : public framework::OpKernel<T> { class ElementwiseMaxGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseMaxGradFunctor<T>, ElementwiseGradCompute<DeviceContext, T, ElementwiseMaxGradFunctor<T>,
ElementwiseMaxBroadCastGradFunctor<T>, ElementwiseMaxBroadCastGradFunctor<T>,
ElementwiseMaxBroadCast2GradFunctor<T>>(ctx); ElementwiseMaxBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
} }
}; };
......
...@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T> ...@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
class ElementwiseMinKernel : public framework::OpKernel<T> { class ElementwiseMinKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx); using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
} }
}; };
...@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T> ...@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T>
class ElementwiseMinGradKernel : public framework::OpKernel<T> { class ElementwiseMinGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseMinGradFunctor<T>, ElementwiseGradCompute<DeviceContext, T, ElementwiseMinGradFunctor<T>,
ElementwiseMinBroadCastGradFunctor<T>, ElementwiseMinBroadCastGradFunctor<T>,
ElementwiseMinBroadCast2GradFunctor<T>>(ctx); ElementwiseMinBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
} }
}; };
......
...@@ -27,7 +27,14 @@ template <typename DeviceContext, typename T> ...@@ -27,7 +27,14 @@ template <typename DeviceContext, typename T>
class ElementwiseMulKernel : public framework::OpKernel<T> { class ElementwiseMulKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx); using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
} }
}; };
...@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T> ...@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T>
class ElementwiseMulGradKernel : public framework::OpKernel<T> { class ElementwiseMulGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>, ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>,
ElementwiseMulBroadCastGradFunctor<T>, ElementwiseMulBroadCastGradFunctor<T>,
ElementwiseMulBroadCast2GradFunctor<T>>(ctx); ElementwiseMulBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
} }
}; };
......
...@@ -313,21 +313,18 @@ EIGEN_FUNCTOR(Div, EIGEN_DIV); ...@@ -313,21 +313,18 @@ EIGEN_FUNCTOR(Div, EIGEN_DIV);
template <typename DeviceContext, typename T, typename functor, template <typename DeviceContext, typename T, typename functor,
typename broadcastfunctor, typename broadcast2functor> typename broadcastfunctor, typename broadcast2functor>
void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
const framework::Tensor* x,
const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout, int axis,
framework::Tensor* dx, framework::Tensor* dy) {
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto x_dims = x->dims(); auto x_dims = x->dims();
auto y_dims = y->dims(); auto y_dims = y->dims();
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
if (dx) { if (dx) {
dx->mutable_data<T>(ctx.GetPlace()); dx->mutable_data<T>(ctx.GetPlace());
} }
...@@ -348,7 +345,6 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { ...@@ -348,7 +345,6 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
x_dims = framework::make_ddim(extended_dims); x_dims = framework::make_ddim(extended_dims);
} }
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
int pre, n, post; int pre, n, post;
...@@ -367,13 +363,10 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { ...@@ -367,13 +363,10 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
template <typename Functor, typename DeviceContext, typename T, template <typename Functor, typename DeviceContext, typename T,
typename OutType = T> typename OutType = T>
void ElementwiseComputeEx(const framework::ExecutionContext& ctx) { void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
using Tensor = framework::Tensor; const framework::Tensor* x,
const framework::Tensor* y, int axis,
auto* x = ctx.Input<Tensor>("X"); framework::Tensor* z) {
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<OutType>(ctx.GetPlace());
TransformFunctor<Functor, T, DeviceContext, OutType> functor( TransformFunctor<Functor, T, DeviceContext, OutType> functor(
x, y, z, ctx.template device_context<DeviceContext>(), Functor()); x, y, z, ctx.template device_context<DeviceContext>(), Functor());
...@@ -394,7 +387,6 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx) { ...@@ -394,7 +387,6 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx) {
x_dims = framework::make_ddim(extended_dims); x_dims = framework::make_ddim(extended_dims);
} }
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
"Axis should be in range [0, x_dims)"); "Axis should be in range [0, x_dims)");
......
...@@ -29,7 +29,14 @@ template <typename DeviceContext, typename T> ...@@ -29,7 +29,14 @@ template <typename DeviceContext, typename T>
class ElementwisePowKernel : public framework::OpKernel<T> { class ElementwisePowKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx); using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
} }
}; };
......
...@@ -27,7 +27,14 @@ template <typename DeviceContext, typename T> ...@@ -27,7 +27,14 @@ template <typename DeviceContext, typename T>
class ElementwiseSubKernel : public framework::OpKernel<T> { class ElementwiseSubKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx); using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
} }
}; };
...@@ -93,9 +100,19 @@ template <typename DeviceContext, typename T> ...@@ -93,9 +100,19 @@ template <typename DeviceContext, typename T>
class ElementwiseSubGradKernel : public framework::OpKernel<T> { class ElementwiseSubGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>, ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>,
ElementwiseSubBroadCastGradFunctor<T>, ElementwiseSubBroadCastGradFunctor<T>,
ElementwiseSubBroadCast2GradFunctor<T>>(ctx); ElementwiseSubBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
} }
}; };
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
enum MiningType { kNone = 0, kMaxNegative, kHardExample };
template <typename T>
bool SortScoreDescend(const std::pair<float, T>& pair1,
const std::pair<float, T>& pair2) {
return pair1.first > pair2.first;
}
inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,
const float match_dist,
const float neg_dist_threshold) {
if (mining_type == MiningType::kMaxNegative) {
return match_idx == -1 && match_dist < neg_dist_threshold;
} else if (mining_type == MiningType::kHardExample) {
return true;
} else {
return false;
}
}
inline MiningType GetMiningType(std::string str) {
if (str == "max_negative") {
return MiningType::kMaxNegative;
} else if (str == "hard_example") {
return MiningType::kHardExample;
} else {
return MiningType::kNone;
}
}
template <typename DeviceContext, typename T>
class MineHardExamplesKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in_cls_loss = ctx.Input<framework::Tensor>("ClsLoss");
auto* in_loc_loss = ctx.Input<framework::Tensor>("LocLoss");
auto* in_matched_indices = ctx.Input<framework::Tensor>("MatchIndices");
auto* in_match_dist = ctx.Input<framework::Tensor>("MatchDist");
float neg_pos_ratio = ctx.Attr<float>("neg_pos_ratio");
T neg_dist_threshold =
static_cast<T>(ctx.Attr<float>("neg_dist_threshold"));
int sample_size = ctx.Attr<int>("sample_size");
MiningType mining_type =
GetMiningType(ctx.Attr<std::string>("mining_type"));
auto out_neg_indices = ctx.Output<framework::LoDTensor>("NegIndices");
auto out_match_indices =
ctx.Output<framework::Tensor>("UpdatedMatchIndices");
framework::Copy(*in_matched_indices, ctx.GetPlace(), out_match_indices);
int batch_size = in_matched_indices->dims()[0];
int prior_num = in_matched_indices->dims()[1];
auto match_indices = framework::EigenMatrix<int>::From(*in_matched_indices);
auto match_indices_et =
framework::EigenMatrix<int>::From(*out_match_indices);
auto match_dist = framework::EigenMatrix<T>::From(*in_match_dist);
const T* cls_loss = in_cls_loss->data<T>();
const T* loc_loss = nullptr;
if (in_loc_loss) {
loc_loss = in_loc_loss->data<T>();
}
std::vector<std::vector<int>> all_neg_indices;
std::vector<size_t> batch_starts = {0};
for (int n = 0; n < batch_size; ++n) {
std::vector<std::pair<T, size_t>> loss_idx;
int neg_sel = 0;
for (int m = 0; m < prior_num; ++m) {
if (IsEligibleMining(mining_type, match_indices(n, m), match_dist(n, m),
neg_dist_threshold)) {
T loss = cls_loss[n * prior_num + m];
if (mining_type == MiningType::kHardExample && loc_loss != nullptr) {
loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m];
}
loss_idx.push_back(std::make_pair(loss, m));
++neg_sel;
}
}
if (mining_type == MiningType::kMaxNegative) {
int num_pos = 0;
for (int m = 0; m < prior_num; ++m) {
if (match_indices(n, m) != -1) ++num_pos;
}
neg_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), neg_sel);
} else if (mining_type == MiningType::kHardExample) {
neg_sel = std::min(sample_size, neg_sel);
}
std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend<size_t>);
std::set<int> sel_indices;
std::vector<int> neg_indices;
std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel,
std::inserter(sel_indices, sel_indices.begin()),
[](std::pair<T, size_t>& l) -> int {
return static_cast<int>(l.second);
});
if (mining_type == MiningType::kHardExample) {
for (int m = 0; m < prior_num; ++m) {
if (match_indices(n, m) > -1) {
if (sel_indices.find(m) == sel_indices.end()) {
match_indices_et(n, m) = -1;
}
} else {
if (sel_indices.find(m) != sel_indices.end()) {
neg_indices.push_back(m);
}
}
}
} else {
neg_indices.resize(sel_indices.size());
std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin());
}
all_neg_indices.push_back(neg_indices);
batch_starts.push_back(batch_starts.back() + neg_indices.size());
}
framework::LoD out_neg_indices_lod;
out_neg_indices_lod.emplace_back(batch_starts);
int neg_offset = 0;
auto neg_data = out_neg_indices->mutable_data<int>(
framework::make_ddim({static_cast<int>(batch_starts.back()), 1}),
ctx.GetPlace());
for (auto neg_indices : all_neg_indices) {
std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset);
neg_offset += neg_indices.size();
}
out_neg_indices->set_lod(out_neg_indices_lod);
return;
}
};
class MineHardExamplesOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("ClsLoss"),
"Input(ClsLoss) of MineHardExamplesOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("MatchIndices"),
"Input(MatchIndices) of MineHardExamplesOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("MatchDist"),
"Input(MatchDist) of MineHardExamplesOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("NegIndices"),
"Output(NegIndices) of MineHardExamplesOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("UpdatedMatchIndices"),
"Output(UpdatedMatchIndices) of MineHardExamplesOp should "
"not be null.");
auto cls_loss_dims = ctx->GetInputDim("ClsLoss");
auto idx_dims = ctx->GetInputDim("MatchIndices");
auto dis_dims = ctx->GetInputDim("MatchDist");
PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL,
"The shape of ClsLoss is [N, Np].");
PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL,
"The shape of MatchIndices is [N, Np].");
PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL,
"The shape of MatchDist is [N, Np].");
if (ctx->HasInput("LocLoss")) {
auto loc_loss_dims = ctx->GetInputDim("LocLoss");
PADDLE_ENFORCE_EQ(loc_loss_dims.size(), 2UL,
"The shape of LocLoss is [N, Np].");
PADDLE_ENFORCE_EQ(cls_loss_dims[0], loc_loss_dims[0],
"Batch size of ClsLoss and LocLoss must be the same.");
PADDLE_ENFORCE_EQ(
cls_loss_dims[1], loc_loss_dims[1],
"Prior box number of ClsLoss and LocLoss must be the same.");
}
PADDLE_ENFORCE_EQ(
cls_loss_dims[0], idx_dims[0],
"Batch size of ClsLoss and MatchIndices must be the same.");
PADDLE_ENFORCE_EQ(
cls_loss_dims[1], idx_dims[1],
"Prior box number of ClsLoss and MatchIndices must be the same.");
PADDLE_ENFORCE_EQ(cls_loss_dims[0], dis_dims[0],
"Batch size of ClsLoss and MatchDist must be the same.");
PADDLE_ENFORCE_EQ(
cls_loss_dims[1], idx_dims[1],
"Prior box number of ClsLoss and MatchDist must be the same.");
auto mining_type =
GetMiningType(ctx->Attrs().Get<std::string>("mining_type"));
PADDLE_ENFORCE_NE(mining_type, MiningType::kNone,
"mining_type must be hard_example or max_negative");
if (mining_type == MiningType::kMaxNegative) {
auto neg_pos_ratio = ctx->Attrs().Get<float>("neg_pos_ratio");
auto neg_dist_threshold = ctx->Attrs().Get<float>("neg_dist_threshold");
PADDLE_ENFORCE_GT(
neg_pos_ratio, 0.0f,
"neg_pos_ratio must greater than zero in max_negative mode");
PADDLE_ENFORCE_GT(
neg_dist_threshold, 0.0f,
"neg_dist_threshold must greater than zero in max_negative mode");
} else if (mining_type == MiningType::kHardExample) {
auto sample_size = ctx->Attrs().Get<int>("sample_size");
PADDLE_ENFORCE_GT(
sample_size, 0,
"sample_size must greater than zero in hard_example mode");
}
ctx->SetOutputDim("UpdatedMatchIndices", idx_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("ClsLoss")->type()),
ctx.device_context());
}
};
class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
public:
MineHardExamplesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"ClsLoss",
"(Tensor, default Tensor<float>), The classification loss with shape "
"[N, Np], N is the batch size and Np is the number of prior box.");
AddInput("LocLoss",
"(Tensor, optional, default Tensor<float>), The localization loss "
"with shape [N, Np], N is the batch size and Np is the number of "
"prior box.")
.AsDispensable();
AddInput("MatchIndices",
"(Tensor, Tensor<int>), Matched indices with shape [N, Np], N is "
"the batch size and Np is the number of prior box. "
"MatchIndices[i][j] equal -1 means the j-th prior box in i-th "
"instance does not match any entity, otherwise means it is "
"matched to row.");
AddInput("MatchDist",
"(Tensor, default Tensor<float>) Matched indices with shape [N, "
"Np], N is the batch size and Np is the number of prior box.");
AddAttr<float>("neg_pos_ratio",
"(float) The ratio of the negative box to the positive "
"box. Use only when mining_type is max_negative.")
.SetDefault(1.0);
AddAttr<float>("neg_dist_threshold",
"(float) The negative overlap upper bound for the unmatched "
"predictions. Use only when mining_type is max_negative.")
.SetDefault(0.5);
AddAttr<int>("sample_size",
"(float) The max sample size of negative box. Use only when "
"mining_type is hard_example.")
.SetDefault(0);
AddAttr<std::string>("mining_type",
"(float) The mining algorithm name, the value is "
"hard_example or max_negative.")
.SetDefault("max_negative")
.InEnum({"hard_example", "max_negative"});
AddOutput(
"NegIndices",
"(LoDTensor<int>) The output of negative example indices. a LoDTensor "
"with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, "
"and each element is the prior box index. "
"For example, the batch size is 2, the lod is [[0, 1, 2]], "
"the sample 0's box 1(MatchIndices[0][1]) is selected, "
"and sample 1's box 0 is selected. The output NegIndices is "
"[[1], [0]].");
AddOutput("UpdatedMatchIndices",
"(Tensor<int>) The output of updated MatchIndices, a tensor with "
"shape [N, Np]. Only update when mining_type is "
"hard_example. The input MatchIndices elements will be update to "
"-1 when it is not in the candidate high loss list of negative "
"examples.");
AddComment(R"DOC(
Mine hard examples Operator.
This operator implements hard example mining to select a subset of negative box indices.
For each image, selects the box with highest losses. subject to the condition that the
box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative.
The selected number is min(sample_size, max_negative_box_number) when mining_type is
hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number)
when mining_type is max_negative, where the max_negative_box_number is the count of
MatchIndices elements with value -1.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp,
ops::MineHardExamplesOpMaker);
REGISTER_OP_CPU_KERNEL(
mine_hard_examples,
ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, float>,
ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
constexpr int64_t kOutputDim = 6;
constexpr int64_t kBBoxSize = 4;
class MultiClassNMSOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("BBoxes"),
"Input(BBoxes) of MultiClassNMS should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Scores"),
"Input(Scores) of MultiClassNMS should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of MultiClassNMS should not be null.");
auto box_dims = ctx->GetInputDim("BBoxes");
auto score_dims = ctx->GetInputDim("Scores");
PADDLE_ENFORCE_EQ(box_dims.size(), 2,
"The rank of Input(BBoxes) must be 2.");
PADDLE_ENFORCE_EQ(score_dims.size(), 3,
"The rank of Input(Scores) must be 3.");
PADDLE_ENFORCE_EQ(box_dims[1], 4,
"The 2nd dimension of Input(BBoxes) must be 4, "
"represents the layout of coordinate "
"[xmin, ymin, xmax, ymax]");
PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2],
"The 1st dimensiong of Input(BBoxes) must be equal to "
"3rd dimension of Input(Scores), which represents the "
"predicted bboxes.");
// Here the box_dims[0] is not the real dimension of output.
// It will be rewritten in the computing kernel.
ctx->SetOutputDim("Out", {box_dims[0], 6});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>("Scores")->type()),
ctx.device_context());
}
};
template <class T>
bool SortScorePairDescend(const std::pair<float, T>& pair1,
const std::pair<float, T>& pair2) {
return pair1.first > pair2.first;
}
template <class T>
static inline void GetMaxScoreIndex(
const std::vector<T>& scores, const T threshold, int top_k,
std::vector<std::pair<T, int>>* sorted_indices) {
for (size_t i = 0; i < scores.size(); ++i) {
if (scores[i] > threshold) {
sorted_indices->push_back(std::make_pair(scores[i], i));
}
}
// Sort the score pair according to the scores in descending order
std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
SortScorePairDescend<int>);
// Keep top_k scores if needed.
if (top_k > -1 && top_k < sorted_indices->size()) {
sorted_indices->resize(top_k);
}
}
template <class T>
static inline T BBoxArea(const T* box, const bool normalized) {
if (box[2] < box[0] || box[3] < box[1]) {
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return static_cast<T>(0.);
} else {
const T w = box[2] - box[0];
const T h = box[3] - box[1];
if (normalized) {
return w * h;
} else {
// If coordinate values are not within range [0, 1].
return (w + 1) * (h + 1);
}
}
}
template <class T>
static inline T JaccardOverlap(const T* box1, const T* box2,
const bool normalized) {
if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
box2[3] < box1[1]) {
return static_cast<T>(0.);
} else {
const T inter_xmin = std::max(box1[0], box2[0]);
const T inter_ymin = std::max(box1[1], box2[1]);
const T inter_xmax = std::min(box1[2], box2[2]);
const T inter_ymax = std::min(box1[3], box2[3]);
const T inter_w = inter_xmax - inter_xmin;
const T inter_h = inter_ymax - inter_ymin;
const T inter_area = inter_w * inter_h;
const T bbox1_area = BBoxArea<T>(box1, normalized);
const T bbox2_area = BBoxArea<T>(box2, normalized);
return inter_area / (bbox1_area + bbox2_area - inter_area);
}
}
template <typename T>
class MultiClassNMSKernel : public framework::OpKernel<T> {
public:
void NMSFast(const Tensor& bbox, const Tensor& scores,
const T score_threshold, const T nms_threshold, const T eta,
const int64_t top_k, std::vector<int>* selected_indices) const {
// The total boxes for each instance.
int64_t num_boxes = bbox.dims()[0];
// 4: [xmin ymin xmax ymax]
int64_t box_size = bbox.dims()[1];
std::vector<T> scores_data(num_boxes);
std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
std::vector<std::pair<T, int>> sorted_indices;
GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
selected_indices->clear();
T adaptive_threshold = nms_threshold;
const T* bbox_data = bbox.data<T>();
while (sorted_indices.size() != 0) {
const int idx = sorted_indices.front().second;
bool keep = true;
for (int k = 0; k < selected_indices->size(); ++k) {
if (keep) {
const int kept_idx = (*selected_indices)[k];
T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
bbox_data + kept_idx * box_size, true);
keep = overlap <= adaptive_threshold;
} else {
break;
}
}
if (keep) {
selected_indices->push_back(idx);
}
sorted_indices.erase(sorted_indices.begin());
if (keep && eta < 1 && adaptive_threshold > 0.5) {
adaptive_threshold *= eta;
}
}
}
void MultiClassNMS(const framework::ExecutionContext& ctx,
const Tensor& scores, const Tensor& bboxes,
std::map<int, std::vector<int>>& indices,
int& num_nmsed_out) const {
int64_t background_label = ctx.Attr<int>("background_label");
int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
int64_t class_num = scores.dims()[0];
int64_t predict_dim = scores.dims()[1];
int num_det = 0;
for (int64_t c = 0; c < class_num; ++c) {
if (c == background_label) continue;
Tensor score = scores.Slice(c, c + 1);
NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k,
&(indices[c]));
num_det += indices[c].size();
}
num_nmsed_out = num_det;
const T* scores_data = scores.data<T>();
if (keep_top_k > -1 && num_det > keep_top_k) {
std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
for (const auto& it : indices) {
int label = it.first;
const T* sdata = scores_data + label * predict_dim;
const std::vector<int>& label_indices = it.second;
for (int j = 0; j < label_indices.size(); ++j) {
int idx = label_indices[j];
PADDLE_ENFORCE_LT(idx, predict_dim);
score_index_pairs.push_back(
std::make_pair(sdata[idx], std::make_pair(label, idx)));
}
}
// Keep top k results per image.
std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
SortScorePairDescend<std::pair<int, int>>);
score_index_pairs.resize(keep_top_k);
// Store the new indices.
std::map<int, std::vector<int>> new_indices;
for (int j = 0; j < score_index_pairs.size(); ++j) {
int label = score_index_pairs[j].second.first;
int idx = score_index_pairs[j].second.second;
new_indices[label].push_back(idx);
}
new_indices.swap(indices);
num_nmsed_out = keep_top_k;
}
}
void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
std::map<int, std::vector<int>>& selected_indices,
Tensor* outs) const {
int predict_dim = scores.dims()[1];
auto* scores_data = scores.data<T>();
auto* bboxes_data = bboxes.data<T>();
auto* odata = outs->data<T>();
int count = 0;
for (const auto& it : selected_indices) {
int label = it.first;
const T* sdata = scores_data + label * predict_dim;
const std::vector<int>& indices = it.second;
for (int j = 0; j < indices.size(); ++j) {
int idx = indices[j];
const T* bdata = bboxes_data + idx * kBBoxSize;
odata[count * kOutputDim] = label; // label
odata[count * kOutputDim + 1] = sdata[idx]; // score
// xmin, ymin, xmax, ymax
std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
count++;
}
}
}
void Compute(const framework::ExecutionContext& ctx) const override {
auto* boxes = ctx.Input<Tensor>("BBoxes");
auto* scores = ctx.Input<Tensor>("Scores");
auto* outs = ctx.Output<LoDTensor>("Out");
auto score_dims = scores->dims();
int64_t batch_size = score_dims[0];
int64_t class_num = score_dims[1];
int64_t predict_dim = score_dims[2];
std::vector<std::map<int, std::vector<int>>> all_indices;
std::vector<size_t> batch_starts = {0};
for (int64_t i = 0; i < batch_size; ++i) {
Tensor ins_score = scores->Slice(i, i + 1);
ins_score.Resize({class_num, predict_dim});
std::map<int, std::vector<int>> indices;
int num_nmsed_out = 0;
MultiClassNMS(ctx, ins_score, *boxes, indices, num_nmsed_out);
all_indices.push_back(indices);
batch_starts.push_back(batch_starts.back() + num_nmsed_out);
}
int num_kept = batch_starts.back();
if (num_kept == 0) {
T* od = outs->mutable_data<T>({1}, ctx.GetPlace());
od[0] = -1;
} else {
outs->mutable_data<T>({num_kept, kOutputDim}, ctx.GetPlace());
for (int64_t i = 0; i < batch_size; ++i) {
Tensor ins_score = scores->Slice(i, i + 1);
ins_score.Resize({class_num, predict_dim});
int64_t s = batch_starts[i];
int64_t e = batch_starts[i + 1];
if (e > s) {
Tensor out = outs->Slice(s, e);
MultiClassOutput(ins_score, *boxes, all_indices[i], &out);
}
}
}
framework::LoD lod;
lod.emplace_back(batch_starts);
outs->set_lod(lod);
}
};
class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
public:
MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("BBoxes",
"(Tensor) A 2-D Tensor with shape [M, 4] represents the "
"predicted locations of M bounding bboxes. Each bounding box "
"has four coordinate values and the layout is "
"[xmin, ymin, xmax, ymax].");
AddInput("Scores",
"(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
"predicted confidence predictions. N is the batch size, C is the "
"class number, M is number of bounding boxes. For each category "
"there are total M scores which corresponding M bounding boxes. "
" Please note, M is equal to the 1st dimension of BBoxes. ");
AddAttr<int>(
"background_label",
"(int64_t, defalut: 0) "
"The index of background label, the background label will be ignored. "
"If set to -1, then all categories will be considered.")
.SetDefault(0);
AddAttr<float>("score_threshold",
"(float) "
"Threshold to filter out bounding boxes with low "
"confidence score. If not provided, consider all boxes.");
AddAttr<int>("nms_top_k",
"(int64_t) "
"Maximum number of detections to be kept according to the "
"confidences aftern the filtering detections based on "
"score_threshold");
AddAttr<float>("nms_threshold",
"(float, defalut: 0.3) "
"The threshold to be used in NMS.")
.SetDefault(0.3);
AddAttr<float>("nms_eta",
"(float) "
"The parameter for adaptive NMS.")
.SetDefault(1.0);
AddAttr<int>("keep_top_k",
"(int64_t) "
"Number of total bboxes to be kept per image after NMS "
"step. -1 means keeping all bboxes after NMS step.");
AddOutput("Out",
"(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
"detections. Each row has 6 values: "
"[label, confidence, xmin, ymin, xmax, ymax], No is the total "
"number of detections in this mini-batch. For each instance, "
"the offsets in first dimension are called LoD, the number of "
"offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
"no detected bbox.");
AddComment(R"DOC(
This operator is to do multi-class non maximum suppression (NMS) on a batched
of boxes and scores.
In the NMS step, this operator greedily selects a subset of detection bounding
boxes that have high scores larger than score_threshold, if providing this
threshold, then selects the largest nms_top_k confidences scores if nms_top_k
is larger than -1. Then this operator pruns away boxes that have high IOU
(intersection over union) overlap with already selected boxes by adaptive
threshold NMS based on parameters of nms_threshold and nms_eta.
Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
per image if keep_top_k is larger than -1.
This operator support multi-class and batched inputs. It applying NMS
independently for each class. The outputs is a 2-D LoDTenosr, for each
image, the offsets in first dimension of LoDTensor are called LoD, the number
of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
means there is no detected bbox for this image. If there is no detected boxes
for all images, all the elements in LoD are 0, and the Out only contains one
value which is -1.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp,
ops::MultiClassNMSOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel<float>,
ops::MultiClassNMSKernel<double>);
...@@ -53,6 +53,8 @@ class WhileOp : public framework::OperatorBase { ...@@ -53,6 +53,8 @@ class WhileOp : public framework::OperatorBase {
auto step_scopes = auto step_scopes =
scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>(); scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
"Condition of while op must in CPU memory.");
while (cond.data<bool>()[0]) { while (cond.data<bool>()[0]) {
auto &current_scope = scope.NewScope(); auto &current_scope = scope.NewScope();
step_scopes->push_back(&current_scope); step_scopes->push_back(&current_scope);
...@@ -99,6 +101,9 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -99,6 +101,9 @@ class WhileGradOp : public framework::OperatorBase {
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::Place &dev_place) const override { const platform::Place &dev_place) const override {
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
framework::Executor executor(dev_place); framework::Executor executor(dev_place);
auto *block = Attr<framework::BlockDesc *>(kStepBlock); auto *block = Attr<framework::BlockDesc *>(kStepBlock);
auto *program = block->Program(); auto *program = block->Program();
...@@ -205,6 +210,8 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -205,6 +210,8 @@ class WhileGradOp : public framework::OperatorBase {
sum_op->Run(cur_scope, dev_place); sum_op->Run(cur_scope, dev_place);
cur_scope.Rename(new_inside_name, inside_grad_name); cur_scope.Rename(new_inside_name, inside_grad_name);
} }
dev_ctx.Wait();
const_cast<framework::Scope &>(scope).DeleteScope(&cur_scope);
} }
} }
}; };
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from graphviz import GraphPreviewGenerator
import proto.framework_pb2 as framework_pb2
def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
'''
Generate a debug graph for block.
Args:
block(Block): a block.
'''
graph = GraphPreviewGenerator("some graph")
# collect parameters and args
protostr = block.desc.serialize_to_string()
desc = framework_pb2.BlockDesc.FromString(str(protostr))
def need_highlight(name):
if highlights is None: return False
for pattern in highlights:
assert type(pattern) is str
if re.match(pattern, name):
return True
return False
# draw parameters and args
vars = {}
for var in desc.vars:
shape = [str(i) for i in var.lod_tensor.tensor.dims]
if not shape:
shape = ['null']
# create var
if var.persistable:
varn = graph.add_param(
var.name, var.type, shape, highlight=need_highlight(var.name))
else:
varn = graph.add_arg(var.name, highlight=need_highlight(var.name))
vars[var.name] = varn
def add_op_link_var(op, var, op2var=False):
for arg in var.arguments:
if arg not in vars:
# add missing variables as argument
vars[arg] = graph.add_arg(arg, highlight=need_highlight(arg))
varn = vars[arg]
highlight = need_highlight(op.description) or need_highlight(
varn.description)
if op2var:
graph.add_edge(op, varn, highlight=highlight)
else:
graph.add_edge(varn, op, highlight=highlight)
for op in desc.ops:
opn = graph.add_op(op.type, highlight=need_highlight(op.type))
for var in op.inputs:
add_op_link_var(opn, var, False)
for var in op.outputs:
add_op_link_var(opn, var, True)
graph(path, show=True)
...@@ -451,9 +451,8 @@ class Operator(object): ...@@ -451,9 +451,8 @@ class Operator(object):
if not given == need: if not given == need:
raise ValueError(("Incorrect setting for output(s) of " raise ValueError(("Incorrect setting for output(s) of "
"operator \"%s\". Need: [%s] Given: [%s]") % "operator \"%s\". Need: [%s] Given: [%s]") %
(type, ", ".join(str(e) (type, ", ".join(str(e) for e in need),
for e in need), ", ".join( ", ".join(str(e) for e in given)))
str(e) for e in given)))
for out_proto in proto.outputs: for out_proto in proto.outputs:
out_args = outputs[out_proto.name] out_args = outputs[out_proto.name]
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import subprocess
import logging
def crepr(v):
if type(v) is str or type(v) is unicode:
return '"%s"' % v
return str(v)
class Rank(object):
def __init__(self, kind, name, priority):
'''
kind: str
name: str
priority: int
'''
self.kind = kind
self.name = name
self.priority = priority
self.nodes = []
def __str__(self):
if not self.nodes:
return ''
return '{' + 'rank={};'.format(self.kind) + \
','.join([node.name for node in self.nodes]) + '}'
class Graph(object):
rank_counter = 0
def __init__(self, title, **attrs):
self.title = title
self.attrs = attrs
self.nodes = []
self.edges = []
self.rank_groups = {}
def code(self):
return self.__str__()
def rank_group(self, kind, priority):
name = "rankgroup-%d" % Graph.rank_counter
Graph.rank_counter += 1
rank = Rank(kind, name, priority)
self.rank_groups[name] = rank
return name
def node(self, label, prefix, description="", **attrs):
node = Node(label, prefix, description, **attrs)
if 'rank' in attrs:
rank = self.rank_groups[attrs['rank']]
del attrs['rank']
rank.nodes.append(node)
self.nodes.append(node)
return node
def edge(self, source, target, **attrs):
edge = Edge(source, target, **attrs)
self.edges.append(edge)
return edge
def compile(self, dot_path):
file = open(dot_path, 'w')
file.write(self.__str__())
image_path = os.path.join(
os.path.dirname(__file__), dot_path[:-3] + "pdf")
cmd = ["dot", "-Tpdf", dot_path, "-o", image_path]
subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
logging.warning("write block debug graph to {}".format(image_path))
return image_path
def show(self, dot_path):
image = self.compile(dot_path)
cmd = ["open", image]
subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
def _rank_repr(self):
ranks = sorted(
self.rank_groups.items(),
cmp=lambda a, b: a[1].priority > b[1].priority)
repr = []
for x in ranks:
repr.append(str(x[1]))
return '\n'.join(repr) + '\n'
def __str__(self):
reprs = [
'digraph G {',
'title = {}'.format(crepr(self.title)),
]
for attr in self.attrs:
reprs.append("{key}={value};".format(
key=attr, value=crepr(self.attrs[attr])))
reprs.append(self._rank_repr())
random.shuffle(self.nodes)
reprs += [str(node) for node in self.nodes]
for x in self.edges:
reprs.append(str(x))
reprs.append('}')
return '\n'.join(reprs)
class Node(object):
counter = 1
def __init__(self, label, prefix, description="", **attrs):
self.label = label
self.name = "%s_%d" % (prefix, Node.counter)
self.description = description
self.attrs = attrs
Node.counter += 1
def __str__(self):
reprs = '{name} [label={label} {extra} ];'.format(
name=self.name,
label=self.label,
extra=',' + ','.join("%s=%s" % (key, crepr(value))
for key, value in self.attrs.items())
if self.attrs else "")
return reprs
class Edge(object):
def __init__(self, source, target, **attrs):
'''
Link source to target.
:param source: Node
:param target: Node
:param graph: Graph
:param attrs: dic
'''
self.source = source
self.target = target
self.attrs = attrs
def __str__(self):
repr = "{source} -> {target} {extra}".format(
source=self.source.name,
target=self.target.name,
extra="" if not self.attrs else
"[" + ','.join("{}={}".format(attr[0], crepr(attr[1]))
for attr in self.attrs.items()) + "]")
return repr
class GraphPreviewGenerator(object):
'''
Generate a graph image for ONNX proto.
'''
def __init__(self, title):
# init graphviz graph
self.graph = Graph(
title,
layout="dot",
concentrate="true",
rankdir="TB", )
self.op_rank = self.graph.rank_group('same', 2)
self.param_rank = self.graph.rank_group('same', 1)
self.arg_rank = self.graph.rank_group('same', 0)
def __call__(self, path='temp.dot', show=False):
if not show:
self.graph.compile(path)
else:
self.graph.show(path)
def add_param(self, name, data_type, shape, highlight=False):
label = '\n'.join([
'<<table cellpadding="5">',
' <tr>',
' <td bgcolor="#2b787e">',
' <b>',
name,
' </b>',
' </td>',
' </tr>',
' <tr>',
' <td>',
str(data_type),
' </td>'
' </tr>',
' <tr>',
' <td>',
'[%s]' % 'x'.join(shape),
' </td>'
' </tr>',
'</table>>',
])
return self.graph.node(
label,
prefix="param",
description=name,
shape="none",
style="rounded,filled,bold",
width="1.3",
color="#148b97" if not highlight else "orange",
fontcolor="#ffffff",
fontname="Arial")
def add_op(self, opType, **kwargs):
highlight = False
if 'highlight' in kwargs:
highlight = kwargs['highlight']
del kwargs['highlight']
return self.graph.node(
"<<B>%s</B>>" % opType,
prefix="op",
description=opType,
shape="box",
style="rounded, filled, bold",
color="#303A3A" if not highlight else "orange",
fontname="Arial",
fontcolor="#ffffff",
width="1.3",
height="0.84", )
def add_arg(self, name, highlight=False):
return self.graph.node(
crepr(name),
prefix="arg",
description=name,
shape="box",
style="rounded,filled,bold",
fontname="Arial",
fontcolor="#999999",
color="#dddddd" if not highlight else "orange")
def add_edge(self, source, target, **kwargs):
highlight = False
if 'highlight' in kwargs:
highlight = kwargs['highlight']
del kwargs['highlight']
return self.graph.edge(
source,
target,
color="#00000" if not highlight else "orange",
**kwargs)
...@@ -1485,7 +1485,9 @@ def batch_norm(input, ...@@ -1485,7 +1485,9 @@ def batch_norm(input,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
data_layout='NCHW', data_layout='NCHW',
name=None): name=None,
moving_mean_name=None,
moving_variance_name=None):
""" """
This function helps create an operator to implement This function helps create an operator to implement
the BatchNorm layer using the configurations from the input parameters. the BatchNorm layer using the configurations from the input parameters.
...@@ -1515,6 +1517,7 @@ def batch_norm(input, ...@@ -1515,6 +1517,7 @@ def batch_norm(input,
attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True) attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
mean = helper.create_global_variable( mean = helper.create_global_variable(
name=moving_mean_name,
dtype=input.dtype, dtype=input.dtype,
shape=param_shape, shape=param_shape,
persistable=True, persistable=True,
...@@ -1522,6 +1525,7 @@ def batch_norm(input, ...@@ -1522,6 +1525,7 @@ def batch_norm(input,
helper.set_variable_initializer(var=mean, initializer=Constant(0.0)) helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
variance = helper.create_global_variable( variance = helper.create_global_variable(
name=moving_variance_name,
dtype=input.dtype, dtype=input.dtype,
shape=param_shape, shape=param_shape,
persistable=True, persistable=True,
......
...@@ -59,6 +59,7 @@ __all__ = [ ...@@ -59,6 +59,7 @@ __all__ = [
'elementwise_pow', 'elementwise_pow',
'clip', 'clip',
'clip_by_norm', 'clip_by_norm',
'softmax',
'sequence_softmax', 'sequence_softmax',
] + __activations__ ] + __activations__
......
...@@ -295,7 +295,7 @@ def fill_constant_batch_size_like(input, ...@@ -295,7 +295,7 @@ def fill_constant_batch_size_like(input,
return out return out
def ones(shape, dtype): def ones(shape, dtype, force_cpu=False):
""" """
**ones** **ones**
...@@ -319,7 +319,7 @@ def ones(shape, dtype): ...@@ -319,7 +319,7 @@ def ones(shape, dtype):
return fill_constant(value=1.0, **locals()) return fill_constant(value=1.0, **locals())
def zeros(shape, dtype): def zeros(shape, dtype, force_cpu=False):
""" """
**zeros** **zeros**
......
...@@ -31,7 +31,7 @@ dtype_to_size = { ...@@ -31,7 +31,7 @@ dtype_to_size = {
class ControlFlowGraph(object): class ControlFlowGraph(object):
def __init__(self, Program, ops, forward_num): def __init__(self, Program, ops, forward_num, skip_opt):
self._program = Program self._program = Program
self._ops = ops self._ops = ops
self._forward_num = forward_num self._forward_num = forward_num
...@@ -41,6 +41,7 @@ class ControlFlowGraph(object): ...@@ -41,6 +41,7 @@ class ControlFlowGraph(object):
self._defs = defaultdict(set) self._defs = defaultdict(set)
self._live_in = defaultdict(set) self._live_in = defaultdict(set)
self._live_out = defaultdict(set) self._live_out = defaultdict(set)
self._skip_opt = skip_opt
def _add_connections(self, connections): def _add_connections(self, connections):
for node1, node2 in connections: for node1, node2 in connections:
...@@ -130,6 +131,10 @@ class ControlFlowGraph(object): ...@@ -130,6 +131,10 @@ class ControlFlowGraph(object):
block_desc, x, block_desc, x,
is_forward).type() != core.VarDesc.VarType.LOD_TENSOR: is_forward).type() != core.VarDesc.VarType.LOD_TENSOR:
return False return False
if x in self._skip_opt:
return False
if not self._find_var(block_desc, x, is_forward).shape():
return False
return True return True
self._build_graph() self._build_graph()
...@@ -140,6 +145,7 @@ class ControlFlowGraph(object): ...@@ -140,6 +145,7 @@ class ControlFlowGraph(object):
if op.type() == "while" or op.type() == "while_grad": if op.type() == "while" or op.type() == "while_grad":
continue continue
block_desc = op.block() block_desc = op.block()
self.current_block_desc = block_desc
is_forward = i < self._forward_num is_forward = i < self._forward_num
if self.pool: if self.pool:
defs_can_optimize = filter( defs_can_optimize = filter(
...@@ -197,28 +203,32 @@ def get_cfgs(input_program): ...@@ -197,28 +203,32 @@ def get_cfgs(input_program):
block_desc = pdesc.block(0) block_desc = pdesc.block(0)
op_size = block_desc.op_size() op_size = block_desc.op_size()
# Get global block ops # Get global block ops
ops_list.append(([block_desc.op(i) for i in range(op_size)], op_size)) ops_list.append(
([block_desc.op(i) for i in range(op_size)], op_size, set()))
while_sub_block_ids = [] while_sub_block_ids = []
while_grad_sub_block_ids = [] while_grad_sub_block_ids = []
while_pair = [] while_op_output = set()
while_block_id_pair = []
for i in range(op_size): for i in range(op_size):
op = block_desc.op(i) op = block_desc.op(i)
if op.type() == "while": if op.type() == "while":
while_sub_block_ids.append(op.attr("sub_block").id) while_sub_block_ids.append(op.attr("sub_block").id)
while_op_output.update(op.output_arg_names())
elif op.type() == "while_grad": elif op.type() == "while_grad":
while_grad_sub_block_ids.append(op.attr("sub_block").id) while_grad_sub_block_ids.append(op.attr("sub_block").id)
while_op_output.update(op.output_arg_names())
# Find while/while_grad block pair # Find while/while_grad block pair
for grad_id in while_grad_sub_block_ids: for grad_id in while_grad_sub_block_ids:
parent_id = pdesc.block(grad_id).parent parent_id = pdesc.block(grad_id).parent
if parent_id in while_sub_block_ids: if parent_id in while_sub_block_ids:
while_pair.append((parent_id, grad_id)) while_block_id_pair.append((parent_id, grad_id))
while_sub_block_ids.remove(parent_id) while_sub_block_ids.remove(parent_id)
# Get while/while_grad block ops # Get while/while_grad block ops
for parent_id, grad_id in while_pair: for parent_id, grad_id in while_block_id_pair:
while_block_ops = [] while_block_ops = []
while_block = pdesc.block(parent_id) while_block = pdesc.block(parent_id)
while_block_op_size = while_block.op_size() while_block_op_size = while_block.op_size()
...@@ -230,7 +240,7 @@ def get_cfgs(input_program): ...@@ -230,7 +240,7 @@ def get_cfgs(input_program):
for i in range(while_grad_block_op_size): for i in range(while_grad_block_op_size):
while_block_ops.append(while_grad_block.op(i)) while_block_ops.append(while_grad_block.op(i))
ops_list.append((while_block_ops, while_block_op_size)) ops_list.append((while_block_ops, while_block_op_size, while_op_output))
# Process rest while block ops # Process rest while block ops
for parent_id in while_sub_block_ids: for parent_id in while_sub_block_ids:
...@@ -242,7 +252,7 @@ def get_cfgs(input_program): ...@@ -242,7 +252,7 @@ def get_cfgs(input_program):
ops_list.append((while_block_ops, while_block_op_size)) ops_list.append((while_block_ops, while_block_op_size))
cfgs = [ControlFlowGraph(input_program, i, j) for i, j in ops_list] cfgs = [ControlFlowGraph(input_program, i, j, k) for i, j, k in ops_list]
return cfgs return cfgs
......
recognize_digits_*.inference.model
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
list(REMOVE_ITEM TEST_OPS test_recognize_digits)
py_test(test_recognize_digits_mlp_cpu
SRCS test_recognize_digits.py
ARGS mlp)
py_test(test_recognize_digits_mlp_cuda
SRCS test_recognize_digits.py
ARGS mlp --use_cuda)
py_test(test_recognize_digits_conv_cpu
SRCS test_recognize_digits.py
ARGS conv)
py_test(test_recognize_digits_conv_cuda
SRCS test_recognize_digits.py
ARGS conv --use_cuda)
py_test(test_recognize_digits_mlp_cpu_parallel
SRCS test_recognize_digits.py
ARGS mlp --parallel)
py_test(test_recognize_digits_mlp_cuda_parallel
SRCS test_recognize_digits.py
ARGS mlp --use_cuda --parallel)
py_test(test_recognize_digits_conv_cpu_parallel
SRCS test_recognize_digits.py
ARGS conv --parallel)
py_test(test_recognize_digits_conv_cuda_parallel
SRCS test_recognize_digits.py
ARGS conv --use_cuda --parallel)
# default test # default test
foreach(src ${TEST_OPS}) foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py) py_test(${src} SRCS ${src}.py)
......
...@@ -11,21 +11,20 @@ ...@@ -11,21 +11,20 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import contextlib
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as pd import paddle.v2.fluid.layers as pd
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
import unittest
dict_size = 30000 dict_size = 30000
source_dict_dim = target_dict_dim = dict_size source_dict_dim = target_dict_dim = dict_size
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
hidden_dim = 32 hidden_dim = 32
word_dim = 16 word_dim = 16
IS_SPARSE = True
batch_size = 2 batch_size = 2
max_length = 8 max_length = 8
topk_size = 50 topk_size = 50
...@@ -34,10 +33,8 @@ beam_size = 2 ...@@ -34,10 +33,8 @@ beam_size = 2
decoder_size = hidden_dim decoder_size = hidden_dim
place = core.CPUPlace()
def encoder(is_sparse):
def encoder():
# encoder # encoder
src_word_id = pd.data( src_word_id = pd.data(
name="src_word_id", shape=[1], dtype='int64', lod_level=1) name="src_word_id", shape=[1], dtype='int64', lod_level=1)
...@@ -45,7 +42,7 @@ def encoder(): ...@@ -45,7 +42,7 @@ def encoder():
input=src_word_id, input=src_word_id,
size=[dict_size, word_dim], size=[dict_size, word_dim],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=is_sparse,
param_attr=fluid.ParamAttr(name='vemb')) param_attr=fluid.ParamAttr(name='vemb'))
fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
...@@ -54,7 +51,7 @@ def encoder(): ...@@ -54,7 +51,7 @@ def encoder():
return encoder_out return encoder_out
def decoder_train(context): def decoder_train(context, is_sparse):
# decoder # decoder
trg_language_word = pd.data( trg_language_word = pd.data(
name="target_language_word", shape=[1], dtype='int64', lod_level=1) name="target_language_word", shape=[1], dtype='int64', lod_level=1)
...@@ -62,7 +59,7 @@ def decoder_train(context): ...@@ -62,7 +59,7 @@ def decoder_train(context):
input=trg_language_word, input=trg_language_word,
size=[dict_size, word_dim], size=[dict_size, word_dim],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=is_sparse,
param_attr=fluid.ParamAttr(name='vemb')) param_attr=fluid.ParamAttr(name='vemb'))
rnn = pd.DynamicRNN() rnn = pd.DynamicRNN()
...@@ -82,10 +79,10 @@ def decoder_train(context): ...@@ -82,10 +79,10 @@ def decoder_train(context):
return rnn() return rnn()
def decoder_decode(context): def decoder_decode(context, is_sparse):
init_state = context init_state = context
array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
counter = pd.zeros(shape=[1], dtype='int64') counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
# fill the first element with init_state # fill the first element with init_state
state_array = pd.create_array('float32') state_array = pd.create_array('float32')
...@@ -117,7 +114,7 @@ def decoder_decode(context): ...@@ -117,7 +114,7 @@ def decoder_decode(context):
input=pre_ids, input=pre_ids,
size=[dict_size, word_dim], size=[dict_size, word_dim],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE) is_sparse=is_sparse)
# use rnn unit to update rnn # use rnn unit to update rnn
current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded], current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded],
...@@ -150,7 +147,7 @@ def decoder_decode(context): ...@@ -150,7 +147,7 @@ def decoder_decode(context):
def set_init_lod(data, lod, place): def set_init_lod(data, lod, place):
res = core.LoDTensor() res = fluid.LoDTensor()
res.set(data, place) res.set(data, place)
res.set_lod(lod) res.set_lod(lod)
return res return res
...@@ -165,15 +162,19 @@ def to_lodtensor(data, place): ...@@ -165,15 +162,19 @@ def to_lodtensor(data, place):
lod.append(cur_len) lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64") flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1]) flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = core.LoDTensor() res = fluid.LoDTensor()
res.set(flattened_data, place) res.set(flattened_data, place)
res.set_lod([lod]) res.set_lod([lod])
return res return res
def train_main(): def train_main(use_cuda, is_sparse):
context = encoder() if use_cuda and not fluid.core.is_compiled_with_cuda():
rnn_out = decoder_train(context) return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
context = encoder(is_sparse)
rnn_out = decoder_train(context, is_sparse)
label = pd.data( label = pd.data(
name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
cost = pd.cross_entropy(input=rnn_out, label=label) cost = pd.cross_entropy(input=rnn_out, label=label)
...@@ -212,9 +213,13 @@ def train_main(): ...@@ -212,9 +213,13 @@ def train_main():
batch_id += 1 batch_id += 1
def decode_main(): def decode_main(use_cuda, is_sparse):
context = encoder() if use_cuda and not fluid.core.is_compiled_with_cuda():
translation_ids, translation_scores = decoder_decode(context) return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
context = encoder(is_sparse)
translation_ids, translation_scores = decoder_decode(context, is_sparse)
exe = Executor(place) exe = Executor(place)
exe.run(framework.default_startup_program()) exe.run(framework.default_startup_program())
...@@ -250,6 +255,60 @@ def decode_main(): ...@@ -250,6 +255,60 @@ def decode_main():
break break
class TestMachineTranslation(unittest.TestCase):
pass
@contextlib.contextmanager
def scope_prog_guard():
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
def inject_test_train(use_cuda, is_sparse):
f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse'
if is_sparse else 'dense')
def f(*args):
with scope_prog_guard():
train_main(use_cuda, is_sparse)
setattr(TestMachineTranslation, f_name, f)
def inject_test_decode(use_cuda, is_sparse, decorator=None):
f_name = 'test_{0}_{1}_decode'.format('cuda'
if use_cuda else 'cpu', 'sparse'
if is_sparse else 'dense')
def f(*args):
with scope_prog_guard():
decode_main(use_cuda, is_sparse)
if decorator is not None:
f = decorator(f)
setattr(TestMachineTranslation, f_name, f)
for _use_cuda_ in (False, True):
for _is_sparse_ in (False, True):
inject_test_train(_use_cuda_, _is_sparse_)
for _use_cuda_ in (False, True):
for _is_sparse_ in (False, True):
_decorator_ = None
if _use_cuda_:
_decorator_ = unittest.skip(
reason='Beam Search does not support CUDA!')
inject_test_decode(
is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_)
if __name__ == '__main__': if __name__ == '__main__':
# train_main() unittest.main()
decode_main()
...@@ -17,6 +17,7 @@ import paddle.v2.fluid as fluid ...@@ -17,6 +17,7 @@ import paddle.v2.fluid as fluid
import paddle.v2 as paddle import paddle.v2 as paddle
import sys import sys
import numpy import numpy
import unittest
def parse_arg(): def parse_arg():
...@@ -74,18 +75,18 @@ def conv_net(img, label): ...@@ -74,18 +75,18 @@ def conv_net(img, label):
return loss_net(conv_pool_2, label) return loss_net(conv_pool_2, label)
def train(args, save_dirname=None): def train(nn_type, use_cuda, parallel, save_dirname):
print("recognize digits with args: {0}".format(" ".join(sys.argv[1:]))) if use_cuda and not fluid.core.is_compiled_with_cuda():
return
img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if args.nn_type == 'mlp': if nn_type == 'mlp':
net_conf = mlp net_conf = mlp
else: else:
net_conf = conv_net net_conf = conv_net
if args.parallel: if parallel:
places = fluid.layers.get_places() places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places) pd = fluid.layers.ParallelDo(places)
with pd.do(): with pd.do():
...@@ -107,7 +108,7 @@ def train(args, save_dirname=None): ...@@ -107,7 +108,7 @@ def train(args, save_dirname=None):
optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer = fluid.optimizer.Adam(learning_rate=0.001)
optimizer.minimize(avg_loss) optimizer.minimize(avg_loss)
place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
...@@ -147,13 +148,14 @@ def train(args, save_dirname=None): ...@@ -147,13 +148,14 @@ def train(args, save_dirname=None):
'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'. 'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
format(pass_id, batch_id + 1, format(pass_id, batch_id + 1,
float(avg_loss_val), float(acc_val))) float(avg_loss_val), float(acc_val)))
raise AssertionError("Loss of recognize digits is too large")
def infer(args, save_dirname=None): def infer(use_cuda, save_dirname=None):
if save_dirname is None: if save_dirname is None:
return return
place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
# Use fluid.io.load_inference_model to obtain the inference program desc, # Use fluid.io.load_inference_model to obtain the inference program desc,
...@@ -174,11 +176,48 @@ def infer(args, save_dirname=None): ...@@ -174,11 +176,48 @@ def infer(args, save_dirname=None):
print("infer results: ", results[0]) print("infer results: ", results[0])
if __name__ == '__main__': def main(use_cuda, parallel, nn_type):
args = parse_arg() if not use_cuda and not parallel:
if not args.use_cuda and not args.parallel: save_dirname = "recognize_digits_" + nn_type + ".inference.model"
save_dirname = "recognize_digits_" + args.nn_type + ".inference.model"
else: else:
save_dirname = None save_dirname = None
train(args, save_dirname)
infer(args, save_dirname) train(
nn_type=nn_type,
use_cuda=use_cuda,
parallel=parallel,
save_dirname=save_dirname)
infer(use_cuda=use_cuda, save_dirname=save_dirname)
class TestRecognizeDigits(unittest.TestCase):
pass
def inject_test_method(use_cuda, parallel, nn_type):
def __impl__(self):
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
main(use_cuda, parallel, nn_type)
fn = 'test_{0}_{1}_{2}'.format(nn_type, 'cuda'
if use_cuda else 'cpu', 'parallel'
if parallel else 'normal')
setattr(TestRecognizeDigits, fn, __impl__)
def inject_all_tests():
for use_cuda in (False, True):
for parallel in (False, True):
for nn_type in ('mlp', 'conv'):
inject_test_method(use_cuda, parallel, nn_type)
inject_all_tests()
if __name__ == '__main__':
unittest.main()
...@@ -62,7 +62,7 @@ def batch_bipartite_match(distance, lod): ...@@ -62,7 +62,7 @@ def batch_bipartite_match(distance, lod):
return match_indices, match_dist return match_indices, match_dist
class TestBipartiteMatchOpForWithLoD(OpTest): class TestBipartiteMatchOpWithLoD(OpTest):
def setUp(self): def setUp(self):
self.op_type = 'bipartite_match' self.op_type = 'bipartite_match'
lod = [[0, 5, 11, 23]] lod = [[0, 5, 11, 23]]
...@@ -72,7 +72,7 @@ class TestBipartiteMatchOpForWithLoD(OpTest): ...@@ -72,7 +72,7 @@ class TestBipartiteMatchOpForWithLoD(OpTest):
self.inputs = {'DistMat': (dist, lod)} self.inputs = {'DistMat': (dist, lod)}
self.outputs = { self.outputs = {
'ColToRowMatchIndices': (match_indices), 'ColToRowMatchIndices': (match_indices),
'ColToRowMatchDis': (match_dist), 'ColToRowMatchDist': (match_dist),
} }
def test_check_output(self): def test_check_output(self):
...@@ -89,7 +89,7 @@ class TestBipartiteMatchOpWithoutLoD(OpTest): ...@@ -89,7 +89,7 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
self.inputs = {'DistMat': dist} self.inputs = {'DistMat': dist}
self.outputs = { self.outputs = {
'ColToRowMatchIndices': match_indices, 'ColToRowMatchIndices': match_indices,
'ColToRowMatchDis': match_dist, 'ColToRowMatchDist': match_dist,
} }
def test_check_output(self): def test_check_output(self):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import sys
import math
from op_test import OpTest
def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
prior_box_x = (
(prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0])
prior_box_y = (
(prior_box[:, 3] + prior_box[:, 1]) / 2).reshape(1, prior_box.shape[0])
prior_box_width = (
(prior_box[:, 2] - prior_box[:, 0])).reshape(1, prior_box.shape[0])
prior_box_height = (
(prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0])
prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0],
prior_box_var.shape[1])
if (code_type == "EncodeCenterSize"):
target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape(
target_box.shape[0], 1)
target_box_y = ((target_box[:, 3] + target_box[:, 1]) / 2).reshape(
target_box.shape[0], 1)
target_box_width = ((target_box[:, 2] - target_box[:, 0])).reshape(
target_box.shape[0], 1)
target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape(
target_box.shape[0], 1)
output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \
prior_box_var[:,:,0]
output_box[:,:,1] = (target_box_y - prior_box_y) / prior_box_height / \
prior_box_var[:,:,1]
output_box[:,:,2] = np.log(np.fabs(target_box_width / prior_box_width)) / \
prior_box_var[:,:,2]
output_box[:,:,3] = np.log(np.fabs(target_box_height / prior_box_height)) / \
prior_box_var[:,:,3]
elif (code_type == "DecodeCenterSize"):
target_box = target_box.reshape(target_box.shape[0], 1,
target_box.shape[1])
target_box_x = prior_box_var[:,:,0] * target_box[:,:,0] * \
prior_box_width + prior_box_x
target_box_y = prior_box_var[:,:,1] * target_box[:,:,1] * \
prior_box_height + prior_box_y
target_box_width = np.exp(prior_box_var[:,:,2] * target_box[:,:,2]) * \
prior_box_width
target_box_height = np.exp(prior_box_var[:,:,3] * target_box[:,:,3]) * \
prior_box_height
output_box[:, :, 0] = target_box_x - target_box_width / 2
output_box[:, :, 1] = target_box_y - target_box_height / 2
output_box[:, :, 2] = target_box_x + target_box_width / 2
output_box[:, :, 3] = target_box_y + target_box_height / 2
def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type):
n = target_box.shape[0]
m = prior_box.shape[0]
output_box = np.zeros((n, m, 4), dtype=np.float32)
for i in range(len(lod) - 1):
box_coder(target_box[lod[i]:lod[i + 1], :], prior_box, prior_box_var,
output_box[lod[i]:lod[i + 1], :, :], code_type)
return output_box
class TestBoxCoderOp(OpTest):
def test_check_output(self):
self.check_output()
def setUp(self):
self.op_type = "box_coder"
lod = [[0, 20]]
prior_box = np.random.random((10, 4)).astype('float32')
prior_box_var = np.random.random((10, 4)).astype('float32')
target_box = np.random.random((20, 4)).astype('float32')
code_type = "DecodeCenterSize"
output_box = batch_box_coder(prior_box, prior_box_var, target_box,
lod[0], code_type)
self.inputs = {
'PriorBox': prior_box,
'PriorBoxVar': prior_box_var,
'TargetBox': target_box,
}
self.attrs = {'code_type': 'decode_center_size'}
self.outputs = {'OutputBox': output_box}
class TestBoxCoderOpWithLoD(OpTest):
def test_check_output(self):
self.check_output()
def setUp(self):
self.op_type = "box_coder"
lod = [[0, 4, 12, 20]]
prior_box = np.random.random((10, 4)).astype('float32')
prior_box_var = np.random.random((10, 4)).astype('float32')
target_box = np.random.random((20, 4)).astype('float32')
code_type = "EncodeCenterSize"
output_box = batch_box_coder(prior_box, prior_box_var, target_box,
lod[0], code_type)
self.inputs = {
'PriorBox': prior_box,
'PriorBoxVar': prior_box_var,
'TargetBox': (target_box, lod),
}
self.attrs = {'code_type': 'encode_center_size'}
self.outputs = {'OutputBox': output_box}
if __name__ == '__main__':
unittest.main()
...@@ -223,6 +223,14 @@ class TestBook(unittest.TestCase): ...@@ -223,6 +223,14 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(layers.sequence_softmax(x=seq)) self.assertIsNotNone(layers.sequence_softmax(x=seq))
print(str(program)) print(str(program))
def test_softmax(self):
program = Program()
with program_guard(program):
data = layers.data(name='data', shape=[10], dtype='float32')
hid = layers.fc(input=data, size=20)
self.assertIsNotNone(layers.softmax(x=hid))
print(str(program))
def test_get_places(self): def test_get_places(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import sys
import math
from op_test import OpTest
class TestMineHardExamplesOp(OpTest):
def set_data(self):
self.init_test_data()
self.inputs = {
'ClsLoss': self.cls_loss,
'LocLoss': self.loc_loss,
'MatchIndices': self.match_indices,
'MatchDist': self.match_dis
}
self.attrs = {
'neg_pos_ratio': self.neg_pos_ratio,
'neg_overlap': self.neg_overlap,
'sample_size': self.sample_size,
'mining_type': self.mining_type
}
self.outputs = {
'NegIndices': (self.neg_indices, self.neg_indices_lod),
'UpdatedMatchIndices': self.updated_match_indices
}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
return
def setUp(self):
self.op_type = "mine_hard_examples"
self.set_data()
def init_test_data(self):
self.neg_pos_ratio = 1.0
self.neg_overlap = 0.5
self.sample_size = 0
self.mining_type = "max_negative"
self.cls_loss = np.array([[0.1, 0.1, 0.3],
[0.3, 0.1, 0.1]]).astype('float32')
self.loc_loss = np.array([[0.1, 0.2, 0.3],
[0.3, 0.4, 0.1]]).astype('float32')
self.match_dis = np.array([[0.2, 0.4, 0.8],
[0.1, 0.9, 0.3]]).astype('float32')
self.match_indices = np.array([[0, -1, -1],
[-1, 0, -1]]).astype('int32')
self.updated_match_indices = self.match_indices
self.neg_indices_lod = [[0, 1, 2]]
self.neg_indices = np.array([[1], [0]]).astype('int32')
class TestMineHardExamplesOpHardExample(TestMineHardExamplesOp):
def init_test_data(self):
super(TestMineHardExamplesOpHardExample, self).init_test_data()
self.mining_type = "hard_example"
self.sample_size = 2
self.cls_loss = np.array([[0.5, 0.1, 0.3],
[0.3, 0.1, 0.1]]).astype('float32')
self.loc_loss = np.array([[0.2, 0.2, 0.3],
[0.3, 0.1, 0.2]]).astype('float32')
self.match_indices = np.array([[0, -1, -1],
[-1, 0, -1]]).astype('int32')
self.updated_match_indices = np.array([[0, -1, -1],
[-1, -1, -1]]).astype('int32')
self.neg_indices_lod = [[0, 1, 3]]
self.neg_indices = np.array([[2], [0], [2]]).astype('int32')
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import unittest
import numpy as np
import copy
from op_test import OpTest
def iou(box_a, box_b):
"""Apply intersection-over-union overlap between box_a and box_b
"""
xmin_a = min(box_a[0], box_a[2])
ymin_a = min(box_a[1], box_a[3])
xmax_a = max(box_a[0], box_a[2])
ymax_a = max(box_a[1], box_a[3])
xmin_b = min(box_b[0], box_b[2])
ymin_b = min(box_b[1], box_b[3])
xmax_b = max(box_b[0], box_b[2])
ymax_b = max(box_b[1], box_b[3])
area_a = (ymax_a - ymin_a) * (xmax_a - xmin_a)
area_b = (ymax_b - ymin_b) * (xmax_b - xmin_b)
if area_a <= 0 and area_b <= 0:
return 0.0
xa = max(xmin_a, xmin_b)
ya = max(ymin_a, ymin_b)
xb = min(xmax_a, xmax_b)
yb = min(ymax_a, ymax_b)
inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0)
box_a_area = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
box_b_area = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
iou_ratio = inter_area / (area_a + area_b - inter_area)
return iou_ratio
def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0):
"""Apply non-maximum suppression at test time to avoid detecting too many
overlapping bounding boxes for a given object.
Args:
boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
scores: (tensor) The class predscores for the img, Shape:[num_priors].
score_threshold: (float) The confidence thresh for filtering low
confidence boxes.
nms_threshold: (float) The overlap thresh for suppressing unnecessary
boxes.
top_k: (int) The maximum number of box preds to consider.
eta: (float) The parameter for adaptive NMS.
Return:
The indices of the kept boxes with respect to num_priors.
"""
all_scores = copy.deepcopy(scores)
all_scores = all_scores.flatten()
selected_indices = np.argwhere(all_scores > score_threshold)
selected_indices = selected_indices.flatten()
all_scores = all_scores[selected_indices]
sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
sorted_scores = all_scores[sorted_indices]
if top_k > -1 and top_k < sorted_indices.shape[0]:
sorted_indices = sorted_indices[:top_k]
sorted_scores = sorted_scores[:top_k]
selected_indices = []
adaptive_threshold = nms_threshold
for i in range(sorted_scores.shape[0]):
idx = sorted_indices[i]
keep = True
for k in range(len(selected_indices)):
if keep:
kept_idx = selected_indices[k]
overlap = iou(boxes[idx], boxes[kept_idx])
keep = True if overlap <= adaptive_threshold else False
else:
break
if keep:
selected_indices.append(idx)
if keep and eta < 1 and adaptive_threshold > 0.5:
adaptive_threshold *= eta
return selected_indices
def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
nms_top_k, keep_top_k):
class_num = scores.shape[0]
priorbox_num = scores.shape[1]
selected_indices = {}
num_det = 0
for c in range(class_num):
if c == background: continue
indices = nms(boxes, scores[c], score_threshold, nms_threshold,
nms_top_k)
selected_indices[c] = indices
num_det += len(indices)
if keep_top_k > -1 and num_det > keep_top_k:
score_index = []
for c, indices in selected_indices.iteritems():
for idx in indices:
score_index.append((scores[c][idx], c, idx))
sorted_score_index = sorted(
score_index, key=lambda tup: tup[0], reverse=True)
sorted_score_index = sorted_score_index[:keep_top_k]
selected_indices = {}
for _, c, _ in sorted_score_index:
selected_indices[c] = []
for s, c, idx in sorted_score_index:
selected_indices[c].append(idx)
num_det = keep_top_k
return selected_indices, num_det
def batched_multiclass_nms(boxes, scores, background, score_threshold,
nms_threshold, nms_top_k, keep_top_k):
batch_size = scores.shape[0]
det_outs = []
lod = [0]
for n in range(batch_size):
nmsed_outs, nmsed_num = multiclass_nms(boxes, scores[n], background,
score_threshold, nms_threshold,
nms_top_k, keep_top_k)
lod.append(lod[-1] + nmsed_num)
if nmsed_num == 0: continue
for c, indices in nmsed_outs.iteritems():
for idx in indices:
xmin, ymin, xmax, ymax = boxes[idx][:]
det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
return det_outs, lod
class TestMulticlassNMSOp(OpTest):
def set_argument(self):
self.score_threshold = 0.01
def setUp(self):
self.set_argument()
N = 7
M = 1200
C = 21
BOX_SIZE = 4
background = 0
nms_threshold = 0.3
nms_top_k = 400
keep_top_k = 200
score_threshold = self.score_threshold
scores = np.random.random((N * M, C)).astype('float32')
def softmax(x):
shiftx = x - np.max(x).clip(-64.)
exps = np.exp(shiftx)
return exps / np.sum(exps)
scores = np.apply_along_axis(softmax, 1, scores)
scores = np.reshape(scores, (N, M, C))
scores = np.transpose(scores, (0, 2, 1))
boxes = np.random.random((M, BOX_SIZE)).astype('float32')
boxes[:, 0:2] = boxes[:, 0:2] * 0.5
boxes[:, 2:4] = boxes[:, 2:4] * 0.5 + 0.5
nmsed_outs, lod = batched_multiclass_nms(boxes, scores, background,
score_threshold, nms_threshold,
nms_top_k, keep_top_k)
nmsed_outs = [-1] if not nmsed_outs else nmsed_outs
nmsed_outs = np.array(nmsed_outs).astype('float32')
self.op_type = 'multiclass_nms'
self.inputs = {'BBoxes': boxes, 'Scores': scores}
self.outputs = {'Out': (nmsed_outs, [lod])}
self.attrs = {
'background_label': 0,
'nms_threshold': nms_threshold,
'nms_top_k': nms_top_k,
'keep_top_k': keep_top_k,
'score_threshold': score_threshold,
'nms_eta': 1.0,
}
def test_check_output(self):
self.check_output()
class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp):
def set_argument(self):
# Here set 2.0 to test the case there is no outputs.
# In practical use, 0.0 < score_threshold < 1.0
self.score_threshold = 2.0
class TestIOU(unittest.TestCase):
def test_iou(self):
box1 = np.array([4.0, 3.0, 7.0, 5.0]).astype('float32')
box2 = np.array([3.0, 4.0, 6.0, 8.0]).astype('float32')
expt_output = np.array([2.0 / 16.0]).astype('float32')
calc_output = np.array([iou(box1, box2)]).astype('float32')
self.assertTrue(np.allclose(calc_output, expt_output))
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册