diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..98356cd7613baff7f0cd66d1462068232b2b8500 --- /dev/null +++ b/benchmark/cluster/vgg16/Dockerfile @@ -0,0 +1,18 @@ +#FROM python:2.7.14 +FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04 +RUN apt-get update && apt-get install -y python +RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev +# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF, +# so we must build one with distribute support to install in this image. +RUN pip install paddlepaddle +RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python' +RUN pip uninstall -y paddlepaddle + +# below lines may change a lot for debugging +ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin +ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root +ADD *.whl / +RUN pip install /*.whl && rm -f /*.whl && \ +chmod +x /usr/bin/paddle_k8s +ENV LD_LIBRARY_PATH=/usr/local/lib +ADD vgg16_fluid.py vgg16_v2.py /workspace/ diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md new file mode 100644 index 0000000000000000000000000000000000000000..11d00b8f85382aa720c169338c51333b730d44d5 --- /dev/null +++ b/benchmark/cluster/vgg16/README.md @@ -0,0 +1,76 @@ +# Performance for Distributed vgg16 + +## Test Result + +### Hardware Infomation + +- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz +- cpu MHz : 2101.000 +- cache size : 20480 KB + +### Single Node Single Thread + +- PServer Count: 10 +- Trainer Count: 20 +- Metrics: samples / sec + +| Batch Size | 32 | 64 | 128 | 256 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 | +| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 | +| TensorFlow | - | - | - | - | + +### Different Batch Size + +- PServer Count: 10 +- Trainer Count: 20 +- Per trainer CPU Core: 1 +- Metrics: samples / sec + +| Batch Size | 32 | 64 | 128 | 256 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 | +| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 | +| TensorFlow | - | - | - | - | + + +### Accelerate Rate + +- Pserver Count: 20 +- Batch Size: 128 +- Metrics: samples / sec + +| Trainer Count | 20 | 40 | 80 | 100 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) | +| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) | +| TensorFlow | - | - | - | - | + +### Different Pserver Count + +- Trainer Count: 60 +- Batch Size: 128 +- Metrics: samples/ sec + +| PServer Count | 3 | 6 |10 | 20 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 | +| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 | +| TensorFlow | - | - | - | - | + +*The performance gap between Fuild and v2 comes from the network interference.* + + +## Steps to Run the Performance Test + +1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support. +1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory. +1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it. +1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step). +1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers. + +Check the logs for the distributed training progress and analyze the performance. + +## Enable Verbos Logs + +Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail. diff --git a/benchmark/cluster/vgg16/fluid_pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ee8b0763b62fc011f40f6197e929a68b48a93e47 --- /dev/null +++ b/benchmark/cluster/vgg16/fluid_pserver.yaml @@ -0,0 +1,72 @@ +apiVersion: extensions/v1beta1 +kind: ReplicaSet +metadata: + name: vgg16job-pserver +spec: + replicas: 10 + template: + metadata: + labels: + paddle-job-pserver: vgg16job + spec: + hostNetwork: true + imagePullSecrets: + - name: job-registry-secret + containers: + - name: pserver + image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" + imagePullPolicy: Always + ports: + - name: jobport-30236 + containerPort: 30236 + env: + - name: PADDLE_JOB_NAME + value: vgg16job + - name: MKL_NUM_THREADS + value: "1" + - name: TRAINING_ROLE + value: "PSERVER" + - name: TRAINERS + value: "20" + - name: PSERVERS + value: "10" + - name: TOPOLOGY + value: "" + - name: ENTRY + value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0" + - name: TRAINER_PACKAGE + value: "/workspace" + - name: PADDLE_INIT_PORT + value: "30236" + - name: PADDLE_INIT_NICS + value: "xgbe0" + - name: PADDLE_INIT_TRAINER_COUNT + value: "1" + - name: PADDLE_INIT_PORTS_NUM + value: "1" + - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE + value: "1" + - name: PADDLE_INIT_NUM_GRADIENT_SERVERS + value: "20" + - name: PADDLE_INIT_NUM_PASSES + value: "1" + - name: PADDLE_INIT_USE_GPU + value: "0" + - name: LD_LIBRARY_PATH + value: "/usr/local/lib:/usr/local/nvidia/lib64" + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: "metadata.namespace" + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: "status.podIP" + command: ["paddle_k8s", "start_fluid"] + resources: + requests: + memory: 10Gi + cpu: 4 + limits: + memory: 10Gi + cpu: 4 diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0a0ed25ebe43c4cc0d5ab0b72cf36c936fcce802 --- /dev/null +++ b/benchmark/cluster/vgg16/fluid_trainer.yaml @@ -0,0 +1,69 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: vgg16job-trainer +spec: + parallelism: 20 + completions: 20 + template: + metadata: + labels: + paddle-job: vgg16job + spec: + imagePullSecrets: + - name: job-registry-secret + hostNetwork: true + containers: + - name: trainer + image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" + imagePullPolicy: Always + command: ["paddle_k8s", "start_fluid"] + env: + - name: PADDLE_JOB_NAME + value: vgg16job + - name: TRAINING_ROLE + value: "TRAINER" + - name: TRAINERS + value: "20" + - name: PSERVERS + value: "10" + - name: TOPOLOGY + value: "" + - name: ENTRY + value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128" + - name: TRAINER_PACKAGE + value: "/workspace" + - name: PADDLE_INIT_PORT + value: "30236" + - name: PADDLE_INIT_NICS + value: "xgbe0" + - name: PADDLE_INIT_TRAINER_COUNT + value: "1" + - name: PADDLE_INIT_PORTS_NUM + value: "1" + - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE + value: "1" + - name: PADDLE_INIT_NUM_GRADIENT_SERVERS + value: "20" + - name: PADDLE_INIT_NUM_PASSES + value: "1" + - name: PADDLE_INIT_USE_GPU + value: "0" + - name: LD_LIBRARY_PATH + value: "/usr/local/lib:/usr/local/nvidia/lib64" + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: "metadata.namespace" + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: "status.podIP" + resources: + requests: + memory: 40Gi + cpu: 2 + limits: + memory: 40Gi + cpu: 2 + restartPolicy: Never diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dd1271e0cf399184134c06b3200ee1202c65cef0 --- /dev/null +++ b/benchmark/cluster/vgg16/v2_pserver.yaml @@ -0,0 +1,64 @@ +apiVersion: extensions/v1beta1 +kind: ReplicaSet +metadata: + name: vgg16v2job-pserver +spec: + replicas: 10 + template: + metadata: + labels: + paddle-job-pserver: vgg16v2job + spec: + hostNetwork: true + imagePullSecrets: + - name: job-registry-secret + containers: + - name: pserver + image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" + imagePullPolicy: Always + ports: + - name: jobport-30236 + containerPort: 30236 + env: + - name: PADDLE_JOB_NAME + value: vgg16v2job + - name: TRAINERS + value: "20" + - name: PSERVERS + value: "10" + - name: TOPOLOGY + value: "" + - name: ENTRY + value: "python train.py" + - name: TRAINER_PACKAGE + value: "/workspace" + - name: PADDLE_INIT_PORT + value: "30236" + - name: PADDLE_INIT_NICS + value: "xgbe0" + - name: PADDLE_INIT_TRAINER_COUNT + value: "1" + - name: PADDLE_INIT_PORTS_NUM + value: "1" + - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE + value: "1" + - name: PADDLE_INIT_NUM_GRADIENT_SERVERS + value: "20" + - name: PADDLE_INIT_NUM_PASSES + value: "1" + - name: PADDLE_INIT_USE_GPU + value: "0" + - name: LD_LIBRARY_PATH + value: "/usr/local/lib:/usr/local/nvidia/lib64" + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: "metadata.namespace" + command: ["paddle_k8s", "start_pserver"] + resources: + requests: + memory: 10Gi + cpu: 4 + limits: + memory: 10Gi + cpu: 4 diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..12c8964066cbcfe8d2a44de2f51a3d12ea422fe2 --- /dev/null +++ b/benchmark/cluster/vgg16/v2_trainer.yaml @@ -0,0 +1,65 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: vgg16v2job-trainer +spec: + parallelism: 20 + completions: 20 + template: + metadata: + labels: + paddle-job: vgg16v2job + spec: + imagePullSecrets: + - name: job-registry-secret + hostNetwork: true + containers: + - name: trainer + image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" + imagePullPolicy: Always + command: ["paddle_k8s", "start_trainer", "v2"] + env: + - name: PADDLE_JOB_NAME + value: vgg16v2job + - name: BATCH_SIZE + value: "256" + - name: TRAINERS + value: "20" + - name: PSERVERS + value: "10" + - name: TOPOLOGY + value: "" + - name: ENTRY + value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py" + - name: TRAINER_PACKAGE + value: "/workspace" + - name: PADDLE_INIT_PORT + value: "30236" + - name: PADDLE_INIT_NICS + value: "xgbe0" + - name: PADDLE_INIT_TRAINER_COUNT + value: "1" + - name: PADDLE_INIT_PORTS_NUM + value: "1" + - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE + value: "1" + - name: PADDLE_INIT_NUM_GRADIENT_SERVERS + value: "20" + - name: PADDLE_INIT_NUM_PASSES + value: "2" + - name: PADDLE_INIT_USE_GPU + value: "0" + - name: LD_LIBRARY_PATH + value: "/usr/local/lib:/usr/local/nvidia/lib64" + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: "metadata.namespace" + resources: + requests: + memory: 40Gi + cpu: 2 + limits: + memory: 40Gi + cpu: 2 + restartPolicy: Never diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py new file mode 100644 index 0000000000000000000000000000000000000000..499e06ec42fc8f840137173628fa465e0541ba30 --- /dev/null +++ b/benchmark/cluster/vgg16/vgg16_fluid.py @@ -0,0 +1,277 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""VGG16 benchmark in Fluid""" +from __future__ import print_function + +import sys +import time +import numpy as np +import paddle.v2 as paddle +import paddle.v2.fluid as fluid +import paddle.v2.fluid.core as core +import paddle.v2.fluid.profiler as profiler +import argparse +import functools +import os + + +def str2bool(v): + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + '--batch_size', type=int, default=128, help="Batch size for training.") +parser.add_argument( + '--learning_rate', + type=float, + default=1e-3, + help="Learning rate for training.") +parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.") +parser.add_argument( + '--device', + type=str, + default='CPU', + choices=['CPU', 'GPU'], + help="The device type.") +parser.add_argument('--device_id', type=int, default=0, help="The device id.") +parser.add_argument( + '--data_format', + type=str, + default='NCHW', + choices=['NCHW', 'NHWC'], + help='The data order, now only support NCHW.') +parser.add_argument( + '--data_set', + type=str, + default='cifar10', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') +parser.add_argument( + '--local', + type=str2bool, + default=True, + help='Whether to run as local mode.') +args = parser.parse_args() + + +def vgg16_bn_drop(input): + def conv_block(input, num_filter, groups, dropouts): + return fluid.nets.img_conv_group( + input=input, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act='relu', + conv_with_batchnorm=True, + conv_batchnorm_drop_rate=dropouts, + pool_type='max') + + conv1 = conv_block(input, 64, 2, [0.3, 0]) + conv2 = conv_block(conv1, 128, 2, [0.4, 0]) + conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) + conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) + conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) + + drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) + fc1 = fluid.layers.fc(input=drop, size=512, act=None) + bn = fluid.layers.batch_norm(input=fc1, act='relu') + drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) + fc2 = fluid.layers.fc(input=drop2, size=512, act=None) + return fc2 + + +def main(): + if args.data_set == "cifar10": + classdim = 10 + if args.data_format == 'NCHW': + data_shape = [3, 32, 32] + else: + data_shape = [32, 32, 3] + else: + classdim = 102 + if args.data_format == 'NCHW': + data_shape = [3, 224, 224] + else: + data_shape = [224, 224, 3] + + # Input data + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + net = vgg16_bn_drop(images) + predict = fluid.layers.fc(input=net, size=classdim, act='softmax') + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + accuracy = fluid.evaluator.Accuracy(input=predict, label=label) + + # inference program + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + test_target = accuracy.metrics + accuracy.states + inference_program = fluid.io.get_inference_program(test_target) + + # Optimization + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + optimize_ops, params_grads = optimizer.minimize(avg_cost) + + # Initialize executor + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace( + args.device_id) + exe = fluid.Executor(place) + + # test + def test(exe): + accuracy.reset(exe) + for batch_id, data in enumerate(test_reader()): + img_data = np.array(map(lambda x: x[0].reshape(data_shape), + data)).astype("float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + + exe.run(inference_program, + feed={"pixel": img_data, + "label": y_data}) + + return accuracy.eval(exe) + + def train_loop(exe, trainer_prog): + iters = 0 + ts = time.time() + for pass_id in range(args.num_passes): + # train + start_time = time.time() + num_samples = 0 + accuracy.reset(exe) + with profiler.profiler("CPU", 'total') as prof: + for batch_id, data in enumerate(train_reader()): + ts = time.time() + img_data = np.array( + map(lambda x: x[0].reshape(data_shape), data)).astype( + "float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + + loss, acc = exe.run( + trainer_prog, + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[avg_cost] + accuracy.metrics) + iters += 1 + num_samples += len(data) + print( + "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" + % (pass_id, iters, loss, acc, time.time() - ts) + ) # The accuracy is the accumulation of batches, but not the current batch. + + pass_elapsed = time.time() - start_time + pass_train_acc = accuracy.eval(exe) + pass_test_acc = test(exe) + print( + "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n" + % (pass_id, num_samples / pass_elapsed, pass_train_acc, + pass_test_acc)) + + if args.local: + # Parameter initialization + exe.run(fluid.default_startup_program()) + + # data reader + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() if args.data_set == 'cifar10' + else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + batch_size=args.batch_size) + train_loop(exe, fluid.default_main_program()) + else: + pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # all pserver endpoints + eplist = [] + for ip in pserver_ips.split(","): + eplist.append(':'.join([ip, "6174"])) + pserver_endpoints = ",".join(eplist) + print("pserver endpoints: ", pserver_endpoints) + trainers = int(os.getenv("TRAINERS")) # total trainer count + print("trainers total: ", trainers) + current_endpoint = os.getenv( + "POD_IP") + ":6174" # current pserver endpoint + training_role = os.getenv( + "TRAINING_ROLE", + "TRAINER") # get the training role: trainer/pserver + t = fluid.DistributeTranspiler() + t.transpile( + optimize_ops, + params_grads, + pservers=pserver_endpoints, + trainers=trainers) + + if training_role == "PSERVER": + if not current_endpoint: + print("need env SERVER_ENDPOINT") + exit(1) + pserver_prog = t.get_pserver_program(current_endpoint) + pserver_startup = t.get_startup_program(current_endpoint, + pserver_prog) + print("starting server side startup") + exe.run(pserver_startup) + print("starting parameter server...") + exe.run(pserver_prog) + elif training_role == "TRAINER": + # Parameter initialization + exe.run(fluid.default_startup_program()) + + # data reader + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() if args.data_set == 'cifar10' + else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else + paddle.dataset.flowers.test(), + batch_size=args.batch_size) + + trainer_prog = t.get_trainer_program() + feeder = fluid.DataFeeder(feed_list=[images, label], place=place) + # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver + exe.run(fluid.default_startup_program()) + train_loop(exe, trainer_prog) + else: + print("environment var TRAINER_ROLE should be TRAINER os PSERVER") + + +def print_arguments(): + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +if __name__ == "__main__": + print_arguments() + main() diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..6ac6b3c33252e0a1f596f539fc090c5ada118e15 --- /dev/null +++ b/benchmark/cluster/vgg16/vgg16_v2.py @@ -0,0 +1,154 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +import gzip + +import paddle.v2.dataset.cifar as cifar +import paddle.v2 as paddle +import time +import os + +DATA_DIM = 3 * 32 * 32 +CLASS_DIM = 10 +BATCH_SIZE = os.getenv("BATCH_SIZE") +if BATCH_SIZE: + BATCH_SIZE = int(BATCH_SIZE) +else: + BATCH_SIZE = 128 +print "batch_size", BATCH_SIZE +NODE_COUNT = int(os.getenv("TRAINERS")) +ts = 0 + + +def vgg(input, nums, class_dim): + def conv_block(input, num_filter, groups, num_channels=None): + return paddle.networks.img_conv_group( + input=input, + num_channels=num_channels, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act=paddle.activation.Relu(), + pool_type=paddle.pooling.Max()) + + assert len(nums) == 5 + # the channel of input feature is 3 + conv1 = conv_block(input, 64, nums[0], 3) + conv2 = conv_block(conv1, 128, nums[1]) + conv3 = conv_block(conv2, 256, nums[2]) + conv4 = conv_block(conv3, 512, nums[3]) + conv5 = conv_block(conv4, 512, nums[4]) + + fc_dim = 512 + fc1 = paddle.layer.fc(input=conv5, + size=fc_dim, + act=paddle.activation.Relu(), + layer_attr=paddle.attr.Extra(drop_rate=0.5)) + fc2 = paddle.layer.fc(input=fc1, + size=fc_dim, + act=paddle.activation.Relu(), + layer_attr=paddle.attr.Extra(drop_rate=0.5)) + out = paddle.layer.fc(input=fc2, + size=class_dim, + act=paddle.activation.Softmax()) + return out + + +def vgg13(input, class_dim): + nums = [2, 2, 2, 2, 2] + return vgg(input, nums, class_dim) + + +def vgg16(input, class_dim): + nums = [2, 2, 3, 3, 3] + return vgg(input, nums, class_dim) + + +def vgg19(input, class_dim): + nums = [2, 2, 4, 4, 4] + return vgg(input, nums, class_dim) + + +def main(): + global ts + paddle.init(use_gpu=False) + image = paddle.layer.data( + name="image", type=paddle.data_type.dense_vector(DATA_DIM)) + lbl = paddle.layer.data( + name="label", type=paddle.data_type.integer_value(CLASS_DIM)) + + extra_layers = None + # NOTE: for v2 distributed training need averaging updates. + learning_rate = 1e-3 / NODE_COUNT + out = vgg16(image, class_dim=CLASS_DIM) + cost = paddle.layer.classification_cost(input=out, label=lbl) + + # Create parameters + parameters = paddle.parameters.create(cost) + + # Create optimizer + optimizer = paddle.optimizer.Momentum( + momentum=0.9, + regularization=paddle.optimizer.L2Regularization(rate=0.0005 * + BATCH_SIZE), + learning_rate=learning_rate / BATCH_SIZE, + learning_rate_decay_a=0.1, + learning_rate_decay_b=128000 * 35, + learning_rate_schedule="discexp", ) + + train_reader = paddle.batch( + paddle.reader.shuffle( + cifar.train10(), + # To use other data, replace the above line with: + # reader.train_reader('train.list'), + buf_size=1000), + batch_size=BATCH_SIZE) + test_reader = paddle.batch( + cifar.test10(), + # To use other data, replace the above line with: + # reader.test_reader('val.list'), + batch_size=BATCH_SIZE) + + # Create trainer + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=optimizer, + extra_layers=extra_layers, + is_local=False) + + # End batch and end pass event handler + def event_handler(event): + global ts, ts_pass + if isinstance(event, paddle.event.BeginPass): + ts_pass = time.time() + if isinstance(event, paddle.event.BeginIteration): + ts = time.time() + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 1 == 0: + print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % ( + event.pass_id, event.batch_id, event.cost, event.metrics, + time.time() - ts) + if isinstance(event, paddle.event.EndPass): + print "Pass %d end, spent: %f" % (event.pass_id, + time.time() - ts_pass) + result = trainer.test(reader=test_reader) + print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) + + trainer.train( + reader=train_reader, num_passes=200, event_handler=event_handler) + + +if __name__ == '__main__': + main() diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index ddf0b055a92d80295b24255a5462d477e0d9c796..29388f5005bf779a1bfa63c0d46d35996c0c792d 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -87,6 +87,11 @@ roi_pool .. autoclass:: paddle.v2.layer.roi_pool :noindex: +pad +---- +.. autoclass:: paddle.v2.layer.pad + :noindex: + Norm Layer ========== @@ -133,6 +138,11 @@ grumemory .. autoclass:: paddle.v2.layer.grumemory :noindex: +gated_unit +----------- +.. autoclass:: paddle.v2.layer.gated_unit + :noindex: + Recurrent Layer Group ===================== @@ -340,6 +350,11 @@ bilinear_interp .. autoclass:: paddle.v2.layer.bilinear_interp :noindex: +dropout +-------- +.. autoclass:: paddle.v2.layer.dropout + :noindex: + dot_prod --------- .. autoclass:: paddle.v2.layer.dot_prod @@ -402,6 +417,11 @@ scale_shift .. autoclass:: paddle.v2.layer.scale_shift :noindex: +factorization_machine +--------------------- +.. autoclass:: paddle.v2.layer.factorization_machine + :noindex: + Sampling Layers =============== @@ -420,22 +440,6 @@ multiplex .. autoclass:: paddle.v2.layer.multiplex :noindex: -Factorization Machine Layer -============================ - -factorization_machine ---------------------- -.. autoclass:: paddle.v2.layer.factorization_machine - :noindex: - -Slicing and Joining Layers -========================== - -pad ----- -.. autoclass:: paddle.v2.layer.pad - :noindex: - .. _api_v2.layer_costs: Cost Layers @@ -526,6 +530,11 @@ multibox_loss .. autoclass:: paddle.v2.layer.multibox_loss :noindex: +detection_output +---------------- +.. autoclass:: paddle.v2.layer.detection_output + :noindex: + Check Layer ============ @@ -534,31 +543,10 @@ eos .. autoclass:: paddle.v2.layer.eos :noindex: -Miscs -===== - -dropout --------- -.. autoclass:: paddle.v2.layer.dropout - :noindex: - -Activation with learnable parameter -=================================== +Activation +========== prelu -------- .. autoclass:: paddle.v2.layer.prelu :noindex: - -gated_unit ------------ -.. autoclass:: paddle.v2.layer.gated_unit - :noindex: - -Detection output Layer -====================== - -detection_output ----------------- -.. autoclass:: paddle.v2.layer.detection_output - :noindex: diff --git a/doc/api/v2/data/dataset.rst b/doc/api/v2/data/dataset.rst index 6a8ecc5bb1d855e0ded3719943ab3adb810de365..02e41564b1e48c07da6ac071fc4b60089169e05a 100644 --- a/doc/api/v2/data/dataset.rst +++ b/doc/api/v2/data/dataset.rst @@ -73,3 +73,10 @@ wmt14 .. automodule:: paddle.v2.dataset.wmt14 :members: :noindex: + +wmt16 ++++++ + +.. automodule:: paddle.v2.dataset.wmt16 + :members: + :noindex: diff --git a/doc/design/switch.md b/doc/design/switch.md new file mode 100644 index 0000000000000000000000000000000000000000..9db1b2782a521c2ff4b28b8f9efcdf1492242ed4 --- /dev/null +++ b/doc/design/switch.md @@ -0,0 +1,32 @@ +### Design Doc: Switch + +### Background + +Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid. + +The following example shows the usage of `fluid.switch`. + +```python +a = fluid.Var(10) +b = fluid.Var(0) + +switch = fluid.switch() +with switch.block(): + with switch.case(fluid.less_equal(a, 10)): + fluid.print("Case 1") + with switch.case(fluid.larger(a, 0)): + fluid.print("Case 2") + with switch.default(): + fluid.print("Case 3") +``` + +### The Semantics + +1. A `switch` control-flow checks cases one-by-one. +1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values. +1. It runs the first matched case, or the default case if there is one. +1. Once it matches a case, it runs the corresponding branch and only that branch. It's like there is a C's `break` keyword at the end of each case. + +The above program should print and print only "Case 1". + +The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches. diff --git a/paddle/framework/channel.h b/paddle/framework/channel.h index 0570980c5a4d7fa45e672ae5baac65d2c65ddad9..b679387b1124e42499df158767b6c7afe1afd0c6 100644 --- a/paddle/framework/channel.h +++ b/paddle/framework/channel.h @@ -23,8 +23,8 @@ namespace framework { template class Channel { public: - virtual void Send(T*) = 0; - virtual void Receive(T*) = 0; + virtual bool Send(T*) = 0; + virtual bool Receive(T*) = 0; virtual size_t Cap() = 0; virtual void Close() = 0; virtual ~Channel() {} diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc index 31ac72eda98859327f9857c18287398d0f459c7b..444d68498c9676fe0e246167dfacbe999a41d1a7 100644 --- a/paddle/framework/channel_test.cc +++ b/paddle/framework/channel_test.cc @@ -48,12 +48,12 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) { const size_t buffer_size = 10; auto ch = MakeChannel(buffer_size); for (size_t i = 0; i < buffer_size; ++i) { - ch->Send(&i); // should not block + EXPECT_EQ(ch->Send(&i), true); // should not block } size_t out; for (size_t i = 0; i < buffer_size; ++i) { - ch->Receive(&out); // should not block + EXPECT_EQ(ch->Receive(&out), true); // should not block EXPECT_EQ(out, i); } CloseChannel(ch); @@ -67,7 +67,10 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { std::thread t([&]() { // Try to write more than buffer size. for (size_t i = 0; i < 2 * buffer_size; ++i) { - ch->Send(&i); // should not block + if (i < buffer_size) + EXPECT_EQ(ch->Send(&i), true); // should block after 10 iterations + else + EXPECT_EQ(ch->Send(&i), false); sum += i; } }); @@ -84,13 +87,13 @@ TEST(Channel, SimpleUnbufferedChannelTest) { unsigned sum_send = 0; std::thread t([&]() { for (int i = 0; i < 5; i++) { - ch->Send(&i); + EXPECT_EQ(ch->Send(&i), true); sum_send += i; } }); for (int i = 0; i < 5; i++) { int recv; - ch->Receive(&recv); + EXPECT_EQ(ch->Receive(&recv), true); EXPECT_EQ(recv, i); } @@ -100,6 +103,102 @@ TEST(Channel, SimpleUnbufferedChannelTest) { delete ch; } +// This tests that closing a buffered channel also unblocks +// any receivers waiting on the channel +TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) { + auto ch = MakeChannel(1); + size_t num_threads = 5; + std::thread t[num_threads]; + bool thread_ended[num_threads]; + + // Launches threads that try to read and are blocked because of no writers + for (size_t i = 0; i < num_threads; i++) { + thread_ended[i] = false; + t[i] = std::thread( + [&](bool *p) { + int data; + // All reads should return false + EXPECT_EQ(ch->Receive(&data), false); + *p = true; + }, + &thread_ended[i]); + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait + + // Verify that all threads are blocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], false); + } + + // Explicitly close the channel + // This should unblock all receivers + CloseChannel(ch); + + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait + + // Verify that all threads got unblocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], true); + } + + for (size_t i = 0; i < num_threads; i++) t[i].join(); + delete ch; +} + +// This tests that closing a buffered channel also unblocks +// any senders waiting for channel to have write space +TEST(Channel, BufferedChannelCloseUnblocksSendersTest) { + auto ch = MakeChannel(1); + size_t num_threads = 5; + std::thread t[num_threads]; + bool thread_ended[num_threads]; + bool send_success[num_threads]; + + // Launches threads that try to write and are blocked because of no readers + for (size_t i = 0; i < num_threads; i++) { + thread_ended[i] = false; + send_success[i] = false; + t[i] = std::thread( + [&](bool *ended, bool *success) { + int data = 10; + *success = ch->Send(&data); + *ended = true; + }, + &thread_ended[i], &send_success[i]); + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait + + // Verify that atleast 4 threads are blocked + int ct = 0; + for (size_t i = 0; i < num_threads; i++) { + if (thread_ended[i] == false) ct++; + } + // Atleast 4 threads must be blocked + EXPECT_GE(ct, 4); + + // Explicitly close the thread + // This should unblock all senders + CloseChannel(ch); + + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait + + // Verify that all threads got unblocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], true); + } + + // Verify that only 1 send was successful + ct = 0; + for (size_t i = 0; i < num_threads; i++) { + if (send_success[i]) ct++; + } + // Only 1 send must be successful + EXPECT_EQ(ct, 1); + + for (size_t i = 0; i < num_threads; i++) t[i].join(); + delete ch; +} + // This tests that closing an unbuffered channel also unblocks // unblocks any receivers waiting for senders TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) { @@ -114,7 +213,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) { t[i] = std::thread( [&](bool *p) { int data; - ch->Receive(&data); + EXPECT_EQ(ch->Receive(&data), false); *p = true; }, &thread_ended[i]); @@ -155,7 +254,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) { t[i] = std::thread( [&](bool *p) { int data = 10; - ch->Send(&data); + EXPECT_EQ(ch->Send(&data), false); *p = true; }, &thread_ended[i]); @@ -207,3 +306,37 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) { t.join(); delete ch; } + +TEST(Channel, UnbufferedMoreReceiveLessSendTest) { + auto ch = MakeChannel(0); + unsigned sum_send = 0; + unsigned sum_receive = 0; + // The receiver should block after 5 + // iterations, since there are only 5 senders. + std::thread t([&]() { + for (int i = 0; i < 8; i++) { + int recv; + ch->Receive(&recv); // should block after the fifth iteration. + EXPECT_EQ(recv, i); + sum_receive += i; + } + }); + for (int i = 0; i < 5; i++) { + ch->Send(&i); + sum_send += i; + } + std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec + EXPECT_EQ(sum_send, 10U); + EXPECT_EQ(sum_receive, 10U); + // send three more elements + for (int i = 5; i < 8; i++) { + ch->Send(&i); + sum_send += i; + } + + CloseChannel(ch); + t.join(); + EXPECT_EQ(sum_send, 28U); + EXPECT_EQ(sum_receive, 28U); + delete ch; +} diff --git a/paddle/framework/details/buffered_channel.h b/paddle/framework/details/buffered_channel.h index b093e1589293b030ef2bedb82504a8e86b3dc857..7ac234b8d42bae0661c3256c78311455c0fbc77c 100644 --- a/paddle/framework/details/buffered_channel.h +++ b/paddle/framework/details/buffered_channel.h @@ -30,8 +30,8 @@ class Buffered : public paddle::framework::Channel { friend void paddle::framework::CloseChannel(Channel*); public: - virtual void Send(T*); - virtual void Receive(T*); + virtual bool Send(T*); + virtual bool Receive(T*); virtual size_t Cap() { return cap_; } virtual void Close(); virtual ~Buffered(); @@ -48,39 +48,43 @@ class Buffered : public paddle::framework::Channel { PADDLE_ENFORCE_GT(cap, 0); } - void NotifyAllSenders(std::unique_lock*); + void NotifyAllParticipants(std::unique_lock*); }; template -void Buffered::Send(T* item) { +bool Buffered::Send(T* item) { std::unique_lock lock(mu_); full_cond_var_.wait(lock, [this]() { return channel_.size() < cap_ || closed_; }); + bool ret = false; if (!closed_) { channel_.push_back(std::move(*item)); lock.unlock(); empty_cond_var_.notify_one(); + ret = true; } + return ret; } template -void Buffered::Receive(T* item) { +bool Buffered::Receive(T* item) { std::unique_lock lock(mu_); empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; }); + bool ret = false; if (!closed_) { *item = std::move(channel_.front()); channel_.pop_front(); - NotifyAllSenders(&lock); - } else { - item = nullptr; + full_cond_var_.notify_one(); + ret = true; } + return ret; } template void Buffered::Close() { std::unique_lock lock(mu_); closed_ = true; - NotifyAllSenders(&lock); + NotifyAllParticipants(&lock); } template @@ -88,13 +92,14 @@ Buffered::~Buffered() { std::unique_lock lock(mu_); closed_ = true; channel_.clear(); - NotifyAllSenders(&lock); + NotifyAllParticipants(&lock); } template -void Buffered::NotifyAllSenders(std::unique_lock* lock) { +void Buffered::NotifyAllParticipants(std::unique_lock* lock) { lock->unlock(); full_cond_var_.notify_all(); + empty_cond_var_.notify_all(); } } // namespace details diff --git a/paddle/framework/details/unbuffered_channel.h b/paddle/framework/details/unbuffered_channel.h index 0dc5afd7e57c1f59dfc1b86093eea231d46966f1..f86a894bb4a42e45edf6964e30620b68183faaa8 100644 --- a/paddle/framework/details/unbuffered_channel.h +++ b/paddle/framework/details/unbuffered_channel.h @@ -29,8 +29,8 @@ class UnBuffered : public paddle::framework::Channel { friend void paddle::framework::CloseChannel(Channel*); public: - virtual void Send(T*); - virtual void Receive(T*); + virtual bool Send(T*); + virtual bool Receive(T*); virtual size_t Cap() { return 0; } virtual void Close(); virtual ~UnBuffered(); @@ -57,7 +57,7 @@ class UnBuffered : public paddle::framework::Channel { // This function implements the concept of how data should // be sent from a writer to a reader. template -void UnBuffered::Send(T* data) { +bool UnBuffered::Send(T* data) { // Prevent other writers from entering std::unique_lock writer_lock(mu_write_); writer_found_ = true; @@ -66,6 +66,7 @@ void UnBuffered::Send(T* data) { cv_writer_.wait(cv_lock, [this]() { return reader_found_ == true || closed_; }); cv_reader_.notify_one(); + bool ret = false; if (!closed_) { std::unique_lock channel_lock(mu_ch_); item = data; @@ -74,14 +75,16 @@ void UnBuffered::Send(T* data) { channel_lock.lock(); cv_channel_.wait(channel_lock, [this]() { return item == nullptr || closed_; }); + ret = true; } writer_found_ = false; + return ret; } // This function implements the concept of how // data that was sent by a writer is read from a reader. template -void UnBuffered::Receive(T* data) { +bool UnBuffered::Receive(T* data) { // Prevent other readers from entering std::unique_lock read_lock{mu_read_}; reader_found_ = true; @@ -90,6 +93,7 @@ void UnBuffered::Receive(T* data) { cv_reader_.wait(cv_lock, [this]() { return writer_found_ == true || closed_; }); cv_writer_.notify_one(); + bool ret = false; if (!closed_) { std::unique_lock lock_ch{mu_ch_}; // Reader should wait for the writer to first write its data @@ -98,10 +102,12 @@ void UnBuffered::Receive(T* data) { *data = std::move(*item); item = nullptr; lock_ch.unlock(); + ret = true; } cv_channel_.notify_one(); } reader_found_ = false; + return ret; } // This function implements the sequence of events diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index f8df2cf97ad532f06cb1393b1a24cd789f8bde29..f554c77845087453f8c6e4d04522a8555e583ae6 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext { bool HasOutputs(const std::string &name) const override; - DDim GetInputDim(const std::string &name) const override; - - void SetOutputDim(const std::string &name, const DDim &dim) override; - AttrReader Attrs() const override; const std::vector &Inputs( @@ -444,21 +440,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const { return true; } -DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const { - std::vector ddims = GetInputsDim(name); - auto length = ddims.size(); - PADDLE_ENFORCE_EQ(length, 1UL, - "Input(%s) should have 1 value, " - "but it has %d now", - name, length); - return ddims[0]; -} - -void CompileTimeInferShapeContext::SetOutputDim(const std::string &name, - const DDim &dim) { - SetOutputsDim(name, {dim}); -} - AttrReader CompileTimeInferShapeContext::Attrs() const { return AttrReader(op_.GetAttrMap()); } diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 4e854f54dd43d760bab44fb5f7cafeb13314b27c..81fa8cf477423fc2a54c719c9a743729215513c3 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -366,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext { return true; } - DDim GetInputDim(const std::string& name) const override { - return GetDim(op_.Input(name)); - } - - void SetOutputDim(const std::string& name, const DDim& dim) override { - SetDim(op_.Output(name), dim); - } - AttrReader Attrs() const override { return AttrReader(op_.Attrs()); } const std::vector& Inputs( diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc index e53cc0cdabc623ae358f1a3e21823a2f38ec3c62..a0fa467291bb42c59b65f5efeabe9c2235e15b2a 100644 --- a/paddle/framework/shape_inference.cc +++ b/paddle/framework/shape_inference.cc @@ -18,10 +18,18 @@ limitations under the License. */ namespace paddle { namespace framework { -std::vector InferShapeContext::GetInputsDim( +DDim InferShapeContext::GetInputDim(const std::string &name) const { + const std::vector &arg_names = Inputs(name); + PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, + "Input(%s) should hold one element, but now it holds %d", + name, arg_names.size()); + return this->GetDim(arg_names[0]); +} + +std::vector InferShapeContext::GetInputsDim( const std::string &name) const { - const std::vector &names = Inputs(name); - return GetDims(names); + const std::vector &arg_names = Inputs(name); + return GetDims(arg_names); } DDim InferShapeContext::GetInputsElementDim(const std::string &name, @@ -30,24 +38,31 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name, return this->GetDim(names[idx]); } -void InferShapeContext::SetOutputsDim( - const std::string &name, const std::vector &dims) { +void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) { + auto &arg_names = Outputs(name); + PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, + "Output(%s) should hold one element, but now it holds %d", + name, arg_names.size()); + SetDim(arg_names[0], dim); +} + +void InferShapeContext::SetOutputsDim(const std::string &name, + const std::vector &dims) { auto &names = Outputs(name); SetDims(names, dims); } -std::vector InferShapeContext::GetDims( +std::vector InferShapeContext::GetDims( const std::vector &names) const { - std::vector ret; + std::vector ret; ret.reserve(names.size()); std::transform( names.begin(), names.end(), std::back_inserter(ret), [this](const std::string &name) { return this->GetDim(name); }); return ret; } - void InferShapeContext::SetDims(const std::vector &names, - const std::vector &dims) { + const std::vector &dims) { size_t length = names.size(); PADDLE_ENFORCE_EQ(length, dims.size()); for (size_t i = 0; i < length; ++i) { diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h index f93319d8f2fd4c5d388bd57fd595a6a5edd51775..830f199ed1451538f12fc8dd34fb7b2bfc356a71 100644 --- a/paddle/framework/shape_inference.h +++ b/paddle/framework/shape_inference.h @@ -35,14 +35,13 @@ class InferShapeContext { virtual bool HasInputs(const std::string &name) const = 0; virtual bool HasOutputs(const std::string &name) const = 0; - virtual framework::DDim GetInputDim(const std::string &name) const = 0; + DDim GetInputDim(const std::string &name) const; - std::vector GetInputsDim(const std::string &name) const; + std::vector GetInputsDim(const std::string &name) const; DDim GetInputsElementDim(const std::string &name, int idx) const; - virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0; - void SetOutputsDim(const std::string &name, - const std::vector &dims); + void SetOutputDim(const std::string &name, const DDim &dim); + void SetOutputsDim(const std::string &name, const std::vector &dims); virtual AttrReader Attrs() const = 0; virtual const std::vector &Inputs( @@ -57,15 +56,13 @@ class InferShapeContext { // Note: In while op, we need this to be public void SetDims(const std::vector &names, - const std::vector &dims); + const std::vector &dims); protected: - virtual framework::DDim GetDim(const std::string &name) const = 0; - virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0; - - std::vector GetDims( - const std::vector &names) const; + virtual DDim GetDim(const std::string &name) const = 0; + virtual void SetDim(const std::string &name, const DDim &dim) = 0; + std::vector GetDims(const std::vector &names) const; std::vector GetVarTypes( const std::vector &names) const; diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index e903f43ba69ee9e28b3a03e8921a41ffa81a2542..000c2089c176adf8d845a56a1f98528734f47ea1 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -158,7 +158,10 @@ op_library(parallel_do_op DEPS executor) # Regist multiple Kernel to pybind if (WITH_GPU) -op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col) + +op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS + vol2col depthwise_conv) + op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function) op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling) op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc diff --git a/paddle/operators/bipartite_match_op.cc b/paddle/operators/bipartite_match_op.cc index 83c8778fe4cec4d9d80de691e117a39fdd92f494..1e6fa2091de25218e2bdafeb740ce884234638a5 100644 --- a/paddle/operators/bipartite_match_op.cc +++ b/paddle/operators/bipartite_match_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -28,12 +28,18 @@ class BipartiteMatchOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("DistMat"), "Input(DistMat) of BipartiteMatch should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("ColToRowMatchIndices"), + "Output(ColToRowMatchIndices) of BipartiteMatch should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("ColToRowMatchDist"), + "Output(ColToRowMatchDist) of BipartiteMatch should not be null."); auto dims = ctx->GetInputDim("DistMat"); PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2."); ctx->SetOutputDim("ColToRowMatchIndices", dims); - ctx->SetOutputDim("ColToRowMatchDis", dims); + ctx->SetOutputDim("ColToRowMatchDist", dims); } }; @@ -91,7 +97,7 @@ class BipartiteMatchKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto* dist_mat = context.Input("DistMat"); auto* match_indices = context.Output("ColToRowMatchIndices"); - auto* match_dist = context.Output("ColToRowMatchDis"); + auto* match_dist = context.Output("ColToRowMatchDist"); auto& dev_ctx = context.device_context(); @@ -148,13 +154,13 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker { "Otherwise, it means B[j] is matched to row " "ColToRowMatchIndices[i][j] in i-th instance. The row number of " "i-th instance is saved in ColToRowMatchIndices[i][j]."); - AddOutput("ColToRowMatchDis", + AddOutput("ColToRowMatchDist", "(Tensor) A 2-D Tensor with shape [N, M] in float type. " "N is batch size. If ColToRowMatchIndices[i][j] is -1, " - "ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed " + "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed " "ColToRowMatchIndices[i][j] = d, and the row offsets of each " "instance are called LoD. Then " - "ColToRowMatchDis[i][j] = DistMat[d+LoD[i]][j]"); + "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]"); AddComment(R"DOC( This operator is a greedy bipartite matching algorithm, which is used to obtain the matching with the maximum distance based on the input diff --git a/paddle/operators/box_coder_op.cc b/paddle/operators/box_coder_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..539813d4858b8faef386047f9ef64aa232aefca1 --- /dev/null +++ b/paddle/operators/box_coder_op.cc @@ -0,0 +1,121 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/box_coder_op.h" + +namespace paddle { +namespace operators { + +class BoxCoderOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("PriorBox"), + "Input(PriorBox) of BoxCoderOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"), + "Input(PriorBoxVar) of BoxCoderOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("TargetBox"), + "Input(TargetBox) of BoxCoderOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutputBox"), + "Output(OutputBox) of BoxCoderOp should not be null."); + + auto prior_box_dims = ctx->GetInputDim("PriorBox"); + auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); + auto target_box_dims = ctx->GetInputDim("TargetBox"); + + PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2, + "The rank of Input of PriorBoxVar must be 2"); + PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]"); + PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims); + PADDLE_ENFORCE_EQ(target_box_dims.size(), 2, + "The rank of Input of TargetBox must be 2"); + PADDLE_ENFORCE_EQ(target_box_dims[1], 4, + "The shape of TargetBox is [M, 4]"); + + GetBoxCodeType(ctx->Attrs().Get("code_type")); + + ctx->SetOutputDim( + "OutputBox", + framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4})); + ctx->ShareLoD("TargetBox", /*->*/ "OutputBox"); + } +}; + +class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker { + public: + BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "PriorBox", + "(Tensor, default Tensor) " + "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, " + "each box is represented as [xmin, ymin, xmax, ymax], " + "[xmin, ymin] is the left top coordinate of the anchor box, " + "if the input is image feature map, they are close to the origin " + "of the coordinate system. [xmax, ymax] is the right bottom " + "coordinate of the anchor box."); + AddInput("PriorBoxVar", + "(Tensor, default Tensor) " + "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group " + "of variance."); + AddInput( + "TargetBox", + "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " + "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], " + "[xmin, ymin] is the left top coordinate of the box if the input " + "is image feature map, they are close to the origin of the coordinate " + "system. [xmax, ymax] is the right bottom coordinate of the box. " + "This tensor can contain LoD information to represent a batch " + "of inputs. One instance of this batch can contain different " + "numbers of entities."); + AddAttr("code_type", + "(string, default encode_center_size) " + "the code type used with the target box") + .SetDefault("encode_center_size") + .InEnum({"encode_center_size", "decode_center_size"}); + AddOutput( + "OutputBox", + "(LoDTensor or Tensor) " + "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] " + "representing the result of N target boxes encoded/decoded with " + "M Prior boxes and variances."); + + AddComment(R"DOC( +Bounding Box Coder Operator. +Encode/Decode the target bounding box with the priorbox information. +The Encoding schema described below: +ox = (tx - px) / pw / pxv +oy = (ty - py) / ph / pyv +ow = log(abs(tw / pw)) / pwv +oh = log(abs(th / ph)) / phv +The Decoding schema described below: +ox = (pw * pxv * tx * + px) - tw / 2 +oy = (ph * pyv * ty * + py) - th / 2 +ow = exp(pwv * tw) * pw + tw / 2 +oh = exp(phv * th) * ph + th / 2 +where tx, ty, tw, th denote the target box's center coordinates, width and +height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor) +center coordinates, width and height. pxv, pyv, pwv, phv denote the variance +of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates, +width and height. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker); +REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel, + ops::BoxCoderKernel); diff --git a/paddle/operators/box_coder_op.cu b/paddle/operators/box_coder_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..98bd93457fafb49f2af5e1ff258fbfa9f9985600 --- /dev/null +++ b/paddle/operators/box_coder_op.cu @@ -0,0 +1,150 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/box_coder_op.h" +#include "paddle/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +template +__global__ void EncodeCenterSizeKernel(const T* prior_box_data, + const T* prior_box_var_data, + const T* target_box_data, const int row, + const int col, const int len, + T* output) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < row * col) { + const int row_idx = idx / col; + const int col_idx = idx % col; + T prior_box_width = + prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len]; + T prior_box_height = + prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1]; + T prior_box_center_x = + (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2; + T prior_box_center_y = (prior_box_data[col_idx * len + 3] + + prior_box_data[col_idx * len + 1]) / + 2; + + T target_box_center_x = + (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) / + 2; + T target_box_center_y = (target_box_data[row_idx * len + 3] + + target_box_data[row_idx * len + 1]) / + 2; + T target_box_width = + target_box_data[row_idx * len + 2] - target_box_data[row_idx * len]; + T target_box_height = + target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1]; + + output[idx * len] = (target_box_center_x - prior_box_center_x) / + prior_box_width / prior_box_var_data[col_idx * len]; + output[idx * len + 1] = (target_box_center_y - prior_box_center_y) / + prior_box_height / + prior_box_var_data[col_idx * len + 1]; + output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) / + prior_box_var_data[col_idx * len + 2]; + output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) / + prior_box_var_data[col_idx * len + 3]; + } +} + +template +__global__ void DecodeCenterSizeKernel(const T* prior_box_data, + const T* prior_box_var_data, + const T* target_box_data, const int row, + const int col, const int len, + T* output) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < row * col) { + const int row_idx = idx / col; + const int col_idx = idx % col; + T prior_box_width = + prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len]; + T prior_box_height = + prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1]; + T prior_box_center_x = + (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2; + T prior_box_center_y = (prior_box_data[col_idx * len + 3] + + prior_box_data[col_idx * len + 1]) / + 2; + + T target_box_width = exp(prior_box_var_data[col_idx * len + 2] * + target_box_data[row_idx * len + 2]) * + prior_box_width; + T target_box_height = exp(prior_box_var_data[col_idx * len + 3] * + target_box_data[row_idx * len + 3]) * + prior_box_height; + T target_box_center_x = prior_box_var_data[col_idx * len] * + target_box_data[row_idx * len] * + prior_box_width + + prior_box_center_x; + T target_box_center_y = prior_box_var_data[col_idx * len + 1] * + target_box_data[row_idx * len + 1] * + prior_box_height + + prior_box_center_y; + + output[idx * len] = target_box_center_x - target_box_width / 2; + output[idx * len + 1] = target_box_center_y - target_box_height / 2; + output[idx * len + 2] = target_box_center_x + target_box_width / 2; + output[idx * len + 3] = target_box_center_y + target_box_height / 2; + } +} + +template +class BoxCoderCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + auto* prior_box = context.Input("PriorBox"); + auto* prior_box_var = context.Input("PriorBoxVar"); + auto* target_box = context.Input("TargetBox"); + auto* output_box = context.Output("OutputBox"); + + if (target_box->lod().size()) { + PADDLE_ENFORCE_EQ(target_box->lod().size(), 1, + "Only support 1 level of LoD."); + } + auto row = target_box->dims()[0]; + auto col = prior_box->dims()[0]; + auto len = prior_box->dims()[1]; + int block = 512; + int grid = (row * col + block - 1) / block; + auto& device_ctx = context.cuda_device_context(); + + const T* prior_box_data = prior_box->data(); + const T* prior_box_var_data = prior_box_var->data(); + const T* target_box_data = target_box->data(); + + output_box->mutable_data({row, col, len}, context.GetPlace()); + T* output = output_box->data(); + + auto code_type = GetBoxCodeType(context.Attr("code_type")); + if (code_type == BoxCodeType::kEncodeCenterSize) { + EncodeCenterSizeKernel<<>>( + prior_box_data, prior_box_var_data, target_box_data, row, col, len, + output); + } else if (code_type == BoxCodeType::kDecodeCenterSize) { + DecodeCenterSizeKernel<<>>( + prior_box_data, prior_box_var_data, target_box_data, row, col, len, + output); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel, + ops::BoxCoderCUDAKernel); diff --git a/paddle/operators/box_coder_op.h b/paddle/operators/box_coder_op.h new file mode 100644 index 0000000000000000000000000000000000000000..086251f6e066f082743f332ce72918c6e572ce19 --- /dev/null +++ b/paddle/operators/box_coder_op.h @@ -0,0 +1,151 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/op_registry.h" +#include "paddle/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 }; + +inline BoxCodeType GetBoxCodeType(const std::string& type) { + if (type == "encode_center_size") { + return BoxCodeType::kEncodeCenterSize; + } else if (type == "decode_center_size") { + return BoxCodeType::kDecodeCenterSize; + } + PADDLE_THROW("Not support type %s.", type); +} + +template +class BoxCoderKernel : public framework::OpKernel { + public: + void EncodeCenterSize(const framework::Tensor& target_box, + const framework::Tensor& prior_box, + const framework::Tensor& prior_box_var, + T* output) const { + int64_t row = target_box.dims()[0]; + int64_t col = prior_box.dims()[0]; + int64_t len = prior_box.dims()[1]; + auto* target_box_data = target_box.data(); + auto* prior_box_data = prior_box.data(); + auto* prior_box_var_data = prior_box_var.data(); + + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + T prior_box_width = + prior_box_data[j * len + 2] - prior_box_data[j * len]; + T prior_box_height = + prior_box_data[j * len + 3] - prior_box_data[j * len + 1]; + T prior_box_center_x = + (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2; + T prior_box_center_y = + (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; + + T target_box_center_x = + (target_box_data[i * len + 2] + target_box_data[i * len]) / 2; + T target_box_center_y = + (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2; + T target_box_width = + target_box_data[i * len + 2] - target_box_data[i * len]; + T target_box_height = + target_box_data[i * len + 3] - target_box_data[i * len + 1]; + + size_t offset = i * col * len + j * len; + output[offset] = (target_box_center_x - prior_box_center_x) / + prior_box_width / prior_box_var_data[j * len]; + output[offset + 1] = (target_box_center_y - prior_box_center_y) / + prior_box_height / prior_box_var_data[j * len + 1]; + output[offset + 2] = + std::log(std::fabs(target_box_width / prior_box_width)) / + prior_box_var_data[j * len + 2]; + output[offset + 3] = + std::log(std::fabs(target_box_height / prior_box_height)) / + prior_box_var_data[j * len + 3]; + } + } + } + void DecodeCenterSize(const framework::Tensor& target_box, + const framework::Tensor& prior_box, + const framework::Tensor& prior_box_var, + T* output) const { + int64_t row = target_box.dims()[0]; + int64_t col = prior_box.dims()[0]; + int64_t len = prior_box.dims()[1]; + + auto* target_box_data = target_box.data(); + auto* prior_box_data = prior_box.data(); + auto* prior_box_var_data = prior_box_var.data(); + + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + T prior_box_width = + prior_box_data[j * len + 2] - prior_box_data[j * len]; + T prior_box_height = + prior_box_data[j * len + 3] - prior_box_data[j * len + 1]; + T prior_box_center_x = + (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2; + T prior_box_center_y = + (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; + + T target_box_center_x = prior_box_var_data[j * len] * + target_box_data[i * len] * prior_box_width + + prior_box_center_x; + T target_box_center_y = prior_box_var_data[j * len + 1] * + target_box_data[i * len + 1] * + prior_box_height + + prior_box_center_y; + T target_box_width = std::exp(prior_box_var_data[j * len + 2] * + target_box_data[i * len + 2]) * + prior_box_width; + T target_box_height = std::exp(prior_box_var_data[j * len + 3] * + target_box_data[i * len + 3]) * + prior_box_height; + + size_t offset = i * col * len + j * len; + output[offset] = target_box_center_x - target_box_width / 2; + output[offset + 1] = target_box_center_y - target_box_height / 2; + output[offset + 2] = target_box_center_x + target_box_width / 2; + output[offset + 3] = target_box_center_y + target_box_height / 2; + } + } + } + + void Compute(const framework::ExecutionContext& context) const override { + auto* prior_box = context.Input("PriorBox"); + auto* prior_box_var = context.Input("PriorBoxVar"); + auto* target_box = context.Input("TargetBox"); + auto* output_box = context.Output("OutputBox"); + + if (target_box->lod().size()) { + PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL, + "Only support 1 level of LoD."); + } + auto row = target_box->dims()[0]; + auto col = prior_box->dims()[0]; + auto len = prior_box->dims()[1]; + + output_box->mutable_data({row, col, len}, context.GetPlace()); + + auto code_type = GetBoxCodeType(context.Attr("code_type")); + T* output = output_box->data(); + if (code_type == BoxCodeType::kEncodeCenterSize) { + EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output); + } else if (code_type == BoxCodeType::kDecodeCenterSize) { + DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h index 9c655d6c0d8e5fe04ee6d85f7e9d9da68105230c..b275fd75b3512343825170fc38565dd27f7f1c75 100644 --- a/paddle/operators/compare_op.h +++ b/paddle/operators/compare_op.h @@ -54,7 +54,15 @@ class CompareOpKernel public: void Compute(const framework::ExecutionContext& context) const override { using T = typename Functor::ELEM_TYPE; - ElementwiseComputeEx(context); + using Tensor = framework::Tensor; + + auto* x = context.Input("X"); + auto* y = context.Input("Y"); + auto* z = context.Output("Out"); + z->mutable_data(context.GetPlace()); + int axis = context.Attr("axis"); + ElementwiseComputeEx(context, x, y, axis, + z); } }; diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc index d6882b275b22b9a2a2b6ff8cfb53a3462bbdbefe..cef7ddd5fe7e12a374fb9cc79211bd2eb97c6c52 100644 --- a/paddle/operators/conv_op.cc +++ b/paddle/operators/conv_op.cc @@ -318,9 +318,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( namespace ops = paddle::operators; REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad, ops::ConvOpGrad); + +// depthwise convolution op +REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, + depthwise_conv2d_grad, ops::ConvOpGrad); REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad, ops::ConvOpGrad); +// depthwise conv kernel +// TODO(xingzhaolong): neon kernel for mobile +REGISTER_OP_CPU_KERNEL( + depthwise_conv2d, + ops::GemmConvKernel, + ops::GemmConvKernel); + +REGISTER_OP_CPU_KERNEL( + depthwise_conv2d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); + REGISTER_OP_CPU_KERNEL( conv2d, ops::GemmConvKernel, ops::GemmConvKernel); diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc index 4f942444f3eb5584f07399b8d1b4d6a5087496d4..d0bd40ee95dab3b2589742b8a0c3a5de7918b5b9 100644 --- a/paddle/operators/conv_op.cu.cc +++ b/paddle/operators/conv_op.cu.cc @@ -16,6 +16,16 @@ limitations under the License. */ namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + depthwise_conv2d, + ops::DepthwiseConvKernel, + ops::DepthwiseConvKernel); + +REGISTER_OP_CUDA_KERNEL( + depthwise_conv2d_grad, + ops::DepthwiseConvGradKernel, + ops::DepthwiseConvGradKernel); + REGISTER_OP_CUDA_KERNEL( conv2d, ops::GemmConvKernel, ops::GemmConvKernel); diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h index 5a8933e7915960f9fcbe92ae73c4f37b3b69ecaf..3c1d0e9c1c4bb964bfaebc3bfed115548bd53f97 100644 --- a/paddle/operators/conv_op.h +++ b/paddle/operators/conv_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/framework/eigen.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/math/depthwise_conv.h" #include "paddle/operators/math/im2col.h" #include "paddle/operators/math/math_function.h" #include "paddle/operators/math/vol2col.h" @@ -350,5 +351,72 @@ class GemmConvGradKernel : public framework::OpKernel { } } }; + +template +class DepthwiseConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + Tensor filter = *context.Input("Filter"); + Tensor* output = context.Output("Output"); + output->mutable_data(context.GetPlace()); + + PADDLE_ENFORCE_EQ( + output->dims()[1] % input->dims()[1], 0, + "The output channels must be a multiple of the input channels"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + + math::DepthwiseConvFunctor depthwiseConv; + + auto& dev_ctx = context.template device_context(); + depthwiseConv(dev_ctx, *input, filter, strides, paddings, output); + } +}; + +template +class DepthwiseConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + Tensor filter = *context.Input("Filter"); + + if (!input_grad && !filter_grad) return; + + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + + math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + + if (input_grad) { + input_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, input_grad, static_cast(0)); + depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, + paddings, input_grad); + } + + if (filter_grad) { + filter_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, filter_grad, static_cast(0)); + depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings, + filter_grad); + } + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h index a8389429f26c17ceab1db22175c90888546ead6f..c32288d6984f126f2374a13973541f4f663b25a4 100644 --- a/paddle/operators/elementwise_add_op.h +++ b/paddle/operators/elementwise_add_op.h @@ -28,7 +28,14 @@ template class ElementwiseAddKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -92,9 +99,19 @@ template class ElementwiseAddGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseAddBroadCastGradFunctor, - ElementwiseAddBroadCast2GradFunctor>(ctx); + ElementwiseAddBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h index ef26cb6c914f50ded07cc9d0d8de3f49f2151129..07ebade31ff5b3d5c89156e28ff5fa0670a9a842 100644 --- a/paddle/operators/elementwise_div_op.h +++ b/paddle/operators/elementwise_div_op.h @@ -28,7 +28,14 @@ template class ElementwiseDivKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -111,9 +118,19 @@ template class ElementwiseDivGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseDivBroadCastGradFunctor, - ElementwiseDivBroadCast2GradFunctor>(ctx); + ElementwiseDivBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/elementwise_max_op.h b/paddle/operators/elementwise_max_op.h index 255728e8e620665a7de225b228c19d6c510da1c8..717e45ab31db9b9a6629fb33e17654dbf986d8c5 100644 --- a/paddle/operators/elementwise_max_op.h +++ b/paddle/operators/elementwise_max_op.h @@ -28,7 +28,14 @@ template class ElementwiseMaxKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -110,9 +117,19 @@ template class ElementwiseMaxGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseMaxBroadCastGradFunctor, - ElementwiseMaxBroadCast2GradFunctor>(ctx); + ElementwiseMaxBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/elementwise_min_op.h b/paddle/operators/elementwise_min_op.h index e6627a0f1bb468c8e4661b83489cb964b72dddb0..0de9a91c52b0ab82cd62604de318ce68e56b767d 100644 --- a/paddle/operators/elementwise_min_op.h +++ b/paddle/operators/elementwise_min_op.h @@ -28,7 +28,14 @@ template class ElementwiseMinKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -110,9 +117,19 @@ template class ElementwiseMinGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseMinBroadCastGradFunctor, - ElementwiseMinBroadCast2GradFunctor>(ctx); + ElementwiseMinBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h index 4b86b00b5a095ae898f9ce0c17cde2cc91060ba9..ae7a71e0244dfb8ad3e55683ac081f92bc36bea5 100644 --- a/paddle/operators/elementwise_mul_op.h +++ b/paddle/operators/elementwise_mul_op.h @@ -27,7 +27,14 @@ template class ElementwiseMulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -110,9 +117,19 @@ template class ElementwiseMulGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseMulBroadCastGradFunctor, - ElementwiseMulBroadCast2GradFunctor>(ctx); + ElementwiseMulBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index d749b8e8757d0d433be05876779ccc22b95ca80b..213fe1f5a818873e8b666464cb112637261c598c 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -313,21 +313,18 @@ EIGEN_FUNCTOR(Div, EIGEN_DIV); template -void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); +void ElementwiseGradCompute(const framework::ExecutionContext& ctx, + const framework::Tensor* x, + const framework::Tensor* y, + const framework::Tensor* out, + const framework::Tensor* dout, int axis, + framework::Tensor* dx, framework::Tensor* dy) { auto& place = *ctx.template device_context().eigen_device(); auto x_dims = x->dims(); auto y_dims = y->dims(); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); if (dx) { dx->mutable_data(ctx.GetPlace()); } @@ -348,7 +345,6 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { x_dims = framework::make_ddim(extended_dims); } - int axis = ctx.Attr("axis"); axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); int pre, n, post; @@ -367,13 +363,10 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { template -void ElementwiseComputeEx(const framework::ExecutionContext& ctx) { - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - z->mutable_data(ctx.GetPlace()); +void ElementwiseComputeEx(const framework::ExecutionContext& ctx, + const framework::Tensor* x, + const framework::Tensor* y, int axis, + framework::Tensor* z) { TransformFunctor functor( x, y, z, ctx.template device_context(), Functor()); @@ -394,7 +387,6 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx) { x_dims = framework::make_ddim(extended_dims); } - int axis = ctx.Attr("axis"); axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), "Axis should be in range [0, x_dims)"); diff --git a/paddle/operators/elementwise_pow_op.h b/paddle/operators/elementwise_pow_op.h index 6019e709e0db0fd62b4d3350bb768095f87ef241..874fd3f09f2afaccfbfca75799cc3448f7393b03 100644 --- a/paddle/operators/elementwise_pow_op.h +++ b/paddle/operators/elementwise_pow_op.h @@ -29,7 +29,14 @@ template class ElementwisePowKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h index a2aca793026189ec87e00b52d7c351689f870400..c2749a8e6ba689233dab4f3c72de10bf01f39fab 100644 --- a/paddle/operators/elementwise_sub_op.h +++ b/paddle/operators/elementwise_sub_op.h @@ -27,7 +27,14 @@ template class ElementwiseSubKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -93,9 +100,19 @@ template class ElementwiseSubGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseSubBroadCastGradFunctor, - ElementwiseSubBroadCast2GradFunctor>(ctx); + ElementwiseSubBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index 28c5aec1996ad04a6cb551ac68c14b613d16858e..768106fadf355ea6fb148491e232dc0ef1453a75 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -8,6 +8,7 @@ if(WITH_GPU) nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context) nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) + nv_library(depthwise_conv SRCS depthwise_conv.cu DEPS device_context) nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function) diff --git a/paddle/operators/math/depthwise_conv.cu b/paddle/operators/math/depthwise_conv.cu new file mode 100644 index 0000000000000000000000000000000000000000..b212e78208355866516211d276cb8046623babd7 --- /dev/null +++ b/paddle/operators/math/depthwise_conv.cu @@ -0,0 +1,311 @@ +/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/depthwise_conv.h" +#include "paddle/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +// A Cuda kernel to compute the depthwise convolution forward pass +// in NCHW format. +template +__global__ void KernelDepthwiseConv( + const int nthreads, const T* const input_data, const T* const filter_data, + const int batch_size, const int output_channels, const int output_height, + const int output_width, const int input_channels, const int input_height, + const int input_width, const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, T* const output_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if (index < nthreads) { + const int batch = index / output_channels / output_height / output_width; + const int c_out = (index / output_height / output_width) % output_channels; + const int h_out = (index / output_width) % output_height; + const int w_out = index % output_width; + + const int c_in = c_out / filter_multiplier; + const T* weight = filter_data + c_out * filter_height * filter_width; + T value = 0; + const int h_in_start = -padding_height + h_out * stride_height; + const int w_in_start = -padding_width + w_out * stride_width; + const int h_in_end = h_in_start + filter_height; + const int w_in_end = w_in_start + filter_width; + + const int in_offset = + ((batch * input_channels + c_in) * input_height) * input_width; + + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + + for (int h_in = h_start; h_in < h_end; h_in++) { + for (int w_in = w_start; w_in < w_end; w_in++) { + const int offset = in_offset + h_in * input_width + w_in; + value += + weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] * + input_data[offset]; + } + } + output_data[index] = value; + } +} + +// CUDA kernel to compute the depthwise convolution backprop w.r.t input. +template +__global__ void KernelDepthwiseConvInputGrad( + const int nthreads, const T* const output_grad_data, + const T* const filter_data, const int batch_size, const int output_channels, + const int output_height, const int output_width, const int input_channels, + const int input_height, const int input_width, const int filter_multiplier, + const int filter_height, const int filter_width, const int stride_height, + const int stride_width, const int padding_height, const int padding_width, + T* const input_grad_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < nthreads) { + const int batch = index / input_channels / input_height / input_width; + const int c_in = (index / input_height / input_width) % input_channels; + const int h_in = (index / input_width) % input_height; + const int w_in = index % input_width; + + const int c_out_start = c_in * filter_multiplier; + + int h_out_start = + (h_in - filter_height + padding_height + stride_height) / stride_height; + h_out_start = 0 > h_out_start ? 0 : h_out_start; + + int h_out_end = (h_in + padding_height) / stride_height; + h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end; + + int w_out_start = + (w_in - filter_width + padding_width + stride_width) / stride_width; + w_out_start = 0 > w_out_start ? 0 : w_out_start; + + int w_out_end = (w_in + padding_width) / stride_width; + w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end; + + T value = 0; + + for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; + c_out++) { + for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { + const int filter_h = h_in + padding_height - h_out * stride_height; + for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { + const int filter_w = w_in + padding_width - w_out * stride_width; + const int filter_offset = c_out * filter_height * filter_width + + filter_h * filter_width + filter_w; + const int output_grad_offset = + ((batch * output_channels + c_out) * output_height + h_out) * + output_width + + w_out; + value += + output_grad_data[output_grad_offset] * filter_data[filter_offset]; + } + } + } + input_grad_data[index] += value; + } +} + +// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. +template +__global__ void KernelDepthwiseConvFilterGrad( + const int nthreads, const T* const output_grad_data, + const T* const input_data, const int num, const int output_channels, + const int output_height, const int output_width, const int input_channels, + const int input_height, const int input_width, const int filter_multiplier, + const int filter_height, const int filter_width, const int stride_height, + const int stride_width, const int padding_height, const int padding_width, + T* const filter_grad_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < nthreads) { + const int w_out = index % output_width; + const int h_out = (index / output_width) % output_height; + const int c_out = (index / output_width / output_height) % output_channels; + const int batch = (index / output_width / output_height / output_channels); + const int c_in = c_out / filter_multiplier; + const int h_in_start = -padding_height + h_out * stride_height; + const int w_in_start = -padding_width + w_out * stride_width; + const int h_in_end = + -padding_height + h_out * stride_height + filter_height; + const int w_in_end = -padding_width + w_out * stride_width + filter_width; + const int in_offset = + (batch * input_channels + c_in) * input_height * input_width; + + T* addr_offset = filter_grad_data + c_out * filter_height * filter_width; + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + + for (int h_in = h_start; h_in < h_end; h_in++) { + for (int w_in = w_start; w_in < w_end; w_in++) { + const int offset = in_offset + h_in * input_width + w_in; + const T diff_temp = output_grad_data[index] * input_data[offset]; + T* addr = addr_offset + (h_in - h_in_start) * filter_width + + (w_in - w_in_start); + paddle::platform::CudaAtomicAdd(addr, diff_temp); + } + } + } +} + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class DepthwiseConvFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& filter, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int ksize_height = filter.dims()[2]; + const int ksize_width = filter.dims()[3]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* input_data = input.data(); + const T* filter_data = filter.data(); + T* output_data = output->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_height * output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConv<<>>( + nthreads, input_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + output_data); + } +}; + +template +class DepthwiseConvInputGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& filter, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output_grad.dims()[1]; + const int output_height = output_grad.dims()[2]; + const int output_width = output_grad.dims()[3]; + const int ksize_height = filter.dims()[2]; + const int ksize_width = filter.dims()[3]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* filter_data = filter.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + int nthreads = batch_size * input_channels * input_height * input_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConvInputGrad<<>>( + nthreads, output_grad_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + input_grad_data); + } +}; + +template +class DepthwiseConvFilterGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* filter_grad) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output_grad.dims()[1]; + const int output_height = output_grad.dims()[2]; + const int output_width = output_grad.dims()[3]; + const int ksize_height = filter_grad->dims()[2]; + const int ksize_width = filter_grad->dims()[3]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* input_data = input.data(); + const T* output_grad_data = output_grad.data(); + T* filter_grad_data = filter_grad->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_height * output_width; + + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConvFilterGrad<<>>( + nthreads, output_grad_data, input_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + filter_grad_data); + } +}; + +template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; + +template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; + +template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/depthwise_conv.h b/paddle/operators/math/depthwise_conv.h new file mode 100644 index 0000000000000000000000000000000000000000..4708920bb42db90d84fda0c6a1039991cb79e80d --- /dev/null +++ b/paddle/operators/math/depthwise_conv.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/tensor.h" +#include "paddle/platform/device_context.h" +#include "paddle/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * \brief Compute the depthwise convolution which include + * forward process and backpropagation process + */ +template +class DepthwiseConvFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& filter, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output); +}; + +template +class DepthwiseConvInputGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& filter, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* input_grad); +}; + +template +class DepthwiseConvFilterGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* filter_grad); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/mine_hard_examples_op.cc b/paddle/operators/mine_hard_examples_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..051cc24706d69ec4f38524af1dd510bf079c74c7 --- /dev/null +++ b/paddle/operators/mine_hard_examples_op.cc @@ -0,0 +1,330 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +enum MiningType { kNone = 0, kMaxNegative, kHardExample }; + +template +bool SortScoreDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +inline bool IsEligibleMining(const MiningType mining_type, const int match_idx, + const float match_dist, + const float neg_dist_threshold) { + if (mining_type == MiningType::kMaxNegative) { + return match_idx == -1 && match_dist < neg_dist_threshold; + } else if (mining_type == MiningType::kHardExample) { + return true; + } else { + return false; + } +} + +inline MiningType GetMiningType(std::string str) { + if (str == "max_negative") { + return MiningType::kMaxNegative; + } else if (str == "hard_example") { + return MiningType::kHardExample; + } else { + return MiningType::kNone; + } +} + +template +class MineHardExamplesKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_cls_loss = ctx.Input("ClsLoss"); + auto* in_loc_loss = ctx.Input("LocLoss"); + auto* in_matched_indices = ctx.Input("MatchIndices"); + auto* in_match_dist = ctx.Input("MatchDist"); + float neg_pos_ratio = ctx.Attr("neg_pos_ratio"); + T neg_dist_threshold = + static_cast(ctx.Attr("neg_dist_threshold")); + int sample_size = ctx.Attr("sample_size"); + MiningType mining_type = + GetMiningType(ctx.Attr("mining_type")); + + auto out_neg_indices = ctx.Output("NegIndices"); + auto out_match_indices = + ctx.Output("UpdatedMatchIndices"); + + framework::Copy(*in_matched_indices, ctx.GetPlace(), out_match_indices); + + int batch_size = in_matched_indices->dims()[0]; + int prior_num = in_matched_indices->dims()[1]; + + auto match_indices = framework::EigenMatrix::From(*in_matched_indices); + + auto match_indices_et = + framework::EigenMatrix::From(*out_match_indices); + + auto match_dist = framework::EigenMatrix::From(*in_match_dist); + + const T* cls_loss = in_cls_loss->data(); + const T* loc_loss = nullptr; + if (in_loc_loss) { + loc_loss = in_loc_loss->data(); + } + + std::vector> all_neg_indices; + std::vector batch_starts = {0}; + for (int n = 0; n < batch_size; ++n) { + std::vector> loss_idx; + int neg_sel = 0; + for (int m = 0; m < prior_num; ++m) { + if (IsEligibleMining(mining_type, match_indices(n, m), match_dist(n, m), + neg_dist_threshold)) { + T loss = cls_loss[n * prior_num + m]; + if (mining_type == MiningType::kHardExample && loc_loss != nullptr) { + loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m]; + } + loss_idx.push_back(std::make_pair(loss, m)); + ++neg_sel; + } + } + + if (mining_type == MiningType::kMaxNegative) { + int num_pos = 0; + for (int m = 0; m < prior_num; ++m) { + if (match_indices(n, m) != -1) ++num_pos; + } + neg_sel = std::min(static_cast(num_pos * neg_pos_ratio), neg_sel); + } else if (mining_type == MiningType::kHardExample) { + neg_sel = std::min(sample_size, neg_sel); + } + + std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend); + std::set sel_indices; + std::vector neg_indices; + std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel, + std::inserter(sel_indices, sel_indices.begin()), + [](std::pair& l) -> int { + return static_cast(l.second); + }); + + if (mining_type == MiningType::kHardExample) { + for (int m = 0; m < prior_num; ++m) { + if (match_indices(n, m) > -1) { + if (sel_indices.find(m) == sel_indices.end()) { + match_indices_et(n, m) = -1; + } + } else { + if (sel_indices.find(m) != sel_indices.end()) { + neg_indices.push_back(m); + } + } + } + } else { + neg_indices.resize(sel_indices.size()); + std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin()); + } + + all_neg_indices.push_back(neg_indices); + batch_starts.push_back(batch_starts.back() + neg_indices.size()); + } + + framework::LoD out_neg_indices_lod; + out_neg_indices_lod.emplace_back(batch_starts); + int neg_offset = 0; + auto neg_data = out_neg_indices->mutable_data( + framework::make_ddim({static_cast(batch_starts.back()), 1}), + ctx.GetPlace()); + + for (auto neg_indices : all_neg_indices) { + std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset); + neg_offset += neg_indices.size(); + } + out_neg_indices->set_lod(out_neg_indices_lod); + return; + } +}; + +class MineHardExamplesOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("ClsLoss"), + "Input(ClsLoss) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("MatchIndices"), + "Input(MatchIndices) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("MatchDist"), + "Input(MatchDist) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("NegIndices"), + "Output(NegIndices) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("UpdatedMatchIndices"), + "Output(UpdatedMatchIndices) of MineHardExamplesOp should " + "not be null."); + + auto cls_loss_dims = ctx->GetInputDim("ClsLoss"); + auto idx_dims = ctx->GetInputDim("MatchIndices"); + auto dis_dims = ctx->GetInputDim("MatchDist"); + + PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL, + "The shape of ClsLoss is [N, Np]."); + PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL, + "The shape of MatchIndices is [N, Np]."); + PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL, + "The shape of MatchDist is [N, Np]."); + + if (ctx->HasInput("LocLoss")) { + auto loc_loss_dims = ctx->GetInputDim("LocLoss"); + PADDLE_ENFORCE_EQ(loc_loss_dims.size(), 2UL, + "The shape of LocLoss is [N, Np]."); + PADDLE_ENFORCE_EQ(cls_loss_dims[0], loc_loss_dims[0], + "Batch size of ClsLoss and LocLoss must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], loc_loss_dims[1], + "Prior box number of ClsLoss and LocLoss must be the same."); + } + + PADDLE_ENFORCE_EQ( + cls_loss_dims[0], idx_dims[0], + "Batch size of ClsLoss and MatchIndices must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], idx_dims[1], + "Prior box number of ClsLoss and MatchIndices must be the same."); + + PADDLE_ENFORCE_EQ(cls_loss_dims[0], dis_dims[0], + "Batch size of ClsLoss and MatchDist must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], idx_dims[1], + "Prior box number of ClsLoss and MatchDist must be the same."); + + auto mining_type = + GetMiningType(ctx->Attrs().Get("mining_type")); + + PADDLE_ENFORCE_NE(mining_type, MiningType::kNone, + "mining_type must be hard_example or max_negative"); + + if (mining_type == MiningType::kMaxNegative) { + auto neg_pos_ratio = ctx->Attrs().Get("neg_pos_ratio"); + auto neg_dist_threshold = ctx->Attrs().Get("neg_dist_threshold"); + PADDLE_ENFORCE_GT( + neg_pos_ratio, 0.0f, + "neg_pos_ratio must greater than zero in max_negative mode"); + PADDLE_ENFORCE_GT( + neg_dist_threshold, 0.0f, + "neg_dist_threshold must greater than zero in max_negative mode"); + } else if (mining_type == MiningType::kHardExample) { + auto sample_size = ctx->Attrs().Get("sample_size"); + PADDLE_ENFORCE_GT( + sample_size, 0, + "sample_size must greater than zero in hard_example mode"); + } + + ctx->SetOutputDim("UpdatedMatchIndices", idx_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("ClsLoss")->type()), + ctx.device_context()); + } +}; + +class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MineHardExamplesOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "ClsLoss", + "(Tensor, default Tensor), The classification loss with shape " + "[N, Np], N is the batch size and Np is the number of prior box."); + AddInput("LocLoss", + "(Tensor, optional, default Tensor), The localization loss " + "with shape [N, Np], N is the batch size and Np is the number of " + "prior box.") + .AsDispensable(); + AddInput("MatchIndices", + "(Tensor, Tensor), Matched indices with shape [N, Np], N is " + "the batch size and Np is the number of prior box. " + "MatchIndices[i][j] equal -1 means the j-th prior box in i-th " + "instance does not match any entity, otherwise means it is " + "matched to row."); + AddInput("MatchDist", + "(Tensor, default Tensor) Matched indices with shape [N, " + "Np], N is the batch size and Np is the number of prior box."); + AddAttr("neg_pos_ratio", + "(float) The ratio of the negative box to the positive " + "box. Use only when mining_type is max_negative.") + .SetDefault(1.0); + AddAttr("neg_dist_threshold", + "(float) The negative overlap upper bound for the unmatched " + "predictions. Use only when mining_type is max_negative.") + .SetDefault(0.5); + AddAttr("sample_size", + "(float) The max sample size of negative box. Use only when " + "mining_type is hard_example.") + .SetDefault(0); + AddAttr("mining_type", + "(float) The mining algorithm name, the value is " + "hard_example or max_negative.") + .SetDefault("max_negative") + .InEnum({"hard_example", "max_negative"}); + + AddOutput( + "NegIndices", + "(LoDTensor) The output of negative example indices. a LoDTensor " + "with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, " + "and each element is the prior box index. " + "For example, the batch size is 2, the lod is [[0, 1, 2]], " + "the sample 0's box 1(MatchIndices[0][1]) is selected, " + "and sample 1's box 0 is selected. The output NegIndices is " + "[[1], [0]]."); + + AddOutput("UpdatedMatchIndices", + "(Tensor) The output of updated MatchIndices, a tensor with " + "shape [N, Np]. Only update when mining_type is " + "hard_example. The input MatchIndices elements will be update to " + "-1 when it is not in the candidate high loss list of negative " + "examples."); + + AddComment(R"DOC( +Mine hard examples Operator. +This operator implements hard example mining to select a subset of negative box indices. +For each image, selects the box with highest losses. subject to the condition that the +box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative. +The selected number is min(sample_size, max_negative_box_number) when mining_type is +hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number) +when mining_type is max_negative, where the max_negative_box_number is the count of +MatchIndices elements with value -1. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp, + ops::MineHardExamplesOpMaker); + +REGISTER_OP_CPU_KERNEL( + mine_hard_examples, + ops::MineHardExamplesKernel, + ops::MineHardExamplesKernel); diff --git a/paddle/operators/multiclass_nms_op.cc b/paddle/operators/multiclass_nms_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8a65fe69f1534112923c72e17969f808b9310735 --- /dev/null +++ b/paddle/operators/multiclass_nms_op.cc @@ -0,0 +1,384 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +constexpr int64_t kOutputDim = 6; +constexpr int64_t kBBoxSize = 4; + +class MultiClassNMSOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("BBoxes"), + "Input(BBoxes) of MultiClassNMS should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Scores"), + "Input(Scores) of MultiClassNMS should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of MultiClassNMS should not be null."); + + auto box_dims = ctx->GetInputDim("BBoxes"); + auto score_dims = ctx->GetInputDim("Scores"); + + PADDLE_ENFORCE_EQ(box_dims.size(), 2, + "The rank of Input(BBoxes) must be 2."); + PADDLE_ENFORCE_EQ(score_dims.size(), 3, + "The rank of Input(Scores) must be 3."); + PADDLE_ENFORCE_EQ(box_dims[1], 4, + "The 2nd dimension of Input(BBoxes) must be 4, " + "represents the layout of coordinate " + "[xmin, ymin, xmax, ymax]"); + PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2], + "The 1st dimensiong of Input(BBoxes) must be equal to " + "3rd dimension of Input(Scores), which represents the " + "predicted bboxes."); + + // Here the box_dims[0] is not the real dimension of output. + // It will be rewritten in the computing kernel. + ctx->SetOutputDim("Out", {box_dims[0], 6}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input("Scores")->type()), + ctx.device_context()); + } +}; + +template +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +template +static inline void GetMaxScoreIndex( + const std::vector& scores, const T threshold, int top_k, + std::vector>* sorted_indices) { + for (size_t i = 0; i < scores.size(); ++i) { + if (scores[i] > threshold) { + sorted_indices->push_back(std::make_pair(scores[i], i)); + } + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices->begin(), sorted_indices->end(), + SortScorePairDescend); + // Keep top_k scores if needed. + if (top_k > -1 && top_k < sorted_indices->size()) { + sorted_indices->resize(top_k); + } +} + +template +static inline T BBoxArea(const T* box, const bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +static inline T JaccardOverlap(const T* box1, const T* box2, + const bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + const T inter_w = inter_xmax - inter_xmin; + const T inter_h = inter_ymax - inter_ymin; + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +class MultiClassNMSKernel : public framework::OpKernel { + public: + void NMSFast(const Tensor& bbox, const Tensor& scores, + const T score_threshold, const T nms_threshold, const T eta, + const int64_t top_k, std::vector* selected_indices) const { + // The total boxes for each instance. + int64_t num_boxes = bbox.dims()[0]; + // 4: [xmin ymin xmax ymax] + int64_t box_size = bbox.dims()[1]; + + std::vector scores_data(num_boxes); + std::copy_n(scores.data(), num_boxes, scores_data.begin()); + std::vector> sorted_indices; + GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices); + + selected_indices->clear(); + T adaptive_threshold = nms_threshold; + const T* bbox_data = bbox.data(); + + while (sorted_indices.size() != 0) { + const int idx = sorted_indices.front().second; + bool keep = true; + for (int k = 0; k < selected_indices->size(); ++k) { + if (keep) { + const int kept_idx = (*selected_indices)[k]; + T overlap = JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, true); + keep = overlap <= adaptive_threshold; + } else { + break; + } + } + if (keep) { + selected_indices->push_back(idx); + } + sorted_indices.erase(sorted_indices.begin()); + if (keep && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } + } + + void MultiClassNMS(const framework::ExecutionContext& ctx, + const Tensor& scores, const Tensor& bboxes, + std::map>& indices, + int& num_nmsed_out) const { + int64_t background_label = ctx.Attr("background_label"); + int64_t nms_top_k = ctx.Attr("nms_top_k"); + int64_t keep_top_k = ctx.Attr("keep_top_k"); + T nms_threshold = static_cast(ctx.Attr("nms_threshold")); + T nms_eta = static_cast(ctx.Attr("nms_eta")); + T score_threshold = static_cast(ctx.Attr("score_threshold")); + + int64_t class_num = scores.dims()[0]; + int64_t predict_dim = scores.dims()[1]; + int num_det = 0; + for (int64_t c = 0; c < class_num; ++c) { + if (c == background_label) continue; + Tensor score = scores.Slice(c, c + 1); + NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k, + &(indices[c])); + num_det += indices[c].size(); + } + + num_nmsed_out = num_det; + const T* scores_data = scores.data(); + if (keep_top_k > -1 && num_det > keep_top_k) { + std::vector>> score_index_pairs; + for (const auto& it : indices) { + int label = it.first; + const T* sdata = scores_data + label * predict_dim; + const std::vector& label_indices = it.second; + for (int j = 0; j < label_indices.size(); ++j) { + int idx = label_indices[j]; + PADDLE_ENFORCE_LT(idx, predict_dim); + score_index_pairs.push_back( + std::make_pair(sdata[idx], std::make_pair(label, idx))); + } + } + // Keep top k results per image. + std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), + SortScorePairDescend>); + score_index_pairs.resize(keep_top_k); + + // Store the new indices. + std::map> new_indices; + for (int j = 0; j < score_index_pairs.size(); ++j) { + int label = score_index_pairs[j].second.first; + int idx = score_index_pairs[j].second.second; + new_indices[label].push_back(idx); + } + new_indices.swap(indices); + num_nmsed_out = keep_top_k; + } + } + + void MultiClassOutput(const Tensor& scores, const Tensor& bboxes, + std::map>& selected_indices, + Tensor* outs) const { + int predict_dim = scores.dims()[1]; + auto* scores_data = scores.data(); + auto* bboxes_data = bboxes.data(); + auto* odata = outs->data(); + + int count = 0; + for (const auto& it : selected_indices) { + int label = it.first; + const T* sdata = scores_data + label * predict_dim; + const std::vector& indices = it.second; + for (int j = 0; j < indices.size(); ++j) { + int idx = indices[j]; + const T* bdata = bboxes_data + idx * kBBoxSize; + odata[count * kOutputDim] = label; // label + odata[count * kOutputDim + 1] = sdata[idx]; // score + // xmin, ymin, xmax, ymax + std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T)); + count++; + } + } + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* boxes = ctx.Input("BBoxes"); + auto* scores = ctx.Input("Scores"); + auto* outs = ctx.Output("Out"); + + auto score_dims = scores->dims(); + + int64_t batch_size = score_dims[0]; + int64_t class_num = score_dims[1]; + int64_t predict_dim = score_dims[2]; + + std::vector>> all_indices; + std::vector batch_starts = {0}; + for (int64_t i = 0; i < batch_size; ++i) { + Tensor ins_score = scores->Slice(i, i + 1); + ins_score.Resize({class_num, predict_dim}); + std::map> indices; + int num_nmsed_out = 0; + MultiClassNMS(ctx, ins_score, *boxes, indices, num_nmsed_out); + all_indices.push_back(indices); + batch_starts.push_back(batch_starts.back() + num_nmsed_out); + } + + int num_kept = batch_starts.back(); + if (num_kept == 0) { + T* od = outs->mutable_data({1}, ctx.GetPlace()); + od[0] = -1; + } else { + outs->mutable_data({num_kept, kOutputDim}, ctx.GetPlace()); + for (int64_t i = 0; i < batch_size; ++i) { + Tensor ins_score = scores->Slice(i, i + 1); + ins_score.Resize({class_num, predict_dim}); + int64_t s = batch_starts[i]; + int64_t e = batch_starts[i + 1]; + if (e > s) { + Tensor out = outs->Slice(s, e); + MultiClassOutput(ins_score, *boxes, all_indices[i], &out); + } + } + } + + framework::LoD lod; + lod.emplace_back(batch_starts); + + outs->set_lod(lod); + } +}; + +class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("BBoxes", + "(Tensor) A 2-D Tensor with shape [M, 4] represents the " + "predicted locations of M bounding bboxes. Each bounding box " + "has four coordinate values and the layout is " + "[xmin, ymin, xmax, ymax]."); + AddInput("Scores", + "(Tensor) A 3-D Tensor with shape [N, C, M] represents the " + "predicted confidence predictions. N is the batch size, C is the " + "class number, M is number of bounding boxes. For each category " + "there are total M scores which corresponding M bounding boxes. " + " Please note, M is equal to the 1st dimension of BBoxes. "); + AddAttr( + "background_label", + "(int64_t, defalut: 0) " + "The index of background label, the background label will be ignored. " + "If set to -1, then all categories will be considered.") + .SetDefault(0); + AddAttr("score_threshold", + "(float) " + "Threshold to filter out bounding boxes with low " + "confidence score. If not provided, consider all boxes."); + AddAttr("nms_top_k", + "(int64_t) " + "Maximum number of detections to be kept according to the " + "confidences aftern the filtering detections based on " + "score_threshold"); + AddAttr("nms_threshold", + "(float, defalut: 0.3) " + "The threshold to be used in NMS.") + .SetDefault(0.3); + AddAttr("nms_eta", + "(float) " + "The parameter for adaptive NMS.") + .SetDefault(1.0); + AddAttr("keep_top_k", + "(int64_t) " + "Number of total bboxes to be kept per image after NMS " + "step. -1 means keeping all bboxes after NMS step."); + AddOutput("Out", + "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the " + "detections. Each row has 6 values: " + "[label, confidence, xmin, ymin, xmax, ymax], No is the total " + "number of detections in this mini-batch. For each instance, " + "the offsets in first dimension are called LoD, the number of " + "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is " + "no detected bbox."); + AddComment(R"DOC( +This operator is to do multi-class non maximum suppression (NMS) on a batched +of boxes and scores. + +In the NMS step, this operator greedily selects a subset of detection bounding +boxes that have high scores larger than score_threshold, if providing this +threshold, then selects the largest nms_top_k confidences scores if nms_top_k +is larger than -1. Then this operator pruns away boxes that have high IOU +(intersection over union) overlap with already selected boxes by adaptive +threshold NMS based on parameters of nms_threshold and nms_eta. + +Aftern NMS step, at most keep_top_k number of total bboxes are to be kept +per image if keep_top_k is larger than -1. + +This operator support multi-class and batched inputs. It applying NMS +independently for each class. The outputs is a 2-D LoDTenosr, for each +image, the offsets in first dimension of LoDTensor are called LoD, the number +of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0, +means there is no detected bbox for this image. If there is no detected boxes +for all images, all the elements in LoD are 0, and the Out only contains one +value which is -1. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp, + ops::MultiClassNMSOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel, + ops::MultiClassNMSKernel); diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc index 2a8afc940393baaaa939471f50f2d5c63edd6a84..6df087d154cc104955c6399050c9cb2bce8d36e1 100644 --- a/paddle/platform/profiler.cc +++ b/paddle/platform/profiler.cc @@ -233,7 +233,7 @@ void ParseEvents(std::vector>& events, }; break; default: - sorted_domain = "event end time"; + sorted_domain = "event first end time"; } std::vector> events_table; diff --git a/python/paddle/v2/fluid/debuger.py b/python/paddle/v2/fluid/debuger.py index d37935244291d15da8199bd3d3979118e4944b48..db1808c64745ac153962c050b08993450dd93c06 100644 --- a/python/paddle/v2/fluid/debuger.py +++ b/python/paddle/v2/fluid/debuger.py @@ -12,10 +12,202 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys import re from graphviz import GraphPreviewGenerator import proto.framework_pb2 as framework_pb2 +_vartype2str_ = [ + "UNK", + "LoDTensor", + "SelectedRows", + "FeedMinibatch", + "FetchList", + "StepScopes", + "LodRankTable", + "LoDTensorArray", + "PlaceList", +] +_dtype2str_ = [ + "bool", + "int16", + "int32", + "int64", + "float16", + "float32", + "float64", +] + + +def repr_data_type(type): + return _dtype2str_[type] + + +def repr_tensor(proto): + return "tensor(type={}, shape={})".format(_dtype2str_[int(proto.data_type)], + str(proto.dims)) + + +reprtpl = "{ttype} {name} ({reprs})" + + +def repr_lodtensor(proto): + if not proto.lod_tensor: return + level = proto.lod_tensor.lod_level + reprs = repr_tensor(proto.lod_tensor.tensor) + return reprtpl.format( + ttype="LoDTensor" if level > 0 else "Tensor", + name=proto.name, + reprs="level=%d, %s" % (level, reprs) if level > 0 else reprs) + + +def repr_selected_rows(proto): + if not proto.selected_rows: return + return reprtpl.format( + ttype="SelectedRows", + name=proto.name, + reprs=repr_tensor(proto.selected_rows)) + + +def repr_tensor_array(proto): + if not proto.tensor_array: return + return reprtpl.format( + ttype="TensorArray", + name=proto.name, + reprs="level=%d, %s" % (proto.tensor_array.lod_level, + repr_tensor(proto.lod_tensor))) + + +type_handlers = [ + repr_lodtensor, + repr_selected_rows, + repr_tensor_array, +] + + +def repr_var(vardesc): + for handler in type_handlers: + res = handler(vardesc) + if res: + return res + + +def pprint_program_codes(program_desc): + reprs = [] + for block_idx in range(program_desc.num_blocks()): + block_desc = program_desc.block(block_idx) + block_repr = pprint_block_codes(block_desc) + reprs.append(block_repr) + return '\n'.join(reprs) + + +def pprint_block_codes(block_desc, show_backward=False): + def is_op_backward(op_desc): + if op_desc.type.endswith('_grad'): return True + + def is_var_backward(var): + if "@GRAD" in var.parameter: return True + for arg in var.arguments: + if "@GRAD" in arg: return True + + for var in op_desc.inputs: + if is_var_backward(var): return True + for var in op_desc.outputs: + if is_var_backward(var): return True + return False + + def is_var_backward(var_desc): + return "@GRAD" in var_desc.name + + if type(block_desc) is not framework_pb2.BlockDesc: + block_desc = framework_pb2.BlockDesc.FromString( + block_desc.serialize_to_string()) + var_reprs = [] + op_reprs = [] + for var in block_desc.vars: + if not show_backward and is_var_backward(var): + continue + var_reprs.append(repr_var(var)) + + for op in block_desc.ops: + if not show_backward and is_op_backward(op): continue + op_reprs.append(repr_op(op)) + + tpl = "// block-{idx} parent-{pidx}\n// variables\n{vars}\n\n// operators\n{ops}\n" + return tpl.format( + idx=block_desc.idx, + pidx=block_desc.parent_idx, + vars='\n'.join(var_reprs), + ops='\n'.join(op_reprs), ) + + +def repr_attr(desc): + tpl = "{key}={value}" + valgetter = [ + lambda attr: attr.i, + lambda attr: attr.f, + lambda attr: attr.s, + lambda attr: attr.ints, + lambda attr: attr.floats, + lambda attr: attr.strings, + lambda attr: attr.b, + lambda attr: attr.bools, + lambda attr: attr.block_idx, + lambda attr: attr.l, + ] + key = desc.name + value = valgetter[desc.type](desc) + if key == "dtype": + value = repr_data_type(value) + return tpl.format(key=key, value=str(value)), (key, value) + + +def _repr_op_fill_constant(optype, inputs, outputs, attrs): + if optype == "fill_constant": + return "{output} = {data} [shape={shape}]".format( + output=','.join(outputs), + data=attrs['value'], + shape=str(attrs['shape'])) + + +op_repr_handlers = [_repr_op_fill_constant, ] + + +def repr_op(opdesc): + optype = None + attrs = [] + attr_dict = {} + is_target = None + inputs = [] + outputs = [] + + tpl = "{outputs} = {optype}({inputs}{is_target}) [{attrs}]" + args2value = lambda args: args[0] if len(args) == 1 else str(list(args)) + for var in opdesc.inputs: + key = var.parameter + value = args2value(var.arguments) + inputs.append("%s=%s" % (key, value)) + for var in opdesc.outputs: + value = args2value(var.arguments) + outputs.append(value) + for attr in opdesc.attrs: + attr_repr, attr_pair = repr_attr(attr) + attrs.append(attr_repr) + attr_dict[attr_pair[0]] = attr_pair[1] + + is_target = opdesc.is_target + + for handler in op_repr_handlers: + res = handler(opdesc.type, inputs, outputs, attr_dict) + if res: return res + + return tpl.format( + outputs=', '.join(outputs), + optype=opdesc.type, + inputs=', '.join(inputs), + attrs="{%s}" % ','.join(attrs), + is_target=", is_target" if is_target else "") + def draw_block_graphviz(block, highlights=None, path="./temp.dot"): ''' diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index c38e21087de1bf7076ce5aaf23d4d4faaebb50a7..a79479f469a0c489edf2676bc5d07066bb480664 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -1231,10 +1231,17 @@ def conv2d(input, """ if stride is None: stride = [1, 1] - helper = LayerHelper('conv2d', **locals()) - dtype = helper.input_dtype() num_channels = input.shape[1] + + l_type = 'conv2d' + if (num_channels == groups and num_filters % num_channels == 0 and + not use_cudnn): + l_type = 'depthwise_conv2d' + + helper = LayerHelper(l_type, **locals()) + dtype = helper.input_dtype() + if groups is None: num_filter_channels = num_channels else: @@ -1267,7 +1274,7 @@ def conv2d(input, pre_bias = helper.create_tmp_variable(dtype) helper.append_op( - type='conv2d', + type=l_type, inputs={ 'Input': input, 'Filter': filter_param, @@ -1478,7 +1485,9 @@ def batch_norm(input, param_attr=None, bias_attr=None, data_layout='NCHW', - name=None): + name=None, + moving_mean_name=None, + moving_variance_name=None): """ This function helps create an operator to implement the BatchNorm layer using the configurations from the input parameters. @@ -1508,6 +1517,7 @@ def batch_norm(input, attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True) mean = helper.create_global_variable( + name=moving_mean_name, dtype=input.dtype, shape=param_shape, persistable=True, @@ -1515,6 +1525,7 @@ def batch_norm(input, helper.set_variable_initializer(var=mean, initializer=Constant(0.0)) variance = helper.create_global_variable( + name=moving_variance_name, dtype=input.dtype, shape=param_shape, persistable=True, diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py index d4a2cd7eeabecb60699b5be94d89cf7a916749e7..d33a4c52a8873b1e376eb2077014130bdcad2e12 100644 --- a/python/paddle/v2/fluid/profiler.py +++ b/python/paddle/v2/fluid/profiler.py @@ -103,10 +103,10 @@ def profiler(state, sorted_key=None): core.enable_profiler(prof_state) yield - if sorted_key not in ['calls', 'total', 'max', 'min', 'ave']: - raise ValueError("The state must be in 'calls', 'total', " - "'max', 'min', 'ave'") sorted_key = 'default' if sorted_key is None else sorted_key + if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']: + raise ValueError("The sorted_key must be None or in 'calls', 'total', " + "'max', 'min' and 'ave'") key_map = { 'default': core.EventSortingKey.kDefault, 'calls': core.EventSortingKey.kCalls, diff --git a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py index 74138298978c7c18936f53761b313887f07aea81..4943bbb3388c3a476596b2fd4dd28605ee7be9e0 100644 --- a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py +++ b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py @@ -62,7 +62,7 @@ def batch_bipartite_match(distance, lod): return match_indices, match_dist -class TestBipartiteMatchOpForWithLoD(OpTest): +class TestBipartiteMatchOpWithLoD(OpTest): def setUp(self): self.op_type = 'bipartite_match' lod = [[0, 5, 11, 23]] @@ -72,7 +72,7 @@ class TestBipartiteMatchOpForWithLoD(OpTest): self.inputs = {'DistMat': (dist, lod)} self.outputs = { 'ColToRowMatchIndices': (match_indices), - 'ColToRowMatchDis': (match_dist), + 'ColToRowMatchDist': (match_dist), } def test_check_output(self): @@ -89,7 +89,7 @@ class TestBipartiteMatchOpWithoutLoD(OpTest): self.inputs = {'DistMat': dist} self.outputs = { 'ColToRowMatchIndices': match_indices, - 'ColToRowMatchDis': match_dist, + 'ColToRowMatchDist': match_dist, } def test_check_output(self): diff --git a/python/paddle/v2/fluid/tests/test_box_coder_op.py b/python/paddle/v2/fluid/tests/test_box_coder_op.py new file mode 100644 index 0000000000000000000000000000000000000000..0dc18476fd5dce7cd293f6cb85f419be7d88ec95 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_box_coder_op.py @@ -0,0 +1,127 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys +import math +from op_test import OpTest + + +def box_coder(target_box, prior_box, prior_box_var, output_box, code_type): + prior_box_x = ( + (prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0]) + prior_box_y = ( + (prior_box[:, 3] + prior_box[:, 1]) / 2).reshape(1, prior_box.shape[0]) + prior_box_width = ( + (prior_box[:, 2] - prior_box[:, 0])).reshape(1, prior_box.shape[0]) + prior_box_height = ( + (prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0]) + prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0], + prior_box_var.shape[1]) + + if (code_type == "EncodeCenterSize"): + target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape( + target_box.shape[0], 1) + target_box_y = ((target_box[:, 3] + target_box[:, 1]) / 2).reshape( + target_box.shape[0], 1) + target_box_width = ((target_box[:, 2] - target_box[:, 0])).reshape( + target_box.shape[0], 1) + target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape( + target_box.shape[0], 1) + + output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \ + prior_box_var[:,:,0] + output_box[:,:,1] = (target_box_y - prior_box_y) / prior_box_height / \ + prior_box_var[:,:,1] + output_box[:,:,2] = np.log(np.fabs(target_box_width / prior_box_width)) / \ + prior_box_var[:,:,2] + output_box[:,:,3] = np.log(np.fabs(target_box_height / prior_box_height)) / \ + prior_box_var[:,:,3] + + elif (code_type == "DecodeCenterSize"): + target_box = target_box.reshape(target_box.shape[0], 1, + target_box.shape[1]) + target_box_x = prior_box_var[:,:,0] * target_box[:,:,0] * \ + prior_box_width + prior_box_x + target_box_y = prior_box_var[:,:,1] * target_box[:,:,1] * \ + prior_box_height + prior_box_y + target_box_width = np.exp(prior_box_var[:,:,2] * target_box[:,:,2]) * \ + prior_box_width + target_box_height = np.exp(prior_box_var[:,:,3] * target_box[:,:,3]) * \ + prior_box_height + output_box[:, :, 0] = target_box_x - target_box_width / 2 + output_box[:, :, 1] = target_box_y - target_box_height / 2 + output_box[:, :, 2] = target_box_x + target_box_width / 2 + output_box[:, :, 3] = target_box_y + target_box_height / 2 + + +def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type): + n = target_box.shape[0] + m = prior_box.shape[0] + output_box = np.zeros((n, m, 4), dtype=np.float32) + for i in range(len(lod) - 1): + box_coder(target_box[lod[i]:lod[i + 1], :], prior_box, prior_box_var, + output_box[lod[i]:lod[i + 1], :, :], code_type) + return output_box + + +class TestBoxCoderOp(OpTest): + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "box_coder" + lod = [[0, 20]] + prior_box = np.random.random((10, 4)).astype('float32') + prior_box_var = np.random.random((10, 4)).astype('float32') + target_box = np.random.random((20, 4)).astype('float32') + code_type = "DecodeCenterSize" + output_box = batch_box_coder(prior_box, prior_box_var, target_box, + lod[0], code_type) + + self.inputs = { + 'PriorBox': prior_box, + 'PriorBoxVar': prior_box_var, + 'TargetBox': target_box, + } + self.attrs = {'code_type': 'decode_center_size'} + self.outputs = {'OutputBox': output_box} + + +class TestBoxCoderOpWithLoD(OpTest): + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "box_coder" + lod = [[0, 4, 12, 20]] + prior_box = np.random.random((10, 4)).astype('float32') + prior_box_var = np.random.random((10, 4)).astype('float32') + target_box = np.random.random((20, 4)).astype('float32') + code_type = "EncodeCenterSize" + output_box = batch_box_coder(prior_box, prior_box_var, target_box, + lod[0], code_type) + + self.inputs = { + 'PriorBox': prior_box, + 'PriorBoxVar': prior_box_var, + 'TargetBox': (target_box, lod), + } + self.attrs = {'code_type': 'encode_center_size'} + self.outputs = {'OutputBox': output_box} + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py index 24de74d730eedbccb4837598bd6d2eb92da59e0d..7512ea333e37d5f4f0102531d8d13f8c2a744b8d 100644 --- a/python/paddle/v2/fluid/tests/test_conv2d_op.py +++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py @@ -241,6 +241,30 @@ class TestCUDNNWith1x1(TestWith1x1): self.op_type = "conv2d" +class TestDepthwiseConv(TestConv2dOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] / self.groups + self.filter_size = [6, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + +class TestDepthwiseConv2(TestConv2dOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] / self.groups + self.filter_size = [6, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + # cudnn v5 does not support dilation conv. # class TestCUDNNWithDilation(TestWithDilation): # def init_op_type(self): diff --git a/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py b/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py new file mode 100755 index 0000000000000000000000000000000000000000..c27573c3d69037bc48e0b6a90636b3f027f15a41 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py @@ -0,0 +1,100 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys +import math +from op_test import OpTest + + +class TestMineHardExamplesOp(OpTest): + def set_data(self): + self.init_test_data() + self.inputs = { + 'ClsLoss': self.cls_loss, + 'LocLoss': self.loc_loss, + 'MatchIndices': self.match_indices, + 'MatchDist': self.match_dis + } + + self.attrs = { + 'neg_pos_ratio': self.neg_pos_ratio, + 'neg_overlap': self.neg_overlap, + 'sample_size': self.sample_size, + 'mining_type': self.mining_type + } + + self.outputs = { + 'NegIndices': (self.neg_indices, self.neg_indices_lod), + 'UpdatedMatchIndices': self.updated_match_indices + } + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + return + + def setUp(self): + self.op_type = "mine_hard_examples" + self.set_data() + + def init_test_data(self): + self.neg_pos_ratio = 1.0 + self.neg_overlap = 0.5 + self.sample_size = 0 + self.mining_type = "max_negative" + self.cls_loss = np.array([[0.1, 0.1, 0.3], + [0.3, 0.1, 0.1]]).astype('float32') + + self.loc_loss = np.array([[0.1, 0.2, 0.3], + [0.3, 0.4, 0.1]]).astype('float32') + + self.match_dis = np.array([[0.2, 0.4, 0.8], + [0.1, 0.9, 0.3]]).astype('float32') + + self.match_indices = np.array([[0, -1, -1], + [-1, 0, -1]]).astype('int32') + + self.updated_match_indices = self.match_indices + + self.neg_indices_lod = [[0, 1, 2]] + self.neg_indices = np.array([[1], [0]]).astype('int32') + + +class TestMineHardExamplesOpHardExample(TestMineHardExamplesOp): + def init_test_data(self): + super(TestMineHardExamplesOpHardExample, self).init_test_data() + self.mining_type = "hard_example" + self.sample_size = 2 + + self.cls_loss = np.array([[0.5, 0.1, 0.3], + [0.3, 0.1, 0.1]]).astype('float32') + + self.loc_loss = np.array([[0.2, 0.2, 0.3], + [0.3, 0.1, 0.2]]).astype('float32') + + self.match_indices = np.array([[0, -1, -1], + [-1, 0, -1]]).astype('int32') + + self.updated_match_indices = np.array([[0, -1, -1], + [-1, -1, -1]]).astype('int32') + + self.neg_indices_lod = [[0, 1, 3]] + self.neg_indices = np.array([[2], [0], [2]]).astype('int32') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py new file mode 100644 index 0000000000000000000000000000000000000000..3b80d2359b083d30f9a5a7b8cc18aaf1ca5146c1 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py @@ -0,0 +1,226 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. +import unittest +import numpy as np +import copy +from op_test import OpTest + + +def iou(box_a, box_b): + """Apply intersection-over-union overlap between box_a and box_b + """ + xmin_a = min(box_a[0], box_a[2]) + ymin_a = min(box_a[1], box_a[3]) + xmax_a = max(box_a[0], box_a[2]) + ymax_a = max(box_a[1], box_a[3]) + + xmin_b = min(box_b[0], box_b[2]) + ymin_b = min(box_b[1], box_b[3]) + xmax_b = max(box_b[0], box_b[2]) + ymax_b = max(box_b[1], box_b[3]) + + area_a = (ymax_a - ymin_a) * (xmax_a - xmin_a) + area_b = (ymax_b - ymin_b) * (xmax_b - xmin_b) + if area_a <= 0 and area_b <= 0: + return 0.0 + + xa = max(xmin_a, xmin_b) + ya = max(ymin_a, ymin_b) + xb = min(xmax_a, xmax_b) + yb = min(ymax_a, ymax_b) + + inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0) + + box_a_area = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]) + box_b_area = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]) + + iou_ratio = inter_area / (area_a + area_b - inter_area) + + return iou_ratio + + +def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0): + """Apply non-maximum suppression at test time to avoid detecting too many + overlapping bounding boxes for a given object. + Args: + boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. + scores: (tensor) The class predscores for the img, Shape:[num_priors]. + score_threshold: (float) The confidence thresh for filtering low + confidence boxes. + nms_threshold: (float) The overlap thresh for suppressing unnecessary + boxes. + top_k: (int) The maximum number of box preds to consider. + eta: (float) The parameter for adaptive NMS. + Return: + The indices of the kept boxes with respect to num_priors. + """ + all_scores = copy.deepcopy(scores) + all_scores = all_scores.flatten() + selected_indices = np.argwhere(all_scores > score_threshold) + selected_indices = selected_indices.flatten() + all_scores = all_scores[selected_indices] + + sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort') + sorted_scores = all_scores[sorted_indices] + if top_k > -1 and top_k < sorted_indices.shape[0]: + sorted_indices = sorted_indices[:top_k] + sorted_scores = sorted_scores[:top_k] + + selected_indices = [] + adaptive_threshold = nms_threshold + for i in range(sorted_scores.shape[0]): + idx = sorted_indices[i] + keep = True + for k in range(len(selected_indices)): + if keep: + kept_idx = selected_indices[k] + overlap = iou(boxes[idx], boxes[kept_idx]) + keep = True if overlap <= adaptive_threshold else False + else: + break + if keep: + selected_indices.append(idx) + if keep and eta < 1 and adaptive_threshold > 0.5: + adaptive_threshold *= eta + return selected_indices + + +def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold, + nms_top_k, keep_top_k): + class_num = scores.shape[0] + priorbox_num = scores.shape[1] + + selected_indices = {} + num_det = 0 + for c in range(class_num): + if c == background: continue + indices = nms(boxes, scores[c], score_threshold, nms_threshold, + nms_top_k) + selected_indices[c] = indices + num_det += len(indices) + + if keep_top_k > -1 and num_det > keep_top_k: + score_index = [] + for c, indices in selected_indices.iteritems(): + for idx in indices: + score_index.append((scores[c][idx], c, idx)) + + sorted_score_index = sorted( + score_index, key=lambda tup: tup[0], reverse=True) + sorted_score_index = sorted_score_index[:keep_top_k] + selected_indices = {} + + for _, c, _ in sorted_score_index: + selected_indices[c] = [] + for s, c, idx in sorted_score_index: + selected_indices[c].append(idx) + num_det = keep_top_k + + return selected_indices, num_det + + +def batched_multiclass_nms(boxes, scores, background, score_threshold, + nms_threshold, nms_top_k, keep_top_k): + batch_size = scores.shape[0] + + det_outs = [] + lod = [0] + for n in range(batch_size): + nmsed_outs, nmsed_num = multiclass_nms(boxes, scores[n], background, + score_threshold, nms_threshold, + nms_top_k, keep_top_k) + lod.append(lod[-1] + nmsed_num) + if nmsed_num == 0: continue + + for c, indices in nmsed_outs.iteritems(): + for idx in indices: + xmin, ymin, xmax, ymax = boxes[idx][:] + det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax]) + + return det_outs, lod + + +class TestMulticlassNMSOp(OpTest): + def set_argument(self): + self.score_threshold = 0.01 + + def setUp(self): + self.set_argument() + N = 7 + M = 1200 + C = 21 + BOX_SIZE = 4 + + background = 0 + nms_threshold = 0.3 + nms_top_k = 400 + keep_top_k = 200 + score_threshold = self.score_threshold + + scores = np.random.random((N * M, C)).astype('float32') + + def softmax(x): + shiftx = x - np.max(x).clip(-64.) + exps = np.exp(shiftx) + return exps / np.sum(exps) + + scores = np.apply_along_axis(softmax, 1, scores) + scores = np.reshape(scores, (N, M, C)) + scores = np.transpose(scores, (0, 2, 1)) + + boxes = np.random.random((M, BOX_SIZE)).astype('float32') + boxes[:, 0:2] = boxes[:, 0:2] * 0.5 + boxes[:, 2:4] = boxes[:, 2:4] * 0.5 + 0.5 + + nmsed_outs, lod = batched_multiclass_nms(boxes, scores, background, + score_threshold, nms_threshold, + nms_top_k, keep_top_k) + nmsed_outs = [-1] if not nmsed_outs else nmsed_outs + nmsed_outs = np.array(nmsed_outs).astype('float32') + + self.op_type = 'multiclass_nms' + self.inputs = {'BBoxes': boxes, 'Scores': scores} + self.outputs = {'Out': (nmsed_outs, [lod])} + self.attrs = { + 'background_label': 0, + 'nms_threshold': nms_threshold, + 'nms_top_k': nms_top_k, + 'keep_top_k': keep_top_k, + 'score_threshold': score_threshold, + 'nms_eta': 1.0, + } + + def test_check_output(self): + self.check_output() + + +class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp): + def set_argument(self): + # Here set 2.0 to test the case there is no outputs. + # In practical use, 0.0 < score_threshold < 1.0 + self.score_threshold = 2.0 + + +class TestIOU(unittest.TestCase): + def test_iou(self): + box1 = np.array([4.0, 3.0, 7.0, 5.0]).astype('float32') + box2 = np.array([3.0, 4.0, 6.0, 8.0]).astype('float32') + + expt_output = np.array([2.0 / 16.0]).astype('float32') + calc_output = np.array([iou(box1, box2)]).astype('float32') + self.assertTrue(np.allclose(calc_output, expt_output)) + + +if __name__ == '__main__': + unittest.main()