diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..98356cd7613baff7f0cd66d1462068232b2b8500 --- /dev/null +++ b/benchmark/cluster/vgg16/Dockerfile @@ -0,0 +1,18 @@ +#FROM python:2.7.14 +FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04 +RUN apt-get update && apt-get install -y python +RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev +# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF, +# so we must build one with distribute support to install in this image. +RUN pip install paddlepaddle +RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python' +RUN pip uninstall -y paddlepaddle + +# below lines may change a lot for debugging +ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin +ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root +ADD *.whl / +RUN pip install /*.whl && rm -f /*.whl && \ +chmod +x /usr/bin/paddle_k8s +ENV LD_LIBRARY_PATH=/usr/local/lib +ADD vgg16_fluid.py vgg16_v2.py /workspace/ diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md new file mode 100644 index 0000000000000000000000000000000000000000..11d00b8f85382aa720c169338c51333b730d44d5 --- /dev/null +++ b/benchmark/cluster/vgg16/README.md @@ -0,0 +1,76 @@ +# Performance for Distributed vgg16 + +## Test Result + +### Hardware Infomation + +- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz +- cpu MHz : 2101.000 +- cache size : 20480 KB + +### Single Node Single Thread + +- PServer Count: 10 +- Trainer Count: 20 +- Metrics: samples / sec + +| Batch Size | 32 | 64 | 128 | 256 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 | +| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 | +| TensorFlow | - | - | - | - | + +### Different Batch Size + +- PServer Count: 10 +- Trainer Count: 20 +- Per trainer CPU Core: 1 +- Metrics: samples / sec + +| Batch Size | 32 | 64 | 128 | 256 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 | +| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 | +| TensorFlow | - | - | - | - | + + +### Accelerate Rate + +- Pserver Count: 20 +- Batch Size: 128 +- Metrics: samples / sec + +| Trainer Count | 20 | 40 | 80 | 100 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) | +| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) | +| TensorFlow | - | - | - | - | + +### Different Pserver Count + +- Trainer Count: 60 +- Batch Size: 128 +- Metrics: samples/ sec + +| PServer Count | 3 | 6 |10 | 20 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 | +| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 | +| TensorFlow | - | - | - | - | + +*The performance gap between Fuild and v2 comes from the network interference.* + + +## Steps to Run the Performance Test + +1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support. +1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory. +1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it. +1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step). +1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers. + +Check the logs for the distributed training progress and analyze the performance. + +## Enable Verbos Logs + +Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail. diff --git a/benchmark/cluster/vgg16/fluid_pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ee8b0763b62fc011f40f6197e929a68b48a93e47 --- /dev/null +++ b/benchmark/cluster/vgg16/fluid_pserver.yaml @@ -0,0 +1,72 @@ +apiVersion: extensions/v1beta1 +kind: ReplicaSet +metadata: + name: vgg16job-pserver +spec: + replicas: 10 + template: + metadata: + labels: + paddle-job-pserver: vgg16job + spec: + hostNetwork: true + imagePullSecrets: + - name: job-registry-secret + containers: + - name: pserver + image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" + imagePullPolicy: Always + ports: + - name: jobport-30236 + containerPort: 30236 + env: + - name: PADDLE_JOB_NAME + value: vgg16job + - name: MKL_NUM_THREADS + value: "1" + - name: TRAINING_ROLE + value: "PSERVER" + - name: TRAINERS + value: "20" + - name: PSERVERS + value: "10" + - name: TOPOLOGY + value: "" + - name: ENTRY + value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0" + - name: TRAINER_PACKAGE + value: "/workspace" + - name: PADDLE_INIT_PORT + value: "30236" + - name: PADDLE_INIT_NICS + value: "xgbe0" + - name: PADDLE_INIT_TRAINER_COUNT + value: "1" + - name: PADDLE_INIT_PORTS_NUM + value: "1" + - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE + value: "1" + - name: PADDLE_INIT_NUM_GRADIENT_SERVERS + value: "20" + - name: PADDLE_INIT_NUM_PASSES + value: "1" + - name: PADDLE_INIT_USE_GPU + value: "0" + - name: LD_LIBRARY_PATH + value: "/usr/local/lib:/usr/local/nvidia/lib64" + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: "metadata.namespace" + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: "status.podIP" + command: ["paddle_k8s", "start_fluid"] + resources: + requests: + memory: 10Gi + cpu: 4 + limits: + memory: 10Gi + cpu: 4 diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0a0ed25ebe43c4cc0d5ab0b72cf36c936fcce802 --- /dev/null +++ b/benchmark/cluster/vgg16/fluid_trainer.yaml @@ -0,0 +1,69 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: vgg16job-trainer +spec: + parallelism: 20 + completions: 20 + template: + metadata: + labels: + paddle-job: vgg16job + spec: + imagePullSecrets: + - name: job-registry-secret + hostNetwork: true + containers: + - name: trainer + image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" + imagePullPolicy: Always + command: ["paddle_k8s", "start_fluid"] + env: + - name: PADDLE_JOB_NAME + value: vgg16job + - name: TRAINING_ROLE + value: "TRAINER" + - name: TRAINERS + value: "20" + - name: PSERVERS + value: "10" + - name: TOPOLOGY + value: "" + - name: ENTRY + value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128" + - name: TRAINER_PACKAGE + value: "/workspace" + - name: PADDLE_INIT_PORT + value: "30236" + - name: PADDLE_INIT_NICS + value: "xgbe0" + - name: PADDLE_INIT_TRAINER_COUNT + value: "1" + - name: PADDLE_INIT_PORTS_NUM + value: "1" + - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE + value: "1" + - name: PADDLE_INIT_NUM_GRADIENT_SERVERS + value: "20" + - name: PADDLE_INIT_NUM_PASSES + value: "1" + - name: PADDLE_INIT_USE_GPU + value: "0" + - name: LD_LIBRARY_PATH + value: "/usr/local/lib:/usr/local/nvidia/lib64" + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: "metadata.namespace" + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: "status.podIP" + resources: + requests: + memory: 40Gi + cpu: 2 + limits: + memory: 40Gi + cpu: 2 + restartPolicy: Never diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dd1271e0cf399184134c06b3200ee1202c65cef0 --- /dev/null +++ b/benchmark/cluster/vgg16/v2_pserver.yaml @@ -0,0 +1,64 @@ +apiVersion: extensions/v1beta1 +kind: ReplicaSet +metadata: + name: vgg16v2job-pserver +spec: + replicas: 10 + template: + metadata: + labels: + paddle-job-pserver: vgg16v2job + spec: + hostNetwork: true + imagePullSecrets: + - name: job-registry-secret + containers: + - name: pserver + image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" + imagePullPolicy: Always + ports: + - name: jobport-30236 + containerPort: 30236 + env: + - name: PADDLE_JOB_NAME + value: vgg16v2job + - name: TRAINERS + value: "20" + - name: PSERVERS + value: "10" + - name: TOPOLOGY + value: "" + - name: ENTRY + value: "python train.py" + - name: TRAINER_PACKAGE + value: "/workspace" + - name: PADDLE_INIT_PORT + value: "30236" + - name: PADDLE_INIT_NICS + value: "xgbe0" + - name: PADDLE_INIT_TRAINER_COUNT + value: "1" + - name: PADDLE_INIT_PORTS_NUM + value: "1" + - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE + value: "1" + - name: PADDLE_INIT_NUM_GRADIENT_SERVERS + value: "20" + - name: PADDLE_INIT_NUM_PASSES + value: "1" + - name: PADDLE_INIT_USE_GPU + value: "0" + - name: LD_LIBRARY_PATH + value: "/usr/local/lib:/usr/local/nvidia/lib64" + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: "metadata.namespace" + command: ["paddle_k8s", "start_pserver"] + resources: + requests: + memory: 10Gi + cpu: 4 + limits: + memory: 10Gi + cpu: 4 diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..12c8964066cbcfe8d2a44de2f51a3d12ea422fe2 --- /dev/null +++ b/benchmark/cluster/vgg16/v2_trainer.yaml @@ -0,0 +1,65 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: vgg16v2job-trainer +spec: + parallelism: 20 + completions: 20 + template: + metadata: + labels: + paddle-job: vgg16v2job + spec: + imagePullSecrets: + - name: job-registry-secret + hostNetwork: true + containers: + - name: trainer + image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" + imagePullPolicy: Always + command: ["paddle_k8s", "start_trainer", "v2"] + env: + - name: PADDLE_JOB_NAME + value: vgg16v2job + - name: BATCH_SIZE + value: "256" + - name: TRAINERS + value: "20" + - name: PSERVERS + value: "10" + - name: TOPOLOGY + value: "" + - name: ENTRY + value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py" + - name: TRAINER_PACKAGE + value: "/workspace" + - name: PADDLE_INIT_PORT + value: "30236" + - name: PADDLE_INIT_NICS + value: "xgbe0" + - name: PADDLE_INIT_TRAINER_COUNT + value: "1" + - name: PADDLE_INIT_PORTS_NUM + value: "1" + - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE + value: "1" + - name: PADDLE_INIT_NUM_GRADIENT_SERVERS + value: "20" + - name: PADDLE_INIT_NUM_PASSES + value: "2" + - name: PADDLE_INIT_USE_GPU + value: "0" + - name: LD_LIBRARY_PATH + value: "/usr/local/lib:/usr/local/nvidia/lib64" + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: "metadata.namespace" + resources: + requests: + memory: 40Gi + cpu: 2 + limits: + memory: 40Gi + cpu: 2 + restartPolicy: Never diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py new file mode 100644 index 0000000000000000000000000000000000000000..499e06ec42fc8f840137173628fa465e0541ba30 --- /dev/null +++ b/benchmark/cluster/vgg16/vgg16_fluid.py @@ -0,0 +1,277 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""VGG16 benchmark in Fluid""" +from __future__ import print_function + +import sys +import time +import numpy as np +import paddle.v2 as paddle +import paddle.v2.fluid as fluid +import paddle.v2.fluid.core as core +import paddle.v2.fluid.profiler as profiler +import argparse +import functools +import os + + +def str2bool(v): + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + '--batch_size', type=int, default=128, help="Batch size for training.") +parser.add_argument( + '--learning_rate', + type=float, + default=1e-3, + help="Learning rate for training.") +parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.") +parser.add_argument( + '--device', + type=str, + default='CPU', + choices=['CPU', 'GPU'], + help="The device type.") +parser.add_argument('--device_id', type=int, default=0, help="The device id.") +parser.add_argument( + '--data_format', + type=str, + default='NCHW', + choices=['NCHW', 'NHWC'], + help='The data order, now only support NCHW.') +parser.add_argument( + '--data_set', + type=str, + default='cifar10', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') +parser.add_argument( + '--local', + type=str2bool, + default=True, + help='Whether to run as local mode.') +args = parser.parse_args() + + +def vgg16_bn_drop(input): + def conv_block(input, num_filter, groups, dropouts): + return fluid.nets.img_conv_group( + input=input, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act='relu', + conv_with_batchnorm=True, + conv_batchnorm_drop_rate=dropouts, + pool_type='max') + + conv1 = conv_block(input, 64, 2, [0.3, 0]) + conv2 = conv_block(conv1, 128, 2, [0.4, 0]) + conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) + conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) + conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) + + drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) + fc1 = fluid.layers.fc(input=drop, size=512, act=None) + bn = fluid.layers.batch_norm(input=fc1, act='relu') + drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) + fc2 = fluid.layers.fc(input=drop2, size=512, act=None) + return fc2 + + +def main(): + if args.data_set == "cifar10": + classdim = 10 + if args.data_format == 'NCHW': + data_shape = [3, 32, 32] + else: + data_shape = [32, 32, 3] + else: + classdim = 102 + if args.data_format == 'NCHW': + data_shape = [3, 224, 224] + else: + data_shape = [224, 224, 3] + + # Input data + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + net = vgg16_bn_drop(images) + predict = fluid.layers.fc(input=net, size=classdim, act='softmax') + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + accuracy = fluid.evaluator.Accuracy(input=predict, label=label) + + # inference program + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + test_target = accuracy.metrics + accuracy.states + inference_program = fluid.io.get_inference_program(test_target) + + # Optimization + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + optimize_ops, params_grads = optimizer.minimize(avg_cost) + + # Initialize executor + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace( + args.device_id) + exe = fluid.Executor(place) + + # test + def test(exe): + accuracy.reset(exe) + for batch_id, data in enumerate(test_reader()): + img_data = np.array(map(lambda x: x[0].reshape(data_shape), + data)).astype("float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + + exe.run(inference_program, + feed={"pixel": img_data, + "label": y_data}) + + return accuracy.eval(exe) + + def train_loop(exe, trainer_prog): + iters = 0 + ts = time.time() + for pass_id in range(args.num_passes): + # train + start_time = time.time() + num_samples = 0 + accuracy.reset(exe) + with profiler.profiler("CPU", 'total') as prof: + for batch_id, data in enumerate(train_reader()): + ts = time.time() + img_data = np.array( + map(lambda x: x[0].reshape(data_shape), data)).astype( + "float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + + loss, acc = exe.run( + trainer_prog, + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[avg_cost] + accuracy.metrics) + iters += 1 + num_samples += len(data) + print( + "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" + % (pass_id, iters, loss, acc, time.time() - ts) + ) # The accuracy is the accumulation of batches, but not the current batch. + + pass_elapsed = time.time() - start_time + pass_train_acc = accuracy.eval(exe) + pass_test_acc = test(exe) + print( + "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n" + % (pass_id, num_samples / pass_elapsed, pass_train_acc, + pass_test_acc)) + + if args.local: + # Parameter initialization + exe.run(fluid.default_startup_program()) + + # data reader + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() if args.data_set == 'cifar10' + else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + batch_size=args.batch_size) + train_loop(exe, fluid.default_main_program()) + else: + pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # all pserver endpoints + eplist = [] + for ip in pserver_ips.split(","): + eplist.append(':'.join([ip, "6174"])) + pserver_endpoints = ",".join(eplist) + print("pserver endpoints: ", pserver_endpoints) + trainers = int(os.getenv("TRAINERS")) # total trainer count + print("trainers total: ", trainers) + current_endpoint = os.getenv( + "POD_IP") + ":6174" # current pserver endpoint + training_role = os.getenv( + "TRAINING_ROLE", + "TRAINER") # get the training role: trainer/pserver + t = fluid.DistributeTranspiler() + t.transpile( + optimize_ops, + params_grads, + pservers=pserver_endpoints, + trainers=trainers) + + if training_role == "PSERVER": + if not current_endpoint: + print("need env SERVER_ENDPOINT") + exit(1) + pserver_prog = t.get_pserver_program(current_endpoint) + pserver_startup = t.get_startup_program(current_endpoint, + pserver_prog) + print("starting server side startup") + exe.run(pserver_startup) + print("starting parameter server...") + exe.run(pserver_prog) + elif training_role == "TRAINER": + # Parameter initialization + exe.run(fluid.default_startup_program()) + + # data reader + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() if args.data_set == 'cifar10' + else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else + paddle.dataset.flowers.test(), + batch_size=args.batch_size) + + trainer_prog = t.get_trainer_program() + feeder = fluid.DataFeeder(feed_list=[images, label], place=place) + # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver + exe.run(fluid.default_startup_program()) + train_loop(exe, trainer_prog) + else: + print("environment var TRAINER_ROLE should be TRAINER os PSERVER") + + +def print_arguments(): + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +if __name__ == "__main__": + print_arguments() + main() diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..6ac6b3c33252e0a1f596f539fc090c5ada118e15 --- /dev/null +++ b/benchmark/cluster/vgg16/vgg16_v2.py @@ -0,0 +1,154 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +import gzip + +import paddle.v2.dataset.cifar as cifar +import paddle.v2 as paddle +import time +import os + +DATA_DIM = 3 * 32 * 32 +CLASS_DIM = 10 +BATCH_SIZE = os.getenv("BATCH_SIZE") +if BATCH_SIZE: + BATCH_SIZE = int(BATCH_SIZE) +else: + BATCH_SIZE = 128 +print "batch_size", BATCH_SIZE +NODE_COUNT = int(os.getenv("TRAINERS")) +ts = 0 + + +def vgg(input, nums, class_dim): + def conv_block(input, num_filter, groups, num_channels=None): + return paddle.networks.img_conv_group( + input=input, + num_channels=num_channels, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act=paddle.activation.Relu(), + pool_type=paddle.pooling.Max()) + + assert len(nums) == 5 + # the channel of input feature is 3 + conv1 = conv_block(input, 64, nums[0], 3) + conv2 = conv_block(conv1, 128, nums[1]) + conv3 = conv_block(conv2, 256, nums[2]) + conv4 = conv_block(conv3, 512, nums[3]) + conv5 = conv_block(conv4, 512, nums[4]) + + fc_dim = 512 + fc1 = paddle.layer.fc(input=conv5, + size=fc_dim, + act=paddle.activation.Relu(), + layer_attr=paddle.attr.Extra(drop_rate=0.5)) + fc2 = paddle.layer.fc(input=fc1, + size=fc_dim, + act=paddle.activation.Relu(), + layer_attr=paddle.attr.Extra(drop_rate=0.5)) + out = paddle.layer.fc(input=fc2, + size=class_dim, + act=paddle.activation.Softmax()) + return out + + +def vgg13(input, class_dim): + nums = [2, 2, 2, 2, 2] + return vgg(input, nums, class_dim) + + +def vgg16(input, class_dim): + nums = [2, 2, 3, 3, 3] + return vgg(input, nums, class_dim) + + +def vgg19(input, class_dim): + nums = [2, 2, 4, 4, 4] + return vgg(input, nums, class_dim) + + +def main(): + global ts + paddle.init(use_gpu=False) + image = paddle.layer.data( + name="image", type=paddle.data_type.dense_vector(DATA_DIM)) + lbl = paddle.layer.data( + name="label", type=paddle.data_type.integer_value(CLASS_DIM)) + + extra_layers = None + # NOTE: for v2 distributed training need averaging updates. + learning_rate = 1e-3 / NODE_COUNT + out = vgg16(image, class_dim=CLASS_DIM) + cost = paddle.layer.classification_cost(input=out, label=lbl) + + # Create parameters + parameters = paddle.parameters.create(cost) + + # Create optimizer + optimizer = paddle.optimizer.Momentum( + momentum=0.9, + regularization=paddle.optimizer.L2Regularization(rate=0.0005 * + BATCH_SIZE), + learning_rate=learning_rate / BATCH_SIZE, + learning_rate_decay_a=0.1, + learning_rate_decay_b=128000 * 35, + learning_rate_schedule="discexp", ) + + train_reader = paddle.batch( + paddle.reader.shuffle( + cifar.train10(), + # To use other data, replace the above line with: + # reader.train_reader('train.list'), + buf_size=1000), + batch_size=BATCH_SIZE) + test_reader = paddle.batch( + cifar.test10(), + # To use other data, replace the above line with: + # reader.test_reader('val.list'), + batch_size=BATCH_SIZE) + + # Create trainer + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=optimizer, + extra_layers=extra_layers, + is_local=False) + + # End batch and end pass event handler + def event_handler(event): + global ts, ts_pass + if isinstance(event, paddle.event.BeginPass): + ts_pass = time.time() + if isinstance(event, paddle.event.BeginIteration): + ts = time.time() + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 1 == 0: + print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % ( + event.pass_id, event.batch_id, event.cost, event.metrics, + time.time() - ts) + if isinstance(event, paddle.event.EndPass): + print "Pass %d end, spent: %f" % (event.pass_id, + time.time() - ts_pass) + result = trainer.test(reader=test_reader) + print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) + + trainer.train( + reader=train_reader, num_passes=200, event_handler=event_handler) + + +if __name__ == '__main__': + main() diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index ddf0b055a92d80295b24255a5462d477e0d9c796..29388f5005bf779a1bfa63c0d46d35996c0c792d 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -87,6 +87,11 @@ roi_pool .. autoclass:: paddle.v2.layer.roi_pool :noindex: +pad +---- +.. autoclass:: paddle.v2.layer.pad + :noindex: + Norm Layer ========== @@ -133,6 +138,11 @@ grumemory .. autoclass:: paddle.v2.layer.grumemory :noindex: +gated_unit +----------- +.. autoclass:: paddle.v2.layer.gated_unit + :noindex: + Recurrent Layer Group ===================== @@ -340,6 +350,11 @@ bilinear_interp .. autoclass:: paddle.v2.layer.bilinear_interp :noindex: +dropout +-------- +.. autoclass:: paddle.v2.layer.dropout + :noindex: + dot_prod --------- .. autoclass:: paddle.v2.layer.dot_prod @@ -402,6 +417,11 @@ scale_shift .. autoclass:: paddle.v2.layer.scale_shift :noindex: +factorization_machine +--------------------- +.. autoclass:: paddle.v2.layer.factorization_machine + :noindex: + Sampling Layers =============== @@ -420,22 +440,6 @@ multiplex .. autoclass:: paddle.v2.layer.multiplex :noindex: -Factorization Machine Layer -============================ - -factorization_machine ---------------------- -.. autoclass:: paddle.v2.layer.factorization_machine - :noindex: - -Slicing and Joining Layers -========================== - -pad ----- -.. autoclass:: paddle.v2.layer.pad - :noindex: - .. _api_v2.layer_costs: Cost Layers @@ -526,6 +530,11 @@ multibox_loss .. autoclass:: paddle.v2.layer.multibox_loss :noindex: +detection_output +---------------- +.. autoclass:: paddle.v2.layer.detection_output + :noindex: + Check Layer ============ @@ -534,31 +543,10 @@ eos .. autoclass:: paddle.v2.layer.eos :noindex: -Miscs -===== - -dropout --------- -.. autoclass:: paddle.v2.layer.dropout - :noindex: - -Activation with learnable parameter -=================================== +Activation +========== prelu -------- .. autoclass:: paddle.v2.layer.prelu :noindex: - -gated_unit ------------ -.. autoclass:: paddle.v2.layer.gated_unit - :noindex: - -Detection output Layer -====================== - -detection_output ----------------- -.. autoclass:: paddle.v2.layer.detection_output - :noindex: diff --git a/doc/api/v2/data/dataset.rst b/doc/api/v2/data/dataset.rst index 6a8ecc5bb1d855e0ded3719943ab3adb810de365..02e41564b1e48c07da6ac071fc4b60089169e05a 100644 --- a/doc/api/v2/data/dataset.rst +++ b/doc/api/v2/data/dataset.rst @@ -73,3 +73,10 @@ wmt14 .. automodule:: paddle.v2.dataset.wmt14 :members: :noindex: + +wmt16 ++++++ + +.. automodule:: paddle.v2.dataset.wmt16 + :members: + :noindex: diff --git a/doc/design/switch.md b/doc/design/switch.md new file mode 100644 index 0000000000000000000000000000000000000000..9db1b2782a521c2ff4b28b8f9efcdf1492242ed4 --- /dev/null +++ b/doc/design/switch.md @@ -0,0 +1,32 @@ +### Design Doc: Switch + +### Background + +Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid. + +The following example shows the usage of `fluid.switch`. + +```python +a = fluid.Var(10) +b = fluid.Var(0) + +switch = fluid.switch() +with switch.block(): + with switch.case(fluid.less_equal(a, 10)): + fluid.print("Case 1") + with switch.case(fluid.larger(a, 0)): + fluid.print("Case 2") + with switch.default(): + fluid.print("Case 3") +``` + +### The Semantics + +1. A `switch` control-flow checks cases one-by-one. +1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values. +1. It runs the first matched case, or the default case if there is one. +1. Once it matches a case, it runs the corresponding branch and only that branch. It's like there is a C's `break` keyword at the end of each case. + +The above program should print and print only "Case 1". + +The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches. diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc index 1510fb8abf54f05804bd404d9bd00ecc42fbef63..31ac72eda98859327f9857c18287398d0f459c7b 100644 --- a/paddle/framework/channel_test.cc +++ b/paddle/framework/channel_test.cc @@ -29,16 +29,16 @@ TEST(Channel, MakeAndClose) { { // MakeChannel should return a buffered channel is buffer_size > 0. auto ch = MakeChannel(10); - EXPECT_NE(dynamic_cast*>(ch), nullptr); - EXPECT_EQ(dynamic_cast*>(ch), nullptr); + EXPECT_NE(dynamic_cast *>(ch), nullptr); + EXPECT_EQ(dynamic_cast *>(ch), nullptr); CloseChannel(ch); delete ch; } { // MakeChannel should return an un-buffered channel is buffer_size = 0. auto ch = MakeChannel(0); - EXPECT_EQ(dynamic_cast*>(ch), nullptr); - EXPECT_NE(dynamic_cast*>(ch), nullptr); + EXPECT_EQ(dynamic_cast *>(ch), nullptr); + EXPECT_NE(dynamic_cast *>(ch), nullptr); CloseChannel(ch); delete ch; } @@ -78,3 +78,132 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { t.join(); delete ch; } + +TEST(Channel, SimpleUnbufferedChannelTest) { + auto ch = MakeChannel(0); + unsigned sum_send = 0; + std::thread t([&]() { + for (int i = 0; i < 5; i++) { + ch->Send(&i); + sum_send += i; + } + }); + for (int i = 0; i < 5; i++) { + int recv; + ch->Receive(&recv); + EXPECT_EQ(recv, i); + } + + CloseChannel(ch); + t.join(); + EXPECT_EQ(sum_send, 10U); + delete ch; +} + +// This tests that closing an unbuffered channel also unblocks +// unblocks any receivers waiting for senders +TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) { + auto ch = MakeChannel(0); + size_t num_threads = 5; + std::thread t[num_threads]; + bool thread_ended[num_threads]; + + // Launches threads that try to read and are blocked becausew of no writers + for (size_t i = 0; i < num_threads; i++) { + thread_ended[i] = false; + t[i] = std::thread( + [&](bool *p) { + int data; + ch->Receive(&data); + *p = true; + }, + &thread_ended[i]); + } + std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec + + // Verify that all the threads are blocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], false); + } + + // Explicitly close the thread + // This should unblock all receivers + CloseChannel(ch); + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec + + // Verify that all threads got unblocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], true); + } + + for (size_t i = 0; i < num_threads; i++) t[i].join(); + delete ch; +} + +// This tests that closing an unbuffered channel also unblocks +// unblocks any senders waiting for senders +TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) { + auto ch = MakeChannel(0); + size_t num_threads = 5; + std::thread t[num_threads]; + bool thread_ended[num_threads]; + + // Launches threads that try to read and are blocked becausew of no writers + for (size_t i = 0; i < num_threads; i++) { + thread_ended[i] = false; + t[i] = std::thread( + [&](bool *p) { + int data = 10; + ch->Send(&data); + *p = true; + }, + &thread_ended[i]); + } + std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec + + // Verify that all the threads are blocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], false); + } + + // Explicitly close the thread + // This should unblock all receivers + CloseChannel(ch); + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec + + // Verify that all threads got unblocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], true); + } + + for (size_t i = 0; i < num_threads; i++) t[i].join(); + delete ch; +} + +TEST(Channel, UnbufferedLessReceiveMoreSendTest) { + auto ch = MakeChannel(0); + unsigned sum_send = 0; + // Send should block after three iterations + // since we only have three receivers. + std::thread t([&]() { + // Try to send more number of times + // than receivers + for (int i = 0; i < 4; i++) { + ch->Send(&i); + sum_send += i; + } + }); + for (int i = 0; i < 3; i++) { + int recv; + ch->Receive(&recv); + EXPECT_EQ(recv, i); + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec + EXPECT_EQ(sum_send, 3U); + + CloseChannel(ch); + t.join(); + delete ch; +} diff --git a/paddle/framework/details/unbuffered_channel.h b/paddle/framework/details/unbuffered_channel.h index cc2d2e587eca981307d4e522bd569fbffa450207..0dc5afd7e57c1f59dfc1b86093eea231d46966f1 100644 --- a/paddle/framework/details/unbuffered_channel.h +++ b/paddle/framework/details/unbuffered_channel.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include -#include #include #include "paddle/framework/channel.h" @@ -36,20 +36,104 @@ class UnBuffered : public paddle::framework::Channel { virtual ~UnBuffered(); private: - UnBuffered() {} + std::mutex mu_ch_; + // Mutex for readers and writers who are waiting for other reader + // and writer to complete execution + std::recursive_mutex mu_read_, mu_write_; + // reader_found_ is set true when a reader is ready to accept data + // writer_found_ is set true when a writer is ready to send data + // A transaction occurs only when both are true + std::atomic reader_found_{false}, writer_found_{false}; + std::condition_variable cv_channel_; + std::condition_variable_any cv_reader_, cv_writer_; + T* item{nullptr}; + std::atomic closed_{false}; + + UnBuffered() : closed_(false) {} + + void NotifyAllParticipants(std::unique_lock*); }; +// This function implements the concept of how data should +// be sent from a writer to a reader. +template +void UnBuffered::Send(T* data) { + // Prevent other writers from entering + std::unique_lock writer_lock(mu_write_); + writer_found_ = true; + std::unique_lock cv_lock(mu_write_); + // If writer comes first, it should wait till a reader arrives + cv_writer_.wait(cv_lock, + [this]() { return reader_found_ == true || closed_; }); + cv_reader_.notify_one(); + if (!closed_) { + std::unique_lock channel_lock(mu_ch_); + item = data; + channel_lock.unlock(); + cv_channel_.notify_one(); + channel_lock.lock(); + cv_channel_.wait(channel_lock, + [this]() { return item == nullptr || closed_; }); + } + writer_found_ = false; +} + +// This function implements the concept of how +// data that was sent by a writer is read from a reader. template -void UnBuffered::Send(T* channel_element) {} +void UnBuffered::Receive(T* data) { + // Prevent other readers from entering + std::unique_lock read_lock{mu_read_}; + reader_found_ = true; + std::unique_lock cv_lock{mu_read_}; + // If reader comes first, it should wait till a writer arrives + cv_reader_.wait(cv_lock, + [this]() { return writer_found_ == true || closed_; }); + cv_writer_.notify_one(); + if (!closed_) { + std::unique_lock lock_ch{mu_ch_}; + // Reader should wait for the writer to first write its data + cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; }); + if (!closed_) { + *data = std::move(*item); + item = nullptr; + lock_ch.unlock(); + } + cv_channel_.notify_one(); + } + reader_found_ = false; +} +// This function implements the sequence of events +// that take place once the channel is closed. template -void UnBuffered::Receive(T*) {} +void UnBuffered::Close() { + std::unique_lock lock(mu_ch_); + item = nullptr; + closed_ = true; + NotifyAllParticipants(&lock); +} +// This function implements the sequence of events +// that are executed once the object of an UnBuffered +// channel is destroyed. template -void UnBuffered::Close() {} +UnBuffered::~UnBuffered() { + std::unique_lock lock(mu_ch_); + item = nullptr; + closed_ = true; + NotifyAllParticipants(&lock); +} +// This function notifies all the readers, writers and +// the channel condition variables. template -UnBuffered::~UnBuffered() {} +void UnBuffered::NotifyAllParticipants(std::unique_lock* lock) { + lock->unlock(); + cv_writer_.notify_all(); + cv_channel_.notify_all(); + cv_reader_.notify_all(); +} } // namespace details } // namespace framework diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index f8df2cf97ad532f06cb1393b1a24cd789f8bde29..f554c77845087453f8c6e4d04522a8555e583ae6 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext { bool HasOutputs(const std::string &name) const override; - DDim GetInputDim(const std::string &name) const override; - - void SetOutputDim(const std::string &name, const DDim &dim) override; - AttrReader Attrs() const override; const std::vector &Inputs( @@ -444,21 +440,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const { return true; } -DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const { - std::vector ddims = GetInputsDim(name); - auto length = ddims.size(); - PADDLE_ENFORCE_EQ(length, 1UL, - "Input(%s) should have 1 value, " - "but it has %d now", - name, length); - return ddims[0]; -} - -void CompileTimeInferShapeContext::SetOutputDim(const std::string &name, - const DDim &dim) { - SetOutputsDim(name, {dim}); -} - AttrReader CompileTimeInferShapeContext::Attrs() const { return AttrReader(op_.GetAttrMap()); } diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 4e854f54dd43d760bab44fb5f7cafeb13314b27c..81fa8cf477423fc2a54c719c9a743729215513c3 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -366,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext { return true; } - DDim GetInputDim(const std::string& name) const override { - return GetDim(op_.Input(name)); - } - - void SetOutputDim(const std::string& name, const DDim& dim) override { - SetDim(op_.Output(name), dim); - } - AttrReader Attrs() const override { return AttrReader(op_.Attrs()); } const std::vector& Inputs( diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc index e53cc0cdabc623ae358f1a3e21823a2f38ec3c62..a0fa467291bb42c59b65f5efeabe9c2235e15b2a 100644 --- a/paddle/framework/shape_inference.cc +++ b/paddle/framework/shape_inference.cc @@ -18,10 +18,18 @@ limitations under the License. */ namespace paddle { namespace framework { -std::vector InferShapeContext::GetInputsDim( +DDim InferShapeContext::GetInputDim(const std::string &name) const { + const std::vector &arg_names = Inputs(name); + PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, + "Input(%s) should hold one element, but now it holds %d", + name, arg_names.size()); + return this->GetDim(arg_names[0]); +} + +std::vector InferShapeContext::GetInputsDim( const std::string &name) const { - const std::vector &names = Inputs(name); - return GetDims(names); + const std::vector &arg_names = Inputs(name); + return GetDims(arg_names); } DDim InferShapeContext::GetInputsElementDim(const std::string &name, @@ -30,24 +38,31 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name, return this->GetDim(names[idx]); } -void InferShapeContext::SetOutputsDim( - const std::string &name, const std::vector &dims) { +void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) { + auto &arg_names = Outputs(name); + PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, + "Output(%s) should hold one element, but now it holds %d", + name, arg_names.size()); + SetDim(arg_names[0], dim); +} + +void InferShapeContext::SetOutputsDim(const std::string &name, + const std::vector &dims) { auto &names = Outputs(name); SetDims(names, dims); } -std::vector InferShapeContext::GetDims( +std::vector InferShapeContext::GetDims( const std::vector &names) const { - std::vector ret; + std::vector ret; ret.reserve(names.size()); std::transform( names.begin(), names.end(), std::back_inserter(ret), [this](const std::string &name) { return this->GetDim(name); }); return ret; } - void InferShapeContext::SetDims(const std::vector &names, - const std::vector &dims) { + const std::vector &dims) { size_t length = names.size(); PADDLE_ENFORCE_EQ(length, dims.size()); for (size_t i = 0; i < length; ++i) { diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h index f93319d8f2fd4c5d388bd57fd595a6a5edd51775..830f199ed1451538f12fc8dd34fb7b2bfc356a71 100644 --- a/paddle/framework/shape_inference.h +++ b/paddle/framework/shape_inference.h @@ -35,14 +35,13 @@ class InferShapeContext { virtual bool HasInputs(const std::string &name) const = 0; virtual bool HasOutputs(const std::string &name) const = 0; - virtual framework::DDim GetInputDim(const std::string &name) const = 0; + DDim GetInputDim(const std::string &name) const; - std::vector GetInputsDim(const std::string &name) const; + std::vector GetInputsDim(const std::string &name) const; DDim GetInputsElementDim(const std::string &name, int idx) const; - virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0; - void SetOutputsDim(const std::string &name, - const std::vector &dims); + void SetOutputDim(const std::string &name, const DDim &dim); + void SetOutputsDim(const std::string &name, const std::vector &dims); virtual AttrReader Attrs() const = 0; virtual const std::vector &Inputs( @@ -57,15 +56,13 @@ class InferShapeContext { // Note: In while op, we need this to be public void SetDims(const std::vector &names, - const std::vector &dims); + const std::vector &dims); protected: - virtual framework::DDim GetDim(const std::string &name) const = 0; - virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0; - - std::vector GetDims( - const std::vector &names) const; + virtual DDim GetDim(const std::string &name) const = 0; + virtual void SetDim(const std::string &name, const DDim &dim) = 0; + std::vector GetDims(const std::vector &names) const; std::vector GetVarTypes( const std::vector &names) const; diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index cbdbf5335d32d55a0221728758025c9d2cb3e7d1..a9876cec2aabf7d116443b685391ee9d20bc1370 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -178,19 +178,22 @@ public: real* inputData = inputs[0].data(); real* filterData = inputs[1].data(); real* outputData = outputs[0].data(); + real* colData = NULL; bool needIm2col = isNeedIm2col(filter); TensorShape imShape = TensorShape({inputChannels / groups_, inputHeight, inputWidth}); - TensorShape colShape; - real* colData = NULL; - size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth; - size_t colWidth = outputHeight * outputWidth; - // Max col matrix height 256, Max col matrix width 1024 - size_t stepColHeight = std::min(colHeight, static_cast(256)); - size_t stepColWidth = std::min(colWidth, static_cast(2048)); + // Max col matrix width 4096, Max col matrix size 4M. + size_t outputHeightSteps = + std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight); + size_t maxColWidth = outputHeightSteps * outputWidth; + size_t channelSteps = + std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth, + (size_t)1), + inputChannels / groups_); + size_t maxColHeight = channelSteps * filterHeight * filterWidth; if (needIm2col) { colShape = TensorShape({inputChannels / groups_, @@ -199,7 +202,7 @@ public: outputHeight, outputWidth}); - resizeBuffer(stepColHeight * stepColWidth * sizeof(real)); + resizeBuffer(maxColHeight * maxColWidth * sizeof(real)); colData = reinterpret_cast(memory_->getBuf()); } @@ -209,20 +212,24 @@ public: (outputChannels / groups_) * outputHeight * outputWidth; size_t filterOffset = filter.getElements() / groups_; - int nStride = colWidth; - int kStride = colHeight; + int nStride = outputHeight * outputWidth; + int kStride = inputChannels / groups_ * filterHeight * filterWidth; for (size_t i = 0; i < batchSize; i++) { + filterData = inputs[1].data(); for (size_t g = 0; g < groups_; g++) { if (needIm2col) { real beta_ = beta; - for (size_t colHeightStart = 0; colHeightStart < colHeight; - colHeightStart += stepColHeight) { - for (size_t colWidthStart = 0; colWidthStart < colWidth; - colWidthStart += stepColWidth) { - int N = std::min(colWidth - colWidthStart, stepColWidth); - int K = std::min(colHeight - colHeightStart, stepColHeight); + for (size_t ic = 0; ic < inputChannels / groups_; + ic += channelSteps) { + int channels = std::min(inputChannels / groups_ - ic, channelSteps); + for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) { + int height = std::min(outputHeight - oh, outputHeightSteps); + + int M = outputChannels / groups_; + int N = height * outputWidth; + int K = channels * filterHeight * filterWidth; // im2col - im2col(inputData + g * inputOffset, + im2col(inputData, imShape, colData, colShape, @@ -232,13 +239,12 @@ public: paddingW(), dilationH(), dilationW(), - colHeightStart, - K, - colWidthStart, + channels, + oh, + height, N); // gemm - int M = outputChannels / groups_; BlasGemm::compute( false, false, @@ -246,12 +252,12 @@ public: N, K, 1.0f, - filterData + g * filterOffset + colHeightStart, + filterData + ic * filterHeight * filterWidth, kStride, colData, N, beta_, - outputData + g * outputOffset + colWidthStart, + outputData + oh * outputWidth, nStride); } beta_ = 1.0; @@ -266,17 +272,18 @@ public: N, K, 1.0f, - filterData + g * filterOffset, + filterData, K, - inputData + g * inputOffset, + inputData, N, beta, - outputData + g * outputOffset, + outputData, N); } + inputData += inputOffset; + outputData += outputOffset; + filterData += filterOffset; } - inputData += inputChannels * inputHeight * inputWidth; - outputData += outputChannels * outputHeight * outputWidth; } memory_.reset(); diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h index 36a9bcf84e4b14965c83627821b71d1c7c0da1b2..915119e291caaa223249cf8e37078723621517b0 100644 --- a/paddle/function/Im2Col.h +++ b/paddle/function/Im2Col.h @@ -111,39 +111,42 @@ public: int paddingWidth, int dilationHeight, int dilationWidth, - int colHeightStart, - int colHeightSize, - int colWidthStart, - int colWidthSize) { + int inputChannels, + int colOffset, + int colOutputHeight, + int colWidth) { int inputHeight = imShape[1]; int inputWidth = imShape[2]; int filterHeight = colShape[1]; int filterWidth = colShape[2]; int outputWidth = colShape[4]; - for (int colh = 0; colh < colHeightSize; colh++) { - int wOffset = (colHeightStart + colh) % filterWidth; - int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight; - int c_im = (colHeightStart + colh) / filterWidth / filterHeight; - - for (int colw = 0; colw < colWidthSize; colw++) { - int h = (colWidthStart + colw) / outputWidth; - int w = (colWidthStart + colw) % outputWidth; - - int imRowIdx = h * strideHeight + hOffset * dilationHeight; - int imColIdx = w * strideWidth + wOffset * dilationWidth; - if ((imRowIdx - paddingHeight) < 0 || - (imRowIdx - paddingHeight) >= inputHeight || - (imColIdx - paddingWidth) < 0 || - (imColIdx - paddingWidth) >= inputWidth) { - colData[colh * colWidthSize + colw] = static_cast(0); - } else { - imRowIdx += c_im * inputHeight - paddingHeight; - imColIdx -= paddingWidth; - colData[colh * colWidthSize + colw] = - imData[imRowIdx * inputWidth + imColIdx]; + for (int ic = 0; ic < inputChannels; ic++) { + for (int oh = 0; oh < colOutputHeight; oh++) { + T* dstData = colData + oh * outputWidth; + for (int fh = 0; fh < filterHeight; fh++) { + for (int fw = 0; fw < filterWidth; fw++) { + int imRowIdx = (oh + colOffset) * strideHeight + + fh * dilationHeight - paddingHeight; + if (imRowIdx < 0 || imRowIdx >= inputHeight) { + memset(dstData, 0, outputWidth * sizeof(T)); + } else { + for (int ow = 0; ow < outputWidth; ow++) { + int imColIdx = + ow * strideWidth + fw * dilationWidth - paddingWidth; + if (imColIdx < 0 || imColIdx >= inputWidth) { + dstData[ow] = T(0); + } else { + dstData[ow] = imData[imRowIdx * inputWidth + imColIdx]; + } + } + } + dstData += colWidth; + } } } + colData += filterHeight * filterWidth * colWidth; + imData += inputHeight * inputWidth; } } }; diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp index 3ba866dcdd845403d52f7a85adfef08cbb11c305..fe44a8bf79005efb87c56f6a79f46421129bab22 100644 --- a/paddle/function/Im2ColTest.cpp +++ b/paddle/function/Im2ColTest.cpp @@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() { padding, dilation, dilation, + channels, 0, - height, - 0, - width); + outputHeight, + outputHeight * outputWidth); autotest::TensorCheckEqual(*output1, *output2); } diff --git a/paddle/inference/tests/book/CMakeLists.txt b/paddle/inference/tests/book/CMakeLists.txt index d3798fb8fd8769aef5940d4ce724cb0cc8686422..0e987eb0240301c58cfb74c9e995d3b525130125 100644 --- a/paddle/inference/tests/book/CMakeLists.txt +++ b/paddle/inference/tests/book/CMakeLists.txt @@ -4,4 +4,4 @@ cc_test(test_inference_recognize_digits_mlp DEPS ARCHIVE_START paddle_fluid ARCHIVE_END ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model) set_tests_properties(test_inference_recognize_digits_mlp - PROPERTIES DEPENDS test_recognize_digits_mlp_cpu) + PROPERTIES DEPENDS test_recognize_digits) diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 1ec4336cabbc7d3073b7638b7484bf61e83a2dc5..cc86b12be08ba987f9682ebf3fda56c2f07fb576 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, CHECK_EQ(channels * outLength, maskMatP->getWidth()); } - /* initialize the data_ */ - for (size_t i = 0; i < height_; i++) { - for (size_t j = 0; j < width_; j++) { - outData[i * outStride + j] = -(real)FLT_MAX; - } - } - /* pool max one by one */ for (size_t n = 0; n < num; ++n) { // frame by frame if (!isContiguous()) { @@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, for (size_t c = 0; c < channels; ++c) { // channel by channel for (size_t ph = 0; ph < outputH; ++ph) { int hstart = ph * strideH - paddingH; - int hend = std::min(hstart + sizeY, imgSizeH); - hstart = std::max(hstart, 0); + int hend = hstart + sizeY; + hstart = hstart < 0 ? 0 : hstart; + hend = hend < (int)imgSizeH ? hend : (int)imgSizeH; for (size_t pw = 0; pw < outputW; ++pw) { int wstart = pw * strideW - paddingW; - int wend = std::min(wstart + sizeX, imgSizeW); - wstart = std::max(wstart, 0); + int wend = wstart + sizeX; + wstart = wstart < 0 ? 0 : wstart; + wend = wend < (int)imgSizeW ? wend : (int)imgSizeW; if (maskData == NULL) { + real tmp = -(real)FLT_MAX; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - outData[ph * outputW + pw] = std::max( - outData[ph * outputW + pw], inputData[h * imgSizeW + w]); + tmp = tmp < inputData[h * imgSizeW + w] + ? inputData[h * imgSizeW + w] + : tmp; } } + outData[ph * outputW + pw] = tmp; } else { for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { diff --git a/paddle/operators/bipartite_match_op.cc b/paddle/operators/bipartite_match_op.cc index 83c8778fe4cec4d9d80de691e117a39fdd92f494..1e6fa2091de25218e2bdafeb740ce884234638a5 100644 --- a/paddle/operators/bipartite_match_op.cc +++ b/paddle/operators/bipartite_match_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -28,12 +28,18 @@ class BipartiteMatchOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("DistMat"), "Input(DistMat) of BipartiteMatch should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("ColToRowMatchIndices"), + "Output(ColToRowMatchIndices) of BipartiteMatch should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("ColToRowMatchDist"), + "Output(ColToRowMatchDist) of BipartiteMatch should not be null."); auto dims = ctx->GetInputDim("DistMat"); PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2."); ctx->SetOutputDim("ColToRowMatchIndices", dims); - ctx->SetOutputDim("ColToRowMatchDis", dims); + ctx->SetOutputDim("ColToRowMatchDist", dims); } }; @@ -91,7 +97,7 @@ class BipartiteMatchKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto* dist_mat = context.Input("DistMat"); auto* match_indices = context.Output("ColToRowMatchIndices"); - auto* match_dist = context.Output("ColToRowMatchDis"); + auto* match_dist = context.Output("ColToRowMatchDist"); auto& dev_ctx = context.device_context(); @@ -148,13 +154,13 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker { "Otherwise, it means B[j] is matched to row " "ColToRowMatchIndices[i][j] in i-th instance. The row number of " "i-th instance is saved in ColToRowMatchIndices[i][j]."); - AddOutput("ColToRowMatchDis", + AddOutput("ColToRowMatchDist", "(Tensor) A 2-D Tensor with shape [N, M] in float type. " "N is batch size. If ColToRowMatchIndices[i][j] is -1, " - "ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed " + "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed " "ColToRowMatchIndices[i][j] = d, and the row offsets of each " "instance are called LoD. Then " - "ColToRowMatchDis[i][j] = DistMat[d+LoD[i]][j]"); + "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]"); AddComment(R"DOC( This operator is a greedy bipartite matching algorithm, which is used to obtain the matching with the maximum distance based on the input diff --git a/paddle/operators/box_coder_op.cc b/paddle/operators/box_coder_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..539813d4858b8faef386047f9ef64aa232aefca1 --- /dev/null +++ b/paddle/operators/box_coder_op.cc @@ -0,0 +1,121 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/box_coder_op.h" + +namespace paddle { +namespace operators { + +class BoxCoderOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("PriorBox"), + "Input(PriorBox) of BoxCoderOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"), + "Input(PriorBoxVar) of BoxCoderOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("TargetBox"), + "Input(TargetBox) of BoxCoderOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutputBox"), + "Output(OutputBox) of BoxCoderOp should not be null."); + + auto prior_box_dims = ctx->GetInputDim("PriorBox"); + auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); + auto target_box_dims = ctx->GetInputDim("TargetBox"); + + PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2, + "The rank of Input of PriorBoxVar must be 2"); + PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]"); + PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims); + PADDLE_ENFORCE_EQ(target_box_dims.size(), 2, + "The rank of Input of TargetBox must be 2"); + PADDLE_ENFORCE_EQ(target_box_dims[1], 4, + "The shape of TargetBox is [M, 4]"); + + GetBoxCodeType(ctx->Attrs().Get("code_type")); + + ctx->SetOutputDim( + "OutputBox", + framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4})); + ctx->ShareLoD("TargetBox", /*->*/ "OutputBox"); + } +}; + +class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker { + public: + BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "PriorBox", + "(Tensor, default Tensor) " + "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, " + "each box is represented as [xmin, ymin, xmax, ymax], " + "[xmin, ymin] is the left top coordinate of the anchor box, " + "if the input is image feature map, they are close to the origin " + "of the coordinate system. [xmax, ymax] is the right bottom " + "coordinate of the anchor box."); + AddInput("PriorBoxVar", + "(Tensor, default Tensor) " + "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group " + "of variance."); + AddInput( + "TargetBox", + "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " + "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], " + "[xmin, ymin] is the left top coordinate of the box if the input " + "is image feature map, they are close to the origin of the coordinate " + "system. [xmax, ymax] is the right bottom coordinate of the box. " + "This tensor can contain LoD information to represent a batch " + "of inputs. One instance of this batch can contain different " + "numbers of entities."); + AddAttr("code_type", + "(string, default encode_center_size) " + "the code type used with the target box") + .SetDefault("encode_center_size") + .InEnum({"encode_center_size", "decode_center_size"}); + AddOutput( + "OutputBox", + "(LoDTensor or Tensor) " + "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] " + "representing the result of N target boxes encoded/decoded with " + "M Prior boxes and variances."); + + AddComment(R"DOC( +Bounding Box Coder Operator. +Encode/Decode the target bounding box with the priorbox information. +The Encoding schema described below: +ox = (tx - px) / pw / pxv +oy = (ty - py) / ph / pyv +ow = log(abs(tw / pw)) / pwv +oh = log(abs(th / ph)) / phv +The Decoding schema described below: +ox = (pw * pxv * tx * + px) - tw / 2 +oy = (ph * pyv * ty * + py) - th / 2 +ow = exp(pwv * tw) * pw + tw / 2 +oh = exp(phv * th) * ph + th / 2 +where tx, ty, tw, th denote the target box's center coordinates, width and +height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor) +center coordinates, width and height. pxv, pyv, pwv, phv denote the variance +of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates, +width and height. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker); +REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel, + ops::BoxCoderKernel); diff --git a/paddle/operators/box_coder_op.cu b/paddle/operators/box_coder_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..98bd93457fafb49f2af5e1ff258fbfa9f9985600 --- /dev/null +++ b/paddle/operators/box_coder_op.cu @@ -0,0 +1,150 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/box_coder_op.h" +#include "paddle/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +template +__global__ void EncodeCenterSizeKernel(const T* prior_box_data, + const T* prior_box_var_data, + const T* target_box_data, const int row, + const int col, const int len, + T* output) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < row * col) { + const int row_idx = idx / col; + const int col_idx = idx % col; + T prior_box_width = + prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len]; + T prior_box_height = + prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1]; + T prior_box_center_x = + (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2; + T prior_box_center_y = (prior_box_data[col_idx * len + 3] + + prior_box_data[col_idx * len + 1]) / + 2; + + T target_box_center_x = + (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) / + 2; + T target_box_center_y = (target_box_data[row_idx * len + 3] + + target_box_data[row_idx * len + 1]) / + 2; + T target_box_width = + target_box_data[row_idx * len + 2] - target_box_data[row_idx * len]; + T target_box_height = + target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1]; + + output[idx * len] = (target_box_center_x - prior_box_center_x) / + prior_box_width / prior_box_var_data[col_idx * len]; + output[idx * len + 1] = (target_box_center_y - prior_box_center_y) / + prior_box_height / + prior_box_var_data[col_idx * len + 1]; + output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) / + prior_box_var_data[col_idx * len + 2]; + output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) / + prior_box_var_data[col_idx * len + 3]; + } +} + +template +__global__ void DecodeCenterSizeKernel(const T* prior_box_data, + const T* prior_box_var_data, + const T* target_box_data, const int row, + const int col, const int len, + T* output) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < row * col) { + const int row_idx = idx / col; + const int col_idx = idx % col; + T prior_box_width = + prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len]; + T prior_box_height = + prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1]; + T prior_box_center_x = + (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2; + T prior_box_center_y = (prior_box_data[col_idx * len + 3] + + prior_box_data[col_idx * len + 1]) / + 2; + + T target_box_width = exp(prior_box_var_data[col_idx * len + 2] * + target_box_data[row_idx * len + 2]) * + prior_box_width; + T target_box_height = exp(prior_box_var_data[col_idx * len + 3] * + target_box_data[row_idx * len + 3]) * + prior_box_height; + T target_box_center_x = prior_box_var_data[col_idx * len] * + target_box_data[row_idx * len] * + prior_box_width + + prior_box_center_x; + T target_box_center_y = prior_box_var_data[col_idx * len + 1] * + target_box_data[row_idx * len + 1] * + prior_box_height + + prior_box_center_y; + + output[idx * len] = target_box_center_x - target_box_width / 2; + output[idx * len + 1] = target_box_center_y - target_box_height / 2; + output[idx * len + 2] = target_box_center_x + target_box_width / 2; + output[idx * len + 3] = target_box_center_y + target_box_height / 2; + } +} + +template +class BoxCoderCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + auto* prior_box = context.Input("PriorBox"); + auto* prior_box_var = context.Input("PriorBoxVar"); + auto* target_box = context.Input("TargetBox"); + auto* output_box = context.Output("OutputBox"); + + if (target_box->lod().size()) { + PADDLE_ENFORCE_EQ(target_box->lod().size(), 1, + "Only support 1 level of LoD."); + } + auto row = target_box->dims()[0]; + auto col = prior_box->dims()[0]; + auto len = prior_box->dims()[1]; + int block = 512; + int grid = (row * col + block - 1) / block; + auto& device_ctx = context.cuda_device_context(); + + const T* prior_box_data = prior_box->data(); + const T* prior_box_var_data = prior_box_var->data(); + const T* target_box_data = target_box->data(); + + output_box->mutable_data({row, col, len}, context.GetPlace()); + T* output = output_box->data(); + + auto code_type = GetBoxCodeType(context.Attr("code_type")); + if (code_type == BoxCodeType::kEncodeCenterSize) { + EncodeCenterSizeKernel<<>>( + prior_box_data, prior_box_var_data, target_box_data, row, col, len, + output); + } else if (code_type == BoxCodeType::kDecodeCenterSize) { + DecodeCenterSizeKernel<<>>( + prior_box_data, prior_box_var_data, target_box_data, row, col, len, + output); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel, + ops::BoxCoderCUDAKernel); diff --git a/paddle/operators/box_coder_op.h b/paddle/operators/box_coder_op.h new file mode 100644 index 0000000000000000000000000000000000000000..086251f6e066f082743f332ce72918c6e572ce19 --- /dev/null +++ b/paddle/operators/box_coder_op.h @@ -0,0 +1,151 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/op_registry.h" +#include "paddle/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 }; + +inline BoxCodeType GetBoxCodeType(const std::string& type) { + if (type == "encode_center_size") { + return BoxCodeType::kEncodeCenterSize; + } else if (type == "decode_center_size") { + return BoxCodeType::kDecodeCenterSize; + } + PADDLE_THROW("Not support type %s.", type); +} + +template +class BoxCoderKernel : public framework::OpKernel { + public: + void EncodeCenterSize(const framework::Tensor& target_box, + const framework::Tensor& prior_box, + const framework::Tensor& prior_box_var, + T* output) const { + int64_t row = target_box.dims()[0]; + int64_t col = prior_box.dims()[0]; + int64_t len = prior_box.dims()[1]; + auto* target_box_data = target_box.data(); + auto* prior_box_data = prior_box.data(); + auto* prior_box_var_data = prior_box_var.data(); + + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + T prior_box_width = + prior_box_data[j * len + 2] - prior_box_data[j * len]; + T prior_box_height = + prior_box_data[j * len + 3] - prior_box_data[j * len + 1]; + T prior_box_center_x = + (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2; + T prior_box_center_y = + (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; + + T target_box_center_x = + (target_box_data[i * len + 2] + target_box_data[i * len]) / 2; + T target_box_center_y = + (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2; + T target_box_width = + target_box_data[i * len + 2] - target_box_data[i * len]; + T target_box_height = + target_box_data[i * len + 3] - target_box_data[i * len + 1]; + + size_t offset = i * col * len + j * len; + output[offset] = (target_box_center_x - prior_box_center_x) / + prior_box_width / prior_box_var_data[j * len]; + output[offset + 1] = (target_box_center_y - prior_box_center_y) / + prior_box_height / prior_box_var_data[j * len + 1]; + output[offset + 2] = + std::log(std::fabs(target_box_width / prior_box_width)) / + prior_box_var_data[j * len + 2]; + output[offset + 3] = + std::log(std::fabs(target_box_height / prior_box_height)) / + prior_box_var_data[j * len + 3]; + } + } + } + void DecodeCenterSize(const framework::Tensor& target_box, + const framework::Tensor& prior_box, + const framework::Tensor& prior_box_var, + T* output) const { + int64_t row = target_box.dims()[0]; + int64_t col = prior_box.dims()[0]; + int64_t len = prior_box.dims()[1]; + + auto* target_box_data = target_box.data(); + auto* prior_box_data = prior_box.data(); + auto* prior_box_var_data = prior_box_var.data(); + + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + T prior_box_width = + prior_box_data[j * len + 2] - prior_box_data[j * len]; + T prior_box_height = + prior_box_data[j * len + 3] - prior_box_data[j * len + 1]; + T prior_box_center_x = + (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2; + T prior_box_center_y = + (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; + + T target_box_center_x = prior_box_var_data[j * len] * + target_box_data[i * len] * prior_box_width + + prior_box_center_x; + T target_box_center_y = prior_box_var_data[j * len + 1] * + target_box_data[i * len + 1] * + prior_box_height + + prior_box_center_y; + T target_box_width = std::exp(prior_box_var_data[j * len + 2] * + target_box_data[i * len + 2]) * + prior_box_width; + T target_box_height = std::exp(prior_box_var_data[j * len + 3] * + target_box_data[i * len + 3]) * + prior_box_height; + + size_t offset = i * col * len + j * len; + output[offset] = target_box_center_x - target_box_width / 2; + output[offset + 1] = target_box_center_y - target_box_height / 2; + output[offset + 2] = target_box_center_x + target_box_width / 2; + output[offset + 3] = target_box_center_y + target_box_height / 2; + } + } + } + + void Compute(const framework::ExecutionContext& context) const override { + auto* prior_box = context.Input("PriorBox"); + auto* prior_box_var = context.Input("PriorBoxVar"); + auto* target_box = context.Input("TargetBox"); + auto* output_box = context.Output("OutputBox"); + + if (target_box->lod().size()) { + PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL, + "Only support 1 level of LoD."); + } + auto row = target_box->dims()[0]; + auto col = prior_box->dims()[0]; + auto len = prior_box->dims()[1]; + + output_box->mutable_data({row, col, len}, context.GetPlace()); + + auto code_type = GetBoxCodeType(context.Attr("code_type")); + T* output = output_box->data(); + if (code_type == BoxCodeType::kEncodeCenterSize) { + EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output); + } else if (code_type == BoxCodeType::kDecodeCenterSize) { + DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h index 9c655d6c0d8e5fe04ee6d85f7e9d9da68105230c..b275fd75b3512343825170fc38565dd27f7f1c75 100644 --- a/paddle/operators/compare_op.h +++ b/paddle/operators/compare_op.h @@ -54,7 +54,15 @@ class CompareOpKernel public: void Compute(const framework::ExecutionContext& context) const override { using T = typename Functor::ELEM_TYPE; - ElementwiseComputeEx(context); + using Tensor = framework::Tensor; + + auto* x = context.Input("X"); + auto* y = context.Input("Y"); + auto* z = context.Output("Out"); + z->mutable_data(context.GetPlace()); + int axis = context.Attr("axis"); + ElementwiseComputeEx(context, x, y, axis, + z); } }; diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h index a8389429f26c17ceab1db22175c90888546ead6f..c32288d6984f126f2374a13973541f4f663b25a4 100644 --- a/paddle/operators/elementwise_add_op.h +++ b/paddle/operators/elementwise_add_op.h @@ -28,7 +28,14 @@ template class ElementwiseAddKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -92,9 +99,19 @@ template class ElementwiseAddGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseAddBroadCastGradFunctor, - ElementwiseAddBroadCast2GradFunctor>(ctx); + ElementwiseAddBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h index ef26cb6c914f50ded07cc9d0d8de3f49f2151129..07ebade31ff5b3d5c89156e28ff5fa0670a9a842 100644 --- a/paddle/operators/elementwise_div_op.h +++ b/paddle/operators/elementwise_div_op.h @@ -28,7 +28,14 @@ template class ElementwiseDivKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -111,9 +118,19 @@ template class ElementwiseDivGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseDivBroadCastGradFunctor, - ElementwiseDivBroadCast2GradFunctor>(ctx); + ElementwiseDivBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/elementwise_max_op.h b/paddle/operators/elementwise_max_op.h index 255728e8e620665a7de225b228c19d6c510da1c8..717e45ab31db9b9a6629fb33e17654dbf986d8c5 100644 --- a/paddle/operators/elementwise_max_op.h +++ b/paddle/operators/elementwise_max_op.h @@ -28,7 +28,14 @@ template class ElementwiseMaxKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -110,9 +117,19 @@ template class ElementwiseMaxGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseMaxBroadCastGradFunctor, - ElementwiseMaxBroadCast2GradFunctor>(ctx); + ElementwiseMaxBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/elementwise_min_op.h b/paddle/operators/elementwise_min_op.h index e6627a0f1bb468c8e4661b83489cb964b72dddb0..0de9a91c52b0ab82cd62604de318ce68e56b767d 100644 --- a/paddle/operators/elementwise_min_op.h +++ b/paddle/operators/elementwise_min_op.h @@ -28,7 +28,14 @@ template class ElementwiseMinKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -110,9 +117,19 @@ template class ElementwiseMinGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseMinBroadCastGradFunctor, - ElementwiseMinBroadCast2GradFunctor>(ctx); + ElementwiseMinBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h index 4b86b00b5a095ae898f9ce0c17cde2cc91060ba9..ae7a71e0244dfb8ad3e55683ac081f92bc36bea5 100644 --- a/paddle/operators/elementwise_mul_op.h +++ b/paddle/operators/elementwise_mul_op.h @@ -27,7 +27,14 @@ template class ElementwiseMulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -110,9 +117,19 @@ template class ElementwiseMulGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseMulBroadCastGradFunctor, - ElementwiseMulBroadCast2GradFunctor>(ctx); + ElementwiseMulBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index d749b8e8757d0d433be05876779ccc22b95ca80b..213fe1f5a818873e8b666464cb112637261c598c 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -313,21 +313,18 @@ EIGEN_FUNCTOR(Div, EIGEN_DIV); template -void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); +void ElementwiseGradCompute(const framework::ExecutionContext& ctx, + const framework::Tensor* x, + const framework::Tensor* y, + const framework::Tensor* out, + const framework::Tensor* dout, int axis, + framework::Tensor* dx, framework::Tensor* dy) { auto& place = *ctx.template device_context().eigen_device(); auto x_dims = x->dims(); auto y_dims = y->dims(); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); if (dx) { dx->mutable_data(ctx.GetPlace()); } @@ -348,7 +345,6 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { x_dims = framework::make_ddim(extended_dims); } - int axis = ctx.Attr("axis"); axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); int pre, n, post; @@ -367,13 +363,10 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { template -void ElementwiseComputeEx(const framework::ExecutionContext& ctx) { - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - z->mutable_data(ctx.GetPlace()); +void ElementwiseComputeEx(const framework::ExecutionContext& ctx, + const framework::Tensor* x, + const framework::Tensor* y, int axis, + framework::Tensor* z) { TransformFunctor functor( x, y, z, ctx.template device_context(), Functor()); @@ -394,7 +387,6 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx) { x_dims = framework::make_ddim(extended_dims); } - int axis = ctx.Attr("axis"); axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), "Axis should be in range [0, x_dims)"); diff --git a/paddle/operators/elementwise_pow_op.h b/paddle/operators/elementwise_pow_op.h index 6019e709e0db0fd62b4d3350bb768095f87ef241..874fd3f09f2afaccfbfca75799cc3448f7393b03 100644 --- a/paddle/operators/elementwise_pow_op.h +++ b/paddle/operators/elementwise_pow_op.h @@ -29,7 +29,14 @@ template class ElementwisePowKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h index a2aca793026189ec87e00b52d7c351689f870400..c2749a8e6ba689233dab4f3c72de10bf01f39fab 100644 --- a/paddle/operators/elementwise_sub_op.h +++ b/paddle/operators/elementwise_sub_op.h @@ -27,7 +27,14 @@ template class ElementwiseSubKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -93,9 +100,19 @@ template class ElementwiseSubGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseSubBroadCastGradFunctor, - ElementwiseSubBroadCast2GradFunctor>(ctx); + ElementwiseSubBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/mine_hard_examples_op.cc b/paddle/operators/mine_hard_examples_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..051cc24706d69ec4f38524af1dd510bf079c74c7 --- /dev/null +++ b/paddle/operators/mine_hard_examples_op.cc @@ -0,0 +1,330 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +enum MiningType { kNone = 0, kMaxNegative, kHardExample }; + +template +bool SortScoreDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +inline bool IsEligibleMining(const MiningType mining_type, const int match_idx, + const float match_dist, + const float neg_dist_threshold) { + if (mining_type == MiningType::kMaxNegative) { + return match_idx == -1 && match_dist < neg_dist_threshold; + } else if (mining_type == MiningType::kHardExample) { + return true; + } else { + return false; + } +} + +inline MiningType GetMiningType(std::string str) { + if (str == "max_negative") { + return MiningType::kMaxNegative; + } else if (str == "hard_example") { + return MiningType::kHardExample; + } else { + return MiningType::kNone; + } +} + +template +class MineHardExamplesKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_cls_loss = ctx.Input("ClsLoss"); + auto* in_loc_loss = ctx.Input("LocLoss"); + auto* in_matched_indices = ctx.Input("MatchIndices"); + auto* in_match_dist = ctx.Input("MatchDist"); + float neg_pos_ratio = ctx.Attr("neg_pos_ratio"); + T neg_dist_threshold = + static_cast(ctx.Attr("neg_dist_threshold")); + int sample_size = ctx.Attr("sample_size"); + MiningType mining_type = + GetMiningType(ctx.Attr("mining_type")); + + auto out_neg_indices = ctx.Output("NegIndices"); + auto out_match_indices = + ctx.Output("UpdatedMatchIndices"); + + framework::Copy(*in_matched_indices, ctx.GetPlace(), out_match_indices); + + int batch_size = in_matched_indices->dims()[0]; + int prior_num = in_matched_indices->dims()[1]; + + auto match_indices = framework::EigenMatrix::From(*in_matched_indices); + + auto match_indices_et = + framework::EigenMatrix::From(*out_match_indices); + + auto match_dist = framework::EigenMatrix::From(*in_match_dist); + + const T* cls_loss = in_cls_loss->data(); + const T* loc_loss = nullptr; + if (in_loc_loss) { + loc_loss = in_loc_loss->data(); + } + + std::vector> all_neg_indices; + std::vector batch_starts = {0}; + for (int n = 0; n < batch_size; ++n) { + std::vector> loss_idx; + int neg_sel = 0; + for (int m = 0; m < prior_num; ++m) { + if (IsEligibleMining(mining_type, match_indices(n, m), match_dist(n, m), + neg_dist_threshold)) { + T loss = cls_loss[n * prior_num + m]; + if (mining_type == MiningType::kHardExample && loc_loss != nullptr) { + loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m]; + } + loss_idx.push_back(std::make_pair(loss, m)); + ++neg_sel; + } + } + + if (mining_type == MiningType::kMaxNegative) { + int num_pos = 0; + for (int m = 0; m < prior_num; ++m) { + if (match_indices(n, m) != -1) ++num_pos; + } + neg_sel = std::min(static_cast(num_pos * neg_pos_ratio), neg_sel); + } else if (mining_type == MiningType::kHardExample) { + neg_sel = std::min(sample_size, neg_sel); + } + + std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend); + std::set sel_indices; + std::vector neg_indices; + std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel, + std::inserter(sel_indices, sel_indices.begin()), + [](std::pair& l) -> int { + return static_cast(l.second); + }); + + if (mining_type == MiningType::kHardExample) { + for (int m = 0; m < prior_num; ++m) { + if (match_indices(n, m) > -1) { + if (sel_indices.find(m) == sel_indices.end()) { + match_indices_et(n, m) = -1; + } + } else { + if (sel_indices.find(m) != sel_indices.end()) { + neg_indices.push_back(m); + } + } + } + } else { + neg_indices.resize(sel_indices.size()); + std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin()); + } + + all_neg_indices.push_back(neg_indices); + batch_starts.push_back(batch_starts.back() + neg_indices.size()); + } + + framework::LoD out_neg_indices_lod; + out_neg_indices_lod.emplace_back(batch_starts); + int neg_offset = 0; + auto neg_data = out_neg_indices->mutable_data( + framework::make_ddim({static_cast(batch_starts.back()), 1}), + ctx.GetPlace()); + + for (auto neg_indices : all_neg_indices) { + std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset); + neg_offset += neg_indices.size(); + } + out_neg_indices->set_lod(out_neg_indices_lod); + return; + } +}; + +class MineHardExamplesOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("ClsLoss"), + "Input(ClsLoss) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("MatchIndices"), + "Input(MatchIndices) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("MatchDist"), + "Input(MatchDist) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("NegIndices"), + "Output(NegIndices) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("UpdatedMatchIndices"), + "Output(UpdatedMatchIndices) of MineHardExamplesOp should " + "not be null."); + + auto cls_loss_dims = ctx->GetInputDim("ClsLoss"); + auto idx_dims = ctx->GetInputDim("MatchIndices"); + auto dis_dims = ctx->GetInputDim("MatchDist"); + + PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL, + "The shape of ClsLoss is [N, Np]."); + PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL, + "The shape of MatchIndices is [N, Np]."); + PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL, + "The shape of MatchDist is [N, Np]."); + + if (ctx->HasInput("LocLoss")) { + auto loc_loss_dims = ctx->GetInputDim("LocLoss"); + PADDLE_ENFORCE_EQ(loc_loss_dims.size(), 2UL, + "The shape of LocLoss is [N, Np]."); + PADDLE_ENFORCE_EQ(cls_loss_dims[0], loc_loss_dims[0], + "Batch size of ClsLoss and LocLoss must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], loc_loss_dims[1], + "Prior box number of ClsLoss and LocLoss must be the same."); + } + + PADDLE_ENFORCE_EQ( + cls_loss_dims[0], idx_dims[0], + "Batch size of ClsLoss and MatchIndices must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], idx_dims[1], + "Prior box number of ClsLoss and MatchIndices must be the same."); + + PADDLE_ENFORCE_EQ(cls_loss_dims[0], dis_dims[0], + "Batch size of ClsLoss and MatchDist must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], idx_dims[1], + "Prior box number of ClsLoss and MatchDist must be the same."); + + auto mining_type = + GetMiningType(ctx->Attrs().Get("mining_type")); + + PADDLE_ENFORCE_NE(mining_type, MiningType::kNone, + "mining_type must be hard_example or max_negative"); + + if (mining_type == MiningType::kMaxNegative) { + auto neg_pos_ratio = ctx->Attrs().Get("neg_pos_ratio"); + auto neg_dist_threshold = ctx->Attrs().Get("neg_dist_threshold"); + PADDLE_ENFORCE_GT( + neg_pos_ratio, 0.0f, + "neg_pos_ratio must greater than zero in max_negative mode"); + PADDLE_ENFORCE_GT( + neg_dist_threshold, 0.0f, + "neg_dist_threshold must greater than zero in max_negative mode"); + } else if (mining_type == MiningType::kHardExample) { + auto sample_size = ctx->Attrs().Get("sample_size"); + PADDLE_ENFORCE_GT( + sample_size, 0, + "sample_size must greater than zero in hard_example mode"); + } + + ctx->SetOutputDim("UpdatedMatchIndices", idx_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("ClsLoss")->type()), + ctx.device_context()); + } +}; + +class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MineHardExamplesOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "ClsLoss", + "(Tensor, default Tensor), The classification loss with shape " + "[N, Np], N is the batch size and Np is the number of prior box."); + AddInput("LocLoss", + "(Tensor, optional, default Tensor), The localization loss " + "with shape [N, Np], N is the batch size and Np is the number of " + "prior box.") + .AsDispensable(); + AddInput("MatchIndices", + "(Tensor, Tensor), Matched indices with shape [N, Np], N is " + "the batch size and Np is the number of prior box. " + "MatchIndices[i][j] equal -1 means the j-th prior box in i-th " + "instance does not match any entity, otherwise means it is " + "matched to row."); + AddInput("MatchDist", + "(Tensor, default Tensor) Matched indices with shape [N, " + "Np], N is the batch size and Np is the number of prior box."); + AddAttr("neg_pos_ratio", + "(float) The ratio of the negative box to the positive " + "box. Use only when mining_type is max_negative.") + .SetDefault(1.0); + AddAttr("neg_dist_threshold", + "(float) The negative overlap upper bound for the unmatched " + "predictions. Use only when mining_type is max_negative.") + .SetDefault(0.5); + AddAttr("sample_size", + "(float) The max sample size of negative box. Use only when " + "mining_type is hard_example.") + .SetDefault(0); + AddAttr("mining_type", + "(float) The mining algorithm name, the value is " + "hard_example or max_negative.") + .SetDefault("max_negative") + .InEnum({"hard_example", "max_negative"}); + + AddOutput( + "NegIndices", + "(LoDTensor) The output of negative example indices. a LoDTensor " + "with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, " + "and each element is the prior box index. " + "For example, the batch size is 2, the lod is [[0, 1, 2]], " + "the sample 0's box 1(MatchIndices[0][1]) is selected, " + "and sample 1's box 0 is selected. The output NegIndices is " + "[[1], [0]]."); + + AddOutput("UpdatedMatchIndices", + "(Tensor) The output of updated MatchIndices, a tensor with " + "shape [N, Np]. Only update when mining_type is " + "hard_example. The input MatchIndices elements will be update to " + "-1 when it is not in the candidate high loss list of negative " + "examples."); + + AddComment(R"DOC( +Mine hard examples Operator. +This operator implements hard example mining to select a subset of negative box indices. +For each image, selects the box with highest losses. subject to the condition that the +box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative. +The selected number is min(sample_size, max_negative_box_number) when mining_type is +hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number) +when mining_type is max_negative, where the max_negative_box_number is the count of +MatchIndices elements with value -1. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp, + ops::MineHardExamplesOpMaker); + +REGISTER_OP_CPU_KERNEL( + mine_hard_examples, + ops::MineHardExamplesKernel, + ops::MineHardExamplesKernel); diff --git a/paddle/operators/multiclass_nms_op.cc b/paddle/operators/multiclass_nms_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8a65fe69f1534112923c72e17969f808b9310735 --- /dev/null +++ b/paddle/operators/multiclass_nms_op.cc @@ -0,0 +1,384 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +constexpr int64_t kOutputDim = 6; +constexpr int64_t kBBoxSize = 4; + +class MultiClassNMSOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("BBoxes"), + "Input(BBoxes) of MultiClassNMS should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Scores"), + "Input(Scores) of MultiClassNMS should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of MultiClassNMS should not be null."); + + auto box_dims = ctx->GetInputDim("BBoxes"); + auto score_dims = ctx->GetInputDim("Scores"); + + PADDLE_ENFORCE_EQ(box_dims.size(), 2, + "The rank of Input(BBoxes) must be 2."); + PADDLE_ENFORCE_EQ(score_dims.size(), 3, + "The rank of Input(Scores) must be 3."); + PADDLE_ENFORCE_EQ(box_dims[1], 4, + "The 2nd dimension of Input(BBoxes) must be 4, " + "represents the layout of coordinate " + "[xmin, ymin, xmax, ymax]"); + PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2], + "The 1st dimensiong of Input(BBoxes) must be equal to " + "3rd dimension of Input(Scores), which represents the " + "predicted bboxes."); + + // Here the box_dims[0] is not the real dimension of output. + // It will be rewritten in the computing kernel. + ctx->SetOutputDim("Out", {box_dims[0], 6}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input("Scores")->type()), + ctx.device_context()); + } +}; + +template +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +template +static inline void GetMaxScoreIndex( + const std::vector& scores, const T threshold, int top_k, + std::vector>* sorted_indices) { + for (size_t i = 0; i < scores.size(); ++i) { + if (scores[i] > threshold) { + sorted_indices->push_back(std::make_pair(scores[i], i)); + } + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices->begin(), sorted_indices->end(), + SortScorePairDescend); + // Keep top_k scores if needed. + if (top_k > -1 && top_k < sorted_indices->size()) { + sorted_indices->resize(top_k); + } +} + +template +static inline T BBoxArea(const T* box, const bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +static inline T JaccardOverlap(const T* box1, const T* box2, + const bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + const T inter_w = inter_xmax - inter_xmin; + const T inter_h = inter_ymax - inter_ymin; + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +class MultiClassNMSKernel : public framework::OpKernel { + public: + void NMSFast(const Tensor& bbox, const Tensor& scores, + const T score_threshold, const T nms_threshold, const T eta, + const int64_t top_k, std::vector* selected_indices) const { + // The total boxes for each instance. + int64_t num_boxes = bbox.dims()[0]; + // 4: [xmin ymin xmax ymax] + int64_t box_size = bbox.dims()[1]; + + std::vector scores_data(num_boxes); + std::copy_n(scores.data(), num_boxes, scores_data.begin()); + std::vector> sorted_indices; + GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices); + + selected_indices->clear(); + T adaptive_threshold = nms_threshold; + const T* bbox_data = bbox.data(); + + while (sorted_indices.size() != 0) { + const int idx = sorted_indices.front().second; + bool keep = true; + for (int k = 0; k < selected_indices->size(); ++k) { + if (keep) { + const int kept_idx = (*selected_indices)[k]; + T overlap = JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, true); + keep = overlap <= adaptive_threshold; + } else { + break; + } + } + if (keep) { + selected_indices->push_back(idx); + } + sorted_indices.erase(sorted_indices.begin()); + if (keep && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } + } + + void MultiClassNMS(const framework::ExecutionContext& ctx, + const Tensor& scores, const Tensor& bboxes, + std::map>& indices, + int& num_nmsed_out) const { + int64_t background_label = ctx.Attr("background_label"); + int64_t nms_top_k = ctx.Attr("nms_top_k"); + int64_t keep_top_k = ctx.Attr("keep_top_k"); + T nms_threshold = static_cast(ctx.Attr("nms_threshold")); + T nms_eta = static_cast(ctx.Attr("nms_eta")); + T score_threshold = static_cast(ctx.Attr("score_threshold")); + + int64_t class_num = scores.dims()[0]; + int64_t predict_dim = scores.dims()[1]; + int num_det = 0; + for (int64_t c = 0; c < class_num; ++c) { + if (c == background_label) continue; + Tensor score = scores.Slice(c, c + 1); + NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k, + &(indices[c])); + num_det += indices[c].size(); + } + + num_nmsed_out = num_det; + const T* scores_data = scores.data(); + if (keep_top_k > -1 && num_det > keep_top_k) { + std::vector>> score_index_pairs; + for (const auto& it : indices) { + int label = it.first; + const T* sdata = scores_data + label * predict_dim; + const std::vector& label_indices = it.second; + for (int j = 0; j < label_indices.size(); ++j) { + int idx = label_indices[j]; + PADDLE_ENFORCE_LT(idx, predict_dim); + score_index_pairs.push_back( + std::make_pair(sdata[idx], std::make_pair(label, idx))); + } + } + // Keep top k results per image. + std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), + SortScorePairDescend>); + score_index_pairs.resize(keep_top_k); + + // Store the new indices. + std::map> new_indices; + for (int j = 0; j < score_index_pairs.size(); ++j) { + int label = score_index_pairs[j].second.first; + int idx = score_index_pairs[j].second.second; + new_indices[label].push_back(idx); + } + new_indices.swap(indices); + num_nmsed_out = keep_top_k; + } + } + + void MultiClassOutput(const Tensor& scores, const Tensor& bboxes, + std::map>& selected_indices, + Tensor* outs) const { + int predict_dim = scores.dims()[1]; + auto* scores_data = scores.data(); + auto* bboxes_data = bboxes.data(); + auto* odata = outs->data(); + + int count = 0; + for (const auto& it : selected_indices) { + int label = it.first; + const T* sdata = scores_data + label * predict_dim; + const std::vector& indices = it.second; + for (int j = 0; j < indices.size(); ++j) { + int idx = indices[j]; + const T* bdata = bboxes_data + idx * kBBoxSize; + odata[count * kOutputDim] = label; // label + odata[count * kOutputDim + 1] = sdata[idx]; // score + // xmin, ymin, xmax, ymax + std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T)); + count++; + } + } + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* boxes = ctx.Input("BBoxes"); + auto* scores = ctx.Input("Scores"); + auto* outs = ctx.Output("Out"); + + auto score_dims = scores->dims(); + + int64_t batch_size = score_dims[0]; + int64_t class_num = score_dims[1]; + int64_t predict_dim = score_dims[2]; + + std::vector>> all_indices; + std::vector batch_starts = {0}; + for (int64_t i = 0; i < batch_size; ++i) { + Tensor ins_score = scores->Slice(i, i + 1); + ins_score.Resize({class_num, predict_dim}); + std::map> indices; + int num_nmsed_out = 0; + MultiClassNMS(ctx, ins_score, *boxes, indices, num_nmsed_out); + all_indices.push_back(indices); + batch_starts.push_back(batch_starts.back() + num_nmsed_out); + } + + int num_kept = batch_starts.back(); + if (num_kept == 0) { + T* od = outs->mutable_data({1}, ctx.GetPlace()); + od[0] = -1; + } else { + outs->mutable_data({num_kept, kOutputDim}, ctx.GetPlace()); + for (int64_t i = 0; i < batch_size; ++i) { + Tensor ins_score = scores->Slice(i, i + 1); + ins_score.Resize({class_num, predict_dim}); + int64_t s = batch_starts[i]; + int64_t e = batch_starts[i + 1]; + if (e > s) { + Tensor out = outs->Slice(s, e); + MultiClassOutput(ins_score, *boxes, all_indices[i], &out); + } + } + } + + framework::LoD lod; + lod.emplace_back(batch_starts); + + outs->set_lod(lod); + } +}; + +class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("BBoxes", + "(Tensor) A 2-D Tensor with shape [M, 4] represents the " + "predicted locations of M bounding bboxes. Each bounding box " + "has four coordinate values and the layout is " + "[xmin, ymin, xmax, ymax]."); + AddInput("Scores", + "(Tensor) A 3-D Tensor with shape [N, C, M] represents the " + "predicted confidence predictions. N is the batch size, C is the " + "class number, M is number of bounding boxes. For each category " + "there are total M scores which corresponding M bounding boxes. " + " Please note, M is equal to the 1st dimension of BBoxes. "); + AddAttr( + "background_label", + "(int64_t, defalut: 0) " + "The index of background label, the background label will be ignored. " + "If set to -1, then all categories will be considered.") + .SetDefault(0); + AddAttr("score_threshold", + "(float) " + "Threshold to filter out bounding boxes with low " + "confidence score. If not provided, consider all boxes."); + AddAttr("nms_top_k", + "(int64_t) " + "Maximum number of detections to be kept according to the " + "confidences aftern the filtering detections based on " + "score_threshold"); + AddAttr("nms_threshold", + "(float, defalut: 0.3) " + "The threshold to be used in NMS.") + .SetDefault(0.3); + AddAttr("nms_eta", + "(float) " + "The parameter for adaptive NMS.") + .SetDefault(1.0); + AddAttr("keep_top_k", + "(int64_t) " + "Number of total bboxes to be kept per image after NMS " + "step. -1 means keeping all bboxes after NMS step."); + AddOutput("Out", + "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the " + "detections. Each row has 6 values: " + "[label, confidence, xmin, ymin, xmax, ymax], No is the total " + "number of detections in this mini-batch. For each instance, " + "the offsets in first dimension are called LoD, the number of " + "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is " + "no detected bbox."); + AddComment(R"DOC( +This operator is to do multi-class non maximum suppression (NMS) on a batched +of boxes and scores. + +In the NMS step, this operator greedily selects a subset of detection bounding +boxes that have high scores larger than score_threshold, if providing this +threshold, then selects the largest nms_top_k confidences scores if nms_top_k +is larger than -1. Then this operator pruns away boxes that have high IOU +(intersection over union) overlap with already selected boxes by adaptive +threshold NMS based on parameters of nms_threshold and nms_eta. + +Aftern NMS step, at most keep_top_k number of total bboxes are to be kept +per image if keep_top_k is larger than -1. + +This operator support multi-class and batched inputs. It applying NMS +independently for each class. The outputs is a 2-D LoDTenosr, for each +image, the offsets in first dimension of LoDTensor are called LoD, the number +of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0, +means there is no detected bbox for this image. If there is no detected boxes +for all images, all the elements in LoD are 0, and the Out only contains one +value which is -1. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp, + ops::MultiClassNMSOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel, + ops::MultiClassNMSKernel); diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc index 2fdd25dbbe68659f8a0a9da13a87148ed259127a..a744ebd61595403ee495a2e2c9e84181422e92ff 100644 --- a/paddle/operators/while_op.cc +++ b/paddle/operators/while_op.cc @@ -53,6 +53,8 @@ class WhileOp : public framework::OperatorBase { auto step_scopes = scope.FindVar(Output(kStepScopes))->GetMutable(); + PADDLE_ENFORCE(platform::is_cpu_place(cond.place()), + "Condition of while op must in CPU memory."); while (cond.data()[0]) { auto ¤t_scope = scope.NewScope(); step_scopes->push_back(¤t_scope); @@ -99,6 +101,9 @@ class WhileGradOp : public framework::OperatorBase { void Run(const framework::Scope &scope, const platform::Place &dev_place) const override { + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); framework::Executor executor(dev_place); auto *block = Attr(kStepBlock); auto *program = block->Program(); @@ -205,6 +210,8 @@ class WhileGradOp : public framework::OperatorBase { sum_op->Run(cur_scope, dev_place); cur_scope.Rename(new_inside_name, inside_grad_name); } + dev_ctx.Wait(); + const_cast(scope).DeleteScope(&cur_scope); } } }; diff --git a/python/paddle/v2/fluid/debuger.py b/python/paddle/v2/fluid/debuger.py new file mode 100644 index 0000000000000000000000000000000000000000..d37935244291d15da8199bd3d3979118e4944b48 --- /dev/null +++ b/python/paddle/v2/fluid/debuger.py @@ -0,0 +1,73 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from graphviz import GraphPreviewGenerator +import proto.framework_pb2 as framework_pb2 + + +def draw_block_graphviz(block, highlights=None, path="./temp.dot"): + ''' + Generate a debug graph for block. + Args: + block(Block): a block. + ''' + graph = GraphPreviewGenerator("some graph") + # collect parameters and args + protostr = block.desc.serialize_to_string() + desc = framework_pb2.BlockDesc.FromString(str(protostr)) + + def need_highlight(name): + if highlights is None: return False + for pattern in highlights: + assert type(pattern) is str + if re.match(pattern, name): + return True + return False + + # draw parameters and args + vars = {} + for var in desc.vars: + shape = [str(i) for i in var.lod_tensor.tensor.dims] + if not shape: + shape = ['null'] + # create var + if var.persistable: + varn = graph.add_param( + var.name, var.type, shape, highlight=need_highlight(var.name)) + else: + varn = graph.add_arg(var.name, highlight=need_highlight(var.name)) + vars[var.name] = varn + + def add_op_link_var(op, var, op2var=False): + for arg in var.arguments: + if arg not in vars: + # add missing variables as argument + vars[arg] = graph.add_arg(arg, highlight=need_highlight(arg)) + varn = vars[arg] + highlight = need_highlight(op.description) or need_highlight( + varn.description) + if op2var: + graph.add_edge(op, varn, highlight=highlight) + else: + graph.add_edge(varn, op, highlight=highlight) + + for op in desc.ops: + opn = graph.add_op(op.type, highlight=need_highlight(op.type)) + for var in op.inputs: + add_op_link_var(opn, var, False) + for var in op.outputs: + add_op_link_var(opn, var, True) + + graph(path, show=True) diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index 7f5187d29984482d91b5709bf4514d013028767a..69cbebe41e57309bb0993148833836b715e417ce 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -451,9 +451,8 @@ class Operator(object): if not given == need: raise ValueError(("Incorrect setting for output(s) of " "operator \"%s\". Need: [%s] Given: [%s]") % - (type, ", ".join(str(e) - for e in need), ", ".join( - str(e) for e in given))) + (type, ", ".join(str(e) for e in need), + ", ".join(str(e) for e in given))) for out_proto in proto.outputs: out_args = outputs[out_proto.name] diff --git a/python/paddle/v2/fluid/graphviz.py b/python/paddle/v2/fluid/graphviz.py new file mode 100644 index 0000000000000000000000000000000000000000..5881119c39231282b5654cd60720a1d8a7877896 --- /dev/null +++ b/python/paddle/v2/fluid/graphviz.py @@ -0,0 +1,272 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random +import subprocess +import logging + + +def crepr(v): + if type(v) is str or type(v) is unicode: + return '"%s"' % v + return str(v) + + +class Rank(object): + def __init__(self, kind, name, priority): + ''' + kind: str + name: str + priority: int + ''' + self.kind = kind + self.name = name + self.priority = priority + self.nodes = [] + + def __str__(self): + if not self.nodes: + return '' + + return '{' + 'rank={};'.format(self.kind) + \ + ','.join([node.name for node in self.nodes]) + '}' + + +class Graph(object): + rank_counter = 0 + + def __init__(self, title, **attrs): + self.title = title + self.attrs = attrs + self.nodes = [] + self.edges = [] + self.rank_groups = {} + + def code(self): + return self.__str__() + + def rank_group(self, kind, priority): + name = "rankgroup-%d" % Graph.rank_counter + Graph.rank_counter += 1 + rank = Rank(kind, name, priority) + self.rank_groups[name] = rank + return name + + def node(self, label, prefix, description="", **attrs): + node = Node(label, prefix, description, **attrs) + + if 'rank' in attrs: + rank = self.rank_groups[attrs['rank']] + del attrs['rank'] + rank.nodes.append(node) + self.nodes.append(node) + return node + + def edge(self, source, target, **attrs): + edge = Edge(source, target, **attrs) + self.edges.append(edge) + return edge + + def compile(self, dot_path): + file = open(dot_path, 'w') + file.write(self.__str__()) + image_path = os.path.join( + os.path.dirname(__file__), dot_path[:-3] + "pdf") + cmd = ["dot", "-Tpdf", dot_path, "-o", image_path] + subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + logging.warning("write block debug graph to {}".format(image_path)) + return image_path + + def show(self, dot_path): + image = self.compile(dot_path) + cmd = ["open", image] + subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + def _rank_repr(self): + ranks = sorted( + self.rank_groups.items(), + cmp=lambda a, b: a[1].priority > b[1].priority) + repr = [] + for x in ranks: + repr.append(str(x[1])) + return '\n'.join(repr) + '\n' + + def __str__(self): + reprs = [ + 'digraph G {', + 'title = {}'.format(crepr(self.title)), + ] + + for attr in self.attrs: + reprs.append("{key}={value};".format( + key=attr, value=crepr(self.attrs[attr]))) + + reprs.append(self._rank_repr()) + + random.shuffle(self.nodes) + reprs += [str(node) for node in self.nodes] + + for x in self.edges: + reprs.append(str(x)) + + reprs.append('}') + return '\n'.join(reprs) + + +class Node(object): + counter = 1 + + def __init__(self, label, prefix, description="", **attrs): + self.label = label + self.name = "%s_%d" % (prefix, Node.counter) + self.description = description + self.attrs = attrs + Node.counter += 1 + + def __str__(self): + reprs = '{name} [label={label} {extra} ];'.format( + name=self.name, + label=self.label, + extra=',' + ','.join("%s=%s" % (key, crepr(value)) + for key, value in self.attrs.items()) + if self.attrs else "") + return reprs + + +class Edge(object): + def __init__(self, source, target, **attrs): + ''' + Link source to target. + :param source: Node + :param target: Node + :param graph: Graph + :param attrs: dic + ''' + self.source = source + self.target = target + self.attrs = attrs + + def __str__(self): + repr = "{source} -> {target} {extra}".format( + source=self.source.name, + target=self.target.name, + extra="" if not self.attrs else + "[" + ','.join("{}={}".format(attr[0], crepr(attr[1])) + for attr in self.attrs.items()) + "]") + return repr + + +class GraphPreviewGenerator(object): + ''' + Generate a graph image for ONNX proto. + ''' + + def __init__(self, title): + # init graphviz graph + self.graph = Graph( + title, + layout="dot", + concentrate="true", + rankdir="TB", ) + + self.op_rank = self.graph.rank_group('same', 2) + self.param_rank = self.graph.rank_group('same', 1) + self.arg_rank = self.graph.rank_group('same', 0) + + def __call__(self, path='temp.dot', show=False): + if not show: + self.graph.compile(path) + else: + self.graph.show(path) + + def add_param(self, name, data_type, shape, highlight=False): + label = '\n'.join([ + '<', + ' ', + ' ', + ' ', + ' ', + ' ' + ' ', + ' ', + ' ' + ' ', + '
', + ' ', + name, + ' ', + '
', + str(data_type), + '
', + '[%s]' % 'x'.join(shape), + '
>', + ]) + return self.graph.node( + label, + prefix="param", + description=name, + shape="none", + style="rounded,filled,bold", + width="1.3", + color="#148b97" if not highlight else "orange", + fontcolor="#ffffff", + fontname="Arial") + + def add_op(self, opType, **kwargs): + highlight = False + if 'highlight' in kwargs: + highlight = kwargs['highlight'] + del kwargs['highlight'] + return self.graph.node( + "<%s>" % opType, + prefix="op", + description=opType, + shape="box", + style="rounded, filled, bold", + color="#303A3A" if not highlight else "orange", + fontname="Arial", + fontcolor="#ffffff", + width="1.3", + height="0.84", ) + + def add_arg(self, name, highlight=False): + return self.graph.node( + crepr(name), + prefix="arg", + description=name, + shape="box", + style="rounded,filled,bold", + fontname="Arial", + fontcolor="#999999", + color="#dddddd" if not highlight else "orange") + + def add_edge(self, source, target, **kwargs): + highlight = False + if 'highlight' in kwargs: + highlight = kwargs['highlight'] + del kwargs['highlight'] + return self.graph.edge( + source, + target, + color="#00000" if not highlight else "orange", + **kwargs) diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index aaf096f0dd5ed9b2dc6004d39e337b3efe879da0..a79479f469a0c489edf2676bc5d07066bb480664 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -1485,7 +1485,9 @@ def batch_norm(input, param_attr=None, bias_attr=None, data_layout='NCHW', - name=None): + name=None, + moving_mean_name=None, + moving_variance_name=None): """ This function helps create an operator to implement the BatchNorm layer using the configurations from the input parameters. @@ -1515,6 +1517,7 @@ def batch_norm(input, attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True) mean = helper.create_global_variable( + name=moving_mean_name, dtype=input.dtype, shape=param_shape, persistable=True, @@ -1522,6 +1525,7 @@ def batch_norm(input, helper.set_variable_initializer(var=mean, initializer=Constant(0.0)) variance = helper.create_global_variable( + name=moving_variance_name, dtype=input.dtype, shape=param_shape, persistable=True, diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py index ee3172c7b8dfd65c693e5aee9b55179e654ce7be..c701e79ad266d996038c6868718106664e1009b5 100644 --- a/python/paddle/v2/fluid/layers/ops.py +++ b/python/paddle/v2/fluid/layers/ops.py @@ -59,6 +59,7 @@ __all__ = [ 'elementwise_pow', 'clip', 'clip_by_norm', + 'softmax', 'sequence_softmax', ] + __activations__ diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py index c435c5206d1ef1ef57683a1a47bf089be6526f38..8460af2a08a54c5f241553757296ed2bf02f0167 100644 --- a/python/paddle/v2/fluid/layers/tensor.py +++ b/python/paddle/v2/fluid/layers/tensor.py @@ -295,7 +295,7 @@ def fill_constant_batch_size_like(input, return out -def ones(shape, dtype): +def ones(shape, dtype, force_cpu=False): """ **ones** @@ -319,7 +319,7 @@ def ones(shape, dtype): return fill_constant(value=1.0, **locals()) -def zeros(shape, dtype): +def zeros(shape, dtype, force_cpu=False): """ **zeros** diff --git a/python/paddle/v2/fluid/memory_optimization_transpiler.py b/python/paddle/v2/fluid/memory_optimization_transpiler.py index 956c5b66da28fd8e74d4fd12f249688daa72d8ac..2b00923f5e85e6ba8fcdedebf5bbbc29403472c6 100644 --- a/python/paddle/v2/fluid/memory_optimization_transpiler.py +++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py @@ -31,7 +31,7 @@ dtype_to_size = { class ControlFlowGraph(object): - def __init__(self, Program, ops, forward_num): + def __init__(self, Program, ops, forward_num, skip_opt): self._program = Program self._ops = ops self._forward_num = forward_num @@ -41,6 +41,7 @@ class ControlFlowGraph(object): self._defs = defaultdict(set) self._live_in = defaultdict(set) self._live_out = defaultdict(set) + self._skip_opt = skip_opt def _add_connections(self, connections): for node1, node2 in connections: @@ -130,6 +131,10 @@ class ControlFlowGraph(object): block_desc, x, is_forward).type() != core.VarDesc.VarType.LOD_TENSOR: return False + if x in self._skip_opt: + return False + if not self._find_var(block_desc, x, is_forward).shape(): + return False return True self._build_graph() @@ -140,6 +145,7 @@ class ControlFlowGraph(object): if op.type() == "while" or op.type() == "while_grad": continue block_desc = op.block() + self.current_block_desc = block_desc is_forward = i < self._forward_num if self.pool: defs_can_optimize = filter( @@ -197,28 +203,32 @@ def get_cfgs(input_program): block_desc = pdesc.block(0) op_size = block_desc.op_size() # Get global block ops - ops_list.append(([block_desc.op(i) for i in range(op_size)], op_size)) + ops_list.append( + ([block_desc.op(i) for i in range(op_size)], op_size, set())) while_sub_block_ids = [] while_grad_sub_block_ids = [] - while_pair = [] + while_op_output = set() + while_block_id_pair = [] for i in range(op_size): op = block_desc.op(i) if op.type() == "while": while_sub_block_ids.append(op.attr("sub_block").id) + while_op_output.update(op.output_arg_names()) elif op.type() == "while_grad": while_grad_sub_block_ids.append(op.attr("sub_block").id) + while_op_output.update(op.output_arg_names()) # Find while/while_grad block pair for grad_id in while_grad_sub_block_ids: parent_id = pdesc.block(grad_id).parent if parent_id in while_sub_block_ids: - while_pair.append((parent_id, grad_id)) + while_block_id_pair.append((parent_id, grad_id)) while_sub_block_ids.remove(parent_id) # Get while/while_grad block ops - for parent_id, grad_id in while_pair: + for parent_id, grad_id in while_block_id_pair: while_block_ops = [] while_block = pdesc.block(parent_id) while_block_op_size = while_block.op_size() @@ -230,7 +240,7 @@ def get_cfgs(input_program): for i in range(while_grad_block_op_size): while_block_ops.append(while_grad_block.op(i)) - ops_list.append((while_block_ops, while_block_op_size)) + ops_list.append((while_block_ops, while_block_op_size, while_op_output)) # Process rest while block ops for parent_id in while_sub_block_ids: @@ -242,7 +252,7 @@ def get_cfgs(input_program): ops_list.append((while_block_ops, while_block_op_size)) - cfgs = [ControlFlowGraph(input_program, i, j) for i, j in ops_list] + cfgs = [ControlFlowGraph(input_program, i, j, k) for i, j, k in ops_list] return cfgs diff --git a/python/paddle/v2/fluid/tests/book/.gitignore b/python/paddle/v2/fluid/tests/book/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..f0b574b9396706a1d68393482296360362dca750 --- /dev/null +++ b/python/paddle/v2/fluid/tests/book/.gitignore @@ -0,0 +1 @@ +recognize_digits_*.inference.model diff --git a/python/paddle/v2/fluid/tests/book/CMakeLists.txt b/python/paddle/v2/fluid/tests/book/CMakeLists.txt index a870478db8a086ea8f2d112e4e06444398d61f8a..673c965b662a022739f8d489c331f4de9455a926 100644 --- a/python/paddle/v2/fluid/tests/book/CMakeLists.txt +++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt @@ -1,32 +1,6 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") -list(REMOVE_ITEM TEST_OPS test_recognize_digits) -py_test(test_recognize_digits_mlp_cpu - SRCS test_recognize_digits.py - ARGS mlp) -py_test(test_recognize_digits_mlp_cuda - SRCS test_recognize_digits.py - ARGS mlp --use_cuda) -py_test(test_recognize_digits_conv_cpu - SRCS test_recognize_digits.py - ARGS conv) -py_test(test_recognize_digits_conv_cuda - SRCS test_recognize_digits.py - ARGS conv --use_cuda) -py_test(test_recognize_digits_mlp_cpu_parallel - SRCS test_recognize_digits.py - ARGS mlp --parallel) -py_test(test_recognize_digits_mlp_cuda_parallel - SRCS test_recognize_digits.py - ARGS mlp --use_cuda --parallel) -py_test(test_recognize_digits_conv_cpu_parallel - SRCS test_recognize_digits.py - ARGS conv --parallel) -py_test(test_recognize_digits_conv_cuda_parallel - SRCS test_recognize_digits.py - ARGS conv --use_cuda --parallel) - # default test foreach(src ${TEST_OPS}) py_test(${src} SRCS ${src}.py) diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py index 82b760d693560dae1ab1fa39afdc186f60423e65..5716ddd3dda90958ad1008679e018542c4fb73d7 100644 --- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py +++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py @@ -11,21 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import contextlib import numpy as np import paddle.v2 as paddle import paddle.v2.fluid as fluid -import paddle.v2.fluid.core as core import paddle.v2.fluid.framework as framework import paddle.v2.fluid.layers as pd from paddle.v2.fluid.executor import Executor +import unittest dict_size = 30000 source_dict_dim = target_dict_dim = dict_size -src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) hidden_dim = 32 word_dim = 16 -IS_SPARSE = True batch_size = 2 max_length = 8 topk_size = 50 @@ -34,10 +33,8 @@ beam_size = 2 decoder_size = hidden_dim -place = core.CPUPlace() - -def encoder(): +def encoder(is_sparse): # encoder src_word_id = pd.data( name="src_word_id", shape=[1], dtype='int64', lod_level=1) @@ -45,7 +42,7 @@ def encoder(): input=src_word_id, size=[dict_size, word_dim], dtype='float32', - is_sparse=IS_SPARSE, + is_sparse=is_sparse, param_attr=fluid.ParamAttr(name='vemb')) fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') @@ -54,7 +51,7 @@ def encoder(): return encoder_out -def decoder_train(context): +def decoder_train(context, is_sparse): # decoder trg_language_word = pd.data( name="target_language_word", shape=[1], dtype='int64', lod_level=1) @@ -62,7 +59,7 @@ def decoder_train(context): input=trg_language_word, size=[dict_size, word_dim], dtype='float32', - is_sparse=IS_SPARSE, + is_sparse=is_sparse, param_attr=fluid.ParamAttr(name='vemb')) rnn = pd.DynamicRNN() @@ -82,10 +79,10 @@ def decoder_train(context): return rnn() -def decoder_decode(context): +def decoder_decode(context, is_sparse): init_state = context array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) - counter = pd.zeros(shape=[1], dtype='int64') + counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) # fill the first element with init_state state_array = pd.create_array('float32') @@ -117,7 +114,7 @@ def decoder_decode(context): input=pre_ids, size=[dict_size, word_dim], dtype='float32', - is_sparse=IS_SPARSE) + is_sparse=is_sparse) # use rnn unit to update rnn current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded], @@ -150,7 +147,7 @@ def decoder_decode(context): def set_init_lod(data, lod, place): - res = core.LoDTensor() + res = fluid.LoDTensor() res.set(data, place) res.set_lod(lod) return res @@ -165,15 +162,19 @@ def to_lodtensor(data, place): lod.append(cur_len) flattened_data = np.concatenate(data, axis=0).astype("int64") flattened_data = flattened_data.reshape([len(flattened_data), 1]) - res = core.LoDTensor() + res = fluid.LoDTensor() res.set(flattened_data, place) res.set_lod([lod]) return res -def train_main(): - context = encoder() - rnn_out = decoder_train(context) +def train_main(use_cuda, is_sparse): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + context = encoder(is_sparse) + rnn_out = decoder_train(context, is_sparse) label = pd.data( name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) cost = pd.cross_entropy(input=rnn_out, label=label) @@ -212,9 +213,13 @@ def train_main(): batch_id += 1 -def decode_main(): - context = encoder() - translation_ids, translation_scores = decoder_decode(context) +def decode_main(use_cuda, is_sparse): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + context = encoder(is_sparse) + translation_ids, translation_scores = decoder_decode(context, is_sparse) exe = Executor(place) exe.run(framework.default_startup_program()) @@ -250,6 +255,60 @@ def decode_main(): break +class TestMachineTranslation(unittest.TestCase): + pass + + +@contextlib.contextmanager +def scope_prog_guard(): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield + + +def inject_test_train(use_cuda, is_sparse): + f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse' + if is_sparse else 'dense') + + def f(*args): + with scope_prog_guard(): + train_main(use_cuda, is_sparse) + + setattr(TestMachineTranslation, f_name, f) + + +def inject_test_decode(use_cuda, is_sparse, decorator=None): + f_name = 'test_{0}_{1}_decode'.format('cuda' + if use_cuda else 'cpu', 'sparse' + if is_sparse else 'dense') + + def f(*args): + with scope_prog_guard(): + decode_main(use_cuda, is_sparse) + + if decorator is not None: + f = decorator(f) + + setattr(TestMachineTranslation, f_name, f) + + +for _use_cuda_ in (False, True): + for _is_sparse_ in (False, True): + inject_test_train(_use_cuda_, _is_sparse_) + +for _use_cuda_ in (False, True): + for _is_sparse_ in (False, True): + + _decorator_ = None + if _use_cuda_: + _decorator_ = unittest.skip( + reason='Beam Search does not support CUDA!') + + inject_test_decode( + is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_) + if __name__ == '__main__': - # train_main() - decode_main() + unittest.main() diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py index b4b6020f58e7538dfe0f98c17d61f3614c3c6fc4..b8f55c813b6984a5a1d266acc1c159c45c23b665 100644 --- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py @@ -17,6 +17,7 @@ import paddle.v2.fluid as fluid import paddle.v2 as paddle import sys import numpy +import unittest def parse_arg(): @@ -74,18 +75,18 @@ def conv_net(img, label): return loss_net(conv_pool_2, label) -def train(args, save_dirname=None): - print("recognize digits with args: {0}".format(" ".join(sys.argv[1:]))) - +def train(nn_type, use_cuda, parallel, save_dirname): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') - if args.nn_type == 'mlp': + if nn_type == 'mlp': net_conf = mlp else: net_conf = conv_net - if args.parallel: + if parallel: places = fluid.layers.get_places() pd = fluid.layers.ParallelDo(places) with pd.do(): @@ -107,7 +108,7 @@ def train(args, save_dirname=None): optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer.minimize(avg_loss) - place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) @@ -147,13 +148,14 @@ def train(args, save_dirname=None): 'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'. format(pass_id, batch_id + 1, float(avg_loss_val), float(acc_val))) + raise AssertionError("Loss of recognize digits is too large") -def infer(args, save_dirname=None): +def infer(use_cuda, save_dirname=None): if save_dirname is None: return - place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) # Use fluid.io.load_inference_model to obtain the inference program desc, @@ -174,11 +176,48 @@ def infer(args, save_dirname=None): print("infer results: ", results[0]) -if __name__ == '__main__': - args = parse_arg() - if not args.use_cuda and not args.parallel: - save_dirname = "recognize_digits_" + args.nn_type + ".inference.model" +def main(use_cuda, parallel, nn_type): + if not use_cuda and not parallel: + save_dirname = "recognize_digits_" + nn_type + ".inference.model" else: save_dirname = None - train(args, save_dirname) - infer(args, save_dirname) + + train( + nn_type=nn_type, + use_cuda=use_cuda, + parallel=parallel, + save_dirname=save_dirname) + infer(use_cuda=use_cuda, save_dirname=save_dirname) + + +class TestRecognizeDigits(unittest.TestCase): + pass + + +def inject_test_method(use_cuda, parallel, nn_type): + def __impl__(self): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + main(use_cuda, parallel, nn_type) + + fn = 'test_{0}_{1}_{2}'.format(nn_type, 'cuda' + if use_cuda else 'cpu', 'parallel' + if parallel else 'normal') + + setattr(TestRecognizeDigits, fn, __impl__) + + +def inject_all_tests(): + for use_cuda in (False, True): + for parallel in (False, True): + for nn_type in ('mlp', 'conv'): + inject_test_method(use_cuda, parallel, nn_type) + + +inject_all_tests() + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py index 74138298978c7c18936f53761b313887f07aea81..4943bbb3388c3a476596b2fd4dd28605ee7be9e0 100644 --- a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py +++ b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py @@ -62,7 +62,7 @@ def batch_bipartite_match(distance, lod): return match_indices, match_dist -class TestBipartiteMatchOpForWithLoD(OpTest): +class TestBipartiteMatchOpWithLoD(OpTest): def setUp(self): self.op_type = 'bipartite_match' lod = [[0, 5, 11, 23]] @@ -72,7 +72,7 @@ class TestBipartiteMatchOpForWithLoD(OpTest): self.inputs = {'DistMat': (dist, lod)} self.outputs = { 'ColToRowMatchIndices': (match_indices), - 'ColToRowMatchDis': (match_dist), + 'ColToRowMatchDist': (match_dist), } def test_check_output(self): @@ -89,7 +89,7 @@ class TestBipartiteMatchOpWithoutLoD(OpTest): self.inputs = {'DistMat': dist} self.outputs = { 'ColToRowMatchIndices': match_indices, - 'ColToRowMatchDis': match_dist, + 'ColToRowMatchDist': match_dist, } def test_check_output(self): diff --git a/python/paddle/v2/fluid/tests/test_box_coder_op.py b/python/paddle/v2/fluid/tests/test_box_coder_op.py new file mode 100644 index 0000000000000000000000000000000000000000..0dc18476fd5dce7cd293f6cb85f419be7d88ec95 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_box_coder_op.py @@ -0,0 +1,127 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys +import math +from op_test import OpTest + + +def box_coder(target_box, prior_box, prior_box_var, output_box, code_type): + prior_box_x = ( + (prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0]) + prior_box_y = ( + (prior_box[:, 3] + prior_box[:, 1]) / 2).reshape(1, prior_box.shape[0]) + prior_box_width = ( + (prior_box[:, 2] - prior_box[:, 0])).reshape(1, prior_box.shape[0]) + prior_box_height = ( + (prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0]) + prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0], + prior_box_var.shape[1]) + + if (code_type == "EncodeCenterSize"): + target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape( + target_box.shape[0], 1) + target_box_y = ((target_box[:, 3] + target_box[:, 1]) / 2).reshape( + target_box.shape[0], 1) + target_box_width = ((target_box[:, 2] - target_box[:, 0])).reshape( + target_box.shape[0], 1) + target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape( + target_box.shape[0], 1) + + output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \ + prior_box_var[:,:,0] + output_box[:,:,1] = (target_box_y - prior_box_y) / prior_box_height / \ + prior_box_var[:,:,1] + output_box[:,:,2] = np.log(np.fabs(target_box_width / prior_box_width)) / \ + prior_box_var[:,:,2] + output_box[:,:,3] = np.log(np.fabs(target_box_height / prior_box_height)) / \ + prior_box_var[:,:,3] + + elif (code_type == "DecodeCenterSize"): + target_box = target_box.reshape(target_box.shape[0], 1, + target_box.shape[1]) + target_box_x = prior_box_var[:,:,0] * target_box[:,:,0] * \ + prior_box_width + prior_box_x + target_box_y = prior_box_var[:,:,1] * target_box[:,:,1] * \ + prior_box_height + prior_box_y + target_box_width = np.exp(prior_box_var[:,:,2] * target_box[:,:,2]) * \ + prior_box_width + target_box_height = np.exp(prior_box_var[:,:,3] * target_box[:,:,3]) * \ + prior_box_height + output_box[:, :, 0] = target_box_x - target_box_width / 2 + output_box[:, :, 1] = target_box_y - target_box_height / 2 + output_box[:, :, 2] = target_box_x + target_box_width / 2 + output_box[:, :, 3] = target_box_y + target_box_height / 2 + + +def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type): + n = target_box.shape[0] + m = prior_box.shape[0] + output_box = np.zeros((n, m, 4), dtype=np.float32) + for i in range(len(lod) - 1): + box_coder(target_box[lod[i]:lod[i + 1], :], prior_box, prior_box_var, + output_box[lod[i]:lod[i + 1], :, :], code_type) + return output_box + + +class TestBoxCoderOp(OpTest): + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "box_coder" + lod = [[0, 20]] + prior_box = np.random.random((10, 4)).astype('float32') + prior_box_var = np.random.random((10, 4)).astype('float32') + target_box = np.random.random((20, 4)).astype('float32') + code_type = "DecodeCenterSize" + output_box = batch_box_coder(prior_box, prior_box_var, target_box, + lod[0], code_type) + + self.inputs = { + 'PriorBox': prior_box, + 'PriorBoxVar': prior_box_var, + 'TargetBox': target_box, + } + self.attrs = {'code_type': 'decode_center_size'} + self.outputs = {'OutputBox': output_box} + + +class TestBoxCoderOpWithLoD(OpTest): + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "box_coder" + lod = [[0, 4, 12, 20]] + prior_box = np.random.random((10, 4)).astype('float32') + prior_box_var = np.random.random((10, 4)).astype('float32') + target_box = np.random.random((20, 4)).astype('float32') + code_type = "EncodeCenterSize" + output_box = batch_box_coder(prior_box, prior_box_var, target_box, + lod[0], code_type) + + self.inputs = { + 'PriorBox': prior_box, + 'PriorBoxVar': prior_box_var, + 'TargetBox': (target_box, lod), + } + self.attrs = {'code_type': 'encode_center_size'} + self.outputs = {'OutputBox': output_box} + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py index 3f54e28defb76d3430a82e791578e20b84833f16..aea43c2517a02c72c1ee3307afdd3b21910f0064 100644 --- a/python/paddle/v2/fluid/tests/test_layers.py +++ b/python/paddle/v2/fluid/tests/test_layers.py @@ -223,6 +223,14 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(layers.sequence_softmax(x=seq)) print(str(program)) + def test_softmax(self): + program = Program() + with program_guard(program): + data = layers.data(name='data', shape=[10], dtype='float32') + hid = layers.fc(input=data, size=20) + self.assertIsNotNone(layers.softmax(x=hid)) + print(str(program)) + def test_get_places(self): program = Program() with program_guard(program): diff --git a/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py b/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py new file mode 100755 index 0000000000000000000000000000000000000000..c27573c3d69037bc48e0b6a90636b3f027f15a41 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py @@ -0,0 +1,100 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys +import math +from op_test import OpTest + + +class TestMineHardExamplesOp(OpTest): + def set_data(self): + self.init_test_data() + self.inputs = { + 'ClsLoss': self.cls_loss, + 'LocLoss': self.loc_loss, + 'MatchIndices': self.match_indices, + 'MatchDist': self.match_dis + } + + self.attrs = { + 'neg_pos_ratio': self.neg_pos_ratio, + 'neg_overlap': self.neg_overlap, + 'sample_size': self.sample_size, + 'mining_type': self.mining_type + } + + self.outputs = { + 'NegIndices': (self.neg_indices, self.neg_indices_lod), + 'UpdatedMatchIndices': self.updated_match_indices + } + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + return + + def setUp(self): + self.op_type = "mine_hard_examples" + self.set_data() + + def init_test_data(self): + self.neg_pos_ratio = 1.0 + self.neg_overlap = 0.5 + self.sample_size = 0 + self.mining_type = "max_negative" + self.cls_loss = np.array([[0.1, 0.1, 0.3], + [0.3, 0.1, 0.1]]).astype('float32') + + self.loc_loss = np.array([[0.1, 0.2, 0.3], + [0.3, 0.4, 0.1]]).astype('float32') + + self.match_dis = np.array([[0.2, 0.4, 0.8], + [0.1, 0.9, 0.3]]).astype('float32') + + self.match_indices = np.array([[0, -1, -1], + [-1, 0, -1]]).astype('int32') + + self.updated_match_indices = self.match_indices + + self.neg_indices_lod = [[0, 1, 2]] + self.neg_indices = np.array([[1], [0]]).astype('int32') + + +class TestMineHardExamplesOpHardExample(TestMineHardExamplesOp): + def init_test_data(self): + super(TestMineHardExamplesOpHardExample, self).init_test_data() + self.mining_type = "hard_example" + self.sample_size = 2 + + self.cls_loss = np.array([[0.5, 0.1, 0.3], + [0.3, 0.1, 0.1]]).astype('float32') + + self.loc_loss = np.array([[0.2, 0.2, 0.3], + [0.3, 0.1, 0.2]]).astype('float32') + + self.match_indices = np.array([[0, -1, -1], + [-1, 0, -1]]).astype('int32') + + self.updated_match_indices = np.array([[0, -1, -1], + [-1, -1, -1]]).astype('int32') + + self.neg_indices_lod = [[0, 1, 3]] + self.neg_indices = np.array([[2], [0], [2]]).astype('int32') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py new file mode 100644 index 0000000000000000000000000000000000000000..3b80d2359b083d30f9a5a7b8cc18aaf1ca5146c1 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py @@ -0,0 +1,226 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. +import unittest +import numpy as np +import copy +from op_test import OpTest + + +def iou(box_a, box_b): + """Apply intersection-over-union overlap between box_a and box_b + """ + xmin_a = min(box_a[0], box_a[2]) + ymin_a = min(box_a[1], box_a[3]) + xmax_a = max(box_a[0], box_a[2]) + ymax_a = max(box_a[1], box_a[3]) + + xmin_b = min(box_b[0], box_b[2]) + ymin_b = min(box_b[1], box_b[3]) + xmax_b = max(box_b[0], box_b[2]) + ymax_b = max(box_b[1], box_b[3]) + + area_a = (ymax_a - ymin_a) * (xmax_a - xmin_a) + area_b = (ymax_b - ymin_b) * (xmax_b - xmin_b) + if area_a <= 0 and area_b <= 0: + return 0.0 + + xa = max(xmin_a, xmin_b) + ya = max(ymin_a, ymin_b) + xb = min(xmax_a, xmax_b) + yb = min(ymax_a, ymax_b) + + inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0) + + box_a_area = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]) + box_b_area = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]) + + iou_ratio = inter_area / (area_a + area_b - inter_area) + + return iou_ratio + + +def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0): + """Apply non-maximum suppression at test time to avoid detecting too many + overlapping bounding boxes for a given object. + Args: + boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. + scores: (tensor) The class predscores for the img, Shape:[num_priors]. + score_threshold: (float) The confidence thresh for filtering low + confidence boxes. + nms_threshold: (float) The overlap thresh for suppressing unnecessary + boxes. + top_k: (int) The maximum number of box preds to consider. + eta: (float) The parameter for adaptive NMS. + Return: + The indices of the kept boxes with respect to num_priors. + """ + all_scores = copy.deepcopy(scores) + all_scores = all_scores.flatten() + selected_indices = np.argwhere(all_scores > score_threshold) + selected_indices = selected_indices.flatten() + all_scores = all_scores[selected_indices] + + sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort') + sorted_scores = all_scores[sorted_indices] + if top_k > -1 and top_k < sorted_indices.shape[0]: + sorted_indices = sorted_indices[:top_k] + sorted_scores = sorted_scores[:top_k] + + selected_indices = [] + adaptive_threshold = nms_threshold + for i in range(sorted_scores.shape[0]): + idx = sorted_indices[i] + keep = True + for k in range(len(selected_indices)): + if keep: + kept_idx = selected_indices[k] + overlap = iou(boxes[idx], boxes[kept_idx]) + keep = True if overlap <= adaptive_threshold else False + else: + break + if keep: + selected_indices.append(idx) + if keep and eta < 1 and adaptive_threshold > 0.5: + adaptive_threshold *= eta + return selected_indices + + +def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold, + nms_top_k, keep_top_k): + class_num = scores.shape[0] + priorbox_num = scores.shape[1] + + selected_indices = {} + num_det = 0 + for c in range(class_num): + if c == background: continue + indices = nms(boxes, scores[c], score_threshold, nms_threshold, + nms_top_k) + selected_indices[c] = indices + num_det += len(indices) + + if keep_top_k > -1 and num_det > keep_top_k: + score_index = [] + for c, indices in selected_indices.iteritems(): + for idx in indices: + score_index.append((scores[c][idx], c, idx)) + + sorted_score_index = sorted( + score_index, key=lambda tup: tup[0], reverse=True) + sorted_score_index = sorted_score_index[:keep_top_k] + selected_indices = {} + + for _, c, _ in sorted_score_index: + selected_indices[c] = [] + for s, c, idx in sorted_score_index: + selected_indices[c].append(idx) + num_det = keep_top_k + + return selected_indices, num_det + + +def batched_multiclass_nms(boxes, scores, background, score_threshold, + nms_threshold, nms_top_k, keep_top_k): + batch_size = scores.shape[0] + + det_outs = [] + lod = [0] + for n in range(batch_size): + nmsed_outs, nmsed_num = multiclass_nms(boxes, scores[n], background, + score_threshold, nms_threshold, + nms_top_k, keep_top_k) + lod.append(lod[-1] + nmsed_num) + if nmsed_num == 0: continue + + for c, indices in nmsed_outs.iteritems(): + for idx in indices: + xmin, ymin, xmax, ymax = boxes[idx][:] + det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax]) + + return det_outs, lod + + +class TestMulticlassNMSOp(OpTest): + def set_argument(self): + self.score_threshold = 0.01 + + def setUp(self): + self.set_argument() + N = 7 + M = 1200 + C = 21 + BOX_SIZE = 4 + + background = 0 + nms_threshold = 0.3 + nms_top_k = 400 + keep_top_k = 200 + score_threshold = self.score_threshold + + scores = np.random.random((N * M, C)).astype('float32') + + def softmax(x): + shiftx = x - np.max(x).clip(-64.) + exps = np.exp(shiftx) + return exps / np.sum(exps) + + scores = np.apply_along_axis(softmax, 1, scores) + scores = np.reshape(scores, (N, M, C)) + scores = np.transpose(scores, (0, 2, 1)) + + boxes = np.random.random((M, BOX_SIZE)).astype('float32') + boxes[:, 0:2] = boxes[:, 0:2] * 0.5 + boxes[:, 2:4] = boxes[:, 2:4] * 0.5 + 0.5 + + nmsed_outs, lod = batched_multiclass_nms(boxes, scores, background, + score_threshold, nms_threshold, + nms_top_k, keep_top_k) + nmsed_outs = [-1] if not nmsed_outs else nmsed_outs + nmsed_outs = np.array(nmsed_outs).astype('float32') + + self.op_type = 'multiclass_nms' + self.inputs = {'BBoxes': boxes, 'Scores': scores} + self.outputs = {'Out': (nmsed_outs, [lod])} + self.attrs = { + 'background_label': 0, + 'nms_threshold': nms_threshold, + 'nms_top_k': nms_top_k, + 'keep_top_k': keep_top_k, + 'score_threshold': score_threshold, + 'nms_eta': 1.0, + } + + def test_check_output(self): + self.check_output() + + +class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp): + def set_argument(self): + # Here set 2.0 to test the case there is no outputs. + # In practical use, 0.0 < score_threshold < 1.0 + self.score_threshold = 2.0 + + +class TestIOU(unittest.TestCase): + def test_iou(self): + box1 = np.array([4.0, 3.0, 7.0, 5.0]).astype('float32') + box2 = np.array([3.0, 4.0, 6.0, 8.0]).astype('float32') + + expt_output = np.array([2.0 / 16.0]).astype('float32') + calc_output = np.array([iou(box1, box2)]).astype('float32') + self.assertTrue(np.allclose(calc_output, expt_output)) + + +if __name__ == '__main__': + unittest.main()