diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c7eb260aea8478f4833cb79253f4481e10b8685..49334279f6dc88c0d35fec43daf80e3cbe65760c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,7 +39,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) -option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) +option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON) option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) @@ -137,7 +137,7 @@ include(external/openblas) # download, build, install openblas include(external/mkldnn) # download, build, install mkldnn include(external/swig) # download, build, install swig include(external/warpctc) # download, build, install warpctc -include(external/boost) # download, build, install boost +include(external/boost) # download boost include(external/any) # download libn::any include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..98356cd7613baff7f0cd66d1462068232b2b8500 --- /dev/null +++ b/benchmark/cluster/vgg16/Dockerfile @@ -0,0 +1,18 @@ +#FROM python:2.7.14 +FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04 +RUN apt-get update && apt-get install -y python +RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev +# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF, +# so we must build one with distribute support to install in this image. +RUN pip install paddlepaddle +RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python' +RUN pip uninstall -y paddlepaddle + +# below lines may change a lot for debugging +ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin +ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root +ADD *.whl / +RUN pip install /*.whl && rm -f /*.whl && \ +chmod +x /usr/bin/paddle_k8s +ENV LD_LIBRARY_PATH=/usr/local/lib +ADD vgg16_fluid.py vgg16_v2.py /workspace/ diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md new file mode 100644 index 0000000000000000000000000000000000000000..11d00b8f85382aa720c169338c51333b730d44d5 --- /dev/null +++ b/benchmark/cluster/vgg16/README.md @@ -0,0 +1,76 @@ +# Performance for Distributed vgg16 + +## Test Result + +### Hardware Infomation + +- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz +- cpu MHz : 2101.000 +- cache size : 20480 KB + +### Single Node Single Thread + +- PServer Count: 10 +- Trainer Count: 20 +- Metrics: samples / sec + +| Batch Size | 32 | 64 | 128 | 256 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 | +| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 | +| TensorFlow | - | - | - | - | + +### Different Batch Size + +- PServer Count: 10 +- Trainer Count: 20 +- Per trainer CPU Core: 1 +- Metrics: samples / sec + +| Batch Size | 32 | 64 | 128 | 256 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 | +| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 | +| TensorFlow | - | - | - | - | + + +### Accelerate Rate + +- Pserver Count: 20 +- Batch Size: 128 +- Metrics: samples / sec + +| Trainer Count | 20 | 40 | 80 | 100 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) | +| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) | +| TensorFlow | - | - | - | - | + +### Different Pserver Count + +- Trainer Count: 60 +- Batch Size: 128 +- Metrics: samples/ sec + +| PServer Count | 3 | 6 |10 | 20 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 | +| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 | +| TensorFlow | - | - | - | - | + +*The performance gap between Fuild and v2 comes from the network interference.* + + +## Steps to Run the Performance Test + +1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support. +1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory. +1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it. +1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step). +1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers. + +Check the logs for the distributed training progress and analyze the performance. + +## Enable Verbos Logs + +Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail. diff --git a/benchmark/cluster/vgg16/fluid_pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ee8b0763b62fc011f40f6197e929a68b48a93e47 --- /dev/null +++ b/benchmark/cluster/vgg16/fluid_pserver.yaml @@ -0,0 +1,72 @@ +apiVersion: extensions/v1beta1 +kind: ReplicaSet +metadata: + name: vgg16job-pserver +spec: + replicas: 10 + template: + metadata: + labels: + paddle-job-pserver: vgg16job + spec: + hostNetwork: true + imagePullSecrets: + - name: job-registry-secret + containers: + - name: pserver + image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" + imagePullPolicy: Always + ports: + - name: jobport-30236 + containerPort: 30236 + env: + - name: PADDLE_JOB_NAME + value: vgg16job + - name: MKL_NUM_THREADS + value: "1" + - name: TRAINING_ROLE + value: "PSERVER" + - name: TRAINERS + value: "20" + - name: PSERVERS + value: "10" + - name: TOPOLOGY + value: "" + - name: ENTRY + value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0" + - name: TRAINER_PACKAGE + value: "/workspace" + - name: PADDLE_INIT_PORT + value: "30236" + - name: PADDLE_INIT_NICS + value: "xgbe0" + - name: PADDLE_INIT_TRAINER_COUNT + value: "1" + - name: PADDLE_INIT_PORTS_NUM + value: "1" + - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE + value: "1" + - name: PADDLE_INIT_NUM_GRADIENT_SERVERS + value: "20" + - name: PADDLE_INIT_NUM_PASSES + value: "1" + - name: PADDLE_INIT_USE_GPU + value: "0" + - name: LD_LIBRARY_PATH + value: "/usr/local/lib:/usr/local/nvidia/lib64" + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: "metadata.namespace" + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: "status.podIP" + command: ["paddle_k8s", "start_fluid"] + resources: + requests: + memory: 10Gi + cpu: 4 + limits: + memory: 10Gi + cpu: 4 diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0a0ed25ebe43c4cc0d5ab0b72cf36c936fcce802 --- /dev/null +++ b/benchmark/cluster/vgg16/fluid_trainer.yaml @@ -0,0 +1,69 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: vgg16job-trainer +spec: + parallelism: 20 + completions: 20 + template: + metadata: + labels: + paddle-job: vgg16job + spec: + imagePullSecrets: + - name: job-registry-secret + hostNetwork: true + containers: + - name: trainer + image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" + imagePullPolicy: Always + command: ["paddle_k8s", "start_fluid"] + env: + - name: PADDLE_JOB_NAME + value: vgg16job + - name: TRAINING_ROLE + value: "TRAINER" + - name: TRAINERS + value: "20" + - name: PSERVERS + value: "10" + - name: TOPOLOGY + value: "" + - name: ENTRY + value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128" + - name: TRAINER_PACKAGE + value: "/workspace" + - name: PADDLE_INIT_PORT + value: "30236" + - name: PADDLE_INIT_NICS + value: "xgbe0" + - name: PADDLE_INIT_TRAINER_COUNT + value: "1" + - name: PADDLE_INIT_PORTS_NUM + value: "1" + - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE + value: "1" + - name: PADDLE_INIT_NUM_GRADIENT_SERVERS + value: "20" + - name: PADDLE_INIT_NUM_PASSES + value: "1" + - name: PADDLE_INIT_USE_GPU + value: "0" + - name: LD_LIBRARY_PATH + value: "/usr/local/lib:/usr/local/nvidia/lib64" + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: "metadata.namespace" + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: "status.podIP" + resources: + requests: + memory: 40Gi + cpu: 2 + limits: + memory: 40Gi + cpu: 2 + restartPolicy: Never diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dd1271e0cf399184134c06b3200ee1202c65cef0 --- /dev/null +++ b/benchmark/cluster/vgg16/v2_pserver.yaml @@ -0,0 +1,64 @@ +apiVersion: extensions/v1beta1 +kind: ReplicaSet +metadata: + name: vgg16v2job-pserver +spec: + replicas: 10 + template: + metadata: + labels: + paddle-job-pserver: vgg16v2job + spec: + hostNetwork: true + imagePullSecrets: + - name: job-registry-secret + containers: + - name: pserver + image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" + imagePullPolicy: Always + ports: + - name: jobport-30236 + containerPort: 30236 + env: + - name: PADDLE_JOB_NAME + value: vgg16v2job + - name: TRAINERS + value: "20" + - name: PSERVERS + value: "10" + - name: TOPOLOGY + value: "" + - name: ENTRY + value: "python train.py" + - name: TRAINER_PACKAGE + value: "/workspace" + - name: PADDLE_INIT_PORT + value: "30236" + - name: PADDLE_INIT_NICS + value: "xgbe0" + - name: PADDLE_INIT_TRAINER_COUNT + value: "1" + - name: PADDLE_INIT_PORTS_NUM + value: "1" + - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE + value: "1" + - name: PADDLE_INIT_NUM_GRADIENT_SERVERS + value: "20" + - name: PADDLE_INIT_NUM_PASSES + value: "1" + - name: PADDLE_INIT_USE_GPU + value: "0" + - name: LD_LIBRARY_PATH + value: "/usr/local/lib:/usr/local/nvidia/lib64" + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: "metadata.namespace" + command: ["paddle_k8s", "start_pserver"] + resources: + requests: + memory: 10Gi + cpu: 4 + limits: + memory: 10Gi + cpu: 4 diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..12c8964066cbcfe8d2a44de2f51a3d12ea422fe2 --- /dev/null +++ b/benchmark/cluster/vgg16/v2_trainer.yaml @@ -0,0 +1,65 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: vgg16v2job-trainer +spec: + parallelism: 20 + completions: 20 + template: + metadata: + labels: + paddle-job: vgg16v2job + spec: + imagePullSecrets: + - name: job-registry-secret + hostNetwork: true + containers: + - name: trainer + image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16" + imagePullPolicy: Always + command: ["paddle_k8s", "start_trainer", "v2"] + env: + - name: PADDLE_JOB_NAME + value: vgg16v2job + - name: BATCH_SIZE + value: "256" + - name: TRAINERS + value: "20" + - name: PSERVERS + value: "10" + - name: TOPOLOGY + value: "" + - name: ENTRY + value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py" + - name: TRAINER_PACKAGE + value: "/workspace" + - name: PADDLE_INIT_PORT + value: "30236" + - name: PADDLE_INIT_NICS + value: "xgbe0" + - name: PADDLE_INIT_TRAINER_COUNT + value: "1" + - name: PADDLE_INIT_PORTS_NUM + value: "1" + - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE + value: "1" + - name: PADDLE_INIT_NUM_GRADIENT_SERVERS + value: "20" + - name: PADDLE_INIT_NUM_PASSES + value: "2" + - name: PADDLE_INIT_USE_GPU + value: "0" + - name: LD_LIBRARY_PATH + value: "/usr/local/lib:/usr/local/nvidia/lib64" + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: "metadata.namespace" + resources: + requests: + memory: 40Gi + cpu: 2 + limits: + memory: 40Gi + cpu: 2 + restartPolicy: Never diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py new file mode 100644 index 0000000000000000000000000000000000000000..499e06ec42fc8f840137173628fa465e0541ba30 --- /dev/null +++ b/benchmark/cluster/vgg16/vgg16_fluid.py @@ -0,0 +1,277 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""VGG16 benchmark in Fluid""" +from __future__ import print_function + +import sys +import time +import numpy as np +import paddle.v2 as paddle +import paddle.v2.fluid as fluid +import paddle.v2.fluid.core as core +import paddle.v2.fluid.profiler as profiler +import argparse +import functools +import os + + +def str2bool(v): + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + '--batch_size', type=int, default=128, help="Batch size for training.") +parser.add_argument( + '--learning_rate', + type=float, + default=1e-3, + help="Learning rate for training.") +parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.") +parser.add_argument( + '--device', + type=str, + default='CPU', + choices=['CPU', 'GPU'], + help="The device type.") +parser.add_argument('--device_id', type=int, default=0, help="The device id.") +parser.add_argument( + '--data_format', + type=str, + default='NCHW', + choices=['NCHW', 'NHWC'], + help='The data order, now only support NCHW.') +parser.add_argument( + '--data_set', + type=str, + default='cifar10', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') +parser.add_argument( + '--local', + type=str2bool, + default=True, + help='Whether to run as local mode.') +args = parser.parse_args() + + +def vgg16_bn_drop(input): + def conv_block(input, num_filter, groups, dropouts): + return fluid.nets.img_conv_group( + input=input, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act='relu', + conv_with_batchnorm=True, + conv_batchnorm_drop_rate=dropouts, + pool_type='max') + + conv1 = conv_block(input, 64, 2, [0.3, 0]) + conv2 = conv_block(conv1, 128, 2, [0.4, 0]) + conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) + conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) + conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) + + drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) + fc1 = fluid.layers.fc(input=drop, size=512, act=None) + bn = fluid.layers.batch_norm(input=fc1, act='relu') + drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) + fc2 = fluid.layers.fc(input=drop2, size=512, act=None) + return fc2 + + +def main(): + if args.data_set == "cifar10": + classdim = 10 + if args.data_format == 'NCHW': + data_shape = [3, 32, 32] + else: + data_shape = [32, 32, 3] + else: + classdim = 102 + if args.data_format == 'NCHW': + data_shape = [3, 224, 224] + else: + data_shape = [224, 224, 3] + + # Input data + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + net = vgg16_bn_drop(images) + predict = fluid.layers.fc(input=net, size=classdim, act='softmax') + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + accuracy = fluid.evaluator.Accuracy(input=predict, label=label) + + # inference program + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + test_target = accuracy.metrics + accuracy.states + inference_program = fluid.io.get_inference_program(test_target) + + # Optimization + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + optimize_ops, params_grads = optimizer.minimize(avg_cost) + + # Initialize executor + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace( + args.device_id) + exe = fluid.Executor(place) + + # test + def test(exe): + accuracy.reset(exe) + for batch_id, data in enumerate(test_reader()): + img_data = np.array(map(lambda x: x[0].reshape(data_shape), + data)).astype("float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + + exe.run(inference_program, + feed={"pixel": img_data, + "label": y_data}) + + return accuracy.eval(exe) + + def train_loop(exe, trainer_prog): + iters = 0 + ts = time.time() + for pass_id in range(args.num_passes): + # train + start_time = time.time() + num_samples = 0 + accuracy.reset(exe) + with profiler.profiler("CPU", 'total') as prof: + for batch_id, data in enumerate(train_reader()): + ts = time.time() + img_data = np.array( + map(lambda x: x[0].reshape(data_shape), data)).astype( + "float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + + loss, acc = exe.run( + trainer_prog, + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[avg_cost] + accuracy.metrics) + iters += 1 + num_samples += len(data) + print( + "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" + % (pass_id, iters, loss, acc, time.time() - ts) + ) # The accuracy is the accumulation of batches, but not the current batch. + + pass_elapsed = time.time() - start_time + pass_train_acc = accuracy.eval(exe) + pass_test_acc = test(exe) + print( + "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n" + % (pass_id, num_samples / pass_elapsed, pass_train_acc, + pass_test_acc)) + + if args.local: + # Parameter initialization + exe.run(fluid.default_startup_program()) + + # data reader + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() if args.data_set == 'cifar10' + else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + batch_size=args.batch_size) + train_loop(exe, fluid.default_main_program()) + else: + pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # all pserver endpoints + eplist = [] + for ip in pserver_ips.split(","): + eplist.append(':'.join([ip, "6174"])) + pserver_endpoints = ",".join(eplist) + print("pserver endpoints: ", pserver_endpoints) + trainers = int(os.getenv("TRAINERS")) # total trainer count + print("trainers total: ", trainers) + current_endpoint = os.getenv( + "POD_IP") + ":6174" # current pserver endpoint + training_role = os.getenv( + "TRAINING_ROLE", + "TRAINER") # get the training role: trainer/pserver + t = fluid.DistributeTranspiler() + t.transpile( + optimize_ops, + params_grads, + pservers=pserver_endpoints, + trainers=trainers) + + if training_role == "PSERVER": + if not current_endpoint: + print("need env SERVER_ENDPOINT") + exit(1) + pserver_prog = t.get_pserver_program(current_endpoint) + pserver_startup = t.get_startup_program(current_endpoint, + pserver_prog) + print("starting server side startup") + exe.run(pserver_startup) + print("starting parameter server...") + exe.run(pserver_prog) + elif training_role == "TRAINER": + # Parameter initialization + exe.run(fluid.default_startup_program()) + + # data reader + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() if args.data_set == 'cifar10' + else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else + paddle.dataset.flowers.test(), + batch_size=args.batch_size) + + trainer_prog = t.get_trainer_program() + feeder = fluid.DataFeeder(feed_list=[images, label], place=place) + # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver + exe.run(fluid.default_startup_program()) + train_loop(exe, trainer_prog) + else: + print("environment var TRAINER_ROLE should be TRAINER os PSERVER") + + +def print_arguments(): + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +if __name__ == "__main__": + print_arguments() + main() diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..6ac6b3c33252e0a1f596f539fc090c5ada118e15 --- /dev/null +++ b/benchmark/cluster/vgg16/vgg16_v2.py @@ -0,0 +1,154 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +import gzip + +import paddle.v2.dataset.cifar as cifar +import paddle.v2 as paddle +import time +import os + +DATA_DIM = 3 * 32 * 32 +CLASS_DIM = 10 +BATCH_SIZE = os.getenv("BATCH_SIZE") +if BATCH_SIZE: + BATCH_SIZE = int(BATCH_SIZE) +else: + BATCH_SIZE = 128 +print "batch_size", BATCH_SIZE +NODE_COUNT = int(os.getenv("TRAINERS")) +ts = 0 + + +def vgg(input, nums, class_dim): + def conv_block(input, num_filter, groups, num_channels=None): + return paddle.networks.img_conv_group( + input=input, + num_channels=num_channels, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act=paddle.activation.Relu(), + pool_type=paddle.pooling.Max()) + + assert len(nums) == 5 + # the channel of input feature is 3 + conv1 = conv_block(input, 64, nums[0], 3) + conv2 = conv_block(conv1, 128, nums[1]) + conv3 = conv_block(conv2, 256, nums[2]) + conv4 = conv_block(conv3, 512, nums[3]) + conv5 = conv_block(conv4, 512, nums[4]) + + fc_dim = 512 + fc1 = paddle.layer.fc(input=conv5, + size=fc_dim, + act=paddle.activation.Relu(), + layer_attr=paddle.attr.Extra(drop_rate=0.5)) + fc2 = paddle.layer.fc(input=fc1, + size=fc_dim, + act=paddle.activation.Relu(), + layer_attr=paddle.attr.Extra(drop_rate=0.5)) + out = paddle.layer.fc(input=fc2, + size=class_dim, + act=paddle.activation.Softmax()) + return out + + +def vgg13(input, class_dim): + nums = [2, 2, 2, 2, 2] + return vgg(input, nums, class_dim) + + +def vgg16(input, class_dim): + nums = [2, 2, 3, 3, 3] + return vgg(input, nums, class_dim) + + +def vgg19(input, class_dim): + nums = [2, 2, 4, 4, 4] + return vgg(input, nums, class_dim) + + +def main(): + global ts + paddle.init(use_gpu=False) + image = paddle.layer.data( + name="image", type=paddle.data_type.dense_vector(DATA_DIM)) + lbl = paddle.layer.data( + name="label", type=paddle.data_type.integer_value(CLASS_DIM)) + + extra_layers = None + # NOTE: for v2 distributed training need averaging updates. + learning_rate = 1e-3 / NODE_COUNT + out = vgg16(image, class_dim=CLASS_DIM) + cost = paddle.layer.classification_cost(input=out, label=lbl) + + # Create parameters + parameters = paddle.parameters.create(cost) + + # Create optimizer + optimizer = paddle.optimizer.Momentum( + momentum=0.9, + regularization=paddle.optimizer.L2Regularization(rate=0.0005 * + BATCH_SIZE), + learning_rate=learning_rate / BATCH_SIZE, + learning_rate_decay_a=0.1, + learning_rate_decay_b=128000 * 35, + learning_rate_schedule="discexp", ) + + train_reader = paddle.batch( + paddle.reader.shuffle( + cifar.train10(), + # To use other data, replace the above line with: + # reader.train_reader('train.list'), + buf_size=1000), + batch_size=BATCH_SIZE) + test_reader = paddle.batch( + cifar.test10(), + # To use other data, replace the above line with: + # reader.test_reader('val.list'), + batch_size=BATCH_SIZE) + + # Create trainer + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=optimizer, + extra_layers=extra_layers, + is_local=False) + + # End batch and end pass event handler + def event_handler(event): + global ts, ts_pass + if isinstance(event, paddle.event.BeginPass): + ts_pass = time.time() + if isinstance(event, paddle.event.BeginIteration): + ts = time.time() + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 1 == 0: + print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % ( + event.pass_id, event.batch_id, event.cost, event.metrics, + time.time() - ts) + if isinstance(event, paddle.event.EndPass): + print "Pass %d end, spent: %f" % (event.pass_id, + time.time() - ts_pass) + result = trainer.test(reader=test_reader) + print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) + + trainer.train( + reader=train_reader, num_passes=200, event_handler=event_handler) + + +if __name__ == '__main__': + main() diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake index c70d83b3f4bb24740ed67b4e2f98a3ced26d1648..dbc676bdac30e0d730206c17a1912d49d4f896eb 100644 --- a/cmake/external/boost.cmake +++ b/cmake/external/boost.cmake @@ -21,6 +21,7 @@ set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOO set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE) +set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1) include_directories(${BOOST_INCLUDE_DIR}) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 585db019d521b1699baadfae31ef95b5059c71b4..33ef6860e1d38f4e87c4431addf43f9f8a655fc2 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -186,6 +186,11 @@ function(cc_library TARGET_NAME) add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) endif() if (cc_library_DEPS) + # Don't need link libwarpctc.so + if ("${cc_library_DEPS};" MATCHES "warpctc;") + list(REMOVE_ITEM cc_library_DEPS warpctc) + add_dependencies(${TARGET_NAME} warpctc) + endif() add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) endif() @@ -224,12 +229,18 @@ function(cc_test TARGET_NAME) if(WITH_TESTING) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) + set(multiValueArgs SRCS DEPS ARGS) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_test_SRCS}) - target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + # Support linking flags: --whole-archive (Linux) / -force_load (MacOS) + target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + if("${cc_test_DEPS}" MATCHES "ARCHIVE_START") + list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END) + endif() add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) - add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + add_test(NAME ${TARGET_NAME} + COMMAND ${TARGET_NAME} ${cc_test_ARGS} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() endfunction(cc_test) @@ -457,12 +468,12 @@ endfunction() function(py_test TARGET_NAME) if(WITH_TESTING) - set(options STATIC static SHARED shared) + set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS ARGS) + set(multiValueArgs SRCS DEPS ARGS ENVS) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_test(NAME ${TARGET_NAME} - COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python + COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index ddf0b055a92d80295b24255a5462d477e0d9c796..29388f5005bf779a1bfa63c0d46d35996c0c792d 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -87,6 +87,11 @@ roi_pool .. autoclass:: paddle.v2.layer.roi_pool :noindex: +pad +---- +.. autoclass:: paddle.v2.layer.pad + :noindex: + Norm Layer ========== @@ -133,6 +138,11 @@ grumemory .. autoclass:: paddle.v2.layer.grumemory :noindex: +gated_unit +----------- +.. autoclass:: paddle.v2.layer.gated_unit + :noindex: + Recurrent Layer Group ===================== @@ -340,6 +350,11 @@ bilinear_interp .. autoclass:: paddle.v2.layer.bilinear_interp :noindex: +dropout +-------- +.. autoclass:: paddle.v2.layer.dropout + :noindex: + dot_prod --------- .. autoclass:: paddle.v2.layer.dot_prod @@ -402,6 +417,11 @@ scale_shift .. autoclass:: paddle.v2.layer.scale_shift :noindex: +factorization_machine +--------------------- +.. autoclass:: paddle.v2.layer.factorization_machine + :noindex: + Sampling Layers =============== @@ -420,22 +440,6 @@ multiplex .. autoclass:: paddle.v2.layer.multiplex :noindex: -Factorization Machine Layer -============================ - -factorization_machine ---------------------- -.. autoclass:: paddle.v2.layer.factorization_machine - :noindex: - -Slicing and Joining Layers -========================== - -pad ----- -.. autoclass:: paddle.v2.layer.pad - :noindex: - .. _api_v2.layer_costs: Cost Layers @@ -526,6 +530,11 @@ multibox_loss .. autoclass:: paddle.v2.layer.multibox_loss :noindex: +detection_output +---------------- +.. autoclass:: paddle.v2.layer.detection_output + :noindex: + Check Layer ============ @@ -534,31 +543,10 @@ eos .. autoclass:: paddle.v2.layer.eos :noindex: -Miscs -===== - -dropout --------- -.. autoclass:: paddle.v2.layer.dropout - :noindex: - -Activation with learnable parameter -=================================== +Activation +========== prelu -------- .. autoclass:: paddle.v2.layer.prelu :noindex: - -gated_unit ------------ -.. autoclass:: paddle.v2.layer.gated_unit - :noindex: - -Detection output Layer -====================== - -detection_output ----------------- -.. autoclass:: paddle.v2.layer.detection_output - :noindex: diff --git a/doc/api/v2/data/dataset.rst b/doc/api/v2/data/dataset.rst index 6a8ecc5bb1d855e0ded3719943ab3adb810de365..02e41564b1e48c07da6ac071fc4b60089169e05a 100644 --- a/doc/api/v2/data/dataset.rst +++ b/doc/api/v2/data/dataset.rst @@ -73,3 +73,10 @@ wmt14 .. automodule:: paddle.v2.dataset.wmt14 :members: :noindex: + +wmt16 ++++++ + +.. automodule:: paddle.v2.dataset.wmt16 + :members: + :noindex: diff --git a/doc/api/v2/fluid/data_feeder.rst b/doc/api/v2/fluid/data_feeder.rst index 0fa78f7dfb04c13be7eb83b7fd35cb03f2f4a7fa..a591c7334fd31c98a94b50a4344f251560a0f2f9 100644 --- a/doc/api/v2/fluid/data_feeder.rst +++ b/doc/api/v2/fluid/data_feeder.rst @@ -1,9 +1,14 @@ +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! + =========== -DataFeeder +data_feeder =========== DataFeeder ------------ -.. automodule:: paddle.v2.fluid.data_feeder - :members: DataFeeder +---------- + +.. autoclass:: paddle.v2.fluid.data_feeder.DataFeeder + :members: :noindex: + diff --git a/doc/api/v2/fluid/evaluator.rst b/doc/api/v2/fluid/evaluator.rst index a23f3301d0331e0ea3733f06444515eb4680cd31..00dcecfd628a35d83d1c596bf0aea819a1705862 100644 --- a/doc/api/v2/fluid/evaluator.rst +++ b/doc/api/v2/fluid/evaluator.rst @@ -1,9 +1,21 @@ -=========== -Evaluator -=========== - -Evaluator ------------ -.. automodule:: paddle.v2.fluid.evaluator - :members: Evaluator +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! + +========= +evaluator +========= + +Accuracy +-------- + +.. autoclass:: paddle.v2.fluid.evaluator.Accuracy + :members: :noindex: + +ChunkEvaluator +-------------- + +.. autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator + :members: + :noindex: + diff --git a/doc/api/v2/fluid/executor.rst b/doc/api/v2/fluid/executor.rst index 3a283538c120cfa1ef646c390bb71c6251c23675..a028f6283f2ca333bdf6c9857a98661c0222b41e 100644 --- a/doc/api/v2/fluid/executor.rst +++ b/doc/api/v2/fluid/executor.rst @@ -1,9 +1,32 @@ -=========== -Executor -=========== +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! + +======== +executor +======== Executor +-------- + +.. autoclass:: paddle.v2.fluid.executor.Executor + :members: + :noindex: + +global_scope +------------ + +.. autofunction:: paddle.v2.fluid.executor.global_scope + :noindex: + +scope_guard ----------- -.. automodule:: paddle.v2.fluid.executor - :members: Executor + +.. autofunction:: paddle.v2.fluid.executor.scope_guard + :noindex: + +switch_scope +------------ + +.. autofunction:: paddle.v2.fluid.executor.switch_scope :noindex: + diff --git a/doc/api/v2/fluid/gen_doc.py b/doc/api/v2/fluid/gen_doc.py new file mode 100644 index 0000000000000000000000000000000000000000..a2147fd3f7ea635d8f14210fbcd1a568ee2230ee --- /dev/null +++ b/doc/api/v2/fluid/gen_doc.py @@ -0,0 +1,109 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import argparse +import sys +import types + +import paddle.v2.fluid as fluid + + +def parse_arg(): + parser = argparse.ArgumentParser() + parser.add_argument('--submodules', nargs="*") + parser.add_argument( + 'module', type=str, help='Generate the documentation of which module') + return parser.parse_args() + + +class DocGenerator(object): + def __init__(self, module_name, stream=sys.stdout): + self.stream = stream + self.module_name = module_name + if not hasattr(fluid, module_name): + raise ValueError("Cannot find fluid.{0}".format(module_name)) + else: + self.module = getattr(fluid, module_name) + self.stream.write('''.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! + +''') + + self._print_header_(module_name, dot='=', is_title=True) + + def print_submodule(self, submodule_name): + submodule = getattr(self.module, submodule_name) + if submodule is None: + raise ValueError("Cannot find submodule {0}".format(submodule_name)) + self.print_section(submodule_name) + + for item in submodule.__all__: + self.print_item(item) + + def print_current_module(self): + for item in self.module.__all__: + self.print_item(item) + + def print_section(self, name): + self._print_header_(name, dot='=', is_title=False) + + def print_item(self, name): + item = getattr(self.module, name) + if isinstance(item, types.TypeType): + self.print_class(name) + elif isinstance(item, types.FunctionType): + self.print_method(name) + else: + raise RuntimeError("Unsupported item {0}".format(name)) + + def print_class(self, name): + self._print_header_(name, dot='-', is_title=False) + self.stream.write('''.. autoclass:: paddle.v2.fluid.{0}.{1} + :members: + :noindex: + +'''.format(self.module_name, name)) + + def print_method(self, name): + self._print_header_(name, dot='-', is_title=False) + self.stream.write('''.. autofunction:: paddle.v2.fluid.{0}.{1} + :noindex: + +'''.format(self.module_name, name)) + + def _print_header_(self, name, dot, is_title): + dot_line = dot * len(name) + if is_title: + self.stream.write(dot_line) + self.stream.write('\n') + self.stream.write(name) + self.stream.write('\n') + self.stream.write(dot_line) + self.stream.write('\n') + self.stream.write('\n') + + +def main(): + args = parse_arg() + gen = DocGenerator(args.module) + if args.submodules is None: + gen.print_current_module() + else: + for submodule_name in args.submodules: + gen.print_submodule(submodule_name) + + +if __name__ == '__main__': + main() diff --git a/doc/api/v2/fluid/gen_doc.sh b/doc/api/v2/fluid/gen_doc.sh new file mode 100755 index 0000000000000000000000000000000000000000..ba7b7ba8e51399deb852b0a7c8ddd3128f521e85 --- /dev/null +++ b/doc/api/v2/fluid/gen_doc.sh @@ -0,0 +1,7 @@ +#!/bin/bash +python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst + +for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer +do + python gen_doc.py ${module} > ${module}.rst +done diff --git a/doc/api/v2/fluid/initializer.rst b/doc/api/v2/fluid/initializer.rst index 8f587837e9873370722062404f511654a9460587..c38be033fff2997930525f51c93995db09daa2b6 100644 --- a/doc/api/v2/fluid/initializer.rst +++ b/doc/api/v2/fluid/initializer.rst @@ -1,50 +1,35 @@ +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! + =========== -Initializer +initializer =========== +Constant +-------- - -Initializer ------------ -.. automodule:: paddle.v2.fluid.initializer - :members: Initializer - :noindex: - - - -ConstantInitializer -------------------- -.. automodule:: paddle.v2.fluid.initializer - :members: ConstantInitializer +.. autoclass:: paddle.v2.fluid.initializer.Constant + :members: :noindex: +Uniform +------- - -UniformInitializer ------------------- -.. automodule:: paddle.v2.fluid.initializer - :members: UniformInitializer - :noindex: - - - -NormalInitializer ------------------ -.. automodule:: paddle.v2.fluid.initializer - :members: NormalInitializer +.. autoclass:: paddle.v2.fluid.initializer.Uniform + :members: :noindex: +Normal +------ -XavierInitializer ------------------ -.. automodule:: paddle.v2.fluid.initializer - :members: XavierInitializer +.. autoclass:: paddle.v2.fluid.initializer.Normal + :members: :noindex: +Xavier +------ -MSRAInitializer ---------------- -.. automodule:: paddle.v2.fluid.initializer - :members: MSRAInitializer +.. autoclass:: paddle.v2.fluid.initializer.Xavier + :members: :noindex: diff --git a/doc/api/v2/fluid/io.rst b/doc/api/v2/fluid/io.rst index 67f68c4e9e16b379207b8de114cdf769e056f78e..37c9c273e369532e8ff596e9649cb695a98a2505 100644 --- a/doc/api/v2/fluid/io.rst +++ b/doc/api/v2/fluid/io.rst @@ -1,10 +1,61 @@ -=========== -IO -=========== +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! +== +io +== +save_vars +--------- -is_parameter +.. autofunction:: paddle.v2.fluid.io.save_vars + :noindex: + +save_params ----------- -.. autofunction:: paddle.v2.fluid.io.is_parameter + +.. autofunction:: paddle.v2.fluid.io.save_params + :noindex: + +save_persistables +----------------- + +.. autofunction:: paddle.v2.fluid.io.save_persistables + :noindex: + +load_vars +--------- + +.. autofunction:: paddle.v2.fluid.io.load_vars + :noindex: + +load_params +----------- + +.. autofunction:: paddle.v2.fluid.io.load_params :noindex: + +load_persistables +----------------- + +.. autofunction:: paddle.v2.fluid.io.load_persistables + :noindex: + +save_inference_model +-------------------- + +.. autofunction:: paddle.v2.fluid.io.save_inference_model + :noindex: + +load_inference_model +-------------------- + +.. autofunction:: paddle.v2.fluid.io.load_inference_model + :noindex: + +get_inference_program +--------------------- + +.. autofunction:: paddle.v2.fluid.io.get_inference_program + :noindex: + diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst index 231ec2d4ba102a5d31c47cbc7a5d484ef17a7f3a..e24613b94b422b7cdf9c6383c359fa92a4faf6ff 100644 --- a/doc/api/v2/fluid/layers.rst +++ b/doc/api/v2/fluid/layers.rst @@ -1,546 +1,799 @@ -========== -Layers -========== +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! +====== +layers +====== -fc ---- -.. autofunction:: paddle.v2.fluid.layers.fc +control_flow +============ + +split_lod_tensor +---------------- + +.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor :noindex: -embedding ---------- -.. autofunction:: paddle.v2.fluid.layers.embedding +merge_lod_tensor +---------------- + +.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor :noindex: -dynamic_lstm ------------- -.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm +BlockGuard +---------- + +.. autoclass:: paddle.v2.fluid.layers.BlockGuard + :members: :noindex: -dynamic_lstmp -------------- -.. autofunction:: paddle.v2.fluid.layers.dynamic_lstmp +BlockGuardWithCompletion +------------------------ + +.. autoclass:: paddle.v2.fluid.layers.BlockGuardWithCompletion + :members: :noindex: -dynamic_gru ------------ -.. autofunction:: paddle.v2.fluid.layers.dynamic_gru +StaticRNNMemoryLink +------------------- + +.. autoclass:: paddle.v2.fluid.layers.StaticRNNMemoryLink + :members: :noindex: -data ----- -.. autofunction:: paddle.v2.fluid.layers.data +WhileGuard +---------- + +.. autoclass:: paddle.v2.fluid.layers.WhileGuard + :members: :noindex: -mean ----- -.. autofunction:: paddle.v2.fluid.layers.mean +While +----- + +.. autoclass:: paddle.v2.fluid.layers.While + :members: :noindex: -mul ---- -.. autofunction:: paddle.v2.fluid.layers.mul +lod_rank_table +-------------- + +.. autofunction:: paddle.v2.fluid.layers.lod_rank_table :noindex: -elementwise_add ---------------- -.. autofunction:: paddle.v2.fluid.layers.elementwise_add +max_sequence_len +---------------- + +.. autofunction:: paddle.v2.fluid.layers.max_sequence_len :noindex: -elementwise_sub ---------------- -.. autofunction:: paddle.v2.fluid.layers.elementwise_sub +topk +---- + +.. autofunction:: paddle.v2.fluid.layers.topk :noindex: -elementwise_mul ---------------- -.. autofunction:: paddle.v2.fluid.layers.elementwise_mul +lod_tensor_to_array +------------------- + +.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array :noindex: -elementwise_div ---------------- -.. autofunction:: paddle.v2.fluid.layers.elementwise_div +array_to_lod_tensor +------------------- + +.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor :noindex: +increment +--------- -dropout -------- -.. autofunction:: paddle.v2.fluid.layers.dropout +.. autofunction:: paddle.v2.fluid.layers.increment :noindex: +array_write +----------- -reshape --------- -.. autofunction:: paddle.v2.fluid.layers.reshape +.. autofunction:: paddle.v2.fluid.layers.array_write :noindex: +create_array +------------ -sigmoid +.. autofunction:: paddle.v2.fluid.layers.create_array + :noindex: + +less_than --------- -.. autofunction:: paddle.v2.fluid.layers.sigmoid + +.. autofunction:: paddle.v2.fluid.layers.less_than :noindex: +array_read +---------- -scale ---------- -.. autofunction:: paddle.v2.fluid.layers.scale +.. autofunction:: paddle.v2.fluid.layers.array_read + :noindex: + +shrink_memory +------------- + +.. autofunction:: paddle.v2.fluid.layers.shrink_memory :noindex: +array_length +------------ -transpose +.. autofunction:: paddle.v2.fluid.layers.array_length + :noindex: + +IfElse +------ + +.. autoclass:: paddle.v2.fluid.layers.IfElse + :members: + :noindex: + +DynamicRNN +---------- + +.. autoclass:: paddle.v2.fluid.layers.DynamicRNN + :members: + :noindex: + +ConditionalBlock +---------------- + +.. autoclass:: paddle.v2.fluid.layers.ConditionalBlock + :members: + :noindex: + +StaticRNN --------- -.. autofunction:: paddle.v2.fluid.layers.transpose + +.. autoclass:: paddle.v2.fluid.layers.StaticRNN + :members: :noindex: +reorder_lod_tensor_by_rank +-------------------------- -sigmoid_cross_entropy_with_logits ---------------------------------- -.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits +.. autofunction:: paddle.v2.fluid.layers.reorder_lod_tensor_by_rank :noindex: +ParallelDo +---------- -cast +.. autoclass:: paddle.v2.fluid.layers.ParallelDo + :members: + :noindex: + +Print +----- + +.. autofunction:: paddle.v2.fluid.layers.Print + :noindex: + +device +====== + +get_places +---------- + +.. autofunction:: paddle.v2.fluid.layers.get_places + :noindex: + +io +== + +data ---- -.. autofunction:: paddle.v2.fluid.layers.cast + +.. autofunction:: paddle.v2.fluid.layers.data :noindex: +BlockGuardServ +-------------- -concat -------- -.. autofunction:: paddle.v2.fluid.layers.concat +.. autoclass:: paddle.v2.fluid.layers.BlockGuardServ + :members: :noindex: +ListenAndServ +------------- -sums +.. autoclass:: paddle.v2.fluid.layers.ListenAndServ + :members: + :noindex: + +Send ---- -.. autofunction:: paddle.v2.fluid.layers.sums + +.. autofunction:: paddle.v2.fluid.layers.Send :noindex: +nn +== -linear_chain_crf ----------------- -.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf +fc +-- + +.. autofunction:: paddle.v2.fluid.layers.fc :noindex: +embedding +--------- -assign -------- .. autofunction:: paddle.v2.fluid.layers.embedding :noindex: +dynamic_lstm +------------ -split_lod_tensor ----------------- -.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor +.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm :noindex: +dynamic_lstmp +------------- -merge_lod_tensor +.. autofunction:: paddle.v2.fluid.layers.dynamic_lstmp + :noindex: + +dynamic_gru +----------- + +.. autofunction:: paddle.v2.fluid.layers.dynamic_gru + :noindex: + +gru_unit +-------- + +.. autofunction:: paddle.v2.fluid.layers.gru_unit + :noindex: + +linear_chain_crf ---------------- -.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor + +.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf + :noindex: + +crf_decoding +------------ + +.. autofunction:: paddle.v2.fluid.layers.crf_decoding :noindex: cos_sim --------- +------- + .. autofunction:: paddle.v2.fluid.layers.cos_sim :noindex: - cross_entropy ------------- + .. autofunction:: paddle.v2.fluid.layers.cross_entropy :noindex: - - square_error_cost ----------------- + .. autofunction:: paddle.v2.fluid.layers.square_error_cost :noindex: - accuracy ---------- +-------- + .. autofunction:: paddle.v2.fluid.layers.accuracy :noindex: +chunk_eval +---------- + +.. autofunction:: paddle.v2.fluid.layers.chunk_eval + :noindex: sequence_conv ------------- + .. autofunction:: paddle.v2.fluid.layers.sequence_conv :noindex: - conv2d ------ + .. autofunction:: paddle.v2.fluid.layers.conv2d :noindex: - sequence_pool ------------- + .. autofunction:: paddle.v2.fluid.layers.sequence_pool :noindex: +pool2d +------ -sequence_first_step -------------------- -.. autofunction:: paddle.v2.fluid.layers.sequence_first_step +.. autofunction:: paddle.v2.fluid.layers.pool2d :noindex: +batch_norm +---------- + +.. autofunction:: paddle.v2.fluid.layers.batch_norm + :noindex: -sequence_last_step +beam_search_decode ------------------ -.. autofunction:: paddle.v2.fluid.layers.sequence_last_step + +.. autofunction:: paddle.v2.fluid.layers.beam_search_decode :noindex: +conv2d_transpose +---------------- -pool2d ------- -.. autofunction:: paddle.v2.fluid.layers.pool2d +.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose :noindex: +sequence_expand +--------------- -batch_norm +.. autofunction:: paddle.v2.fluid.layers.sequence_expand + :noindex: + +lstm_unit +--------- + +.. autofunction:: paddle.v2.fluid.layers.lstm_unit + :noindex: + +reduce_sum ---------- -.. autofunction:: paddle.v2.fluid.layers.batch_norm + +.. autofunction:: paddle.v2.fluid.layers.reduce_sum + :noindex: + +reduce_mean +----------- + +.. autofunction:: paddle.v2.fluid.layers.reduce_mean :noindex: +reduce_max +---------- + +.. autofunction:: paddle.v2.fluid.layers.reduce_max + :noindex: -beam_search_decode +reduce_min +---------- + +.. autofunction:: paddle.v2.fluid.layers.reduce_min + :noindex: + +sequence_first_step +------------------- + +.. autofunction:: paddle.v2.fluid.layers.sequence_first_step + :noindex: + +sequence_last_step ------------------ -.. autofunction:: paddle.v2.fluid.layers.beam_search_decode + +.. autofunction:: paddle.v2.fluid.layers.sequence_last_step + :noindex: + +dropout +------- + +.. autofunction:: paddle.v2.fluid.layers.dropout :noindex: +split +----- -lod_rank_table --------------- -.. autofunction:: paddle.v2.fluid.layers.lod_rank_table +.. autofunction:: paddle.v2.fluid.layers.split :noindex: +ctc_greedy_decoder +------------------ -max_sequence_len ----------------- -.. autofunction:: paddle.v2.fluid.layers.max_sequence_len +.. autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder :noindex: +edit_distance +------------- -topk ------ -.. autofunction:: paddle.v2.fluid.layers.topk +.. autofunction:: paddle.v2.fluid.layers.edit_distance :noindex: +l2_normalize +------------ -lod_tensor_to_array -------------------- -.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array +.. autofunction:: paddle.v2.fluid.layers.l2_normalize :noindex: +matmul +------ - -array_to_lod_tensor -------------------- -.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor +.. autofunction:: paddle.v2.fluid.layers.matmul :noindex: +warpctc +------- +.. autofunction:: paddle.v2.fluid.layers.warpctc + :noindex: +sequence_reshape +---------------- -fill_constant -------------- -.. autofunction:: paddle.v2.fluid.layers.fill_constant +.. autofunction:: paddle.v2.fluid.layers.sequence_reshape :noindex: +transpose +--------- +.. autofunction:: paddle.v2.fluid.layers.transpose + :noindex: -fill_constant_batch_size_like ------------------------------ -.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like +im2sequence +----------- + +.. autofunction:: paddle.v2.fluid.layers.im2sequence :noindex: +nce +--- -ones ----- -.. autofunction:: paddle.v2.fluid.layers.ones +.. autofunction:: paddle.v2.fluid.layers.nce :noindex: +beam_search +----------- -zeros ------ -.. autofunction:: paddle.v2.fluid.layers.zeros +.. autofunction:: paddle.v2.fluid.layers.beam_search :noindex: +row_conv +-------- -increment ---------- -.. autofunction:: paddle.v2.fluid.layers.increment +.. autofunction:: paddle.v2.fluid.layers.row_conv :noindex: +multiplex +--------- -array_write ------------ -.. autofunction:: paddle.v2.fluid.layers.array_write +.. autofunction:: paddle.v2.fluid.layers.multiplex :noindex: +ops +=== +mean +---- -create_array ------------- -.. autofunction:: paddle.v2.fluid.layers.create_array +.. autofunction:: paddle.v2.fluid.layers.mean :noindex: +mul +--- -less_than ---------- -.. autofunction:: paddle.v2.fluid.layers.less_than +.. autofunction:: paddle.v2.fluid.layers.mul :noindex: +reshape +------- -array_read ----------- -.. autofunction:: paddle.v2.fluid.layers.array_read +.. autofunction:: paddle.v2.fluid.layers.reshape :noindex: +scale +----- -shrink_memory --------------- -.. autofunction:: paddle.v2.fluid.layers.shrink_memory +.. autofunction:: paddle.v2.fluid.layers.scale :noindex: +sigmoid_cross_entropy_with_logits +--------------------------------- -array_length -------------- -.. autofunction:: paddle.v2.fluid.layers.array_length +.. autofunction:: paddle.v2.fluid.layers.sigmoid_cross_entropy_with_logits :noindex: +elementwise_add +--------------- -conv2d_transpose ----------------- -.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose +.. autofunction:: paddle.v2.fluid.layers.elementwise_add :noindex: - -sequence_expand +elementwise_div --------------- -.. autofunction:: paddle.v2.fluid.layers.sequence_expand + +.. autofunction:: paddle.v2.fluid.layers.elementwise_div :noindex: +elementwise_sub +--------------- -gru_unit --------- -.. autofunction:: paddle.v2.fluid.layers.gru_unit +.. autofunction:: paddle.v2.fluid.layers.elementwise_sub :noindex: +elementwise_mul +--------------- -lstm_unit ---------- -.. autofunction:: paddle.v2.fluid.layers.lstm_unit +.. autofunction:: paddle.v2.fluid.layers.elementwise_mul :noindex: +elementwise_max +--------------- -sequence_softmax ----------------- -.. autofunction:: paddle.v2.fluid.layers.sequence_softmax +.. autofunction:: paddle.v2.fluid.layers.elementwise_max :noindex: +elementwise_min +--------------- -reduce_sum ----------- -.. autofunction:: paddle.v2.fluid.layers.reduce_sum +.. autofunction:: paddle.v2.fluid.layers.elementwise_min :noindex: +elementwise_pow +--------------- -reduce_mean ------------ -.. autofunction:: paddle.v2.fluid.layers.reduce_mean +.. autofunction:: paddle.v2.fluid.layers.elementwise_pow :noindex: +clip +---- -reduce_max ----------- -.. autofunction:: paddle.v2.fluid.layers.reduce_max +.. autofunction:: paddle.v2.fluid.layers.clip :noindex: +clip_by_norm +------------ -reduce_min ----------- -.. autofunction:: paddle.v2.fluid.layers.reduce_min +.. autofunction:: paddle.v2.fluid.layers.clip_by_norm :noindex: +sequence_softmax +---------------- -split ------ -.. autofunction:: paddle.v2.fluid.layers.split +.. autofunction:: paddle.v2.fluid.layers.sequence_softmax :noindex: +sigmoid +------- -matmul ------- -.. autofunction:: paddle.v2.fluid.layers.matmul +.. autofunction:: paddle.v2.fluid.layers.sigmoid :noindex: logsigmoid ---------- + .. autofunction:: paddle.v2.fluid.layers.logsigmoid :noindex: exp --- + .. autofunction:: paddle.v2.fluid.layers.exp :noindex: relu ---- + .. autofunction:: paddle.v2.fluid.layers.relu :noindex: tanh ---- + .. autofunction:: paddle.v2.fluid.layers.tanh :noindex: tanh_shrink ----------- + .. autofunction:: paddle.v2.fluid.layers.tanh_shrink :noindex: softshrink ---------- + .. autofunction:: paddle.v2.fluid.layers.softshrink :noindex: sqrt ---- + .. autofunction:: paddle.v2.fluid.layers.sqrt :noindex: abs ----- +--- + .. autofunction:: paddle.v2.fluid.layers.abs :noindex: ceil ---- + .. autofunction:: paddle.v2.fluid.layers.ceil :noindex: floor ----- + .. autofunction:: paddle.v2.fluid.layers.floor :noindex: round ----- + .. autofunction:: paddle.v2.fluid.layers.round :noindex: reciprocal ---------- + .. autofunction:: paddle.v2.fluid.layers.reciprocal :noindex: log --- + .. autofunction:: paddle.v2.fluid.layers.log :noindex: square ------ + .. autofunction:: paddle.v2.fluid.layers.square :noindex: softplus -------- + .. autofunction:: paddle.v2.fluid.layers.softplus :noindex: softsign ---------- +-------- + .. autofunction:: paddle.v2.fluid.layers.softsign :noindex: brelu ----- + .. autofunction:: paddle.v2.fluid.layers.brelu :noindex: leaky_relu ---------- + .. autofunction:: paddle.v2.fluid.layers.leaky_relu :noindex: soft_relu --------- + .. autofunction:: paddle.v2.fluid.layers.soft_relu :noindex: elu ----- +--- + .. autofunction:: paddle.v2.fluid.layers.elu :noindex: relu6 ----- + .. autofunction:: paddle.v2.fluid.layers.relu6 :noindex: pow ----- +--- + .. autofunction:: paddle.v2.fluid.layers.pow :noindex: +stanh +----- + +.. autofunction:: paddle.v2.fluid.layers.stanh + :noindex: + hard_shrink ----------- + .. autofunction:: paddle.v2.fluid.layers.hard_shrink :noindex: thresholded_relu ---------------- + .. autofunction:: paddle.v2.fluid.layers.thresholded_relu :noindex: hard_sigmoid -------------- +------------ + .. autofunction:: paddle.v2.fluid.layers.hard_sigmoid :noindex: swish ------- +----- + .. autofunction:: paddle.v2.fluid.layers.swish :noindex: -im2sequence +tensor +====== + +create_tensor +------------- + +.. autofunction:: paddle.v2.fluid.layers.create_tensor + :noindex: + +create_parameter +---------------- + +.. autofunction:: paddle.v2.fluid.layers.create_parameter + :noindex: + +create_global_var +----------------- + +.. autofunction:: paddle.v2.fluid.layers.create_global_var + :noindex: + +cast +---- + +.. autofunction:: paddle.v2.fluid.layers.cast + :noindex: + +concat ------ -.. autofunction:: paddle.v2.fluid.layers.im2sequence + +.. autofunction:: paddle.v2.fluid.layers.concat :noindex: -edit_distance ---------------- -.. autofunction:: paddle.v2.fluid.layers.edit_distance_error +sums +---- + +.. autofunction:: paddle.v2.fluid.layers.sums :noindex: -ctc_greedy_decoder ---------------- -.. autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder +assign +------ + +.. autofunction:: paddle.v2.fluid.layers.assign :noindex: -l2_normalize ------------- -.. autofunction:: paddle.v2.fluid.layers.l2_normalize +fill_constant_batch_size_like +----------------------------- + +.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like :noindex: -sequence_reshape ----------------- -.. autofunction:: paddle.v2.fluid.layers.sequence_reshape +fill_constant +------------- + +.. autofunction:: paddle.v2.fluid.layers.fill_constant :noindex: -row_conv --------- -.. autofunction:: paddle.v2.fluid.layers.row_conv +ones +---- + +.. autofunction:: paddle.v2.fluid.layers.ones :noindex: -multiplex ---------- -.. autofunction:: paddle.v2.fluid.layers.multiplex +zeros +----- + +.. autofunction:: paddle.v2.fluid.layers.zeros :noindex: + diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst index 500019bc507f859c4c91de5d322a82eb1e78e2de..015581b7660848bdb0845fafe2d3fc05405e6ae6 100644 --- a/doc/api/v2/fluid/nets.rst +++ b/doc/api/v2/fluid/nets.rst @@ -1,33 +1,31 @@ -=========== -Nets -=========== +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! + +==== +nets +==== simple_img_conv_pool -------------------- -.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool - :noindex: - -img_conv_group ---------------- -.. autofunction:: paddle.v2.fluid.nets.img_conv_group +.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool :noindex: - sequence_conv_pool ------------------ + .. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool :noindex: - glu --- + .. autofunction:: paddle.v2.fluid.nets.glu :noindex: - scaled_dot_product_attention ---------------------------- + .. autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention :noindex: diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst index 19b4940f08de3e2f7dc177f2961e538946d10a78..1691ebb9a7cb16da96e04147d0adea322374f529 100644 --- a/doc/api/v2/fluid/optimizer.rst +++ b/doc/api/v2/fluid/optimizer.rst @@ -1,54 +1,49 @@ -=========== -Optimizer -=========== - -Optimizer ------------ -.. automodule:: paddle.v2.fluid.optimizer - :members: Optimizer - :noindex: +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! +========= +optimizer +========= -SGDOptimizer ------------ -.. automodule:: paddle.v2.fluid.optimizer - :members: SGDOptimizer - :noindex: +SGD +--- +.. autoclass:: paddle.v2.fluid.optimizer.SGD + :members: + :noindex: +Momentum +-------- -MomentumOptimizer ------------------ -.. automodule:: paddle.v2.fluid.optimizer - :members: MomentumOptimizer +.. autoclass:: paddle.v2.fluid.optimizer.Momentum + :members: :noindex: +Adagrad +------- - -AdagradOptimizer ----------------- -.. automodule:: paddle.v2.fluid.optimizer - :members: AdagradOptimizer +.. autoclass:: paddle.v2.fluid.optimizer.Adagrad + :members: :noindex: +Adam +---- -AdamOptimizer -------------- -.. automodule:: paddle.v2.fluid.optimizer - :members: AdamOptimizer +.. autoclass:: paddle.v2.fluid.optimizer.Adam + :members: :noindex: +Adamax +------ -AdamaxOptimizer ------------ -.. automodule:: paddle.v2.fluid.optimizer - :members: AdamaxOptimizer +.. autoclass:: paddle.v2.fluid.optimizer.Adamax + :members: :noindex: +DecayedAdagrad +-------------- -DecayedAdagradOptimizer ------------------------ -.. automodule:: paddle.v2.fluid.optimizer - :members: DecayedAdagradOptimizer +.. autoclass:: paddle.v2.fluid.optimizer.DecayedAdagrad + :members: :noindex: diff --git a/doc/api/v2/fluid/param_attr.rst b/doc/api/v2/fluid/param_attr.rst index ca0c8af9e8c4f2271de7a131ad0d27c0e8635f50..8083d0d858dafcd275eaddb9b475875ee42ef724 100644 --- a/doc/api/v2/fluid/param_attr.rst +++ b/doc/api/v2/fluid/param_attr.rst @@ -1,11 +1,21 @@ -=========== +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! + +========== +param_attr +========== + ParamAttr -=========== +--------- +.. autoclass:: paddle.v2.fluid.param_attr.ParamAttr + :members: + :noindex: +WeightNormParamAttr +------------------- -ParamAttr ------------ -.. automodule:: paddle.v2.fluid.param_attr - :members: ParamAttr +.. autoclass:: paddle.v2.fluid.param_attr.WeightNormParamAttr + :members: :noindex: + diff --git a/doc/api/v2/fluid/profiler.rst b/doc/api/v2/fluid/profiler.rst index 7d4042d1f41c12c4a551ba6576559d612116872a..4a1ff7cb6976e0054f77428b699ea679aa91394f 100644 --- a/doc/api/v2/fluid/profiler.rst +++ b/doc/api/v2/fluid/profiler.rst @@ -1,10 +1,25 @@ -=========== -Profiler -=========== +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! +======== +profiler +======== +cuda_profiler +------------- -Profiler ------------ .. autofunction:: paddle.v2.fluid.profiler.cuda_profiler :noindex: + +reset_profiler +-------------- + +.. autofunction:: paddle.v2.fluid.profiler.reset_profiler + :noindex: + +profiler +-------- + +.. autofunction:: paddle.v2.fluid.profiler.profiler + :noindex: + diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst index 868e225ed3d59e79aeb217fb88081ea25f80fa2c..2c17d15599baa1d02eb87c7b6c40034769ebb3a4 100644 --- a/doc/api/v2/fluid/regularizer.rst +++ b/doc/api/v2/fluid/regularizer.rst @@ -1,25 +1,27 @@ +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! + =========== -Regularizer +regularizer =========== -WeightDecayRegularizer ----------------------- -.. automodule:: paddle.v2.fluid.regularizer - :members: WeightDecayRegularizer - :noindex: - +append_regularization_ops +------------------------- -L2DecayRegularizer ------------------- -.. automodule:: paddle.v2.fluid.regularizer - :members: L2DecayRegularizer +.. autofunction:: paddle.v2.fluid.regularizer.append_regularization_ops :noindex: +L1Decay +------- +.. autoclass:: paddle.v2.fluid.regularizer.L1Decay + :members: + :noindex: -L1DecayRegularizer -------------------- -.. automodule:: paddle.v2.fluid.regularizer - :members: L1DecayRegularizer +L2Decay +------- +.. autoclass:: paddle.v2.fluid.regularizer.L2Decay + :members: + :noindex: diff --git a/doc/design/speech/README.MD b/doc/design/speech/deep_speech_2.md similarity index 85% rename from doc/design/speech/README.MD rename to doc/design/speech/deep_speech_2.md index 7304650e628dba210488cd2dc4836318b5383b2a..cfdc4d6df04344c70d3334626bd38eca997c31ff 100644 --- a/doc/design/speech/README.MD +++ b/doc/design/speech/deep_speech_2.md @@ -140,7 +140,19 @@ TODO by Assignees ### Beam Search with CTC and LM -TODO by Assignees +
+
+Figure 2. Algorithm for CTC Beam Search Decoder. +
+ +- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts: + - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; + - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary. +- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding. +- Such external scorer consists of language model, word count or any other custom scorers. +- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7) +- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality. + ## Future Work @@ -153,3 +165,4 @@ TODO by Assignees 1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016. 2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595. +3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873 diff --git a/doc/design/speech/image/beam_search.png b/doc/design/speech/image/beam_search.png new file mode 100644 index 0000000000000000000000000000000000000000..7f7e35f34223162d0f7f0ed97375909c43b830ae Binary files /dev/null and b/doc/design/speech/image/beam_search.png differ diff --git a/doc/design/switch.md b/doc/design/switch.md new file mode 100644 index 0000000000000000000000000000000000000000..9db1b2782a521c2ff4b28b8f9efcdf1492242ed4 --- /dev/null +++ b/doc/design/switch.md @@ -0,0 +1,32 @@ +### Design Doc: Switch + +### Background + +Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid. + +The following example shows the usage of `fluid.switch`. + +```python +a = fluid.Var(10) +b = fluid.Var(0) + +switch = fluid.switch() +with switch.block(): + with switch.case(fluid.less_equal(a, 10)): + fluid.print("Case 1") + with switch.case(fluid.larger(a, 0)): + fluid.print("Case 2") + with switch.default(): + fluid.print("Case 3") +``` + +### The Semantics + +1. A `switch` control-flow checks cases one-by-one. +1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values. +1. It runs the first matched case, or the default case if there is one. +1. Once it matches a case, it runs the corresponding branch and only that branch. It's like there is a C's `break` keyword at the end of each case. + +The above program should print and print only "Case 1". + +The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches. diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst index 71904dc41ed0d946867d890cc585e1b88450ca8c..ff904b1022a41612c9680dce92d3fc2c69ad7e93 100644 --- a/doc/getstarted/build_and_install/build_from_source_cn.rst +++ b/doc/getstarted/build_and_install/build_from_source_cn.rst @@ -115,7 +115,7 @@ PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种B "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON" "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON" "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON" - "WITH_TESTING", "是否开启单元测试", "ON" + "WITH_TESTING", "是否开启单元测试", "OFF" "WITH_DOC", "是否编译中英文文档", "OFF" "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto" "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON" diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst index 27f73b2e2c029b41d514e1612912ed1c335605b6..718fb869c23a1f7be82c87c726282bded9dad516 100644 --- a/doc/getstarted/build_and_install/build_from_source_en.rst +++ b/doc/getstarted/build_and_install/build_from_source_en.rst @@ -126,7 +126,7 @@ You can add :code:`-D` argument to pass such options, like: "WITH_AVX", "Build with AVX support", "ON" "WITH_PYTHON", "Build with integrated Python interpreter", "ON" "WITH_STYLE_CHECK", "Check code style when building", "ON" - "WITH_TESTING", "Build unit tests", "ON" + "WITH_TESTING", "Build unit tests", "OFF" "WITH_DOC", "Build documentations", "OFF" "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto" "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON" diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst index 98fada7bdb46f4dd2927d6f93bcbcebbe7d18604..79d214635a069a739060e0b79424729f6ff90387 100644 --- a/doc/getstarted/build_and_install/docker_install_cn.rst +++ b/doc/getstarted/build_and_install/docker_install_cn.rst @@ -95,6 +95,12 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note docker run -p 8888:8888 paddlepaddle/book +国内用户可以使用下面的镜像源来加速访问: + + .. code-block: bash + + docker run -p 8888:8888 docker.paddlepaddlehub.com/book + 然后在浏览器中输入以下网址: .. code-block:: text diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst index b1d0890b4cdddb77114a80276130afd07c22d270..e0e0559fb858a093db96a9b4ec1c5a45d6c71a38 100644 --- a/doc/getstarted/build_and_install/docker_install_en.rst +++ b/doc/getstarted/build_and_install/docker_install_en.rst @@ -102,6 +102,12 @@ We provide a packaged book image, simply issue the command: docker run -p 8888:8888 paddlepaddle/book +For users in China, we provide a faster mirror: + + .. code-block: bash + + docker run -p 8888:8888 docker.paddlepaddlehub.com/book + Then, you would back and paste the address into the local browser: .. code-block:: text diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md index c2fc86687d7106aac7c74d6dd16bc229353cb7c1..0f3db59607fb6b43da01f5fdb46949087517ed6c 100644 --- a/doc/howto/usage/cluster/cluster_train_cn.md +++ b/doc/howto/usage/cluster/cluster_train_cn.md @@ -92,11 +92,11 @@ paddle.init( 参数说明 - use_gpu: **可选,默认False**,是否启用GPU训练 -- trainer_count:**必选,默认1**,当前训练任务trainer总个数 +- trainer_count:**必选,默认1**,当前trainer的线程数目 - port:**必选,默认7164**,连接到pserver的端口 - ports_num:**必选,默认1**,连接到pserver的端口个数 - ports_num_for_sparse:**必选,默认0**,和pserver之间用于稀疏类型参数通信的端口个数 -- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数 +- num_gradient_servers:**必选,默认1**,当前训练任务trainer总数 - trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数 - pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开 diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md index 28cd1fa7903e559e33a7fc2f00172fdfbe2fdc97..f9424f8f1a29fcf001c4e7976086512b22f6e858 100644 --- a/doc/howto/usage/cluster/cluster_train_en.md +++ b/doc/howto/usage/cluster/cluster_train_en.md @@ -95,11 +95,11 @@ paddle.init( Parameter Description - use_gpu: **optional, default False**, set to "True" to enable GPU training. -- trainer_count: **required, default 1**, total count of trainers in the training job. +- trainer_count: **required, default 1**, number of threads in current trainer. - port: **required, default 7164**, port to connect to parameter server. - ports_num: **required, default 1**, number of ports for communication. - ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation. -- num_gradient_servers: **required, default 1**, total number of gradient server. +- num_gradient_servers: **required, default 1**, number of trainers in current job. - trainer_id: **required, default 0**, ID for every trainer, start from 0. - pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",". diff --git a/doc/index_cn.rst b/doc/index_cn.rst index ada51c2d73263898b2c748437f8eb0f30b537073..9279bac7f4b2898c18979630a8d6dfcb2dba70e0 100644 --- a/doc/index_cn.rst +++ b/doc/index_cn.rst @@ -8,4 +8,3 @@ PaddlePaddle 文档 howto/index_cn.rst api/index_cn.rst faq/index_cn.rst - mobile/index_cn.rst diff --git a/doc/index_en.rst b/doc/index_en.rst index 23b64b6cadf776d44c4d0aa5a550ffe24be13b18..64684b8b9b27e245c6b32ea28809d3bbce22fab9 100644 --- a/doc/index_en.rst +++ b/doc/index_en.rst @@ -7,4 +7,3 @@ PaddlePaddle Documentation getstarted/index_en.rst howto/index_en.rst api/index_en.rst - mobile/index_en.rst diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst deleted file mode 100644 index 1d99666e58b7043b85b0203ee0dfcd1957710161..0000000000000000000000000000000000000000 --- a/doc/mobile/index_cn.rst +++ /dev/null @@ -1,9 +0,0 @@ -MOBILE -====== - -.. toctree:: - :maxdepth: 1 - - cross_compiling_for_android_cn.md - cross_compiling_for_ios_cn.md - cross_compiling_for_raspberry_cn.md diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst deleted file mode 100644 index ef421dacad458828cadf8cf505375d6c4bfd9dde..0000000000000000000000000000000000000000 --- a/doc/mobile/index_en.rst +++ /dev/null @@ -1,9 +0,0 @@ -MOBILE -====== - -.. toctree:: - :maxdepth: 1 - - cross_compiling_for_android_en.md - cross_compiling_for_ios_en.md - cross_compiling_for_raspberry_en.md diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index d394fa5d10d502d8fadbb48b6b85e4884f20b70d..a2a0be08d9425cdd8cce374aecd097085491d4c0 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -22,7 +22,7 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory) -nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) +nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init) cc_test(variable_test SRCS variable_test.cc) diff --git a/paddle/framework/channel.h b/paddle/framework/channel.h index 70ecccc1a1078374f3190b3956103ed8000c4fc5..b679387b1124e42499df158767b6c7afe1afd0c6 100644 --- a/paddle/framework/channel.h +++ b/paddle/framework/channel.h @@ -23,12 +23,10 @@ namespace framework { template class Channel { public: - virtual void Send(T*) = 0; - virtual void Receive(T*) = 0; + virtual bool Send(T*) = 0; + virtual bool Receive(T*) = 0; virtual size_t Cap() = 0; - - // Don't delete channels; instead, call Channel::Close. - protected: + virtual void Close() = 0; virtual ~Channel() {} }; @@ -50,11 +48,7 @@ Channel* MakeChannel(size_t buffer_size) { template void CloseChannel(Channel* ch) { - if (ch->Cap() > 0) { - delete dynamic_cast*>(ch); - } else { - delete dynamic_cast*>(ch); - } + ch->Close(); } } // namespace framework diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc index 9efc0172658c800d14102531332dbef68fa392f4..444d68498c9676fe0e246167dfacbe999a41d1a7 100644 --- a/paddle/framework/channel_test.cc +++ b/paddle/framework/channel_test.cc @@ -14,13 +14,329 @@ limitations under the License. */ #include "paddle/framework/channel.h" +#include +#include + #include "gtest/gtest.h" +using paddle::framework::Channel; +using paddle::framework::MakeChannel; +using paddle::framework::CloseChannel; + TEST(Channel, MakeAndClose) { - using paddle::framework::Channel; - using paddle::framework::MakeChannel; - using paddle::framework::CloseChannel; + using paddle::framework::details::Buffered; + using paddle::framework::details::UnBuffered; + { + // MakeChannel should return a buffered channel is buffer_size > 0. + auto ch = MakeChannel(10); + EXPECT_NE(dynamic_cast *>(ch), nullptr); + EXPECT_EQ(dynamic_cast *>(ch), nullptr); + CloseChannel(ch); + delete ch; + } + { + // MakeChannel should return an un-buffered channel is buffer_size = 0. + auto ch = MakeChannel(0); + EXPECT_EQ(dynamic_cast *>(ch), nullptr); + EXPECT_NE(dynamic_cast *>(ch), nullptr); + CloseChannel(ch); + delete ch; + } +} + +TEST(Channel, SufficientBufferSizeDoesntBlock) { + const size_t buffer_size = 10; + auto ch = MakeChannel(buffer_size); + for (size_t i = 0; i < buffer_size; ++i) { + EXPECT_EQ(ch->Send(&i), true); // should not block + } + + size_t out; + for (size_t i = 0; i < buffer_size; ++i) { + EXPECT_EQ(ch->Receive(&out), true); // should not block + EXPECT_EQ(out, i); + } + CloseChannel(ch); + delete ch; +} + +TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { + const size_t buffer_size = 10; + auto ch = MakeChannel(buffer_size); + size_t sum = 0; + std::thread t([&]() { + // Try to write more than buffer size. + for (size_t i = 0; i < 2 * buffer_size; ++i) { + if (i < buffer_size) + EXPECT_EQ(ch->Send(&i), true); // should block after 10 iterations + else + EXPECT_EQ(ch->Send(&i), false); + sum += i; + } + }); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec + EXPECT_EQ(sum, 45U); + + CloseChannel(ch); + t.join(); + delete ch; +} + +TEST(Channel, SimpleUnbufferedChannelTest) { + auto ch = MakeChannel(0); + unsigned sum_send = 0; + std::thread t([&]() { + for (int i = 0; i < 5; i++) { + EXPECT_EQ(ch->Send(&i), true); + sum_send += i; + } + }); + for (int i = 0; i < 5; i++) { + int recv; + EXPECT_EQ(ch->Receive(&recv), true); + EXPECT_EQ(recv, i); + } + + CloseChannel(ch); + t.join(); + EXPECT_EQ(sum_send, 10U); + delete ch; +} + +// This tests that closing a buffered channel also unblocks +// any receivers waiting on the channel +TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) { + auto ch = MakeChannel(1); + size_t num_threads = 5; + std::thread t[num_threads]; + bool thread_ended[num_threads]; + + // Launches threads that try to read and are blocked because of no writers + for (size_t i = 0; i < num_threads; i++) { + thread_ended[i] = false; + t[i] = std::thread( + [&](bool *p) { + int data; + // All reads should return false + EXPECT_EQ(ch->Receive(&data), false); + *p = true; + }, + &thread_ended[i]); + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait + + // Verify that all threads are blocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], false); + } + + // Explicitly close the channel + // This should unblock all receivers + CloseChannel(ch); + + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait + + // Verify that all threads got unblocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], true); + } + + for (size_t i = 0; i < num_threads; i++) t[i].join(); + delete ch; +} + +// This tests that closing a buffered channel also unblocks +// any senders waiting for channel to have write space +TEST(Channel, BufferedChannelCloseUnblocksSendersTest) { + auto ch = MakeChannel(1); + size_t num_threads = 5; + std::thread t[num_threads]; + bool thread_ended[num_threads]; + bool send_success[num_threads]; + + // Launches threads that try to write and are blocked because of no readers + for (size_t i = 0; i < num_threads; i++) { + thread_ended[i] = false; + send_success[i] = false; + t[i] = std::thread( + [&](bool *ended, bool *success) { + int data = 10; + *success = ch->Send(&data); + *ended = true; + }, + &thread_ended[i], &send_success[i]); + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait + + // Verify that atleast 4 threads are blocked + int ct = 0; + for (size_t i = 0; i < num_threads; i++) { + if (thread_ended[i] == false) ct++; + } + // Atleast 4 threads must be blocked + EXPECT_GE(ct, 4); + + // Explicitly close the thread + // This should unblock all senders + CloseChannel(ch); + + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait + + // Verify that all threads got unblocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], true); + } + + // Verify that only 1 send was successful + ct = 0; + for (size_t i = 0; i < num_threads; i++) { + if (send_success[i]) ct++; + } + // Only 1 send must be successful + EXPECT_EQ(ct, 1); + + for (size_t i = 0; i < num_threads; i++) t[i].join(); + delete ch; +} + +// This tests that closing an unbuffered channel also unblocks +// unblocks any receivers waiting for senders +TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) { + auto ch = MakeChannel(0); + size_t num_threads = 5; + std::thread t[num_threads]; + bool thread_ended[num_threads]; + + // Launches threads that try to read and are blocked becausew of no writers + for (size_t i = 0; i < num_threads; i++) { + thread_ended[i] = false; + t[i] = std::thread( + [&](bool *p) { + int data; + EXPECT_EQ(ch->Receive(&data), false); + *p = true; + }, + &thread_ended[i]); + } + std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec + + // Verify that all the threads are blocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], false); + } + + // Explicitly close the thread + // This should unblock all receivers + CloseChannel(ch); + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec + + // Verify that all threads got unblocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], true); + } + + for (size_t i = 0; i < num_threads; i++) t[i].join(); + delete ch; +} + +// This tests that closing an unbuffered channel also unblocks +// unblocks any senders waiting for senders +TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) { + auto ch = MakeChannel(0); + size_t num_threads = 5; + std::thread t[num_threads]; + bool thread_ended[num_threads]; + + // Launches threads that try to read and are blocked becausew of no writers + for (size_t i = 0; i < num_threads; i++) { + thread_ended[i] = false; + t[i] = std::thread( + [&](bool *p) { + int data = 10; + EXPECT_EQ(ch->Send(&data), false); + *p = true; + }, + &thread_ended[i]); + } + std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec + + // Verify that all the threads are blocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], false); + } + + // Explicitly close the thread + // This should unblock all receivers + CloseChannel(ch); + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec + + // Verify that all threads got unblocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], true); + } + + for (size_t i = 0; i < num_threads; i++) t[i].join(); + delete ch; +} + +TEST(Channel, UnbufferedLessReceiveMoreSendTest) { + auto ch = MakeChannel(0); + unsigned sum_send = 0; + // Send should block after three iterations + // since we only have three receivers. + std::thread t([&]() { + // Try to send more number of times + // than receivers + for (int i = 0; i < 4; i++) { + ch->Send(&i); + sum_send += i; + } + }); + for (int i = 0; i < 3; i++) { + int recv; + ch->Receive(&recv); + EXPECT_EQ(recv, i); + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec + EXPECT_EQ(sum_send, 3U); + + CloseChannel(ch); + t.join(); + delete ch; +} + +TEST(Channel, UnbufferedMoreReceiveLessSendTest) { + auto ch = MakeChannel(0); + unsigned sum_send = 0; + unsigned sum_receive = 0; + // The receiver should block after 5 + // iterations, since there are only 5 senders. + std::thread t([&]() { + for (int i = 0; i < 8; i++) { + int recv; + ch->Receive(&recv); // should block after the fifth iteration. + EXPECT_EQ(recv, i); + sum_receive += i; + } + }); + for (int i = 0; i < 5; i++) { + ch->Send(&i); + sum_send += i; + } + std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec + EXPECT_EQ(sum_send, 10U); + EXPECT_EQ(sum_receive, 10U); + // send three more elements + for (int i = 5; i < 8; i++) { + ch->Send(&i); + sum_send += i; + } - Channel* ch = MakeChannel(10); CloseChannel(ch); + t.join(); + EXPECT_EQ(sum_send, 28U); + EXPECT_EQ(sum_receive, 28U); + delete ch; } diff --git a/paddle/framework/details/buffered_channel.h b/paddle/framework/details/buffered_channel.h index 572e29d44a3baec84a029d87f9b0874784aa761b..7ac234b8d42bae0661c3256c78311455c0fbc77c 100644 --- a/paddle/framework/details/buffered_channel.h +++ b/paddle/framework/details/buffered_channel.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "paddle/framework/channel.h" +#include "paddle/platform/enforce.h" namespace paddle { namespace framework { @@ -29,9 +30,11 @@ class Buffered : public paddle::framework::Channel { friend void paddle::framework::CloseChannel(Channel*); public: - virtual void Send(T*); - virtual void Receive(T*); + virtual bool Send(T*); + virtual bool Receive(T*); virtual size_t Cap() { return cap_; } + virtual void Close(); + virtual ~Buffered(); private: size_t cap_; @@ -39,42 +42,64 @@ class Buffered : public paddle::framework::Channel { std::condition_variable empty_cond_var_; std::condition_variable full_cond_var_; std::deque channel_; + bool closed_; - Buffered(size_t cap) : cap_(cap) {} - virtual ~Buffered(); + Buffered(size_t cap) : cap_(cap), closed_(false) { + PADDLE_ENFORCE_GT(cap, 0); + } - void NotifyAllSenders(std::unique_lock*); + void NotifyAllParticipants(std::unique_lock*); }; template -void Buffered::Send(T* item) { +bool Buffered::Send(T* item) { + std::unique_lock lock(mu_); + full_cond_var_.wait(lock, + [this]() { return channel_.size() < cap_ || closed_; }); + bool ret = false; + if (!closed_) { + channel_.push_back(std::move(*item)); + lock.unlock(); + empty_cond_var_.notify_one(); + ret = true; + } + return ret; +} + +template +bool Buffered::Receive(T* item) { std::unique_lock lock(mu_); - full_cond_var_.wait(lock, [this]() { return channel_.size() < cap_; }); - channel_.push_back(std::move(*item)); - lock.unlock(); - empty_cond_var_.notify_one(); + empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; }); + bool ret = false; + if (!closed_) { + *item = std::move(channel_.front()); + channel_.pop_front(); + full_cond_var_.notify_one(); + ret = true; + } + return ret; } template -void Buffered::Receive(T* item) { +void Buffered::Close() { std::unique_lock lock(mu_); - empty_cond_var_.wait(lock, [this]() { return !channel_.empty(); }); - *item = std::move(channel_.front()); - channel_.pop_front(); - NotifyAllSenders(&lock); + closed_ = true; + NotifyAllParticipants(&lock); } template Buffered::~Buffered() { std::unique_lock lock(mu_); + closed_ = true; channel_.clear(); - NotifyAllSenders(&lock); + NotifyAllParticipants(&lock); } template -void Buffered::NotifyAllSenders(std::unique_lock* lock) { +void Buffered::NotifyAllParticipants(std::unique_lock* lock) { lock->unlock(); - full_cond_var_.notify_one(); + full_cond_var_.notify_all(); + empty_cond_var_.notify_all(); } } // namespace details diff --git a/paddle/framework/details/unbuffered_channel.h b/paddle/framework/details/unbuffered_channel.h index 7ecced1fba88fea781fc342091bc71e5aa496d3a..f86a894bb4a42e45edf6964e30620b68183faaa8 100644 --- a/paddle/framework/details/unbuffered_channel.h +++ b/paddle/framework/details/unbuffered_channel.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include -#include #include #include "paddle/framework/channel.h" @@ -29,23 +29,117 @@ class UnBuffered : public paddle::framework::Channel { friend void paddle::framework::CloseChannel(Channel*); public: - virtual void Send(T*); - virtual void Receive(T*); + virtual bool Send(T*); + virtual bool Receive(T*); virtual size_t Cap() { return 0; } + virtual void Close(); + virtual ~UnBuffered(); private: - UnBuffered() {} - virtual ~UnBuffered(); + std::mutex mu_ch_; + // Mutex for readers and writers who are waiting for other reader + // and writer to complete execution + std::recursive_mutex mu_read_, mu_write_; + // reader_found_ is set true when a reader is ready to accept data + // writer_found_ is set true when a writer is ready to send data + // A transaction occurs only when both are true + std::atomic reader_found_{false}, writer_found_{false}; + std::condition_variable cv_channel_; + std::condition_variable_any cv_reader_, cv_writer_; + T* item{nullptr}; + std::atomic closed_{false}; + + UnBuffered() : closed_(false) {} + + void NotifyAllParticipants(std::unique_lock*); }; +// This function implements the concept of how data should +// be sent from a writer to a reader. +template +bool UnBuffered::Send(T* data) { + // Prevent other writers from entering + std::unique_lock writer_lock(mu_write_); + writer_found_ = true; + std::unique_lock cv_lock(mu_write_); + // If writer comes first, it should wait till a reader arrives + cv_writer_.wait(cv_lock, + [this]() { return reader_found_ == true || closed_; }); + cv_reader_.notify_one(); + bool ret = false; + if (!closed_) { + std::unique_lock channel_lock(mu_ch_); + item = data; + channel_lock.unlock(); + cv_channel_.notify_one(); + channel_lock.lock(); + cv_channel_.wait(channel_lock, + [this]() { return item == nullptr || closed_; }); + ret = true; + } + writer_found_ = false; + return ret; +} + +// This function implements the concept of how +// data that was sent by a writer is read from a reader. +template +bool UnBuffered::Receive(T* data) { + // Prevent other readers from entering + std::unique_lock read_lock{mu_read_}; + reader_found_ = true; + std::unique_lock cv_lock{mu_read_}; + // If reader comes first, it should wait till a writer arrives + cv_reader_.wait(cv_lock, + [this]() { return writer_found_ == true || closed_; }); + cv_writer_.notify_one(); + bool ret = false; + if (!closed_) { + std::unique_lock lock_ch{mu_ch_}; + // Reader should wait for the writer to first write its data + cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; }); + if (!closed_) { + *data = std::move(*item); + item = nullptr; + lock_ch.unlock(); + ret = true; + } + cv_channel_.notify_one(); + } + reader_found_ = false; + return ret; +} + +// This function implements the sequence of events +// that take place once the channel is closed. template -void UnBuffered::Send(T* channel_element) {} +void UnBuffered::Close() { + std::unique_lock lock(mu_ch_); + item = nullptr; + closed_ = true; + NotifyAllParticipants(&lock); +} +// This function implements the sequence of events +// that are executed once the object of an UnBuffered +// channel is destroyed. template -void UnBuffered::Receive(T*) {} +UnBuffered::~UnBuffered() { + std::unique_lock lock(mu_ch_); + item = nullptr; + closed_ = true; + NotifyAllParticipants(&lock); +} +// This function notifies all the readers, writers and +// the channel condition variables. template -UnBuffered::~UnBuffered() {} +void UnBuffered::NotifyAllParticipants(std::unique_lock* lock) { + lock->unlock(); + cv_writer_.notify_all(); + cv_channel_.notify_all(); + cv_reader_.notify_all(); +} } // namespace details } // namespace framework diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index cbf3ec75265fa74aaacffee684b7b7d5f73b7c02..9a232b08434d299d10bb2acdb6e96295de875d56 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -25,7 +25,7 @@ limitations under the License. */ #include "paddle/platform/place.h" #include "paddle/platform/profiler.h" -DECLARE_bool(do_memory_benchmark); +DECLARE_bool(benchmark); DEFINE_bool(check_nan_inf, false, "Checking whether operator produce NAN/INF or not. It will be " "extremely slow so please use this flag wisely."); @@ -33,9 +33,6 @@ DEFINE_bool(check_nan_inf, false, namespace paddle { namespace framework { -const std::string kFeedOpType = "feed"; -const std::string kFetchOpType = "fetch"; - Executor::Executor(const platform::Place& place) : place_(place) {} static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { @@ -125,7 +122,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, op->Run(*local_scope, place_); VLOG(3) << op->DebugStringEx(local_scope); - if (FLAGS_do_memory_benchmark) { + if (FLAGS_benchmark) { VLOG(2) << "Memory used after operator " + op->Type() + " running: " << memory::memory_usage(place_); } @@ -142,7 +139,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, if (create_vars && create_local_scope) { scope->DeleteScope(local_scope); } - if (FLAGS_do_memory_benchmark) { + if (FLAGS_benchmark) { VLOG(2) << "-------------------------------------------------------"; VLOG(2) << "Memory used after deleting local scope: " << memory::memory_usage(place_); diff --git a/paddle/framework/feed_fetch_type.h b/paddle/framework/feed_fetch_type.h index 9bc4a90c44828ecb7458d524f59609f01848cc5c..168f456675af508df86dd0520cdeb5d16d94ad31 100644 --- a/paddle/framework/feed_fetch_type.h +++ b/paddle/framework/feed_fetch_type.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include #include "paddle/framework/lod_tensor.h" @@ -20,5 +21,8 @@ namespace paddle { namespace framework { using FeedFetchType = LoDTensor; using FeedFetchList = std::vector; + +static const std::string kFeedOpType = "feed"; +static const std::string kFetchOpType = "fetch"; } // namespace framework } // namespace paddle diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc index 4ef82a541efaa35bcf831d5122570154f2fa2423..3f6ea121b3994979d89a7d5a8c20c59240a0c111 100644 --- a/paddle/framework/init.cc +++ b/paddle/framework/init.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include // for strdup #include +#include #include #include "paddle/framework/init.h" @@ -46,17 +47,23 @@ void InitDevices() { std::vector places; places.emplace_back(platform::CPUPlace()); + int count = 0; #ifdef PADDLE_WITH_CUDA - int count = platform::GetCUDADeviceCount(); - for (int i = 0; i < count; ++i) { - places.emplace_back(platform::CUDAPlace(i)); + try { + count = platform::GetCUDADeviceCount(); + } catch (const std::exception &exp) { + LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime."; } #else LOG(WARNING) - << "'GPU' is not supported, Please re-compile with WITH_GPU option"; + << "'CUDA' is not supported, Please re-compile with WITH_GPU option"; #endif + for (int i = 0; i < count; ++i) { + places.emplace_back(platform::CUDAPlace(i)); + } + platform::DeviceContextPool::Init(places); } diff --git a/paddle/framework/init_test.cc b/paddle/framework/init_test.cc index f837a965d3be7d40c20803ae4462b3bfd91bffd0..01e076dd8ea24831e3ed7c8a7f8fae6818a89335 100644 --- a/paddle/framework/init_test.cc +++ b/paddle/framework/init_test.cc @@ -20,7 +20,21 @@ TEST(InitDevices, CPU) { using paddle::framework::InitDevices; using paddle::platform::DeviceContextPool; +#ifndef PADDLE_WITH_CUDA InitDevices(); DeviceContextPool& pool = DeviceContextPool::Instance(); - ASSERT_GE(pool.size(), 1U); + ASSERT_EQ(pool.size(), 1U); +#endif +} + +TEST(InitDevices, CUDA) { + using paddle::framework::InitDevices; + using paddle::platform::DeviceContextPool; + +#ifdef PADDLE_WITH_CUDA + int count = paddle::platform::GetCUDADeviceCount(); + InitDevices(); + DeviceContextPool& pool = DeviceContextPool::Instance(); + ASSERT_EQ(pool.size(), 1U + static_cast(count)); +#endif } diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc index 53b0d0fe083579da4f0bb600f292765aa2aa0d8a..cb27de6991674247e6215ce64a2da5000fa78ed4 100644 --- a/paddle/framework/lod_tensor.cc +++ b/paddle/framework/lod_tensor.cc @@ -24,8 +24,6 @@ limitations under the License. */ #include #include -#include - namespace paddle { namespace framework { diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h index 9d1294fdeb9bd76bf944f7ec3687e3c5bb333241..d0ab640485baf6d76ee629ea420b603f42b031b4 100644 --- a/paddle/framework/lod_tensor.h +++ b/paddle/framework/lod_tensor.h @@ -18,11 +18,11 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #include -#include #endif #include #include "paddle/framework/ddim.h" +#include "paddle/framework/mixed_vector.h" #include "paddle/framework/tensor.h" #include "paddle/framework/tensor_util.h" #include "paddle/platform/enforce.h" @@ -31,15 +31,6 @@ limitations under the License. */ namespace paddle { namespace framework { -#ifndef PADDLE_WITH_CUDA -template -using Vector = std::vector; -#else -template -using Vector = thrust::host_vector< - T, thrust::system::cuda::experimental::pinned_allocator>; -#endif - /* * LoD is short for Level of Details. * @@ -55,7 +46,15 @@ using Vector = thrust::host_vector< * 0 2 4 7 * 0 2 5 7 10 12 15 20 */ -using LoD = std::vector>; +struct LoD : public std::vector> { + using std::vector>::vector; + + void CopyFromCUDA() { + for (auto it = this->begin(); it != this->end(); ++it) { + it->CopyFromCUDA(); + } + } +}; std::ostream& operator<<(std::ostream& os, const LoD& lod); std::ostream& operator<<(std::ostream& os, const LoDTensor& t); @@ -109,7 +108,10 @@ bool CheckAbsLoD(const LoD& in, int tensor_height = -1); */ class LoDTensor : public Tensor { public: - LoDTensor() {} + LoDTensor() : Tensor() {} + + /* Constructor with place should only be used in pybind */ + explicit LoDTensor(const platform::Place& place) : Tensor(place) {} explicit LoDTensor(const LoD& lod) : lod_(lod) {} diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc index 4d172c43c7cceacb7d0dfaf1c4d3028717350268..3b63020e685436396071fa05cd7697630ae56c95 100644 --- a/paddle/framework/lod_tensor_test.cc +++ b/paddle/framework/lod_tensor_test.cc @@ -23,6 +23,17 @@ namespace paddle { namespace framework { +TEST(LoD, data) { + LoD lod{{0, 1, 2}}; + lod.push_back({0, 2, 4, 5}); + lod.push_back(std::vector({0, 1, 6, 8, 10, 11})); + + auto& v = lod[0]; + for (size_t i = 0; i < v.size(); ++i) { + EXPECT_EQ(v[i], i); + } +} + TEST(LodExpand, test) { LoD lod{{0, 2}}; LoDTensor tensor; diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu index 1e253a2f6f35e827fb2e5db6270da03705b39514..d4c9f00bd9c00f3cae68858ca46c5320fc117405 100644 --- a/paddle/framework/lod_tensor_test.cu +++ b/paddle/framework/lod_tensor_test.cu @@ -14,6 +14,8 @@ #include #include +#include +#include "paddle/framework/init.h" #include "paddle/framework/lod_tensor.h" #include "paddle/platform/assert.h" @@ -26,7 +28,48 @@ __global__ void test(size_t* a, int size) { } } +TEST(Vector, Normal) { + using namespace paddle::framework; + using namespace paddle::platform; + using namespace paddle::memory; + + paddle::framework::InitDevices(); + + paddle::framework::Vector vec({1, 2, 3}); + size_t* ptr = vec.data(); + for (size_t i = 0; i < vec.size(); ++i) { + EXPECT_EQ(vec[i], *(ptr + i)); + } + + vec.clear(); + vec.CopyFromCUDA(); + + std::vector v = {1, 2, 3}; + for (size_t i = 0; i < v.size(); ++i) { + EXPECT_EQ(v[i], vec[i]); + } +} + +TEST(LoD, data) { + paddle::framework::InitDevices(); + + paddle::framework::LoD lod{{0, 1, 2}}; + lod.push_back({0, 2, 4, 5}); + lod.push_back(std::vector({0, 1, 6, 8, 10, 11})); + + auto& v = lod[0]; + test<<<1, 1>>>(v.cuda_data(), v.size()); + cudaDeviceSynchronize(); + + v.CopyFromCUDA(); + for (size_t i = 0; i < v.size(); ++i) { + EXPECT_EQ(v[i], i * 2); + } +} + TEST(LoDTensor, LoDInGPU) { + paddle::framework::InitDevices(); + paddle::framework::LoDTensor lod_tensor; paddle::platform::CUDAPlace place(0); @@ -42,8 +85,9 @@ TEST(LoDTensor, LoDInGPU) { auto lod = lod_tensor.lod(); - test<<<1, 8>>>(lod[0].data(), lod[0].size()); + test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size()); cudaDeviceSynchronize(); + lod.CopyFromCUDA(); for (size_t i = 0; i < src_lod[0].size(); ++i) { EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2); diff --git a/paddle/framework/mixed_vector.h b/paddle/framework/mixed_vector.h new file mode 100644 index 0000000000000000000000000000000000000000..85caac8dcd9ede4fe997e2fd246d1421aa73c80a --- /dev/null +++ b/paddle/framework/mixed_vector.h @@ -0,0 +1,135 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/memory/memcpy.h" +#include "paddle/memory/memory.h" +#include "paddle/platform/device_context.h" +#include "paddle/platform/enforce.h" +#include "paddle/platform/place.h" + +namespace paddle { +namespace framework { + +/** + * @brief Vector support both cpu and gpu. + * host vector lifetime is same with Vector + * device vector is lazily malloc and modified. + */ + +template +class Vector : public std::vector { + public: + using std::vector::vector; + + Vector() {} + Vector(const std::vector &v) : std::vector(v) {} // NOLINT + + virtual ~Vector() { +#ifdef PADDLE_WITH_CUDA + if (cuda_ptr_ != nullptr) { + memory::Free(place_, cuda_ptr_); + } +#endif + } + + /* Get device vector */ + T *cuda_data() { + CopyToCUDA(); + PADDLE_ENFORCE_NOT_NULL( + cuda_ptr_, "No data or Insufficient CUDA memory to allocation"); + return static_cast(cuda_ptr_); + } + + /* Get host vector */ + T *data() { return std::vector::data(); } + const T *data() const { return std::vector::data(); } + + /* Synchronize host vector to device vector */ + void CopyToCUDA(); + /* Synchronize device vector to host vector */ + void CopyFromCUDA(); + /* Switch device vector location */ + void CopyToPeer(platform::Place); + + private: + void *cuda_ptr_ = nullptr; + size_t cuda_size_ = 0; // device vector numel + platform::CUDAPlace place_; +}; + +template +void Vector::CopyToCUDA() { +#ifdef PADDLE_WITH_CUDA + if (cuda_size_ < this->size()) { + if (cuda_ptr_ != nullptr) { + memory::Free(place_, cuda_ptr_); + } + cuda_ptr_ = + memory::Alloc(place_, this->size() * sizeof(T)); + } + cuda_size_ = this->size(); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto *ctx = pool.GetByPlace(place_); + memory::Copy(place_, cuda_ptr_, platform::CPUPlace(), + static_cast(this->data()), + this->size() * sizeof(T), ctx->stream()); + ctx->Wait(); +#endif +} + +template +void Vector::CopyFromCUDA() { +#ifdef PADDLE_WITH_CUDA + if (cuda_ptr_ == nullptr) { + LOG(WARNING) << "No uncommitted cuda data."; + return; + } + this->resize(cuda_size_); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto *ctx = pool.GetByPlace(place_); + memory::Copy(platform::CPUPlace(), static_cast(this->data()), place_, + static_cast(cuda_ptr_), this->size() * sizeof(T), + ctx->stream()); + ctx->Wait(); +#endif +} + +template +void Vector::CopyToPeer(platform::Place peer_place) { +#ifdef PADDLE_WITH_CUDA + auto *ctx = platform::DeviceContextPool::Instance().GetByPlace(place_); + void *peer_cuda_ptr = memory::Alloc( + boost::get(peer_place), this->size() * sizeof(T)); + memory::Copy(boost::get(peer_place), peer_cuda_ptr, + place_, cuda_ptr_, this->size() * sizeof(T), ctx->stream()); + ctx->Wait(); + + memory::Free(place_, cuda_ptr_); + place_ = boost::get(peer_place); + cuda_ptr_ = peer_cuda_ptr; +#endif +} + +template class Vector; +template class Vector; +template class Vector; +template class Vector; + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index f8df2cf97ad532f06cb1393b1a24cd789f8bde29..f554c77845087453f8c6e4d04522a8555e583ae6 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext { bool HasOutputs(const std::string &name) const override; - DDim GetInputDim(const std::string &name) const override; - - void SetOutputDim(const std::string &name, const DDim &dim) override; - AttrReader Attrs() const override; const std::vector &Inputs( @@ -444,21 +440,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const { return true; } -DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const { - std::vector ddims = GetInputsDim(name); - auto length = ddims.size(); - PADDLE_ENFORCE_EQ(length, 1UL, - "Input(%s) should have 1 value, " - "but it has %d now", - name, length); - return ddims[0]; -} - -void CompileTimeInferShapeContext::SetOutputDim(const std::string &name, - const DDim &dim) { - SetOutputsDim(name, {dim}); -} - AttrReader CompileTimeInferShapeContext::Attrs() const { return AttrReader(op_.GetAttrMap()); } diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 831b1e2a1e10777d9e89364adcd4b1f367e86080..81fa8cf477423fc2a54c719c9a743729215513c3 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -22,9 +22,7 @@ limitations under the License. */ #include "paddle/framework/shape_inference.h" #include "paddle/framework/var_type.h" -DEFINE_bool(op_sync, false, - "Default cuda is asynchronous device, set to True will" - "force op run in synchronous mode."); +DECLARE_bool(benchmark); namespace paddle { namespace framework { @@ -368,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext { return true; } - DDim GetInputDim(const std::string& name) const override { - return GetDim(op_.Input(name)); - } - - void SetOutputDim(const std::string& name, const DDim& dim) override { - SetDim(op_.Output(name), dim); - } - AttrReader Attrs() const override { return AttrReader(op_.Attrs()); } const std::vector& Inputs( @@ -531,7 +521,7 @@ void OperatorWithKernel::Run(const Scope& scope, ExecutionContext(*this, new_scope, *new_dev_ctx)); /*For profiling/benchmark only*/ - if (FLAGS_op_sync) { + if (FLAGS_benchmark) { new_dev_ctx->Wait(); } } diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc index b2368e3a27abe6382b7460222e3fccce6f1beb08..15ea4035c6e6193105b621210a900e74d1466941 100644 --- a/paddle/framework/program_desc.cc +++ b/paddle/framework/program_desc.cc @@ -14,13 +14,11 @@ limitations under the License. */ #include "paddle/framework/program_desc.h" #include "paddle/framework/block_desc.h" +#include "paddle/framework/feed_fetch_type.h" namespace paddle { namespace framework { -const std::string kFeedOpType = "feed"; -const std::string kFetchOpType = "fetch"; - BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) { auto *b = desc_.add_blocks(); b->set_parent_idx(parent.ID()); diff --git a/paddle/framework/program_desc.h b/paddle/framework/program_desc.h index b9741b31393a474e06fd156a2f3354844d53187c..8e958eab6ee08436ca73b13bac010e66c7df2b8b 100644 --- a/paddle/framework/program_desc.h +++ b/paddle/framework/program_desc.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include "paddle/framework/block_desc.h" #include "paddle/framework/framework.pb.h" #include "paddle/framework/proto_desc.h" #include "paddle/platform/macros.h" diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc index a67ff910093d93060d07d849f6e968e5f4ce21cd..af08b2ab816f63c05d4c65df9601c787e57994f5 100644 --- a/paddle/framework/scope.cc +++ b/paddle/framework/scope.cc @@ -20,9 +20,11 @@ limitations under the License. */ #include "paddle/framework/threadpool.h" #include "paddle/string/printf.h" -DEFINE_bool(do_memory_benchmark, false, +DEFINE_bool(benchmark, false, "Doing memory benchmark. It will make deleting scope synchronized, " - "and add some memory usage logs"); + "and add some memory usage logs." + "Default cuda is asynchronous device, set to True will" + "force op run in synchronous mode."); namespace paddle { namespace framework { @@ -93,7 +95,7 @@ void Scope::DeleteScope(Scope* scope) { PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); this->kids_.erase(it); // When making memory benchmark on Fluid, we have to delete scope sync. - if (FLAGS_do_memory_benchmark) { + if (FLAGS_benchmark) { delete scope; } else { Async([scope] { delete scope; }); diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc index e53cc0cdabc623ae358f1a3e21823a2f38ec3c62..a0fa467291bb42c59b65f5efeabe9c2235e15b2a 100644 --- a/paddle/framework/shape_inference.cc +++ b/paddle/framework/shape_inference.cc @@ -18,10 +18,18 @@ limitations under the License. */ namespace paddle { namespace framework { -std::vector InferShapeContext::GetInputsDim( +DDim InferShapeContext::GetInputDim(const std::string &name) const { + const std::vector &arg_names = Inputs(name); + PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, + "Input(%s) should hold one element, but now it holds %d", + name, arg_names.size()); + return this->GetDim(arg_names[0]); +} + +std::vector InferShapeContext::GetInputsDim( const std::string &name) const { - const std::vector &names = Inputs(name); - return GetDims(names); + const std::vector &arg_names = Inputs(name); + return GetDims(arg_names); } DDim InferShapeContext::GetInputsElementDim(const std::string &name, @@ -30,24 +38,31 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name, return this->GetDim(names[idx]); } -void InferShapeContext::SetOutputsDim( - const std::string &name, const std::vector &dims) { +void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) { + auto &arg_names = Outputs(name); + PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, + "Output(%s) should hold one element, but now it holds %d", + name, arg_names.size()); + SetDim(arg_names[0], dim); +} + +void InferShapeContext::SetOutputsDim(const std::string &name, + const std::vector &dims) { auto &names = Outputs(name); SetDims(names, dims); } -std::vector InferShapeContext::GetDims( +std::vector InferShapeContext::GetDims( const std::vector &names) const { - std::vector ret; + std::vector ret; ret.reserve(names.size()); std::transform( names.begin(), names.end(), std::back_inserter(ret), [this](const std::string &name) { return this->GetDim(name); }); return ret; } - void InferShapeContext::SetDims(const std::vector &names, - const std::vector &dims) { + const std::vector &dims) { size_t length = names.size(); PADDLE_ENFORCE_EQ(length, dims.size()); for (size_t i = 0; i < length; ++i) { diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h index f93319d8f2fd4c5d388bd57fd595a6a5edd51775..830f199ed1451538f12fc8dd34fb7b2bfc356a71 100644 --- a/paddle/framework/shape_inference.h +++ b/paddle/framework/shape_inference.h @@ -35,14 +35,13 @@ class InferShapeContext { virtual bool HasInputs(const std::string &name) const = 0; virtual bool HasOutputs(const std::string &name) const = 0; - virtual framework::DDim GetInputDim(const std::string &name) const = 0; + DDim GetInputDim(const std::string &name) const; - std::vector GetInputsDim(const std::string &name) const; + std::vector GetInputsDim(const std::string &name) const; DDim GetInputsElementDim(const std::string &name, int idx) const; - virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0; - void SetOutputsDim(const std::string &name, - const std::vector &dims); + void SetOutputDim(const std::string &name, const DDim &dim); + void SetOutputsDim(const std::string &name, const std::vector &dims); virtual AttrReader Attrs() const = 0; virtual const std::vector &Inputs( @@ -57,15 +56,13 @@ class InferShapeContext { // Note: In while op, we need this to be public void SetDims(const std::vector &names, - const std::vector &dims); + const std::vector &dims); protected: - virtual framework::DDim GetDim(const std::string &name) const = 0; - virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0; - - std::vector GetDims( - const std::vector &names) const; + virtual DDim GetDim(const std::string &name) const = 0; + virtual void SetDim(const std::string &name, const DDim &dim) = 0; + std::vector GetDims(const std::vector &names) const; std::vector GetVarTypes( const std::vector &names) const; diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index 4aaa29d794c95592832a1fe990e2dce274eba9d5..f0ea709a5c37e769e3ffa1b2e9d1e39721979251 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -47,6 +47,11 @@ class Tensor { public: Tensor() : offset_(0) {} + /*! Constructor with place should only be used in pybind. */ + explicit Tensor(const platform::Place& place) : offset_(0) { + holder_->set_place(place); + } + /*! Return a pointer to mutable memory block. */ template inline T* data(); @@ -137,6 +142,7 @@ class Tensor { virtual std::type_index type() const = 0; virtual platform::Place place() const = 0; virtual void set_type(std::type_index type) = 0; + virtual void set_place(platform::Place place) = 0; }; template @@ -156,6 +162,7 @@ class Tensor { virtual void* ptr() const { return static_cast(ptr_.get()); } virtual std::type_index type() const { return type_; } virtual void set_type(std::type_index type) { type_ = type; } + virtual void set_place(platform::Place place) { place_ = place; } /*! the pointer of memory block. */ std::unique_ptr> ptr_; diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index cbdbf5335d32d55a0221728758025c9d2cb3e7d1..a9876cec2aabf7d116443b685391ee9d20bc1370 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -178,19 +178,22 @@ public: real* inputData = inputs[0].data(); real* filterData = inputs[1].data(); real* outputData = outputs[0].data(); + real* colData = NULL; bool needIm2col = isNeedIm2col(filter); TensorShape imShape = TensorShape({inputChannels / groups_, inputHeight, inputWidth}); - TensorShape colShape; - real* colData = NULL; - size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth; - size_t colWidth = outputHeight * outputWidth; - // Max col matrix height 256, Max col matrix width 1024 - size_t stepColHeight = std::min(colHeight, static_cast(256)); - size_t stepColWidth = std::min(colWidth, static_cast(2048)); + // Max col matrix width 4096, Max col matrix size 4M. + size_t outputHeightSteps = + std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight); + size_t maxColWidth = outputHeightSteps * outputWidth; + size_t channelSteps = + std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth, + (size_t)1), + inputChannels / groups_); + size_t maxColHeight = channelSteps * filterHeight * filterWidth; if (needIm2col) { colShape = TensorShape({inputChannels / groups_, @@ -199,7 +202,7 @@ public: outputHeight, outputWidth}); - resizeBuffer(stepColHeight * stepColWidth * sizeof(real)); + resizeBuffer(maxColHeight * maxColWidth * sizeof(real)); colData = reinterpret_cast(memory_->getBuf()); } @@ -209,20 +212,24 @@ public: (outputChannels / groups_) * outputHeight * outputWidth; size_t filterOffset = filter.getElements() / groups_; - int nStride = colWidth; - int kStride = colHeight; + int nStride = outputHeight * outputWidth; + int kStride = inputChannels / groups_ * filterHeight * filterWidth; for (size_t i = 0; i < batchSize; i++) { + filterData = inputs[1].data(); for (size_t g = 0; g < groups_; g++) { if (needIm2col) { real beta_ = beta; - for (size_t colHeightStart = 0; colHeightStart < colHeight; - colHeightStart += stepColHeight) { - for (size_t colWidthStart = 0; colWidthStart < colWidth; - colWidthStart += stepColWidth) { - int N = std::min(colWidth - colWidthStart, stepColWidth); - int K = std::min(colHeight - colHeightStart, stepColHeight); + for (size_t ic = 0; ic < inputChannels / groups_; + ic += channelSteps) { + int channels = std::min(inputChannels / groups_ - ic, channelSteps); + for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) { + int height = std::min(outputHeight - oh, outputHeightSteps); + + int M = outputChannels / groups_; + int N = height * outputWidth; + int K = channels * filterHeight * filterWidth; // im2col - im2col(inputData + g * inputOffset, + im2col(inputData, imShape, colData, colShape, @@ -232,13 +239,12 @@ public: paddingW(), dilationH(), dilationW(), - colHeightStart, - K, - colWidthStart, + channels, + oh, + height, N); // gemm - int M = outputChannels / groups_; BlasGemm::compute( false, false, @@ -246,12 +252,12 @@ public: N, K, 1.0f, - filterData + g * filterOffset + colHeightStart, + filterData + ic * filterHeight * filterWidth, kStride, colData, N, beta_, - outputData + g * outputOffset + colWidthStart, + outputData + oh * outputWidth, nStride); } beta_ = 1.0; @@ -266,17 +272,18 @@ public: N, K, 1.0f, - filterData + g * filterOffset, + filterData, K, - inputData + g * inputOffset, + inputData, N, beta, - outputData + g * outputOffset, + outputData, N); } + inputData += inputOffset; + outputData += outputOffset; + filterData += filterOffset; } - inputData += inputChannels * inputHeight * inputWidth; - outputData += outputChannels * outputHeight * outputWidth; } memory_.reset(); diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h index 36a9bcf84e4b14965c83627821b71d1c7c0da1b2..915119e291caaa223249cf8e37078723621517b0 100644 --- a/paddle/function/Im2Col.h +++ b/paddle/function/Im2Col.h @@ -111,39 +111,42 @@ public: int paddingWidth, int dilationHeight, int dilationWidth, - int colHeightStart, - int colHeightSize, - int colWidthStart, - int colWidthSize) { + int inputChannels, + int colOffset, + int colOutputHeight, + int colWidth) { int inputHeight = imShape[1]; int inputWidth = imShape[2]; int filterHeight = colShape[1]; int filterWidth = colShape[2]; int outputWidth = colShape[4]; - for (int colh = 0; colh < colHeightSize; colh++) { - int wOffset = (colHeightStart + colh) % filterWidth; - int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight; - int c_im = (colHeightStart + colh) / filterWidth / filterHeight; - - for (int colw = 0; colw < colWidthSize; colw++) { - int h = (colWidthStart + colw) / outputWidth; - int w = (colWidthStart + colw) % outputWidth; - - int imRowIdx = h * strideHeight + hOffset * dilationHeight; - int imColIdx = w * strideWidth + wOffset * dilationWidth; - if ((imRowIdx - paddingHeight) < 0 || - (imRowIdx - paddingHeight) >= inputHeight || - (imColIdx - paddingWidth) < 0 || - (imColIdx - paddingWidth) >= inputWidth) { - colData[colh * colWidthSize + colw] = static_cast(0); - } else { - imRowIdx += c_im * inputHeight - paddingHeight; - imColIdx -= paddingWidth; - colData[colh * colWidthSize + colw] = - imData[imRowIdx * inputWidth + imColIdx]; + for (int ic = 0; ic < inputChannels; ic++) { + for (int oh = 0; oh < colOutputHeight; oh++) { + T* dstData = colData + oh * outputWidth; + for (int fh = 0; fh < filterHeight; fh++) { + for (int fw = 0; fw < filterWidth; fw++) { + int imRowIdx = (oh + colOffset) * strideHeight + + fh * dilationHeight - paddingHeight; + if (imRowIdx < 0 || imRowIdx >= inputHeight) { + memset(dstData, 0, outputWidth * sizeof(T)); + } else { + for (int ow = 0; ow < outputWidth; ow++) { + int imColIdx = + ow * strideWidth + fw * dilationWidth - paddingWidth; + if (imColIdx < 0 || imColIdx >= inputWidth) { + dstData[ow] = T(0); + } else { + dstData[ow] = imData[imRowIdx * inputWidth + imColIdx]; + } + } + } + dstData += colWidth; + } } } + colData += filterHeight * filterWidth * colWidth; + imData += inputHeight * inputWidth; } } }; diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp index 3ba866dcdd845403d52f7a85adfef08cbb11c305..fe44a8bf79005efb87c56f6a79f46421129bab22 100644 --- a/paddle/function/Im2ColTest.cpp +++ b/paddle/function/Im2ColTest.cpp @@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() { padding, dilation, dilation, + channels, 0, - height, - 0, - width); + outputHeight, + outputHeight * outputWidth); autotest::TensorCheckEqual(*output1, *output2); } diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt index 3f587fa790d1d980a35224b7a42dac1845fab99c..e8e0ee210718bb266383c967699b15418b18ea08 100644 --- a/paddle/inference/CMakeLists.txt +++ b/paddle/inference/CMakeLists.txt @@ -1,4 +1,4 @@ -set(FLUID_CORE_MODULES proto_desc paddle_memory executor prune init) +set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init) cc_library(paddle_fluid_api SRCS io.cc @@ -29,19 +29,6 @@ add_custom_target(inference_lib_dist DEPENDS inference_lib framework_lib memory_lib platform_lib string_lib gflags_lib glog_lib protobuf_lib eigen3_lib) -add_executable(example example.cc) -if(APPLE) - set(OPTIONAL_LINK_FLAGS) - if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") - set(OPTIONAL_LINK_FLAGS "-undefined dynamic_lookup") - endif() - target_link_libraries(example - -Wl,-force_load paddle_fluid - ${OPTIONAL_LINK_FLAGS} - ${PTOOLS_LIB}) -else() - target_link_libraries(example - -Wl,--start-group -Wl,--whole-archive paddle_fluid - -Wl,--no-whole-archive -Wl,--end-group - ${PTOOLS_LIB}) +if(WITH_TESTING) + add_subdirectory(tests/book) endif() diff --git a/paddle/inference/example.cc b/paddle/inference/example.cc deleted file mode 100644 index ac2aedd88b61cde18e8fb9c05d34dd62daf62ab7..0000000000000000000000000000000000000000 --- a/paddle/inference/example.cc +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "gflags/gflags.h" -#include "paddle/framework/init.h" -#include "paddle/framework/lod_tensor.h" -#include "paddle/inference/io.h" - -DEFINE_string(dirname, "", "Directory of the inference model."); - -int main(int argc, char** argv) { - google::ParseCommandLineFlags(&argc, &argv, true); - if (FLAGS_dirname.empty()) { - // Example: - // ./example --dirname=recognize_digits_mlp.inference.model - std::cout << "Usage: ./example --dirname=path/to/your/model" << std::endl; - exit(1); - } - - // 1. Define place, executor, scope - auto place = paddle::platform::CPUPlace(); - paddle::framework::InitDevices(); - auto* executor = new paddle::framework::Executor(place); - auto* scope = new paddle::framework::Scope(); - - std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl; - std::string dirname = FLAGS_dirname; - - // 2. Initialize the inference program - auto inference_program = paddle::inference::Load(*executor, *scope, dirname); - - // 3. Optional: perform optimization on the inference_program - - // 4. Get the feed_target_names and fetch_target_names - const std::vector& feed_target_names = - inference_program->GetFeedTargetNames(); - const std::vector& fetch_target_names = - inference_program->GetFetchTargetNames(); - - // 5. Generate input - paddle::framework::LoDTensor input; - srand(time(0)); - float* input_ptr = - input.mutable_data({1, 784}, paddle::platform::CPUPlace()); - for (int i = 0; i < 784; ++i) { - input_ptr[i] = rand() / (static_cast(RAND_MAX)); - } - - std::vector feeds; - feeds.push_back(input); - std::vector fetchs; - - // Set up maps for feed and fetch targets - std::map feed_targets; - std::map fetch_targets; - - // set_feed_variable - for (size_t i = 0; i < feed_target_names.size(); ++i) { - feed_targets[feed_target_names[i]] = &feeds[i]; - } - - // get_fetch_variable - fetchs.resize(fetch_target_names.size()); - for (size_t i = 0; i < fetch_target_names.size(); ++i) { - fetch_targets[fetch_target_names[i]] = &fetchs[i]; - } - - // Run the inference program - executor->Run(*inference_program, scope, feed_targets, fetch_targets); - - // Get outputs - for (size_t i = 0; i < fetchs.size(); ++i) { - auto dims_i = fetchs[i].dims(); - std::cout << "dims_i:"; - for (int j = 0; j < dims_i.size(); ++j) { - std::cout << " " << dims_i[j]; - } - std::cout << std::endl; - std::cout << "result:"; - float* output_ptr = fetchs[i].data(); - for (int j = 0; j < paddle::framework::product(dims_i); ++j) { - std::cout << " " << output_ptr[j]; - } - std::cout << std::endl; - } - - delete scope; - delete executor; - - return 0; -} diff --git a/paddle/inference/io.cc b/paddle/inference/io.cc index f6d901381e781f161689f05315d4e0fe63610f84..60ad7af1c0a469beb6a07bf057a8647fcb98cca8 100644 --- a/paddle/inference/io.cc +++ b/paddle/inference/io.cc @@ -13,13 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/inference/io.h" + #include +#include "paddle/framework/block_desc.h" +#include "paddle/framework/feed_fetch_type.h" namespace paddle { namespace inference { -const std::string kFeedOpType = "feed"; - bool IsParameter(const framework::VarDesc* var, const framework::ProgramDesc& main_program) { if (var->Persistable()) { @@ -27,7 +28,7 @@ bool IsParameter(const framework::VarDesc* var, for (size_t i = 0; i < main_program.Size(); ++i) { const framework::BlockDesc& block = main_program.Block(i); for (auto* op : block.AllOps()) { - if (op->Type() == kFeedOpType) { + if (op->Type() == framework::kFeedOpType) { continue; } for (auto input_argument_name : op->InputArgumentNames()) { @@ -51,7 +52,7 @@ void LoadPersistables(framework::Executor& executor, framework::BlockDesc* load_block = load_program->MutableBlock(0); for (auto* var : global_block.AllVars()) { if (IsParameter(var, main_program)) { - LOG(INFO) << "parameter's name: " << var->Name(); + VLOG(3) << "parameter's name: " << var->Name(); framework::VarDesc* new_var = load_block->Var(var->Name()); new_var->SetShape(var->Shape()); diff --git a/paddle/inference/io.h b/paddle/inference/io.h index dccb700e9565b3482152cfcf399b2369edf01c7b..962b6c4e20d30de3cc28eae1c8c5c33b3ab5f6ac 100644 --- a/paddle/inference/io.h +++ b/paddle/inference/io.h @@ -17,18 +17,13 @@ limitations under the License. */ #include #include #include -#include "paddle/framework/block_desc.h" #include "paddle/framework/executor.h" #include "paddle/framework/program_desc.h" #include "paddle/framework/scope.h" -#include "paddle/framework/var_desc.h" namespace paddle { namespace inference { -bool IsParameter(const framework::VarDesc* var, - const framework::ProgramDesc& main_program); - void LoadPersistables(framework::Executor& executor, framework::Scope& scope, const std::string& dirname, diff --git a/paddle/inference/tests/book/CMakeLists.txt b/paddle/inference/tests/book/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e987eb0240301c58cfb74c9e995d3b525130125 --- /dev/null +++ b/paddle/inference/tests/book/CMakeLists.txt @@ -0,0 +1,7 @@ +set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests) +cc_test(test_inference_recognize_digits_mlp + SRCS test_inference_recognize_digits.cc + DEPS ARCHIVE_START paddle_fluid ARCHIVE_END + ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model) +set_tests_properties(test_inference_recognize_digits_mlp + PROPERTIES DEPENDS test_recognize_digits) diff --git a/paddle/inference/tests/book/test_inference_recognize_digits.cc b/paddle/inference/tests/book/test_inference_recognize_digits.cc new file mode 100644 index 0000000000000000000000000000000000000000..26dc2aee04261d9a1fd29b4d75bfacc7870c09d8 --- /dev/null +++ b/paddle/inference/tests/book/test_inference_recognize_digits.cc @@ -0,0 +1,113 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "gflags/gflags.h" +#include "paddle/framework/lod_tensor.h" +#include "paddle/inference/io.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +template +void TestInference(const std::string& dirname, + const std::vector& cpu_feeds, + std::vector& cpu_fetchs) { + // 1. Define place, executor and scope + auto place = Place(); + auto executor = paddle::framework::Executor(place); + auto* scope = new paddle::framework::Scope(); + + // 2. Initialize the inference_program and load all parameters from file + auto inference_program = paddle::inference::Load(executor, *scope, dirname); + + // 3. Get the feed_target_names and fetch_target_names + const std::vector& feed_target_names = + inference_program->GetFeedTargetNames(); + const std::vector& fetch_target_names = + inference_program->GetFetchTargetNames(); + + // 4. Prepare inputs: set up maps for feed targets + std::map feed_targets; + for (size_t i = 0; i < feed_target_names.size(); ++i) { + // Please make sure that cpu_feeds[i] is right for feed_target_names[i] + feed_targets[feed_target_names[i]] = cpu_feeds[i]; + } + + // 5. Define Tensor to get the outputs: set up maps for fetch targets + std::map fetch_targets; + for (size_t i = 0; i < fetch_target_names.size(); ++i) { + fetch_targets[fetch_target_names[i]] = cpu_fetchs[i]; + } + + // 6. Run the inference program + executor.Run(*inference_program, scope, feed_targets, fetch_targets); + + delete scope; +} + +TEST(inference, recognize_digits) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor input; + srand(time(0)); + float* input_ptr = + input.mutable_data({1, 28, 28}, paddle::platform::CPUPlace()); + for (int i = 0; i < 784; ++i) { + input_ptr[i] = rand() / (static_cast(RAND_MAX)); + } + std::vector cpu_feeds; + cpu_feeds.push_back(&input); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference( + dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference( + dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.dims(); + + EXPECT_EQ(output1.dims(), output2.dims()); + EXPECT_EQ(output1.numel(), output2.numel()); + + float err = 1E-3; + int count = 0; + for (int64_t i = 0; i < output1.numel(); ++i) { + if (fabs(output1.data()[i] - output2.data()[i]) > err) { + count++; + } + } + EXPECT_EQ(count, 0) << "There are " << count << " different elements."; +#endif +} diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 1ec4336cabbc7d3073b7638b7484bf61e83a2dc5..cc86b12be08ba987f9682ebf3fda56c2f07fb576 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, CHECK_EQ(channels * outLength, maskMatP->getWidth()); } - /* initialize the data_ */ - for (size_t i = 0; i < height_; i++) { - for (size_t j = 0; j < width_; j++) { - outData[i * outStride + j] = -(real)FLT_MAX; - } - } - /* pool max one by one */ for (size_t n = 0; n < num; ++n) { // frame by frame if (!isContiguous()) { @@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, for (size_t c = 0; c < channels; ++c) { // channel by channel for (size_t ph = 0; ph < outputH; ++ph) { int hstart = ph * strideH - paddingH; - int hend = std::min(hstart + sizeY, imgSizeH); - hstart = std::max(hstart, 0); + int hend = hstart + sizeY; + hstart = hstart < 0 ? 0 : hstart; + hend = hend < (int)imgSizeH ? hend : (int)imgSizeH; for (size_t pw = 0; pw < outputW; ++pw) { int wstart = pw * strideW - paddingW; - int wend = std::min(wstart + sizeX, imgSizeW); - wstart = std::max(wstart, 0); + int wend = wstart + sizeX; + wstart = wstart < 0 ? 0 : wstart; + wend = wend < (int)imgSizeW ? wend : (int)imgSizeW; if (maskData == NULL) { + real tmp = -(real)FLT_MAX; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - outData[ph * outputW + pw] = std::max( - outData[ph * outputW + pw], inputData[h * imgSizeW + w]); + tmp = tmp < inputData[h * imgSizeW + w] + ? inputData[h * imgSizeW + w] + : tmp; } } + outData[ph * outputW + pw] = tmp; } else { for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 48cf5816cce4bb5ee8e66e72c5b1acea8535ab10..000c2089c176adf8d845a56a1f98528734f47ea1 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -122,9 +122,11 @@ if(WITH_DISTRIBUTE) set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor) + op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor) else() - set(DEPS_OPS ${DEPS_OPS} send_op recv_op) + set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op) endif() op_library(cond_op DEPS framework_proto tensor net_op) @@ -156,7 +158,10 @@ op_library(parallel_do_op DEPS executor) # Regist multiple Kernel to pybind if (WITH_GPU) -op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col) + +op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS + vol2col depthwise_conv) + op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function) op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling) op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc @@ -173,6 +178,8 @@ endif() # FIXME(typhoonzero): save/load depends lodtensor serialization functions op_library(save_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor) +op_library(save_combine_op DEPS lod_tensor) +op_library(load_combine_op DEPS lod_tensor) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) foreach(src ${GENERAL_OPS}) @@ -192,3 +199,4 @@ if(WITH_GPU) cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) endif() cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) +cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu index 4e579387924a5b0499f29609bc6b1322030a3c0d..00cb6e9cafb4e79ed3d59cd4a6e40ea132e5efda 100644 --- a/paddle/operators/adagrad_op.cu +++ b/paddle/operators/adagrad_op.cu @@ -82,7 +82,7 @@ struct SparseAdagradFunctor { math::scatter::MergeAdd merge_func; auto grad_merge = merge_func(context, grad); auto* grad_merge_data = grad_merge.mutable_value()->template data(); - auto& merge_rows = grad_merge.rows(); + framework::Vector merge_rows(grad_merge.rows()); // 2. m += g_m * g_m math::scatter::Mul sqare_func; auto grad_square = sqare_func(context, grad_merge, grad_merge); @@ -101,8 +101,8 @@ struct SparseAdagradFunctor { SparseAdagradFunctorKernel< T, 256><<(context) - .stream()>>>(grad_merge_data, grad_merge.rows().data(), - lr, param_data, moment_data, grad_width, + .stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr, + param_data, moment_data, grad_width, epsilon); } }; diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h index 9cc34bdded780e61e8700eb4fa4a295c84fb48bc..bf536687d398b8342e6ae76a07c11e5fe47483e0 100644 --- a/paddle/operators/adam_op.h +++ b/paddle/operators/adam_op.h @@ -199,7 +199,12 @@ class AdamOpKernel : public framework::OpKernel { merge_func(ctx.template device_context(), grad); auto& grad_tensor = grad_merge.value(); const T* grad_data = grad_tensor.template data(); - auto* rows = grad_merge.rows().data(); + int64_t* rows = nullptr; + if (platform::is_gpu_place(ctx.GetPlace())) { + rows = grad_merge.mutable_rows()->cuda_data(); + } else { + rows = grad_merge.mutable_rows()->data(); + } auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); SparseAdamFunctor functor( diff --git a/paddle/operators/bipartite_match_op.cc b/paddle/operators/bipartite_match_op.cc index 83c8778fe4cec4d9d80de691e117a39fdd92f494..1e6fa2091de25218e2bdafeb740ce884234638a5 100644 --- a/paddle/operators/bipartite_match_op.cc +++ b/paddle/operators/bipartite_match_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -28,12 +28,18 @@ class BipartiteMatchOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("DistMat"), "Input(DistMat) of BipartiteMatch should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("ColToRowMatchIndices"), + "Output(ColToRowMatchIndices) of BipartiteMatch should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("ColToRowMatchDist"), + "Output(ColToRowMatchDist) of BipartiteMatch should not be null."); auto dims = ctx->GetInputDim("DistMat"); PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2."); ctx->SetOutputDim("ColToRowMatchIndices", dims); - ctx->SetOutputDim("ColToRowMatchDis", dims); + ctx->SetOutputDim("ColToRowMatchDist", dims); } }; @@ -91,7 +97,7 @@ class BipartiteMatchKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto* dist_mat = context.Input("DistMat"); auto* match_indices = context.Output("ColToRowMatchIndices"); - auto* match_dist = context.Output("ColToRowMatchDis"); + auto* match_dist = context.Output("ColToRowMatchDist"); auto& dev_ctx = context.device_context(); @@ -148,13 +154,13 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker { "Otherwise, it means B[j] is matched to row " "ColToRowMatchIndices[i][j] in i-th instance. The row number of " "i-th instance is saved in ColToRowMatchIndices[i][j]."); - AddOutput("ColToRowMatchDis", + AddOutput("ColToRowMatchDist", "(Tensor) A 2-D Tensor with shape [N, M] in float type. " "N is batch size. If ColToRowMatchIndices[i][j] is -1, " - "ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed " + "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed " "ColToRowMatchIndices[i][j] = d, and the row offsets of each " "instance are called LoD. Then " - "ColToRowMatchDis[i][j] = DistMat[d+LoD[i]][j]"); + "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]"); AddComment(R"DOC( This operator is a greedy bipartite matching algorithm, which is used to obtain the matching with the maximum distance based on the input diff --git a/paddle/operators/box_coder_op.cc b/paddle/operators/box_coder_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..539813d4858b8faef386047f9ef64aa232aefca1 --- /dev/null +++ b/paddle/operators/box_coder_op.cc @@ -0,0 +1,121 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/box_coder_op.h" + +namespace paddle { +namespace operators { + +class BoxCoderOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("PriorBox"), + "Input(PriorBox) of BoxCoderOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"), + "Input(PriorBoxVar) of BoxCoderOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("TargetBox"), + "Input(TargetBox) of BoxCoderOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutputBox"), + "Output(OutputBox) of BoxCoderOp should not be null."); + + auto prior_box_dims = ctx->GetInputDim("PriorBox"); + auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); + auto target_box_dims = ctx->GetInputDim("TargetBox"); + + PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2, + "The rank of Input of PriorBoxVar must be 2"); + PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]"); + PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims); + PADDLE_ENFORCE_EQ(target_box_dims.size(), 2, + "The rank of Input of TargetBox must be 2"); + PADDLE_ENFORCE_EQ(target_box_dims[1], 4, + "The shape of TargetBox is [M, 4]"); + + GetBoxCodeType(ctx->Attrs().Get("code_type")); + + ctx->SetOutputDim( + "OutputBox", + framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4})); + ctx->ShareLoD("TargetBox", /*->*/ "OutputBox"); + } +}; + +class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker { + public: + BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "PriorBox", + "(Tensor, default Tensor) " + "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, " + "each box is represented as [xmin, ymin, xmax, ymax], " + "[xmin, ymin] is the left top coordinate of the anchor box, " + "if the input is image feature map, they are close to the origin " + "of the coordinate system. [xmax, ymax] is the right bottom " + "coordinate of the anchor box."); + AddInput("PriorBoxVar", + "(Tensor, default Tensor) " + "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group " + "of variance."); + AddInput( + "TargetBox", + "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " + "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], " + "[xmin, ymin] is the left top coordinate of the box if the input " + "is image feature map, they are close to the origin of the coordinate " + "system. [xmax, ymax] is the right bottom coordinate of the box. " + "This tensor can contain LoD information to represent a batch " + "of inputs. One instance of this batch can contain different " + "numbers of entities."); + AddAttr("code_type", + "(string, default encode_center_size) " + "the code type used with the target box") + .SetDefault("encode_center_size") + .InEnum({"encode_center_size", "decode_center_size"}); + AddOutput( + "OutputBox", + "(LoDTensor or Tensor) " + "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] " + "representing the result of N target boxes encoded/decoded with " + "M Prior boxes and variances."); + + AddComment(R"DOC( +Bounding Box Coder Operator. +Encode/Decode the target bounding box with the priorbox information. +The Encoding schema described below: +ox = (tx - px) / pw / pxv +oy = (ty - py) / ph / pyv +ow = log(abs(tw / pw)) / pwv +oh = log(abs(th / ph)) / phv +The Decoding schema described below: +ox = (pw * pxv * tx * + px) - tw / 2 +oy = (ph * pyv * ty * + py) - th / 2 +ow = exp(pwv * tw) * pw + tw / 2 +oh = exp(phv * th) * ph + th / 2 +where tx, ty, tw, th denote the target box's center coordinates, width and +height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor) +center coordinates, width and height. pxv, pyv, pwv, phv denote the variance +of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates, +width and height. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker); +REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel, + ops::BoxCoderKernel); diff --git a/paddle/operators/box_coder_op.cu b/paddle/operators/box_coder_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..98bd93457fafb49f2af5e1ff258fbfa9f9985600 --- /dev/null +++ b/paddle/operators/box_coder_op.cu @@ -0,0 +1,150 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/box_coder_op.h" +#include "paddle/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +template +__global__ void EncodeCenterSizeKernel(const T* prior_box_data, + const T* prior_box_var_data, + const T* target_box_data, const int row, + const int col, const int len, + T* output) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < row * col) { + const int row_idx = idx / col; + const int col_idx = idx % col; + T prior_box_width = + prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len]; + T prior_box_height = + prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1]; + T prior_box_center_x = + (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2; + T prior_box_center_y = (prior_box_data[col_idx * len + 3] + + prior_box_data[col_idx * len + 1]) / + 2; + + T target_box_center_x = + (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) / + 2; + T target_box_center_y = (target_box_data[row_idx * len + 3] + + target_box_data[row_idx * len + 1]) / + 2; + T target_box_width = + target_box_data[row_idx * len + 2] - target_box_data[row_idx * len]; + T target_box_height = + target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1]; + + output[idx * len] = (target_box_center_x - prior_box_center_x) / + prior_box_width / prior_box_var_data[col_idx * len]; + output[idx * len + 1] = (target_box_center_y - prior_box_center_y) / + prior_box_height / + prior_box_var_data[col_idx * len + 1]; + output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) / + prior_box_var_data[col_idx * len + 2]; + output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) / + prior_box_var_data[col_idx * len + 3]; + } +} + +template +__global__ void DecodeCenterSizeKernel(const T* prior_box_data, + const T* prior_box_var_data, + const T* target_box_data, const int row, + const int col, const int len, + T* output) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < row * col) { + const int row_idx = idx / col; + const int col_idx = idx % col; + T prior_box_width = + prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len]; + T prior_box_height = + prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1]; + T prior_box_center_x = + (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2; + T prior_box_center_y = (prior_box_data[col_idx * len + 3] + + prior_box_data[col_idx * len + 1]) / + 2; + + T target_box_width = exp(prior_box_var_data[col_idx * len + 2] * + target_box_data[row_idx * len + 2]) * + prior_box_width; + T target_box_height = exp(prior_box_var_data[col_idx * len + 3] * + target_box_data[row_idx * len + 3]) * + prior_box_height; + T target_box_center_x = prior_box_var_data[col_idx * len] * + target_box_data[row_idx * len] * + prior_box_width + + prior_box_center_x; + T target_box_center_y = prior_box_var_data[col_idx * len + 1] * + target_box_data[row_idx * len + 1] * + prior_box_height + + prior_box_center_y; + + output[idx * len] = target_box_center_x - target_box_width / 2; + output[idx * len + 1] = target_box_center_y - target_box_height / 2; + output[idx * len + 2] = target_box_center_x + target_box_width / 2; + output[idx * len + 3] = target_box_center_y + target_box_height / 2; + } +} + +template +class BoxCoderCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + auto* prior_box = context.Input("PriorBox"); + auto* prior_box_var = context.Input("PriorBoxVar"); + auto* target_box = context.Input("TargetBox"); + auto* output_box = context.Output("OutputBox"); + + if (target_box->lod().size()) { + PADDLE_ENFORCE_EQ(target_box->lod().size(), 1, + "Only support 1 level of LoD."); + } + auto row = target_box->dims()[0]; + auto col = prior_box->dims()[0]; + auto len = prior_box->dims()[1]; + int block = 512; + int grid = (row * col + block - 1) / block; + auto& device_ctx = context.cuda_device_context(); + + const T* prior_box_data = prior_box->data(); + const T* prior_box_var_data = prior_box_var->data(); + const T* target_box_data = target_box->data(); + + output_box->mutable_data({row, col, len}, context.GetPlace()); + T* output = output_box->data(); + + auto code_type = GetBoxCodeType(context.Attr("code_type")); + if (code_type == BoxCodeType::kEncodeCenterSize) { + EncodeCenterSizeKernel<<>>( + prior_box_data, prior_box_var_data, target_box_data, row, col, len, + output); + } else if (code_type == BoxCodeType::kDecodeCenterSize) { + DecodeCenterSizeKernel<<>>( + prior_box_data, prior_box_var_data, target_box_data, row, col, len, + output); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel, + ops::BoxCoderCUDAKernel); diff --git a/paddle/operators/box_coder_op.h b/paddle/operators/box_coder_op.h new file mode 100644 index 0000000000000000000000000000000000000000..086251f6e066f082743f332ce72918c6e572ce19 --- /dev/null +++ b/paddle/operators/box_coder_op.h @@ -0,0 +1,151 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/op_registry.h" +#include "paddle/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 }; + +inline BoxCodeType GetBoxCodeType(const std::string& type) { + if (type == "encode_center_size") { + return BoxCodeType::kEncodeCenterSize; + } else if (type == "decode_center_size") { + return BoxCodeType::kDecodeCenterSize; + } + PADDLE_THROW("Not support type %s.", type); +} + +template +class BoxCoderKernel : public framework::OpKernel { + public: + void EncodeCenterSize(const framework::Tensor& target_box, + const framework::Tensor& prior_box, + const framework::Tensor& prior_box_var, + T* output) const { + int64_t row = target_box.dims()[0]; + int64_t col = prior_box.dims()[0]; + int64_t len = prior_box.dims()[1]; + auto* target_box_data = target_box.data(); + auto* prior_box_data = prior_box.data(); + auto* prior_box_var_data = prior_box_var.data(); + + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + T prior_box_width = + prior_box_data[j * len + 2] - prior_box_data[j * len]; + T prior_box_height = + prior_box_data[j * len + 3] - prior_box_data[j * len + 1]; + T prior_box_center_x = + (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2; + T prior_box_center_y = + (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; + + T target_box_center_x = + (target_box_data[i * len + 2] + target_box_data[i * len]) / 2; + T target_box_center_y = + (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2; + T target_box_width = + target_box_data[i * len + 2] - target_box_data[i * len]; + T target_box_height = + target_box_data[i * len + 3] - target_box_data[i * len + 1]; + + size_t offset = i * col * len + j * len; + output[offset] = (target_box_center_x - prior_box_center_x) / + prior_box_width / prior_box_var_data[j * len]; + output[offset + 1] = (target_box_center_y - prior_box_center_y) / + prior_box_height / prior_box_var_data[j * len + 1]; + output[offset + 2] = + std::log(std::fabs(target_box_width / prior_box_width)) / + prior_box_var_data[j * len + 2]; + output[offset + 3] = + std::log(std::fabs(target_box_height / prior_box_height)) / + prior_box_var_data[j * len + 3]; + } + } + } + void DecodeCenterSize(const framework::Tensor& target_box, + const framework::Tensor& prior_box, + const framework::Tensor& prior_box_var, + T* output) const { + int64_t row = target_box.dims()[0]; + int64_t col = prior_box.dims()[0]; + int64_t len = prior_box.dims()[1]; + + auto* target_box_data = target_box.data(); + auto* prior_box_data = prior_box.data(); + auto* prior_box_var_data = prior_box_var.data(); + + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + T prior_box_width = + prior_box_data[j * len + 2] - prior_box_data[j * len]; + T prior_box_height = + prior_box_data[j * len + 3] - prior_box_data[j * len + 1]; + T prior_box_center_x = + (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2; + T prior_box_center_y = + (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; + + T target_box_center_x = prior_box_var_data[j * len] * + target_box_data[i * len] * prior_box_width + + prior_box_center_x; + T target_box_center_y = prior_box_var_data[j * len + 1] * + target_box_data[i * len + 1] * + prior_box_height + + prior_box_center_y; + T target_box_width = std::exp(prior_box_var_data[j * len + 2] * + target_box_data[i * len + 2]) * + prior_box_width; + T target_box_height = std::exp(prior_box_var_data[j * len + 3] * + target_box_data[i * len + 3]) * + prior_box_height; + + size_t offset = i * col * len + j * len; + output[offset] = target_box_center_x - target_box_width / 2; + output[offset + 1] = target_box_center_y - target_box_height / 2; + output[offset + 2] = target_box_center_x + target_box_width / 2; + output[offset + 3] = target_box_center_y + target_box_height / 2; + } + } + } + + void Compute(const framework::ExecutionContext& context) const override { + auto* prior_box = context.Input("PriorBox"); + auto* prior_box_var = context.Input("PriorBoxVar"); + auto* target_box = context.Input("TargetBox"); + auto* output_box = context.Output("OutputBox"); + + if (target_box->lod().size()) { + PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL, + "Only support 1 level of LoD."); + } + auto row = target_box->dims()[0]; + auto col = prior_box->dims()[0]; + auto len = prior_box->dims()[1]; + + output_box->mutable_data({row, col, len}, context.GetPlace()); + + auto code_type = GetBoxCodeType(context.Attr("code_type")); + T* output = output_box->data(); + if (code_type == BoxCodeType::kEncodeCenterSize) { + EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output); + } else if (code_type == BoxCodeType::kDecodeCenterSize) { + DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h index 9c655d6c0d8e5fe04ee6d85f7e9d9da68105230c..b275fd75b3512343825170fc38565dd27f7f1c75 100644 --- a/paddle/operators/compare_op.h +++ b/paddle/operators/compare_op.h @@ -54,7 +54,15 @@ class CompareOpKernel public: void Compute(const framework::ExecutionContext& context) const override { using T = typename Functor::ELEM_TYPE; - ElementwiseComputeEx(context); + using Tensor = framework::Tensor; + + auto* x = context.Input("X"); + auto* y = context.Input("Y"); + auto* z = context.Output("Out"); + z->mutable_data(context.GetPlace()); + int axis = context.Attr("axis"); + ElementwiseComputeEx(context, x, y, axis, + z); } }; diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc index d6882b275b22b9a2a2b6ff8cfb53a3462bbdbefe..cef7ddd5fe7e12a374fb9cc79211bd2eb97c6c52 100644 --- a/paddle/operators/conv_op.cc +++ b/paddle/operators/conv_op.cc @@ -318,9 +318,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( namespace ops = paddle::operators; REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad, ops::ConvOpGrad); + +// depthwise convolution op +REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, + depthwise_conv2d_grad, ops::ConvOpGrad); REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad, ops::ConvOpGrad); +// depthwise conv kernel +// TODO(xingzhaolong): neon kernel for mobile +REGISTER_OP_CPU_KERNEL( + depthwise_conv2d, + ops::GemmConvKernel, + ops::GemmConvKernel); + +REGISTER_OP_CPU_KERNEL( + depthwise_conv2d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); + REGISTER_OP_CPU_KERNEL( conv2d, ops::GemmConvKernel, ops::GemmConvKernel); diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc index 4f942444f3eb5584f07399b8d1b4d6a5087496d4..d0bd40ee95dab3b2589742b8a0c3a5de7918b5b9 100644 --- a/paddle/operators/conv_op.cu.cc +++ b/paddle/operators/conv_op.cu.cc @@ -16,6 +16,16 @@ limitations under the License. */ namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + depthwise_conv2d, + ops::DepthwiseConvKernel, + ops::DepthwiseConvKernel); + +REGISTER_OP_CUDA_KERNEL( + depthwise_conv2d_grad, + ops::DepthwiseConvGradKernel, + ops::DepthwiseConvGradKernel); + REGISTER_OP_CUDA_KERNEL( conv2d, ops::GemmConvKernel, ops::GemmConvKernel); diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h index 5a8933e7915960f9fcbe92ae73c4f37b3b69ecaf..3c1d0e9c1c4bb964bfaebc3bfed115548bd53f97 100644 --- a/paddle/operators/conv_op.h +++ b/paddle/operators/conv_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/framework/eigen.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/math/depthwise_conv.h" #include "paddle/operators/math/im2col.h" #include "paddle/operators/math/math_function.h" #include "paddle/operators/math/vol2col.h" @@ -350,5 +351,72 @@ class GemmConvGradKernel : public framework::OpKernel { } } }; + +template +class DepthwiseConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + Tensor filter = *context.Input("Filter"); + Tensor* output = context.Output("Output"); + output->mutable_data(context.GetPlace()); + + PADDLE_ENFORCE_EQ( + output->dims()[1] % input->dims()[1], 0, + "The output channels must be a multiple of the input channels"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + + math::DepthwiseConvFunctor depthwiseConv; + + auto& dev_ctx = context.template device_context(); + depthwiseConv(dev_ctx, *input, filter, strides, paddings, output); + } +}; + +template +class DepthwiseConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + Tensor filter = *context.Input("Filter"); + + if (!input_grad && !filter_grad) return; + + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + + math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + + if (input_grad) { + input_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, input_grad, static_cast(0)); + depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, + paddings, input_grad); + } + + if (filter_grad) { + filter_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, filter_grad, static_cast(0)); + depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings, + filter_grad); + } + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/operators/ctc_align_op.cu b/paddle/operators/ctc_align_op.cu index 45635f16745346b08f7e31db2f25905bdbc3aeeb..2a970cd9fa965b4126356eaa1519068f9c7a7f34 100644 --- a/paddle/operators/ctc_align_op.cu +++ b/paddle/operators/ctc_align_op.cu @@ -69,12 +69,11 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel { auto stream = ctx.cuda_device_context().stream(); MergeAndDelCudaKernel<<<1, 1, 0, stream>>>( - num_tokens, tokens, num_seq, input_lod[level].data(), blank, + num_tokens, tokens, num_seq, input_lod[level].cuda_data(), blank, merge_repeated, dev_out_lod0_ptr, output_data); // set output lod - thrust::host_vector host_out_lod0(dev_out_lod0.begin(), - dev_out_lod0.end()); + std::vector host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end()); framework::LoD out_lod; out_lod.push_back(host_out_lod0); output->set_lod(out_lod); diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc index 35cb18797ff66cb87a6658e73ce02b0bfae29baa..5274aa204e6629c9c5ea850c433e0948c89015bd 100644 --- a/paddle/operators/dropout_op.cc +++ b/paddle/operators/dropout_op.cc @@ -51,6 +51,13 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { "'dropout_prob' must be between 0.0 and 1.0."); }); AddAttr("is_test", "True if in test phase.").SetDefault(false); + AddAttr("fix_seed", + "A flag indicating whether to use a fixed seed to generate " + "random mask. NOTE: DO NOT set this flag to true in " + "training. Setting this flag to true is only useful in " + "unittest or for debug that always the same output units " + "will be dropped.") + .SetDefault(false); AddAttr("seed", "Dropout random seed.").SetDefault(0); AddComment(R"DOC( diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu index c56930336e865079f1b96df0f35b0a051fe63a27..84d78445a4fa340ba3c066bb48b96b2a890db652 100644 --- a/paddle/operators/dropout_op.cu +++ b/paddle/operators/dropout_op.cu @@ -62,7 +62,11 @@ class GPUDropoutKernel : public framework::OpKernel { auto* mask = context.Output("Mask"); auto* mask_data = mask->mutable_data(context.GetPlace()); int size = framework::product(mask->dims()); - int seed = context.Attr("seed"); + + std::random_device rnd; + int seed = + context.Attr("fix_seed") ? context.Attr("seed") : rnd(); + thrust::counting_iterator index_sequence_begin(0); thrust::transform(index_sequence_begin, index_sequence_begin + size, thrust::device_ptr(mask_data), diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h index c90b8d277eb78048c001d36a367287146b51c636..46e5dbc64ff9ad3d04a9c1c07f4226932f661baf 100644 --- a/paddle/operators/dropout_op.h +++ b/paddle/operators/dropout_op.h @@ -38,9 +38,15 @@ class CPUDropoutKernel : public framework::OpKernel { if (!context.Attr("is_test")) { auto* mask = context.Output("Mask"); auto* mask_data = mask->mutable_data(context.GetPlace()); - int seed = context.Attr("seed"); + + // NOTE: fixed seed should only be used in unittest or for debug. + // Guarantee to use random seed in training. + std::random_device rnd; std::minstd_rand engine; + int seed = + context.Attr("fix_seed") ? context.Attr("seed") : rnd(); engine.seed(seed); + std::uniform_real_distribution dist(0, 1); size_t size = framework::product(mask->dims()); for (size_t i = 0; i < size; ++i) { diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h index a8389429f26c17ceab1db22175c90888546ead6f..c32288d6984f126f2374a13973541f4f663b25a4 100644 --- a/paddle/operators/elementwise_add_op.h +++ b/paddle/operators/elementwise_add_op.h @@ -28,7 +28,14 @@ template class ElementwiseAddKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -92,9 +99,19 @@ template class ElementwiseAddGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseAddBroadCastGradFunctor, - ElementwiseAddBroadCast2GradFunctor>(ctx); + ElementwiseAddBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h index ef26cb6c914f50ded07cc9d0d8de3f49f2151129..07ebade31ff5b3d5c89156e28ff5fa0670a9a842 100644 --- a/paddle/operators/elementwise_div_op.h +++ b/paddle/operators/elementwise_div_op.h @@ -28,7 +28,14 @@ template class ElementwiseDivKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -111,9 +118,19 @@ template class ElementwiseDivGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseDivBroadCastGradFunctor, - ElementwiseDivBroadCast2GradFunctor>(ctx); + ElementwiseDivBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/elementwise_max_op.h b/paddle/operators/elementwise_max_op.h index 255728e8e620665a7de225b228c19d6c510da1c8..717e45ab31db9b9a6629fb33e17654dbf986d8c5 100644 --- a/paddle/operators/elementwise_max_op.h +++ b/paddle/operators/elementwise_max_op.h @@ -28,7 +28,14 @@ template class ElementwiseMaxKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -110,9 +117,19 @@ template class ElementwiseMaxGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseMaxBroadCastGradFunctor, - ElementwiseMaxBroadCast2GradFunctor>(ctx); + ElementwiseMaxBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/elementwise_min_op.h b/paddle/operators/elementwise_min_op.h index e6627a0f1bb468c8e4661b83489cb964b72dddb0..0de9a91c52b0ab82cd62604de318ce68e56b767d 100644 --- a/paddle/operators/elementwise_min_op.h +++ b/paddle/operators/elementwise_min_op.h @@ -28,7 +28,14 @@ template class ElementwiseMinKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -110,9 +117,19 @@ template class ElementwiseMinGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseMinBroadCastGradFunctor, - ElementwiseMinBroadCast2GradFunctor>(ctx); + ElementwiseMinBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h index 4b86b00b5a095ae898f9ce0c17cde2cc91060ba9..ae7a71e0244dfb8ad3e55683ac081f92bc36bea5 100644 --- a/paddle/operators/elementwise_mul_op.h +++ b/paddle/operators/elementwise_mul_op.h @@ -27,7 +27,14 @@ template class ElementwiseMulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -110,9 +117,19 @@ template class ElementwiseMulGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseMulBroadCastGradFunctor, - ElementwiseMulBroadCast2GradFunctor>(ctx); + ElementwiseMulBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index d749b8e8757d0d433be05876779ccc22b95ca80b..213fe1f5a818873e8b666464cb112637261c598c 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -313,21 +313,18 @@ EIGEN_FUNCTOR(Div, EIGEN_DIV); template -void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); +void ElementwiseGradCompute(const framework::ExecutionContext& ctx, + const framework::Tensor* x, + const framework::Tensor* y, + const framework::Tensor* out, + const framework::Tensor* dout, int axis, + framework::Tensor* dx, framework::Tensor* dy) { auto& place = *ctx.template device_context().eigen_device(); auto x_dims = x->dims(); auto y_dims = y->dims(); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); if (dx) { dx->mutable_data(ctx.GetPlace()); } @@ -348,7 +345,6 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { x_dims = framework::make_ddim(extended_dims); } - int axis = ctx.Attr("axis"); axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); int pre, n, post; @@ -367,13 +363,10 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { template -void ElementwiseComputeEx(const framework::ExecutionContext& ctx) { - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - z->mutable_data(ctx.GetPlace()); +void ElementwiseComputeEx(const framework::ExecutionContext& ctx, + const framework::Tensor* x, + const framework::Tensor* y, int axis, + framework::Tensor* z) { TransformFunctor functor( x, y, z, ctx.template device_context(), Functor()); @@ -394,7 +387,6 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx) { x_dims = framework::make_ddim(extended_dims); } - int axis = ctx.Attr("axis"); axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), "Axis should be in range [0, x_dims)"); diff --git a/paddle/operators/elementwise_pow_op.cc b/paddle/operators/elementwise_pow_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..5293cc7dd34ccee860c50e964516da9b4d42d29c --- /dev/null +++ b/paddle/operators/elementwise_pow_op.cc @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/elementwise_pow_op.h" +#include "paddle/operators/elementwise_op.h" + +namespace paddle { +namespace operators { +class ElementwisePowOpMaker : public ElementwiseOpMaker { + public: + ElementwisePowOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : ElementwiseOpMaker(proto, op_checker) { + SetComment("Pow", "Out = X ^ Y"); + AddComment(comment_); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(elementwise_pow, ops::ElementwiseOp, + ops::ElementwisePowOpMaker); +REGISTER_OP_CPU_KERNEL( + elementwise_pow, + ops::ElementwisePowKernel, + ops::ElementwisePowKernel); diff --git a/paddle/operators/elementwise_pow_op.cu b/paddle/operators/elementwise_pow_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..643c978e635bc8e9671b47774c2eac5b713f59c2 --- /dev/null +++ b/paddle/operators/elementwise_pow_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/operators/elementwise_pow_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + elementwise_pow, + ops::ElementwisePowKernel, + ops::ElementwisePowKernel); diff --git a/paddle/operators/elementwise_pow_op.h b/paddle/operators/elementwise_pow_op.h new file mode 100644 index 0000000000000000000000000000000000000000..874fd3f09f2afaccfbfca75799cc3448f7393b03 --- /dev/null +++ b/paddle/operators/elementwise_pow_op.h @@ -0,0 +1,44 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/operators/elementwise_op_function.h" + +namespace paddle { +namespace operators { + +template +struct PowFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return std::pow(a, b); } +}; + +template +class ElementwisePowKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h index a2aca793026189ec87e00b52d7c351689f870400..c2749a8e6ba689233dab4f3c72de10bf01f39fab 100644 --- a/paddle/operators/elementwise_sub_op.h +++ b/paddle/operators/elementwise_sub_op.h @@ -27,7 +27,14 @@ template class ElementwiseSubKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseComputeEx, DeviceContext, T>(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); } }; @@ -93,9 +100,19 @@ template class ElementwiseSubGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); ElementwiseGradCompute, ElementwiseSubBroadCastGradFunctor, - ElementwiseSubBroadCast2GradFunctor>(ctx); + ElementwiseSubBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); } }; diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index d738e1850ca4f658f4fca5c9bf643c44f676cce9..789d01e0022b5c36957f295265a9dc42649b310f 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -52,7 +52,11 @@ class FeedOp : public framework::OperatorBase { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(place); - framework::Copy(feed_item, place, dev_ctx, out_item); + if (platform::is_same_place(feed_item.place(), place)) { + out_item->ShareDataWith(feed_item); + } else { + framework::Copy(feed_item, place, dev_ctx, out_item); + } out_item->set_lod(feed_item.lod()); } }; diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h index b1957fb9ce6add8628cb206abf2c569d3f615c85..a08bd4233b02d021aaa64bafe4b855f11a60d338 100644 --- a/paddle/operators/gru_op.h +++ b/paddle/operators/gru_op.h @@ -30,11 +30,12 @@ using Tensor = framework::Tensor; template inline void ReorderInitState(const DeviceContext& ctx, - const framework::Tensor& src, const size_t* index, + const framework::Tensor& src, + framework::Vector index_lod, framework::Tensor* dst, bool indexed_src) { math::CopyMatrixRowsFunctor row_shuffle; dst->mutable_data(src.dims(), ctx.GetPlace()); - row_shuffle(ctx, src, index, *dst, indexed_src); + row_shuffle(ctx, src, index_lod, *dst, indexed_src); } template @@ -76,7 +77,9 @@ class GRUKernel : public framework::OpKernel { gru_value.state_weight = const_cast(weight_data + 2 * frame_size * frame_size); Tensor ordered_h0; - const size_t* order = batch_gate->lod()[2].data(); + + framework::Vector order(batch_gate->lod()[2]); + if (h0) { // Since the batch computing for GRU reorders the input sequences // according to their length. The initialized cell state also needs @@ -159,7 +162,9 @@ class GRUGradKernel : public framework::OpKernel { zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast(0.0)); Tensor ordered_h0, ordered_h0_grad; - const size_t* order = batch_gate->lod()[2].data(); + + framework::Vector order(batch_gate->lod()[2]); + if (h0) { ReorderInitState(dev_ctx, *h0, order, &ordered_h0, true); diff --git a/paddle/operators/label_smooth_op.cc b/paddle/operators/label_smooth_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c89082f44b360cbd171eccb212674040b8688a46 --- /dev/null +++ b/paddle/operators/label_smooth_op.cc @@ -0,0 +1,128 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/label_smooth_op.h" + +namespace paddle { +namespace operators { + +class LabelSmoothOp : public framework::OperatorWithKernel { + public: + LabelSmoothOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of LabelSmoothOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LabelSmoothOp should not be null."); + auto in_dims = ctx->GetInputDim("X"); + if (ctx->HasInput("PriorDist")) { + auto noise_dims = ctx->GetInputDim("PriorDist"); + auto noise_numel = paddle::framework::product(noise_dims); + PADDLE_ENFORCE( + in_dims[1] == noise_numel, + "The number of elements in Input(PriorDist) must be equal to the " + "dimension of each label."); + } + ctx->ShareLoD("X", /*->*/ "Out"); + ctx->SetOutputDim("Out", in_dims); + } +}; + +class LabelSmoothOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LabelSmoothOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor) The input labels of LabelSmooth operator. This " + "input can be batched labels in one-hot encoding or output from " + "softmax, with shape [N x K], where N is the batch size and K is " + "the number of classes"); + AddInput("PriorDist", + "(Tensor, optional)" + "The prior distribution to be added to the smoothed label. It is " + "fixed during training and the number of elements should be equal " + "to the dimension K of each label. Default is uniform " + "distribution and each element will be set to 1/K if not provided " + "in input.") + .AsDispensable(); + AddOutput("Out", + "(loDTensor) The smoothed label of LabelSmooth operator. It has" + "the same shape and LoD with the Input(LoDTensor)."); + AddAttr("epsilon", + "(float, default 0.0f)" + "The smoothing parameter of LabelSmooth operator.") + .SetDefault(0.0f); + AddComment(R"DOC( +LabelSmooth Operator. + +Label smoothing is a mechanism to regularize the classifier layer. In machine +learning, optimizing the log-likelihood of the correct label directly may +cause two problems. First, it may result in overfitting: if the model learns +to assign full probability to the ground-truth label for each training example, +it is not guaranteed to generalize. Second, it encourages the differences +between the largest logit and all others to become large, reducing the ability +of the model to adapt. Label smoothing is proposed to encourage the model to +be less confident, which replaces the ground-truth label $y$ with the weighted +sum of itself and some fixed distribution $\mu$, i.e. + +$$ + \tilde{y} = (1 - \epsilon) * y + \epsilon * \mu, +$$ + +where $(1 - \epsilon)$ and $\epsilon$ are the weights respectively, and +$\tilde{y}$ is the smoothed label. Usually uniform distribution is used for +$\mu$. This change in the ground-truth label is called label-smoothing +regularization or LSR. + +See more details about label smoothing in https://arxiv.org/abs/1512.00567. + +)DOC"); + } +}; + +class LabelSmoothGradOp : public framework::OperatorWithKernel { + public: + LabelSmoothGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker, + label_smooth_grad, ops::LabelSmoothGradOp); +REGISTER_OP_CPU_KERNEL( + label_smooth, + ops::LabelSmoothKernel, + ops::LabelSmoothKernel); +REGISTER_OP_CPU_KERNEL( + label_smooth_grad, + ops::LabelSmoothGradKernel, + ops::LabelSmoothGradKernel); diff --git a/paddle/operators/label_smooth_op.cu b/paddle/operators/label_smooth_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..5a0cec12bc58a56e4b0c3bd6fbc6c4754ef81fa4 --- /dev/null +++ b/paddle/operators/label_smooth_op.cu @@ -0,0 +1,26 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/label_smooth_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + label_smooth, + ops::LabelSmoothKernel, + ops::LabelSmoothKernel); +REGISTER_OP_CUDA_KERNEL( + label_smooth_grad, + ops::LabelSmoothGradKernel, + ops::LabelSmoothGradKernel); diff --git a/paddle/operators/label_smooth_op.h b/paddle/operators/label_smooth_op.h new file mode 100644 index 0000000000000000000000000000000000000000..87bc9f793e3b4e249142710243c45d51f3a913b2 --- /dev/null +++ b/paddle/operators/label_smooth_op.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class LabelSmoothKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* out_t = ctx.Output("Out"); + auto* in_t = ctx.Input("X"); + auto* dist_t = ctx.Input("PriorDist"); + auto label_dim = in_t->dims()[1]; + out_t->mutable_data(ctx.GetPlace()); + + auto epsilon = ctx.Attr("epsilon"); + auto out = framework::EigenVector::Flatten(*out_t); + auto in = framework::EigenVector::Flatten(*in_t); + auto& dev = *ctx.template device_context().eigen_device(); + if (dist_t) { + auto dist = framework::EigenVector::Flatten(*dist_t); + out.device(dev) = + static_cast(1 - epsilon) * in + + epsilon * dist.broadcast(Eigen::DSizes(in_t->numel())); + } else { + out.device(dev) = static_cast(1 - epsilon) * in + + static_cast(epsilon / label_dim); + } + } +}; + +template +class LabelSmoothGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* d_out_t = ctx.Input(framework::GradVarName("Out")); + auto* d_in_t = ctx.Output(framework::GradVarName("X")); + d_in_t->mutable_data(ctx.GetPlace()); + + auto d_out = framework::EigenVector::Flatten(*d_out_t); + auto d_in = framework::EigenVector::Flatten(*d_in_t); + + auto epsilon = ctx.Attr("epsilon"); + auto& dev = *ctx.template device_context().eigen_device(); + d_in.device(dev) = static_cast(1 - epsilon) * d_out; + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1c6d2ae4d05becaeed34d66cad398cc90f9d3ece --- /dev/null +++ b/paddle/operators/layer_norm_op.cc @@ -0,0 +1,370 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/layer_norm_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; + +template +using EigenMatrixMapRowMajor = Eigen::Map< + Eigen::Matrix>; +template +using ConstEigenMatrixMapRowMajor = Eigen::Map< + const Eigen::Matrix>; + +class LayerNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Y"), + "Output(Y) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Mean"), + "Output(Mean) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Variance"), + "Output(Variance) of LayerNormOp should not be null."); + + auto x_dim = ctx->GetInputDim("X"); + auto begin_norm_axis = ctx->Attrs().Get("begin_norm_axis"); + PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(), + "'begin_norm_axis' must be less than the rank of X."); + + auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + if (ctx->HasInput("Scale")) { + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right); + } + if (ctx->HasInput("Bias")) { + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right); + } + + ctx->SetOutputDim("Y", ctx->GetInputDim("X")); + ctx->SetOutputDim("Mean", {left}); + ctx->SetOutputDim("Variance", {left}); + ctx->ShareLoD("X", "Y"); + } +}; + +class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(LoDTensor) The input tensor."); + AddInput("Scale", + "(Tensor, optional) Scale is a 1-dimensional tensor of size " + "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])." + "It is applied to the output.") + .AsDispensable(); + AddInput("Bias", + "(Tensor, optional) Bias is a 1-dimensional tensor of size " + "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])." + "It is applied to the output.") + .AsDispensable(); + AddOutput("Y", "(LoDTensor) Result after normalization."); + AddOutput("Mean", "(Tensor) Mean of the current mini batch.") + .AsIntermediate(); + AddOutput("Variance", "(Tensor) Variance of the current mini batch.") + .AsIntermediate(); + + AddAttr("epsilon", + "(float, default 1e-5) Constant for " + "numerical stability") + .SetDefault(1e-5) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, + "'epsilon' should be between 0.0 and 0.001."); + }); + AddAttr("begin_norm_axis", + "(int default:1), the " + "axis of `begin_norm_axis ... Rank(X) - 1` will be " + "normalized. `begin_norm_axis` splits the tensor(`X`) to a " + "matrix [N,H].") + .SetDefault(1) + .AddCustomChecker([](const int &begin_norm_axis) { + PADDLE_ENFORCE_GT(begin_norm_axis, 0, + "'begin_norm_axis' should be greater than zero."); + }); + + AddComment(R"DOC( +Layer Normalization. + +Layer Norm has been implemented as discussed in the paper: +https://arxiv.org/abs/1607.06450 +... +)DOC"); + } +}; + +template +class LayerNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + const auto *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + + auto *output = ctx.Output("Y"); + auto *mean = ctx.Output("Mean"); + auto *var = ctx.Output("Variance"); + output->mutable_data(ctx.GetPlace()); + mean->mutable_data(ctx.GetPlace()); + var->mutable_data(ctx.GetPlace()); + + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + + auto input_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); + + auto mean_map = EigenMatrixMapRowMajor(mean->data(), left, 1); + auto var_map = EigenMatrixMapRowMajor(var->data(), left, 1); + auto output_map = EigenMatrixMapRowMajor(output->data(), left, right); + + auto squre = [](T ele) { return ele * ele; }; + auto add_epslion = [epsilon](T ele) { return ele + epsilon; }; + + mean_map = input_map.rowwise().mean(); + var_map = (input_map - mean_map.replicate(1, right)) + .unaryExpr(squre) + .rowwise() + .mean() + .unaryExpr(add_epslion); + + auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; + // TODO(zcd): Some thinking about output_map, is it appropriate that + // `output_map` and `input_map` point to the same memory. + auto inv_std = var_map.unaryExpr(inv_std_func); + if (scale && bias) { + auto scale_map = + ConstEigenMatrixMapRowMajor(scale->data(), 1, right); + auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), 1, right); + output_map = (input_map - mean_map.replicate(1, right)) + .cwiseProduct(inv_std.replicate(1, right)) + .cwiseProduct(scale_map.replicate(left, 1)) + + bias_map.replicate(left, 1); + } else if (scale) { + auto scale_map = + ConstEigenMatrixMapRowMajor(scale->data(), 1, right); + output_map = (input_map - mean_map.replicate(1, right)) + .cwiseProduct(inv_std.replicate(1, right)) + .cwiseProduct(scale_map.replicate(left, 1)); + } else if (bias) { + auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), 1, right); + output_map = (input_map - mean_map.replicate(1, right)) + .cwiseProduct(inv_std.replicate(1, right)) + + bias_map.replicate(left, 1); + } else { + output_map = (input_map - mean_map.replicate(1, right)) + .cwiseProduct(inv_std.replicate(1, right)); + } + } +}; + +class LayerNormGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + // check input + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Scale"), + "Input(Scale) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Mean"), + "Input(Mean) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Variance"), + "Input(Variance) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) of LayerNormOp should not be null."); + + // check output + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + ctx->SetOutputDim(framework::GradVarName("Scale"), + ctx->GetInputDim("Scale")); + } + if (ctx->HasOutput(framework::GradVarName("Bias"))) { + ctx->SetOutputDim(framework::GradVarName("Bias"), + ctx->GetInputDim("Bias")); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + const auto *var = ctx.InputVar(framework::GradVarName("Y")); + if (var == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + const Tensor *t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } + if (t == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + return framework::OpKernelType(framework::ToDataType(t->type()), + ctx.GetPlace()); + } +}; + +template +class LayerNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *x = ctx.Input("X"); + const auto *mean = ctx.Input("Mean"); + const auto *var = ctx.Input("Variance"); + const auto *scale = ctx.Input("Scale"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + + const auto &x_dims = x->dims(); + + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + auto x_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); + auto d_y_map = ConstEigenMatrixMapRowMajor(d_y->data(), left, right); + auto mean_map = ConstEigenMatrixMapRowMajor(mean->data(), left, 1); + auto var_map = ConstEigenMatrixMapRowMajor(var->data(), left, 1); + + if (d_bias) { + d_bias->mutable_data(ctx.GetPlace()); + auto d_bias_map = EigenMatrixMapRowMajor(d_bias->data(), 1, right); + d_bias_map = d_y_map.colwise().sum(); + } + if (d_scale) { + d_scale->mutable_data(ctx.GetPlace()); + auto d_scale_map = + EigenMatrixMapRowMajor(d_scale->data(), 1, right); + auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; + // There are two equation to compute d_scale. One uses "Y" and the other + // does not use "Y" + d_scale_map = + ((x_map - mean_map.replicate(1, right)) + .cwiseProduct( + var_map.unaryExpr(inv_std_func).replicate(1, right)) + .cwiseProduct(d_y_map)) + .colwise() + .sum(); + } + + if (d_x) { + d_x->mutable_data(ctx.GetPlace()); + auto d_x_map = EigenMatrixMapRowMajor(d_x->data(), left, right); + auto triple_product_func = [](T ele) { return ele * ele * ele; }; + auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; + // TODO(zcd): these code can be refined + if (d_scale) { + auto scale_map = + ConstEigenMatrixMapRowMajor(scale->data(), 1, right); + // dy_dx + auto dx_end = var_map.unaryExpr(inv_std_func) + .replicate(1, right) + .cwiseProduct(d_y_map) + .cwiseProduct(scale_map.replicate(left, 1)); + // dy_dmean_dx + auto dx_mean = (T(-1.0) / right) * + var_map.unaryExpr(inv_std_func) + .replicate(1, right) + .cwiseProduct(d_y_map) + .cwiseProduct(scale_map.replicate(left, 1)) + .rowwise() + .sum() + .replicate(1, right); + // dy_var_dx + auto dvar_end_part = (x_map - mean_map.replicate(1, right)) + .cwiseProduct(scale_map.replicate(left, 1)) + .cwiseProduct(d_y_map) + .rowwise() + .sum(); + auto dvar_end = var_map.unaryExpr(inv_std_func) + .unaryExpr(triple_product_func) + .cwiseProduct(dvar_end_part) + .replicate(1, right); + auto dx_var = + (T(-1.0) / right) * + (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); + + d_x_map = dx_end + dx_mean + dx_var; + } else { + // dy_dx + auto dx_end = var_map.unaryExpr(inv_std_func) + .replicate(1, right) + .cwiseProduct(d_y_map); + // dy_dmean_dx + auto dx_mean = (T(-1.0) / right) * + var_map.unaryExpr(inv_std_func) + .replicate(1, right) + .cwiseProduct(d_y_map) + .rowwise() + .sum() + .replicate(1, right); + // dy_var_dx + auto dvar_end_part = (x_map - mean_map.replicate(1, right)) + .cwiseProduct(d_y_map) + .rowwise() + .sum(); + auto dvar_end = var_map.unaryExpr(inv_std_func) + .unaryExpr(triple_product_func) + .cwiseProduct(dvar_end_part) + .replicate(1, right); + auto dx_var = + (T(-1.0) / right) * + (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); + + d_x_map = dx_end + dx_mean + dx_var; + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker, + layer_norm_grad, ops::LayerNormGradOp); +REGISTER_OP_CPU_KERNEL( + layer_norm, + ops::LayerNormKernel); +REGISTER_OP_CPU_KERNEL( + layer_norm_grad, + ops::LayerNormGradKernel); diff --git a/paddle/operators/layer_norm_op.h b/paddle/operators/layer_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..bca35b91e6f52d35dee14aac9d080b52914942e3 --- /dev/null +++ b/paddle/operators/layer_norm_op.h @@ -0,0 +1,35 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class LayerNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override; +}; + +template +class LayerNormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/listen_and_serv_op.cc b/paddle/operators/listen_and_serv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..099f6b23736adcc2a6e9c27dca297178687ae785 --- /dev/null +++ b/paddle/operators/listen_and_serv_op.cc @@ -0,0 +1,207 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include + +#include + +#include "paddle/framework/executor.h" +#include "paddle/framework/framework.pb.h" +#include "paddle/framework/lod_tensor.h" +#include "paddle/framework/op_registry.h" +#include "paddle/framework/proto_desc.h" +#include "paddle/operators/detail/grpc_server.h" +#include "paddle/operators/detail/sendrecvop_utils.h" +#include "paddle/operators/detail/simple_block_queue.h" +#include "paddle/string/printf.h" + +namespace paddle { +namespace operators { + +constexpr char kOptimizeBlock[] = "OptimizeBlock"; + +void RunServer(std::shared_ptr service) { + service->RunSyncUpdate(); + VLOG(4) << "RunServer thread end"; +} + +static void CreateTensorFromMessageType(framework::Variable *var, + sendrecv::VarType var_type) { + if (var_type == sendrecv::VarType::LOD_TENSOR) { + var->GetMutable(); + } else if (var_type == sendrecv::VarType::SELECTED_ROWS) { + var->GetMutable(); + } else { + PADDLE_THROW( + "VariableMessage type %d is not in " + "[LoDTensor, SelectedRows]", + var_type); + } +} + +class ListenAndServOp : public framework::OperatorBase { + public: + ListenAndServOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) { + if (!rpc_service_) { + std::string endpoint = Attr("endpoint"); + rpc_service_.reset(new detail::AsyncGRPCServer(endpoint)); + server_thread_.reset(new std::thread(RunServer, rpc_service_)); + } + } + + void Stop() override { + detail::MessageWithName term_msg; + term_msg.first = LISTEN_TERMINATE_MESSAGE; + rpc_service_->Push(term_msg); + rpc_service_->ShutDown(); + server_thread_->join(); + } + + std::string GetGradVarNameForTrainer(const std::string &varname) const { + if (grads_counter_.find(varname) == grads_counter_.end()) { + grads_counter_[varname] = 0; + } + return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++); + } + + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + framework::Scope &recv_scope = scope.NewScope(); + + // FIXME(Yancey1989): initialize rpc server with lazy mode. + rpc_service_->SetScope(&recv_scope); + rpc_service_->SetDevCtx(&dev_ctx); + auto param_list = Attr>("ParamList"); + auto grad_list = Attr>("GradList"); + auto fan_in = Attr("Fanin"); + + auto *block = Attr(kOptimizeBlock); + auto *program = block->Program(); + framework::Executor executor(dev_place); + + // TODO(typhoonzero): change this to a while_op for every cluster-batch. + bool exit_flag = false; + while (!exit_flag) { + // Get from multiple trainers, we don't care about the order in which + // the gradients arrives, just add suffix 0~n and merge the gradient. + rpc_service_->SetCond(0); + size_t recv_var_cnt = 0; + int batch_barrier = 0; + while (batch_barrier != fan_in) { + const detail::MessageWithName &v = rpc_service_->Get(); + auto grad_var_name = v.first; + if (grad_var_name == LISTEN_TERMINATE_MESSAGE) { + LOG(INFO) << "received terminate message and exit"; + exit_flag = true; + break; + } else if (grad_var_name == BATCH_BARRIER_MESSAGE) { + VLOG(3) << "recv batch barrier message"; + batch_barrier++; + continue; + } else { + // receive a variable + recv_var_cnt++; + auto it = + std::find(grad_list.begin(), grad_list.end(), grad_var_name); + std::string param_var_name; + if (it != grad_list.end()) { + param_var_name = param_list[it - grad_list.begin()]; + } else { + LOG(ERROR) << "grad has no paired param:" << grad_var_name; + } + VLOG(3) << "received grad: " << grad_var_name + << " updating param: " << param_var_name; + + if (fan_in > 1) { + grad_var_name = this->GetGradVarNameForTrainer(grad_var_name); + } + auto *var = recv_scope.FindVar(grad_var_name); + if (var == nullptr) { + LOG(ERROR) << "Can not find server side var: " << grad_var_name; + PADDLE_THROW("Can not find server side var"); + } + detail::DeserializeFromMessage(v.second, dev_ctx, var); + } + } + VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier."; + // TODO(Yancey1989): merge SelectedRows variables here + if (exit_flag) { + rpc_service_->ShutDown(); + } + + try { + executor.Run(*program, &recv_scope, block->ID(), /*global_block*/ + false /*create_local_scope*/, false /*create_vars*/); + } catch (std::exception &e) { + LOG(ERROR) << "run sub program error " << e.what(); + } + rpc_service_->SetCond(1); + rpc_service_->WaitClientGet(recv_var_cnt); + grads_counter_.clear(); + } // while(true) + } + + protected: + std::shared_ptr rpc_service_; + std::shared_ptr server_thread_; + mutable std::unordered_map grads_counter_; +}; + +class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ListenAndServOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddComment(R"DOC( +ListenAndServ operator + +This operator will start a RPC server which can receive variables +from send_op and send back variables to recv_op. +)DOC"); + AddAttr("endpoint", + "(string, default 127.0.0.1:6164)" + "IP address to listen on.") + .SetDefault("127.0.0.1:6164") + .AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); + AddAttr(kOptimizeBlock, + "BlockID to run on server side."); + AddAttr>( + "ParamList", "type list of string", + "grad->param name mapping to find which parameters to optimize.") + .SetDefault({}); + AddAttr>( + "GradList", "type list of string", + "grad->param name mapping to find which parameters to optimize.") + .SetDefault({}); + AddAttr("Fanin", "type int", + "Number of trainers in the current cluster job") + .SetDefault(1); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(listen_and_serv, ops::ListenAndServOp, + ops::ListenAndServOpMaker); diff --git a/paddle/operators/load_combine_op.cc b/paddle/operators/load_combine_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f4be793d7bf1f346c011842c57fb5b5179a697d6 --- /dev/null +++ b/paddle/operators/load_combine_op.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include "paddle/framework/op_registry.h" +#include "paddle/platform/device_context.h" + +namespace paddle { +namespace operators { + +class LoadCombineOp : public framework::OperatorBase { + public: + LoadCombineOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto filename = Attr("file_path"); + + std::ifstream fin(filename); + PADDLE_ENFORCE(static_cast(fin), + "Cannot open file %s for load_combine op", filename); + + auto out_var_names = Outputs("Out"); + PADDLE_ENFORCE_GT( + static_cast(out_var_names.size()), 0, + "The number of output variables should be greater than 0."); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + for (size_t i = 0; i < out_var_names.size(); i++) { + auto *out_var = scope.FindVar(out_var_names[i]); + + PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found", + out_var_names[i]); + + auto *tensor = out_var->GetMutable(); + + // Error checking + PADDLE_ENFORCE(static_cast(fin), "Cannot read more from file %s", + filename); + + // Get data from fin to tensor + DeserializeFromStream(fin, tensor, dev_ctx); + + if (platform::is_gpu_place(place)) { + // copy CPU to GPU + framework::LoDTensor cpu_tensor; + cpu_tensor.ShareDataWith(*tensor); + cpu_tensor.set_lod(tensor->lod()); + + // reset tensor + out_var->Clear(); + tensor = out_var->GetMutable(); + tensor->set_lod(cpu_tensor.lod()); + Copy(cpu_tensor, place, dev_ctx, tensor); + } + } + } +}; + +class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + LoadCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput( + "Out", + "(vector) The output LoDTensors that will be read from the input file.") + .AsDuplicable(); + AddAttr("file_path", + "(string) " + "LoDTensors will be loaded from \"file_path\".") + .AddCustomChecker( + [](const std::string &path) { return !path.empty(); }); + AddComment(R"DOC( +LoadCombine Operator. + +LoadCombine operator loads LoDTensor variables from a file. The file should +contain one or more LoDTensors serialized using the SaveCombine operator. The +LoadCombine operator applies a deserialization strategy to appropriately load +the LodTensors, and this strategy complements the serialization strategy used +in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled +with the SaveCombine operator, and can only deserialize one or more LoDTensors +that were saved using the SaveCombine operator. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OPERATOR(load_combine, ops::LoadCombineOp, + ops::LoadCombineOpProtoMaker); diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu index d97390fa1c53fa0bdf16ab34cb209b994621f83c..07372808bbf078bd2e9b0bb5782b95a046253f46 100644 --- a/paddle/operators/lookup_table_op.cu +++ b/paddle/operators/lookup_table_op.cu @@ -125,8 +125,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { new_rows.resize(ids_dim[0]); auto gpu_place = boost::get(context.GetPlace()); - memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data, - ids_dim[0] * sizeof(int64_t), stream); + memory::Copy(platform::CPUPlace(), new_rows.cuda_data(), gpu_place, + ids_data, ids_dim[0] * sizeof(int64_t), stream); d_table->set_rows(new_rows); diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h index c57ee414dc5b3417549c8ac3a7fd57a9c8f452df..72e95b75e29c88c5944607ceaa40435bac7a745c 100644 --- a/paddle/operators/lstm_op.h +++ b/paddle/operators/lstm_op.h @@ -27,11 +27,12 @@ using Tensor = framework::Tensor; template inline void ReorderInitState(const DeviceContext& ctx, - const framework::Tensor& src, const size_t* index, + const framework::Tensor& src, + framework::Vector index_lod, framework::Tensor* dst, bool indexed_src) { math::CopyMatrixRowsFunctor row_shuffle; dst->mutable_data(src.dims(), ctx.GetPlace()); - row_shuffle(ctx, src, index, *dst, indexed_src); + row_shuffle(ctx, src, index_lod, *dst, indexed_src); } template @@ -84,7 +85,9 @@ class LSTMKernel : public framework::OpKernel { } lstm_value.prev_state_value = nullptr; Tensor ordered_c0; - const size_t* order = batch_gate->lod()[2].data(); + + framework::Vector order(batch_gate->lod()[2]); + if (cell_t0) { // Since the batch computing for LSTM reorders the input sequence // according to their length. The initialized cell state also needs @@ -202,7 +205,8 @@ class LSTMGradKernel : public framework::OpKernel { // ordered_h0_g/c0_g is the reordered gradient of hidden/cell // initialization. Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g; - const size_t* order = batch_gate->lod()[2].data(); + framework::Vector order(batch_gate->lod()[2]); + if (c0) { ReorderInitState(device_ctx, *c0, order, &ordered_c0, true); diff --git a/paddle/operators/lstmp_op.h b/paddle/operators/lstmp_op.h index ee82d5c10a5421b181e525f49a263d4808ede62f..e064a155dfadd8104fa80727a962cb2e24ade29f 100644 --- a/paddle/operators/lstmp_op.h +++ b/paddle/operators/lstmp_op.h @@ -34,7 +34,8 @@ using EigenMatrix = framework::EigenMatrix; template inline void ReorderInitState(const DeviceContext& ctx, - const framework::Tensor& src, const size_t* index, + const framework::Tensor& src, + framework::Vector index, framework::Tensor* dst, bool indexed_src) { math::CopyMatrixRowsFunctor row_shuffle; dst->mutable_data(src.dims(), ctx.GetPlace()); @@ -109,7 +110,9 @@ class LSTMPKernel : public framework::OpKernel { } lstmp_value.prev_state_value = nullptr; Tensor ordered_c0; - const size_t* order = batch_gate->lod()[2].data(); + + framework::Vector order(batch_gate->lod()[2]); + if (cell_t0) { // Since the batch computing for LSTMP reorders the input sequence // according to their length. The initialized cell state also needs @@ -275,7 +278,9 @@ class LSTMPGradKernel : public framework::OpKernel { // ordered_h0_g/c0_g is the reordered gradient of hidden/cell // initialization. Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g; - const size_t* order = batch_gate->lod()[2].data(); + + framework::Vector order(batch_gate->lod()[2]); + if (c0) { ReorderInitState(device_ctx, *c0, order, &ordered_c0, true); diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index 28c5aec1996ad04a6cb551ac68c14b613d16858e..768106fadf355ea6fb148491e232dc0ef1453a75 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -8,6 +8,7 @@ if(WITH_GPU) nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context) nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) + nv_library(depthwise_conv SRCS depthwise_conv.cu DEPS device_context) nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function) diff --git a/paddle/operators/math/depthwise_conv.cu b/paddle/operators/math/depthwise_conv.cu new file mode 100644 index 0000000000000000000000000000000000000000..b212e78208355866516211d276cb8046623babd7 --- /dev/null +++ b/paddle/operators/math/depthwise_conv.cu @@ -0,0 +1,311 @@ +/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/depthwise_conv.h" +#include "paddle/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +// A Cuda kernel to compute the depthwise convolution forward pass +// in NCHW format. +template +__global__ void KernelDepthwiseConv( + const int nthreads, const T* const input_data, const T* const filter_data, + const int batch_size, const int output_channels, const int output_height, + const int output_width, const int input_channels, const int input_height, + const int input_width, const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, T* const output_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if (index < nthreads) { + const int batch = index / output_channels / output_height / output_width; + const int c_out = (index / output_height / output_width) % output_channels; + const int h_out = (index / output_width) % output_height; + const int w_out = index % output_width; + + const int c_in = c_out / filter_multiplier; + const T* weight = filter_data + c_out * filter_height * filter_width; + T value = 0; + const int h_in_start = -padding_height + h_out * stride_height; + const int w_in_start = -padding_width + w_out * stride_width; + const int h_in_end = h_in_start + filter_height; + const int w_in_end = w_in_start + filter_width; + + const int in_offset = + ((batch * input_channels + c_in) * input_height) * input_width; + + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + + for (int h_in = h_start; h_in < h_end; h_in++) { + for (int w_in = w_start; w_in < w_end; w_in++) { + const int offset = in_offset + h_in * input_width + w_in; + value += + weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] * + input_data[offset]; + } + } + output_data[index] = value; + } +} + +// CUDA kernel to compute the depthwise convolution backprop w.r.t input. +template +__global__ void KernelDepthwiseConvInputGrad( + const int nthreads, const T* const output_grad_data, + const T* const filter_data, const int batch_size, const int output_channels, + const int output_height, const int output_width, const int input_channels, + const int input_height, const int input_width, const int filter_multiplier, + const int filter_height, const int filter_width, const int stride_height, + const int stride_width, const int padding_height, const int padding_width, + T* const input_grad_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < nthreads) { + const int batch = index / input_channels / input_height / input_width; + const int c_in = (index / input_height / input_width) % input_channels; + const int h_in = (index / input_width) % input_height; + const int w_in = index % input_width; + + const int c_out_start = c_in * filter_multiplier; + + int h_out_start = + (h_in - filter_height + padding_height + stride_height) / stride_height; + h_out_start = 0 > h_out_start ? 0 : h_out_start; + + int h_out_end = (h_in + padding_height) / stride_height; + h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end; + + int w_out_start = + (w_in - filter_width + padding_width + stride_width) / stride_width; + w_out_start = 0 > w_out_start ? 0 : w_out_start; + + int w_out_end = (w_in + padding_width) / stride_width; + w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end; + + T value = 0; + + for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; + c_out++) { + for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { + const int filter_h = h_in + padding_height - h_out * stride_height; + for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { + const int filter_w = w_in + padding_width - w_out * stride_width; + const int filter_offset = c_out * filter_height * filter_width + + filter_h * filter_width + filter_w; + const int output_grad_offset = + ((batch * output_channels + c_out) * output_height + h_out) * + output_width + + w_out; + value += + output_grad_data[output_grad_offset] * filter_data[filter_offset]; + } + } + } + input_grad_data[index] += value; + } +} + +// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. +template +__global__ void KernelDepthwiseConvFilterGrad( + const int nthreads, const T* const output_grad_data, + const T* const input_data, const int num, const int output_channels, + const int output_height, const int output_width, const int input_channels, + const int input_height, const int input_width, const int filter_multiplier, + const int filter_height, const int filter_width, const int stride_height, + const int stride_width, const int padding_height, const int padding_width, + T* const filter_grad_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < nthreads) { + const int w_out = index % output_width; + const int h_out = (index / output_width) % output_height; + const int c_out = (index / output_width / output_height) % output_channels; + const int batch = (index / output_width / output_height / output_channels); + const int c_in = c_out / filter_multiplier; + const int h_in_start = -padding_height + h_out * stride_height; + const int w_in_start = -padding_width + w_out * stride_width; + const int h_in_end = + -padding_height + h_out * stride_height + filter_height; + const int w_in_end = -padding_width + w_out * stride_width + filter_width; + const int in_offset = + (batch * input_channels + c_in) * input_height * input_width; + + T* addr_offset = filter_grad_data + c_out * filter_height * filter_width; + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + + for (int h_in = h_start; h_in < h_end; h_in++) { + for (int w_in = w_start; w_in < w_end; w_in++) { + const int offset = in_offset + h_in * input_width + w_in; + const T diff_temp = output_grad_data[index] * input_data[offset]; + T* addr = addr_offset + (h_in - h_in_start) * filter_width + + (w_in - w_in_start); + paddle::platform::CudaAtomicAdd(addr, diff_temp); + } + } + } +} + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class DepthwiseConvFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& filter, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int ksize_height = filter.dims()[2]; + const int ksize_width = filter.dims()[3]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* input_data = input.data(); + const T* filter_data = filter.data(); + T* output_data = output->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_height * output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConv<<>>( + nthreads, input_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + output_data); + } +}; + +template +class DepthwiseConvInputGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& filter, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output_grad.dims()[1]; + const int output_height = output_grad.dims()[2]; + const int output_width = output_grad.dims()[3]; + const int ksize_height = filter.dims()[2]; + const int ksize_width = filter.dims()[3]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* filter_data = filter.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + int nthreads = batch_size * input_channels * input_height * input_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConvInputGrad<<>>( + nthreads, output_grad_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + input_grad_data); + } +}; + +template +class DepthwiseConvFilterGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* filter_grad) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output_grad.dims()[1]; + const int output_height = output_grad.dims()[2]; + const int output_width = output_grad.dims()[3]; + const int ksize_height = filter_grad->dims()[2]; + const int ksize_width = filter_grad->dims()[3]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* input_data = input.data(); + const T* output_grad_data = output_grad.data(); + T* filter_grad_data = filter_grad->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_height * output_width; + + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConvFilterGrad<<>>( + nthreads, output_grad_data, input_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + filter_grad_data); + } +}; + +template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; + +template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; + +template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/depthwise_conv.h b/paddle/operators/math/depthwise_conv.h new file mode 100644 index 0000000000000000000000000000000000000000..4708920bb42db90d84fda0c6a1039991cb79e80d --- /dev/null +++ b/paddle/operators/math/depthwise_conv.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/tensor.h" +#include "paddle/platform/device_context.h" +#include "paddle/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * \brief Compute the depthwise convolution which include + * forward process and backpropagation process + */ +template +class DepthwiseConvFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& filter, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output); +}; + +template +class DepthwiseConvInputGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& filter, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* input_grad); +}; + +template +class DepthwiseConvFilterGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* filter_grad); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu index 0ee456f9bc61436bd0f2f8ef20dd1654e7e56d56..acdd87cb3550bc5f3891aed6fefd4301a3395f9f 100644 --- a/paddle/operators/math/selected_rows_functor.cu +++ b/paddle/operators/math/selected_rows_functor.cu @@ -31,7 +31,7 @@ struct SelectedRowsAdd { PADDLE_ENFORCE_EQ(in1_height, input2.height()); output->set_height(in1_height); - auto& in1_rows = input1.rows(); + framework::Vector in1_rows(input1.rows()); auto& in2_rows = input2.rows(); std::vector out_rows; out_rows.reserve(in1_rows.size() + in2_rows.size()); @@ -108,7 +108,7 @@ struct SelectedRowsAddTensor { PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); auto& in1_value = input1.value(); - auto& in1_rows = input1.rows(); + framework::Vector in1_rows(input1.rows()); int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); @@ -126,7 +126,7 @@ struct SelectedRowsAddTensor { dim3 grid(1, in1_rows.size()); SelectedRowsAddTensorKernel< T, block_size><<>>( - in1_data, in1_rows.data(), out_data, in1_row_numel); + in1_data, in1_rows.cuda_data(), out_data, in1_row_numel); auto out_eigen = framework::EigenVector::Flatten(*output); auto in2_eigen = framework::EigenVector::Flatten(input2); @@ -146,7 +146,7 @@ struct SelectedRowsAddTo { auto in1_height = input1.height(); PADDLE_ENFORCE_EQ(in1_height, input2->height()); - auto& in1_rows = input1.rows(); + framework::Vector in1_rows(input1.rows()); auto& in2_rows = *(input2->mutable_rows()); auto& in1_value = input1.value(); @@ -204,7 +204,7 @@ struct SelectedRowsAddToTensor { PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); auto& in1_value = input1.value(); - auto& in1_rows = input1.rows(); + framework::Vector in1_rows(input1.rows()); int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); @@ -216,7 +216,7 @@ struct SelectedRowsAddToTensor { dim3 grid(1, in1_rows.size()); SelectedRowsAddToTensorKernel< T, block_size><<>>( - in1_data, in1_rows.data(), in2_data, in1_row_numel); + in1_data, in1_rows.cuda_data(), in2_data, in1_row_numel); } }; @@ -257,7 +257,7 @@ struct MergeAdd { framework::SelectedRows operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input) { framework::SelectedRows out; - auto input_rows = input.rows(); + framework::Vector input_rows(input.rows()); std::set row_set(input_rows.begin(), input_rows.end()); std::vector merge_rows(row_set.begin(), row_set.end()); @@ -283,9 +283,9 @@ struct MergeAdd { MergeAddKernel< T, 256><<(context) - .stream()>>>(input_data, input.rows().data(), out_data, - out.rows().data(), out.rows().size(), - input_width); + .stream()>>>(input_data, input_rows.cuda_data(), out_data, + out.mutable_rows()->cuda_data(), + out.rows().size(), input_width); return out; } }; @@ -370,8 +370,8 @@ struct UpdateToTensor { dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1); dim3 grid(1, in1_rows.size()); UpdateToTensorKernel<<< - grid, threads, 0, context.stream()>>>(in1_data, in1_rows.data(), op, - in2_data, in1_row_numel); + grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(), + op, in2_data, in1_row_numel); } }; } // namespace scatter diff --git a/paddle/operators/math/sequence2batch.cc b/paddle/operators/math/sequence2batch.cc index e459a42ca251a9fc79f745f48a118ce898a0f77e..17abce1c2f809f75edb2c5dc46709094c2ce10c3 100644 --- a/paddle/operators/math/sequence2batch.cc +++ b/paddle/operators/math/sequence2batch.cc @@ -23,8 +23,10 @@ template class CopyMatrixRowsFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& src, const size_t* index, - framework::Tensor& dst, bool is_src_index) { + const framework::Tensor& src, + framework::Vector index_lod, framework::Tensor& dst, + bool is_src_index) { + size_t* index = index_lod.data(); auto src_dims = src.dims(); auto dst_dims = dst.dims(); PADDLE_ENFORCE_EQ(src_dims.size(), 2UL, diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu index 452ae8951000872b706f7e4227a62dbf98109e7e..f27631271a42b4d64abef00d7f119b85e32edda4 100644 --- a/paddle/operators/math/sequence2batch.cu +++ b/paddle/operators/math/sequence2batch.cu @@ -42,8 +42,10 @@ template class CopyMatrixRowsFunctor { public: void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& src, const size_t* index, - framework::Tensor& dst, bool is_src_index) { + const framework::Tensor& src, + framework::Vector index_lod, framework::Tensor& dst, + bool is_src_index) { + size_t* index = index_lod.cuda_data(); auto src_dims = src.dims(); auto dst_dims = dst.dims(); PADDLE_ENFORCE_EQ(src_dims.size(), 2, diff --git a/paddle/operators/math/sequence2batch.h b/paddle/operators/math/sequence2batch.h index a5c43a2c7d4d729c35a20a27de2a23141e6019bc..6db0427b4174a09dd254d771e8d3d215cc6571a9 100644 --- a/paddle/operators/math/sequence2batch.h +++ b/paddle/operators/math/sequence2batch.h @@ -35,7 +35,7 @@ class CopyMatrixRowsFunctor { // copy the input src to the indexed rows of output dst. // The indexed rows are based on the input index. void operator()(const DeviceContext& context, const framework::Tensor& src, - const size_t* index, framework::Tensor& dst, + framework::Vector index_lod, framework::Tensor& dst, bool is_src_index); }; @@ -66,7 +66,7 @@ class LoDTensor2BatchFunctor { PADDLE_ENFORCE_EQ(lods[1].size(), static_cast(lod_tensor.dims()[0])); CopyMatrixRowsFunctor to_batch; - to_batch(context, lod_tensor, lods[1].data(), batch, true); + to_batch(context, lod_tensor, lods[1], batch, true); return; } @@ -144,7 +144,7 @@ class LoDTensor2BatchFunctor { batch.set_lod(batch_lods); CopyMatrixRowsFunctor to_batch; - to_batch(context, lod_tensor, seq2batch_idx, batch, true); + to_batch(context, lod_tensor, batch_lods[1], batch, true); } }; @@ -159,8 +159,7 @@ class Batch2LoDTensorFunctor { PADDLE_ENFORCE_EQ(in_lod[1].size(), static_cast(lod_tensor.dims()[0])); CopyMatrixRowsFunctor to_seq; - size_t* index = in_lod[1].data(); - to_seq(context, batch, index, lod_tensor, false); + to_seq(context, batch, in_lod[1], lod_tensor, false); } }; diff --git a/paddle/operators/math/sequence_padding.cu b/paddle/operators/math/sequence_padding.cu index a38df26f59569c4fd54a1ba5691b2cd5f3245344..65c9cfe4a0ec14d220ad237baa71703a783ed0fa 100644 --- a/paddle/operators/math/sequence_padding.cu +++ b/paddle/operators/math/sequence_padding.cu @@ -120,12 +120,14 @@ class PaddingLoDTensorFunctor { T* padding_data = padding.data(); if (norm_by_times) { SequencePaddingKernel<<>>( - padding_data, const_cast(seq_data), abs_offset_lod[level].data(), - sequence_width, max_sequence_length, num_sequences); + padding_data, const_cast(seq_data), + abs_offset_lod[level].cuda_data(), sequence_width, + max_sequence_length, num_sequences); } else { SequencePaddingKernel<<>>( - padding_data, const_cast(seq_data), abs_offset_lod[level].data(), - sequence_width, max_sequence_length, num_sequences); + padding_data, const_cast(seq_data), + abs_offset_lod[level].cuda_data(), sequence_width, + max_sequence_length, num_sequences); } } }; @@ -193,12 +195,14 @@ class UnpaddingLoDTensorFunctor { T* seq_data = seq.data(); if (norm_by_times) { SequencePaddingKernel<<>>( - const_cast(padding_data), seq_data, abs_offset_lod[level].data(), - sequence_width, max_sequence_length, num_sequences); + const_cast(padding_data), seq_data, + abs_offset_lod[level].cuda_data(), sequence_width, + max_sequence_length, num_sequences); } else { SequencePaddingKernel<<>>( - const_cast(padding_data), seq_data, abs_offset_lod[level].data(), - sequence_width, max_sequence_length, num_sequences); + const_cast(padding_data), seq_data, + abs_offset_lod[level].cuda_data(), sequence_width, + max_sequence_length, num_sequences); } } }; diff --git a/paddle/operators/math/sequence_pooling.cu b/paddle/operators/math/sequence_pooling.cu index 4c9e6b375ce7251747b9cd443d86cca0858c84ef..f66534a6812a66c737445ea96914a393077d7d65 100644 --- a/paddle/operators/math/sequence_pooling.cu +++ b/paddle/operators/math/sequence_pooling.cu @@ -73,7 +73,7 @@ class MaxSeqPoolFunctor { dim3 grid(num_seq, 1); auto stream = context.stream(); KeMaxSequencePool<<>>( - in_data, starts.data(), out_data, max_index, num_seq, dim); + in_data, starts.cuda_data(), out_data, max_index, num_seq, dim); } }; diff --git a/paddle/operators/math/sequence_scale.cu b/paddle/operators/math/sequence_scale.cu index ceaabd8e0fd81c927fbd4333c0aa7954b8da8513..fd4e28f6113729cd1fa9dc179bd9b601d29b8a7f 100644 --- a/paddle/operators/math/sequence_scale.cu +++ b/paddle/operators/math/sequence_scale.cu @@ -46,7 +46,7 @@ class ScaleLoDTensorFunctor { SequenceScaleKernel<<< num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>( - seq_data, abs_offset_lod[level].data(), scales, seq_width); + seq_data, abs_offset_lod[level].cuda_data(), scales, seq_width); } }; diff --git a/paddle/operators/mine_hard_examples_op.cc b/paddle/operators/mine_hard_examples_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..051cc24706d69ec4f38524af1dd510bf079c74c7 --- /dev/null +++ b/paddle/operators/mine_hard_examples_op.cc @@ -0,0 +1,330 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +enum MiningType { kNone = 0, kMaxNegative, kHardExample }; + +template +bool SortScoreDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +inline bool IsEligibleMining(const MiningType mining_type, const int match_idx, + const float match_dist, + const float neg_dist_threshold) { + if (mining_type == MiningType::kMaxNegative) { + return match_idx == -1 && match_dist < neg_dist_threshold; + } else if (mining_type == MiningType::kHardExample) { + return true; + } else { + return false; + } +} + +inline MiningType GetMiningType(std::string str) { + if (str == "max_negative") { + return MiningType::kMaxNegative; + } else if (str == "hard_example") { + return MiningType::kHardExample; + } else { + return MiningType::kNone; + } +} + +template +class MineHardExamplesKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_cls_loss = ctx.Input("ClsLoss"); + auto* in_loc_loss = ctx.Input("LocLoss"); + auto* in_matched_indices = ctx.Input("MatchIndices"); + auto* in_match_dist = ctx.Input("MatchDist"); + float neg_pos_ratio = ctx.Attr("neg_pos_ratio"); + T neg_dist_threshold = + static_cast(ctx.Attr("neg_dist_threshold")); + int sample_size = ctx.Attr("sample_size"); + MiningType mining_type = + GetMiningType(ctx.Attr("mining_type")); + + auto out_neg_indices = ctx.Output("NegIndices"); + auto out_match_indices = + ctx.Output("UpdatedMatchIndices"); + + framework::Copy(*in_matched_indices, ctx.GetPlace(), out_match_indices); + + int batch_size = in_matched_indices->dims()[0]; + int prior_num = in_matched_indices->dims()[1]; + + auto match_indices = framework::EigenMatrix::From(*in_matched_indices); + + auto match_indices_et = + framework::EigenMatrix::From(*out_match_indices); + + auto match_dist = framework::EigenMatrix::From(*in_match_dist); + + const T* cls_loss = in_cls_loss->data(); + const T* loc_loss = nullptr; + if (in_loc_loss) { + loc_loss = in_loc_loss->data(); + } + + std::vector> all_neg_indices; + std::vector batch_starts = {0}; + for (int n = 0; n < batch_size; ++n) { + std::vector> loss_idx; + int neg_sel = 0; + for (int m = 0; m < prior_num; ++m) { + if (IsEligibleMining(mining_type, match_indices(n, m), match_dist(n, m), + neg_dist_threshold)) { + T loss = cls_loss[n * prior_num + m]; + if (mining_type == MiningType::kHardExample && loc_loss != nullptr) { + loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m]; + } + loss_idx.push_back(std::make_pair(loss, m)); + ++neg_sel; + } + } + + if (mining_type == MiningType::kMaxNegative) { + int num_pos = 0; + for (int m = 0; m < prior_num; ++m) { + if (match_indices(n, m) != -1) ++num_pos; + } + neg_sel = std::min(static_cast(num_pos * neg_pos_ratio), neg_sel); + } else if (mining_type == MiningType::kHardExample) { + neg_sel = std::min(sample_size, neg_sel); + } + + std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend); + std::set sel_indices; + std::vector neg_indices; + std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel, + std::inserter(sel_indices, sel_indices.begin()), + [](std::pair& l) -> int { + return static_cast(l.second); + }); + + if (mining_type == MiningType::kHardExample) { + for (int m = 0; m < prior_num; ++m) { + if (match_indices(n, m) > -1) { + if (sel_indices.find(m) == sel_indices.end()) { + match_indices_et(n, m) = -1; + } + } else { + if (sel_indices.find(m) != sel_indices.end()) { + neg_indices.push_back(m); + } + } + } + } else { + neg_indices.resize(sel_indices.size()); + std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin()); + } + + all_neg_indices.push_back(neg_indices); + batch_starts.push_back(batch_starts.back() + neg_indices.size()); + } + + framework::LoD out_neg_indices_lod; + out_neg_indices_lod.emplace_back(batch_starts); + int neg_offset = 0; + auto neg_data = out_neg_indices->mutable_data( + framework::make_ddim({static_cast(batch_starts.back()), 1}), + ctx.GetPlace()); + + for (auto neg_indices : all_neg_indices) { + std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset); + neg_offset += neg_indices.size(); + } + out_neg_indices->set_lod(out_neg_indices_lod); + return; + } +}; + +class MineHardExamplesOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("ClsLoss"), + "Input(ClsLoss) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("MatchIndices"), + "Input(MatchIndices) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("MatchDist"), + "Input(MatchDist) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("NegIndices"), + "Output(NegIndices) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("UpdatedMatchIndices"), + "Output(UpdatedMatchIndices) of MineHardExamplesOp should " + "not be null."); + + auto cls_loss_dims = ctx->GetInputDim("ClsLoss"); + auto idx_dims = ctx->GetInputDim("MatchIndices"); + auto dis_dims = ctx->GetInputDim("MatchDist"); + + PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL, + "The shape of ClsLoss is [N, Np]."); + PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL, + "The shape of MatchIndices is [N, Np]."); + PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL, + "The shape of MatchDist is [N, Np]."); + + if (ctx->HasInput("LocLoss")) { + auto loc_loss_dims = ctx->GetInputDim("LocLoss"); + PADDLE_ENFORCE_EQ(loc_loss_dims.size(), 2UL, + "The shape of LocLoss is [N, Np]."); + PADDLE_ENFORCE_EQ(cls_loss_dims[0], loc_loss_dims[0], + "Batch size of ClsLoss and LocLoss must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], loc_loss_dims[1], + "Prior box number of ClsLoss and LocLoss must be the same."); + } + + PADDLE_ENFORCE_EQ( + cls_loss_dims[0], idx_dims[0], + "Batch size of ClsLoss and MatchIndices must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], idx_dims[1], + "Prior box number of ClsLoss and MatchIndices must be the same."); + + PADDLE_ENFORCE_EQ(cls_loss_dims[0], dis_dims[0], + "Batch size of ClsLoss and MatchDist must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], idx_dims[1], + "Prior box number of ClsLoss and MatchDist must be the same."); + + auto mining_type = + GetMiningType(ctx->Attrs().Get("mining_type")); + + PADDLE_ENFORCE_NE(mining_type, MiningType::kNone, + "mining_type must be hard_example or max_negative"); + + if (mining_type == MiningType::kMaxNegative) { + auto neg_pos_ratio = ctx->Attrs().Get("neg_pos_ratio"); + auto neg_dist_threshold = ctx->Attrs().Get("neg_dist_threshold"); + PADDLE_ENFORCE_GT( + neg_pos_ratio, 0.0f, + "neg_pos_ratio must greater than zero in max_negative mode"); + PADDLE_ENFORCE_GT( + neg_dist_threshold, 0.0f, + "neg_dist_threshold must greater than zero in max_negative mode"); + } else if (mining_type == MiningType::kHardExample) { + auto sample_size = ctx->Attrs().Get("sample_size"); + PADDLE_ENFORCE_GT( + sample_size, 0, + "sample_size must greater than zero in hard_example mode"); + } + + ctx->SetOutputDim("UpdatedMatchIndices", idx_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("ClsLoss")->type()), + ctx.device_context()); + } +}; + +class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MineHardExamplesOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "ClsLoss", + "(Tensor, default Tensor), The classification loss with shape " + "[N, Np], N is the batch size and Np is the number of prior box."); + AddInput("LocLoss", + "(Tensor, optional, default Tensor), The localization loss " + "with shape [N, Np], N is the batch size and Np is the number of " + "prior box.") + .AsDispensable(); + AddInput("MatchIndices", + "(Tensor, Tensor), Matched indices with shape [N, Np], N is " + "the batch size and Np is the number of prior box. " + "MatchIndices[i][j] equal -1 means the j-th prior box in i-th " + "instance does not match any entity, otherwise means it is " + "matched to row."); + AddInput("MatchDist", + "(Tensor, default Tensor) Matched indices with shape [N, " + "Np], N is the batch size and Np is the number of prior box."); + AddAttr("neg_pos_ratio", + "(float) The ratio of the negative box to the positive " + "box. Use only when mining_type is max_negative.") + .SetDefault(1.0); + AddAttr("neg_dist_threshold", + "(float) The negative overlap upper bound for the unmatched " + "predictions. Use only when mining_type is max_negative.") + .SetDefault(0.5); + AddAttr("sample_size", + "(float) The max sample size of negative box. Use only when " + "mining_type is hard_example.") + .SetDefault(0); + AddAttr("mining_type", + "(float) The mining algorithm name, the value is " + "hard_example or max_negative.") + .SetDefault("max_negative") + .InEnum({"hard_example", "max_negative"}); + + AddOutput( + "NegIndices", + "(LoDTensor) The output of negative example indices. a LoDTensor " + "with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, " + "and each element is the prior box index. " + "For example, the batch size is 2, the lod is [[0, 1, 2]], " + "the sample 0's box 1(MatchIndices[0][1]) is selected, " + "and sample 1's box 0 is selected. The output NegIndices is " + "[[1], [0]]."); + + AddOutput("UpdatedMatchIndices", + "(Tensor) The output of updated MatchIndices, a tensor with " + "shape [N, Np]. Only update when mining_type is " + "hard_example. The input MatchIndices elements will be update to " + "-1 when it is not in the candidate high loss list of negative " + "examples."); + + AddComment(R"DOC( +Mine hard examples Operator. +This operator implements hard example mining to select a subset of negative box indices. +For each image, selects the box with highest losses. subject to the condition that the +box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative. +The selected number is min(sample_size, max_negative_box_number) when mining_type is +hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number) +when mining_type is max_negative, where the max_negative_box_number is the count of +MatchIndices elements with value -1. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp, + ops::MineHardExamplesOpMaker); + +REGISTER_OP_CPU_KERNEL( + mine_hard_examples, + ops::MineHardExamplesKernel, + ops::MineHardExamplesKernel); diff --git a/paddle/operators/multiclass_nms_op.cc b/paddle/operators/multiclass_nms_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8a65fe69f1534112923c72e17969f808b9310735 --- /dev/null +++ b/paddle/operators/multiclass_nms_op.cc @@ -0,0 +1,384 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +constexpr int64_t kOutputDim = 6; +constexpr int64_t kBBoxSize = 4; + +class MultiClassNMSOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("BBoxes"), + "Input(BBoxes) of MultiClassNMS should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Scores"), + "Input(Scores) of MultiClassNMS should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of MultiClassNMS should not be null."); + + auto box_dims = ctx->GetInputDim("BBoxes"); + auto score_dims = ctx->GetInputDim("Scores"); + + PADDLE_ENFORCE_EQ(box_dims.size(), 2, + "The rank of Input(BBoxes) must be 2."); + PADDLE_ENFORCE_EQ(score_dims.size(), 3, + "The rank of Input(Scores) must be 3."); + PADDLE_ENFORCE_EQ(box_dims[1], 4, + "The 2nd dimension of Input(BBoxes) must be 4, " + "represents the layout of coordinate " + "[xmin, ymin, xmax, ymax]"); + PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2], + "The 1st dimensiong of Input(BBoxes) must be equal to " + "3rd dimension of Input(Scores), which represents the " + "predicted bboxes."); + + // Here the box_dims[0] is not the real dimension of output. + // It will be rewritten in the computing kernel. + ctx->SetOutputDim("Out", {box_dims[0], 6}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input("Scores")->type()), + ctx.device_context()); + } +}; + +template +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +template +static inline void GetMaxScoreIndex( + const std::vector& scores, const T threshold, int top_k, + std::vector>* sorted_indices) { + for (size_t i = 0; i < scores.size(); ++i) { + if (scores[i] > threshold) { + sorted_indices->push_back(std::make_pair(scores[i], i)); + } + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices->begin(), sorted_indices->end(), + SortScorePairDescend); + // Keep top_k scores if needed. + if (top_k > -1 && top_k < sorted_indices->size()) { + sorted_indices->resize(top_k); + } +} + +template +static inline T BBoxArea(const T* box, const bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +static inline T JaccardOverlap(const T* box1, const T* box2, + const bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + const T inter_w = inter_xmax - inter_xmin; + const T inter_h = inter_ymax - inter_ymin; + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +class MultiClassNMSKernel : public framework::OpKernel { + public: + void NMSFast(const Tensor& bbox, const Tensor& scores, + const T score_threshold, const T nms_threshold, const T eta, + const int64_t top_k, std::vector* selected_indices) const { + // The total boxes for each instance. + int64_t num_boxes = bbox.dims()[0]; + // 4: [xmin ymin xmax ymax] + int64_t box_size = bbox.dims()[1]; + + std::vector scores_data(num_boxes); + std::copy_n(scores.data(), num_boxes, scores_data.begin()); + std::vector> sorted_indices; + GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices); + + selected_indices->clear(); + T adaptive_threshold = nms_threshold; + const T* bbox_data = bbox.data(); + + while (sorted_indices.size() != 0) { + const int idx = sorted_indices.front().second; + bool keep = true; + for (int k = 0; k < selected_indices->size(); ++k) { + if (keep) { + const int kept_idx = (*selected_indices)[k]; + T overlap = JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, true); + keep = overlap <= adaptive_threshold; + } else { + break; + } + } + if (keep) { + selected_indices->push_back(idx); + } + sorted_indices.erase(sorted_indices.begin()); + if (keep && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } + } + + void MultiClassNMS(const framework::ExecutionContext& ctx, + const Tensor& scores, const Tensor& bboxes, + std::map>& indices, + int& num_nmsed_out) const { + int64_t background_label = ctx.Attr("background_label"); + int64_t nms_top_k = ctx.Attr("nms_top_k"); + int64_t keep_top_k = ctx.Attr("keep_top_k"); + T nms_threshold = static_cast(ctx.Attr("nms_threshold")); + T nms_eta = static_cast(ctx.Attr("nms_eta")); + T score_threshold = static_cast(ctx.Attr("score_threshold")); + + int64_t class_num = scores.dims()[0]; + int64_t predict_dim = scores.dims()[1]; + int num_det = 0; + for (int64_t c = 0; c < class_num; ++c) { + if (c == background_label) continue; + Tensor score = scores.Slice(c, c + 1); + NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k, + &(indices[c])); + num_det += indices[c].size(); + } + + num_nmsed_out = num_det; + const T* scores_data = scores.data(); + if (keep_top_k > -1 && num_det > keep_top_k) { + std::vector>> score_index_pairs; + for (const auto& it : indices) { + int label = it.first; + const T* sdata = scores_data + label * predict_dim; + const std::vector& label_indices = it.second; + for (int j = 0; j < label_indices.size(); ++j) { + int idx = label_indices[j]; + PADDLE_ENFORCE_LT(idx, predict_dim); + score_index_pairs.push_back( + std::make_pair(sdata[idx], std::make_pair(label, idx))); + } + } + // Keep top k results per image. + std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), + SortScorePairDescend>); + score_index_pairs.resize(keep_top_k); + + // Store the new indices. + std::map> new_indices; + for (int j = 0; j < score_index_pairs.size(); ++j) { + int label = score_index_pairs[j].second.first; + int idx = score_index_pairs[j].second.second; + new_indices[label].push_back(idx); + } + new_indices.swap(indices); + num_nmsed_out = keep_top_k; + } + } + + void MultiClassOutput(const Tensor& scores, const Tensor& bboxes, + std::map>& selected_indices, + Tensor* outs) const { + int predict_dim = scores.dims()[1]; + auto* scores_data = scores.data(); + auto* bboxes_data = bboxes.data(); + auto* odata = outs->data(); + + int count = 0; + for (const auto& it : selected_indices) { + int label = it.first; + const T* sdata = scores_data + label * predict_dim; + const std::vector& indices = it.second; + for (int j = 0; j < indices.size(); ++j) { + int idx = indices[j]; + const T* bdata = bboxes_data + idx * kBBoxSize; + odata[count * kOutputDim] = label; // label + odata[count * kOutputDim + 1] = sdata[idx]; // score + // xmin, ymin, xmax, ymax + std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T)); + count++; + } + } + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* boxes = ctx.Input("BBoxes"); + auto* scores = ctx.Input("Scores"); + auto* outs = ctx.Output("Out"); + + auto score_dims = scores->dims(); + + int64_t batch_size = score_dims[0]; + int64_t class_num = score_dims[1]; + int64_t predict_dim = score_dims[2]; + + std::vector>> all_indices; + std::vector batch_starts = {0}; + for (int64_t i = 0; i < batch_size; ++i) { + Tensor ins_score = scores->Slice(i, i + 1); + ins_score.Resize({class_num, predict_dim}); + std::map> indices; + int num_nmsed_out = 0; + MultiClassNMS(ctx, ins_score, *boxes, indices, num_nmsed_out); + all_indices.push_back(indices); + batch_starts.push_back(batch_starts.back() + num_nmsed_out); + } + + int num_kept = batch_starts.back(); + if (num_kept == 0) { + T* od = outs->mutable_data({1}, ctx.GetPlace()); + od[0] = -1; + } else { + outs->mutable_data({num_kept, kOutputDim}, ctx.GetPlace()); + for (int64_t i = 0; i < batch_size; ++i) { + Tensor ins_score = scores->Slice(i, i + 1); + ins_score.Resize({class_num, predict_dim}); + int64_t s = batch_starts[i]; + int64_t e = batch_starts[i + 1]; + if (e > s) { + Tensor out = outs->Slice(s, e); + MultiClassOutput(ins_score, *boxes, all_indices[i], &out); + } + } + } + + framework::LoD lod; + lod.emplace_back(batch_starts); + + outs->set_lod(lod); + } +}; + +class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("BBoxes", + "(Tensor) A 2-D Tensor with shape [M, 4] represents the " + "predicted locations of M bounding bboxes. Each bounding box " + "has four coordinate values and the layout is " + "[xmin, ymin, xmax, ymax]."); + AddInput("Scores", + "(Tensor) A 3-D Tensor with shape [N, C, M] represents the " + "predicted confidence predictions. N is the batch size, C is the " + "class number, M is number of bounding boxes. For each category " + "there are total M scores which corresponding M bounding boxes. " + " Please note, M is equal to the 1st dimension of BBoxes. "); + AddAttr( + "background_label", + "(int64_t, defalut: 0) " + "The index of background label, the background label will be ignored. " + "If set to -1, then all categories will be considered.") + .SetDefault(0); + AddAttr("score_threshold", + "(float) " + "Threshold to filter out bounding boxes with low " + "confidence score. If not provided, consider all boxes."); + AddAttr("nms_top_k", + "(int64_t) " + "Maximum number of detections to be kept according to the " + "confidences aftern the filtering detections based on " + "score_threshold"); + AddAttr("nms_threshold", + "(float, defalut: 0.3) " + "The threshold to be used in NMS.") + .SetDefault(0.3); + AddAttr("nms_eta", + "(float) " + "The parameter for adaptive NMS.") + .SetDefault(1.0); + AddAttr("keep_top_k", + "(int64_t) " + "Number of total bboxes to be kept per image after NMS " + "step. -1 means keeping all bboxes after NMS step."); + AddOutput("Out", + "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the " + "detections. Each row has 6 values: " + "[label, confidence, xmin, ymin, xmax, ymax], No is the total " + "number of detections in this mini-batch. For each instance, " + "the offsets in first dimension are called LoD, the number of " + "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is " + "no detected bbox."); + AddComment(R"DOC( +This operator is to do multi-class non maximum suppression (NMS) on a batched +of boxes and scores. + +In the NMS step, this operator greedily selects a subset of detection bounding +boxes that have high scores larger than score_threshold, if providing this +threshold, then selects the largest nms_top_k confidences scores if nms_top_k +is larger than -1. Then this operator pruns away boxes that have high IOU +(intersection over union) overlap with already selected boxes by adaptive +threshold NMS based on parameters of nms_threshold and nms_eta. + +Aftern NMS step, at most keep_top_k number of total bboxes are to be kept +per image if keep_top_k is larger than -1. + +This operator support multi-class and batched inputs. It applying NMS +independently for each class. The outputs is a 2-D LoDTenosr, for each +image, the offsets in first dimension of LoDTensor are called LoD, the number +of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0, +means there is no detected bbox for this image. If there is no detected boxes +for all images, all the elements in LoD are 0, and the Out only contains one +value which is -1. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp, + ops::MultiClassNMSOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel, + ops::MultiClassNMSKernel); diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc index 49e1eb3402482e7ff12d9b2b640f7271a80cf6d9..ba71094219f37eb7a3c2df68be986cec7afbf7ab 100644 --- a/paddle/operators/recv_op.cc +++ b/paddle/operators/recv_op.cc @@ -12,187 +12,60 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include #include -#include -#include - -#include "paddle/framework/executor.h" +#include "paddle/framework/data_type.h" #include "paddle/framework/framework.pb.h" #include "paddle/framework/lod_tensor.h" #include "paddle/framework/op_registry.h" -#include "paddle/framework/proto_desc.h" -#include "paddle/operators/detail/grpc_server.h" -#include "paddle/operators/detail/sendrecvop_utils.h" -#include "paddle/operators/detail/simple_block_queue.h" -#include "paddle/string/printf.h" + +#include +#include "paddle/operators/detail/grpc_client.h" namespace paddle { namespace operators { -constexpr char kOptimizeBlock[] = "OptimizeBlock"; - -void RunServer(std::shared_ptr service) { - service->RunSyncUpdate(); - VLOG(4) << "RunServer thread end"; -} - -static void CreateTensorFromMessageType(framework::Variable *var, - sendrecv::VarType var_type) { - if (var_type == sendrecv::VarType::LOD_TENSOR) { - var->GetMutable(); - } else if (var_type == sendrecv::VarType::SELECTED_ROWS) { - var->GetMutable(); - } else { - PADDLE_THROW( - "VariableMessage type %d is not in " - "[LoDTensor, SelectedRows]", - var_type); - } -} - class RecvOp : public framework::OperatorBase { public: - RecvOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) { - if (!rpc_service_) { - std::string endpoint = Attr("endpoint"); - rpc_service_.reset(new detail::AsyncGRPCServer(endpoint)); - server_thread_.reset(new std::thread(RunServer, rpc_service_)); - } - } - - void Stop() override { - detail::MessageWithName term_msg; - term_msg.first = LISTEN_TERMINATE_MESSAGE; - rpc_service_->Push(term_msg); - rpc_service_->ShutDown(); - server_thread_->join(); - } - - std::string GetGradVarNameForTrainer(const std::string &varname) const { - if (grads_counter_.find(varname) == grads_counter_.end()) { - grads_counter_[varname] = 0; + RecvOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope& scope, + const platform::Place& place) const override { + auto outs = Outputs("Out"); + std::vector epmap = Attr>("epmap"); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + for (size_t i = 0; i < outs.size(); i++) { + VLOG(3) << "getting " << outs[i]; + client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]); } - return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++); + PADDLE_ENFORCE(client_.Wait()); } - void Run(const framework::Scope &scope, - const platform::Place &dev_place) const override { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(dev_place); - framework::Scope &recv_scope = scope.NewScope(); - - // FIXME(Yancey1989): initialize rpc server with laze mode. - rpc_service_->SetScope(&recv_scope); - rpc_service_->SetDevCtx(&dev_ctx); - auto param_list = Attr>("ParamList"); - auto grad_list = Attr>("GradList"); - auto fan_in = Attr("Fanin"); - - auto *block = Attr(kOptimizeBlock); - auto *program = block->Program(); - framework::Executor executor(dev_place); - - // TODO(typhoonzero): change this to a while_op for every cluster-batch. - bool exit_flag = false; - while (!exit_flag) { - // Get from multiple trainers, we don't care about the order in which - // the gradients arrives, just add suffix 0~n and merge the gradient. - rpc_service_->SetCond(0); - size_t recv_var_cnt = 0; - int batch_barrier = 0; - while (batch_barrier != fan_in) { - const detail::MessageWithName &v = rpc_service_->Get(); - auto grad_var_name = v.first; - if (grad_var_name == LISTEN_TERMINATE_MESSAGE) { - LOG(INFO) << "received terminate message and exit"; - exit_flag = true; - break; - } else if (grad_var_name == BATCH_BARRIER_MESSAGE) { - VLOG(3) << "recv batch barrier message"; - batch_barrier++; - continue; - } else { - // receive a variable - recv_var_cnt++; - auto it = - std::find(grad_list.begin(), grad_list.end(), grad_var_name); - std::string param_var_name; - if (it != grad_list.end()) { - param_var_name = param_list[it - grad_list.begin()]; - } else { - LOG(ERROR) << "grad has no paired param:" << grad_var_name; - } - VLOG(3) << "received grad: " << grad_var_name - << " updating param: " << param_var_name; - - if (fan_in > 1) { - grad_var_name = this->GetGradVarNameForTrainer(grad_var_name); - } - auto *var = recv_scope.FindVar(grad_var_name); - if (var == nullptr) { - LOG(ERROR) << "Can not find server side var: " << grad_var_name; - PADDLE_THROW("Can not find server side var"); - } - detail::DeserializeFromMessage(v.second, dev_ctx, var); - } - } - VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier."; - // TODO(Yancey1989): merge SelectedRows variables here - if (exit_flag) { - break; - } - - try { - executor.Run(*program, &recv_scope, block->ID(), /*global_block*/ - false /*create_local_scope*/, false /*create_vars*/); - } catch (std::exception &e) { - LOG(ERROR) << "run sub program error " << e.what(); - } - rpc_service_->SetCond(1); - rpc_service_->WaitClientGet(recv_var_cnt); - grads_counter_.clear(); - } // while(true) - } - - protected: - std::shared_ptr rpc_service_; - std::shared_ptr server_thread_; - mutable std::unordered_map grads_counter_; + private: + mutable detail::RPCClient client_; }; class RecvOpMaker : public framework::OpProtoAndCheckerMaker { public: - RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker) + RecvOpMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable(); AddComment(R"DOC( Recv operator -This operator will recieve tensor from send_op +This operator can get variables from server side. )DOC"); - AddAttr("endpoint", - "(string, default 127.0.0.1:6164)" - "IP address to listen on.") - .SetDefault("127.0.0.1:6164") - .AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); - AddAttr( - kOptimizeBlock, "Serialized ProgramDesc string for recv to run."); - AddAttr>( - "ParamList", "type list of string", - "grad->param name mapping to find which parameters to optimize.") - .SetDefault({}); - AddAttr>( - "GradList", "type list of string", - "grad->param name mapping to find which parameters to optimize.") + AddAttr>("epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input " + "variables for mapping") .SetDefault({}); - AddAttr("Fanin", "type int", - "Number of trainers in the current cluster job") - .SetDefault(1); } }; diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu index 41f2c5b9de91ade15b4010f56377675cfd1b611c..b3825212e1ac41b13a2f4cad2c128da39c5f6e71 100644 --- a/paddle/operators/row_conv_op.cu +++ b/paddle/operators/row_conv_op.cu @@ -307,7 +307,7 @@ class RowConvKernel int input_dim = X->dims()[1]; int num_sequence = batch_indices.size() - 1; int future_context = Filter->dims()[0]; - size_t *idx = batch_indices.data(); + size_t *idx = batch_indices.cuda_data(); auto stream = context.cuda_device_context().stream(); if (future_context <= 32) { @@ -345,7 +345,7 @@ class RowConvGradKernel int input_dim = X->dims()[1]; int num_sequence = batch_indices.size() - 1; int future_context = Filter->dims()[0]; - size_t *idx = batch_indices.data(); + size_t *idx = batch_indices.cuda_data(); auto &device_ctx = context.cuda_device_context(); math::SetConstant zero; diff --git a/paddle/operators/save_combine_op.cc b/paddle/operators/save_combine_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..bffa2908bc42d73332f22fa3706d24ab49cd4b38 --- /dev/null +++ b/paddle/operators/save_combine_op.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/framework/data_type.h" +#include "paddle/framework/framework.pb.h" +#include "paddle/framework/lod_tensor.h" +#include "paddle/framework/op_registry.h" +#include "paddle/platform/device_context.h" + +namespace paddle { +namespace operators { + +// TODO(sidgoyal78): These function are needed by other files (save_op), move +// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op). +constexpr char kSEP = '/'; +static bool FileExists(const std::string &filepath) { + struct stat buffer; + return (stat(filepath.c_str(), &buffer) == 0); +} + +static std::string DirName(const std::string &filepath) { + auto pos = filepath.rfind(kSEP); + if (pos == std::string::npos) { + return ""; + } + return filepath.substr(0, pos); +} + +static void MkDir(const char *path) { + if (mkdir(path, 0755)) { + PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path); + } +} + +static void MkDirRecursively(const char *fullpath) { + if (*fullpath == '\0') return; // empty string + if (FileExists(fullpath)) return; + + MkDirRecursively(DirName(fullpath).c_str()); + MkDir(fullpath); +} + +class SaveCombineOp : public framework::OperatorBase { + public: + SaveCombineOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto filename = Attr("file_path"); + auto overwrite = Attr("overwrite"); + + bool is_present = FileExists(filename); + if (is_present && !overwrite) { + PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false", + filename, overwrite); + } + + MkDirRecursively(DirName(filename).c_str()); + std::ofstream fout(filename); + PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", + filename); + + auto inp_var_names = Inputs("X"); + PADDLE_ENFORCE_GT(static_cast(inp_var_names.size()), 0, + "The number of input variables should be greater than 0"); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + for (size_t i = 0; i < inp_var_names.size(); i++) { + auto *var = scope.FindVar(inp_var_names[i]); + + PADDLE_ENFORCE(var != nullptr, + "Cannot find variable %s for save_combine_op", + inp_var_names[i]); + PADDLE_ENFORCE(var->IsType(), + "SaveCombineOp only supports LoDTensor, %s has wrong type", + inp_var_names[i]); + + auto &tensor = var->Get(); + // Serialize tensor + framework::SerializeToStream(fout, tensor, dev_ctx); + } + fout.close(); + } +}; + +class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + SaveCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(vector) Input LoDTensors that need to be saved together in a file.") + .AsDuplicable(); + AddComment(R"DOC( +SaveCombine operator + +This operator will serialize and write a list of input LoDTensor variables +to a file on disk. +)DOC"); + AddAttr("overwrite", + "(boolean, default true)" + "Overwrite the output file if it exists.") + .SetDefault(true); + AddAttr( + "file_path", + "(string)" + "The \"file_path\" where the LoDTensor variables will be saved.") + .AddCustomChecker( + [](const std::string &path) { return !path.empty(); }); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(save_combine, ops::SaveCombineOp, + ops::SaveCombineOpProtoMaker); diff --git a/paddle/operators/save_load_combine_op_test.cc b/paddle/operators/save_load_combine_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f3ddc4a6c55d72e4e444869a1ebcd7662c892317 --- /dev/null +++ b/paddle/operators/save_load_combine_op_test.cc @@ -0,0 +1,180 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/framework/op_registry.h" + +USE_NO_KERNEL_OP(save_combine); +USE_NO_KERNEL_OP(load_combine); + +int* CreateForSaveCombineOp(int x, int y, const std::vector& lod_info, + std::string var_name, + paddle::platform::CPUPlace& place, + paddle::framework::Scope& scope, + paddle::framework::LoD& expect_lod) { + auto var = scope.Var(var_name); + auto tensor = var->GetMutable(); + tensor->Resize({x, y}); + expect_lod.resize(1); + for (size_t i = 0; i < lod_info.size(); i++) { + expect_lod[0].push_back(lod_info[i]); + } + tensor->set_lod(expect_lod); + int* expect = tensor->mutable_data(place); + for (int64_t i = 0; i < tensor->numel(); ++i) { + expect[i] = static_cast(i); + } + return expect; +} + +paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad( + const std::string out_var_name, paddle::framework::Scope& scope) { + auto load_var = scope.Var(out_var_name); + auto target = load_var->GetMutable(); + return target; +} + +int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target, + paddle::framework::Scope& scope, + paddle::framework::LoD& actual_lod) { + int* actual = target->data(); + actual_lod = target->lod(); + return actual; +} + +void CheckValues(int* expect, int* actual, paddle::framework::LoD expect_lod, + paddle::framework::LoD actual_lod, const int& numel) { + for (int64_t i = 0; i < numel; ++i) { + EXPECT_EQ(expect[i], actual[i]); + } + EXPECT_EQ(expect_lod.size(), actual_lod.size()); + for (size_t i = 0; i < expect_lod.size(); ++i) { + for (size_t j = 0; j < expect_lod[i].size(); ++j) { + EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]); + } + } +} + +// Here, we create 4 LoDTensors and use save_combine_op to first save these +// in a single file. Then, we use load_combine_op to load these sequentially +TEST(SaveLoadCombineOp, CPU) { + paddle::framework::Scope scope; + paddle::platform::CPUPlace place; + + std::vector lod1 = {0, 1, 2, 3, 10}; + int numel1 = 100; + paddle::framework::LoD expect_lod1; + int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope, + expect_lod1); + + std::vector lod2 = {0, 2, 5, 10}; + int numel2 = 200; + paddle::framework::LoD expect_lod2; + int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope, + expect_lod2); + + std::vector lod3 = {0, 2, 3, 20}; + int numel3 = 4000; + paddle::framework::LoD expect_lod3; + int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place, + scope, expect_lod3); + + std::vector lod4 = {0, 1, 20}; + int numel4 = 1000; + paddle::framework::LoD expect_lod4; + int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope, + expect_lod4); + + // Set attributes + std::string filename = "check_tensor.ls"; + paddle::framework::AttributeMap attrs; + attrs.insert({"file_path", std::string(filename)}); + + // Run the save_combine_op + auto save_combine_op = paddle::framework::OpRegistry::CreateOp( + "save_combine", + {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs); + save_combine_op->Run(scope, place); + + // Set up output vars + auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope); + auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope); + auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope); + auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope); + + // Run the load_combine_op + auto load_combine_op = paddle::framework::OpRegistry::CreateOp( + "load_combine", {}, + {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs); + load_combine_op->Run(scope, place); + + paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4; + int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1); + int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2); + int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3); + int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4); + + CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1); + CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2); + CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3); + CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4); +} + +// Test with original SaveLoadTest +TEST(SaveLoadTestWithCombineOp, CPU) { + paddle::framework::Scope scope; + paddle::platform::CPUPlace place; + + auto var = scope.Var("test_var"); + auto tensor = var->GetMutable(); + tensor->Resize({3, 10}); + paddle::framework::LoD expect_lod; + expect_lod.resize(1); + expect_lod[0].push_back(0); + expect_lod[0].push_back(1); + expect_lod[0].push_back(2); + expect_lod[0].push_back(3); + + tensor->set_lod(expect_lod); + int* expect = tensor->mutable_data(place); + for (int64_t i = 0; i < tensor->numel(); ++i) { + expect[i] = static_cast(i); + } + paddle::framework::AttributeMap attrs; + attrs.insert({"file_path", std::string("check_t.save")}); + + auto save_op = paddle::framework::OpRegistry::CreateOp( + "save_combine", {{"X", {"test_var"}}}, {}, attrs); + save_op->Run(scope, place); + + auto load_var = scope.Var("out_var"); + auto target = load_var->GetMutable(); + auto load_op = paddle::framework::OpRegistry::CreateOp( + "load_combine", {}, {{"Out", {"out_var"}}}, attrs); + load_op->Run(scope, place); + int* actual = target->data(); + for (int64_t i = 0; i < tensor->numel(); ++i) { + EXPECT_EQ(expect[i], actual[i]); + } + auto& actual_lod = target->lod(); + EXPECT_EQ(expect_lod.size(), actual_lod.size()); + for (size_t i = 0; i < expect_lod.size(); ++i) { + for (size_t j = 0; j < expect_lod[i].size(); ++j) { + EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]); + } + } +} diff --git a/paddle/operators/save_load_op_test.cc b/paddle/operators/save_load_op_test.cc index 40103d864fb58804b39ca5f3c63e802a430ce886..d829d5da174b73613da9dcfcd308a5b05e12bce9 100644 --- a/paddle/operators/save_load_op_test.cc +++ b/paddle/operators/save_load_op_test.cc @@ -24,7 +24,7 @@ TEST(SaveLoadOp, CPU) { auto var = scope.Var("test_var"); auto tensor = var->GetMutable(); - tensor->Resize({10, 10}); + tensor->Resize({3, 10}); paddle::framework::LoD expect_lod; expect_lod.resize(1); expect_lod[0].push_back(0); diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc index bb719dc2a8a577bc042a2a70f7169b7d70f83684..ee0f268b0e4dfa23bf878d71404d47553183a977 100644 --- a/paddle/operators/send_op.cc +++ b/paddle/operators/send_op.cc @@ -42,28 +42,34 @@ class SendOp : public framework::OperatorBase { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); + + auto client_var_name = Output("RPCClient"); + PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name), + "Can not find variable '%s' in the scope.", + client_var_name); + auto* client_var = scope.FindVar(client_var_name); + detail::RPCClient* rpc_client = client_var->GetMutable(); + for (size_t i = 0; i < ins.size(); i++) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; - client_.AsyncSendVariable(epmap[i], ctx, scope, ins[i]); + rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]); } - PADDLE_ENFORCE(client_.Wait()); + PADDLE_ENFORCE(rpc_client->Wait()); for (auto& ep : endpoints) { VLOG(3) << "batch barrier, ep: " << ep; - client_.AsyncSendBatchBarrier(ep); + rpc_client->AsyncSendBatchBarrier(ep); } - PADDLE_ENFORCE(client_.Wait()); - - for (size_t i = 0; i < outs.size(); i++) { - VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; - client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]); + PADDLE_ENFORCE(rpc_client->Wait()); + + if (outs.size() > 0) { + for (size_t i = 0; i < outs.size(); i++) { + VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; + rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]); + } + PADDLE_ENFORCE(rpc_client->Wait()); } - - PADDLE_ENFORCE(client_.Wait()); } - - private: - mutable detail::RPCClient client_; }; class SendOpMaker : public framework::OpProtoAndCheckerMaker { @@ -73,11 +79,16 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable(); AddOutput("Out", "(Tensor) Output tensor to be received from server") .AsDuplicable(); + AddOutput("RPCClient", + "(RPCClient) The RPC client object which is" + "initialized at most once."); AddComment(R"DOC( Send operator This operator will send tensor to recv_op at the parameter server. )DOC"); + // TODO(typhoonzero): remove this attr generate de-duplicated vector from + // epmap when initializing. AddAttr>("endpoints", "(string vector, default 127.0.0.1:6164)" "Server endpoints to send variables to.") diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc index 045a0f5434f339bab345d14881ed05450ce6588d..31527a906d56da54d2571910de627757d708a996 100644 --- a/paddle/operators/send_recv_op_test.cc +++ b/paddle/operators/send_recv_op_test.cc @@ -25,7 +25,7 @@ limitations under the License. */ #include "paddle/string/printf.h" USE_NO_KERNEL_OP(send); -USE_NO_KERNEL_OP(recv); +USE_NO_KERNEL_OP(listen_and_serv); USE_OP(sum); namespace f = paddle::framework; @@ -33,7 +33,7 @@ namespace p = paddle::platform; namespace m = paddle::operators::math; // global for simplicity. -std::unique_ptr recv_op; +std::unique_ptr listen_and_serv_op; void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) { p::CPUDeviceContext ctx(place); @@ -120,7 +120,7 @@ void StartServerNet(bool is_sparse) { InitTensorsInScope(scope, place); } - // sub program run in recv_op, for simple test we use sum + // sub program run in listen_and_serv_op, for simple test we use sum f::ProgramDesc program; f::BlockDesc *block = program.MutableBlock(0); // X for server side tensors, RX for received tensers, must be of same shape. @@ -131,8 +131,9 @@ void StartServerNet(bool is_sparse) { attrs.insert({"ParamList", std::vector({"Out"})}); attrs.insert({"GradList", std::vector({"x1"})}); attrs.insert({"OptimizeBlock", block}); - recv_op = f::OpRegistry::CreateOp("recv", {{"RX", {"x1"}}}, {}, attrs); - recv_op->Run(scope, place); + listen_and_serv_op = + f::OpRegistry::CreateOp("listen_and_serv", {}, {}, attrs); + listen_and_serv_op->Run(scope, place); } TEST(SendRecvOp, CPUDense) { @@ -161,9 +162,9 @@ TEST(SendRecvOp, CPUDense) { for (int64_t i = 0; i < target->numel(); ++i) { EXPECT_EQ(expected[i] * 2, actual[i]); } - recv_op->Stop(); + listen_and_serv_op->Stop(); server_thread.join(); - recv_op.reset(nullptr); + listen_and_serv_op.reset(nullptr); } TEST(SendRecvOp, CPUSparse) { @@ -200,7 +201,7 @@ TEST(SendRecvOp, CPUSparse) { EXPECT_EQ(expect_value->mutable_data(place)[i], actual->mutable_data(place)[i]); } - recv_op->Stop(); + listen_and_serv_op->Stop(); server_thread.join(); - recv_op.reset(); + listen_and_serv_op.reset(); } diff --git a/paddle/operators/sequence_erase_op.cu b/paddle/operators/sequence_erase_op.cu index f1e3b96acd0259de2b3ca1348834bd17e1e174a2..a5311f15f0c607c880a6f12c0bef10b2dd8c8a79 100644 --- a/paddle/operators/sequence_erase_op.cu +++ b/paddle/operators/sequence_erase_op.cu @@ -96,9 +96,8 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel { GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr); - // Set LoD for output - thrust::host_vector out_lod0 = dev_out_lod; + std::vector out_lod0(dev_out_lod.begin(), dev_out_lod.end()); framework::LoD out_lod; out_lod.push_back(out_lod0); out->set_lod(out_lod); diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu index 42f8f8b2f072f9d204dfadcd732926b5c98dc617..29f5aa3542c26c76a1b80da61ec6752019216131 100644 --- a/paddle/operators/sgd_op.cu +++ b/paddle/operators/sgd_op.cu @@ -89,7 +89,7 @@ class SGDOpCUDAKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(in_height, out_dims[0]); auto& in_value = grad->value(); - auto& in_rows = grad->rows(); + framework::Vector in_rows(grad->rows()); int64_t in_row_numel = in_value.numel() / in_rows.size(); PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); @@ -102,7 +102,7 @@ class SGDOpCUDAKernel : public framework::OpKernel { dim3 grid(1, in_rows.size()); SparseSGDFunctorKernel< T, 256><<>>( - in_data, in_rows.data(), learning_rate->data(), out_data, + in_data, in_rows.cuda_data(), learning_rate->data(), out_data, in_row_numel); } else { diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h index 48201b344de0d3bd2b121a12389876dad095f10d..3d8102c3ae20c8b714cd48b4fc78dc18a0cf89a7 100644 --- a/paddle/operators/sum_op.h +++ b/paddle/operators/sum_op.h @@ -68,7 +68,32 @@ class SumKernel : public framework::OpKernel { } } } else if (out_var->IsType()) { - PADDLE_ENFORCE(!in_place, "SelectedRows not support inplace sum now"); + std::unique_ptr in0; + if (in_place) { + // If is in_place, we store the input[0] to in0 + auto &in_sel0 = in_vars[0]->Get(); + auto &rows = in_sel0.rows(); +#ifdef PADDLE_WITH_CUDA + std::vector rows_in_cpu; + rows_in_cpu.reserve(rows.size()); + for (auto item : rows) { + rows_in_cpu.push_back(item); + } + in0.reset(new framework::SelectedRows(rows_in_cpu, in_sel0.height())); +#else + in0.reset(new framework::SelectedRows(rows, in_sel0.height())); +#endif + in0->mutable_value()->ShareDataWith(in_sel0.value()); + } + + auto get_selected_row = [&](size_t i) -> const SelectedRows & { + if (i == 0 && in0) { + return *in0.get(); + } else { + return in_vars[i]->Get(); + } + }; + auto *out = context.Output("Out"); out->mutable_rows()->clear(); auto *out_value = out->mutable_value(); @@ -76,24 +101,26 @@ class SumKernel : public framework::OpKernel { // Runtime InferShape size_t first_dim = 0; for (int i = 0; i < N; i++) { - first_dim += in_vars[i]->Get().rows().size(); + auto &sel_row = get_selected_row(i); + first_dim += sel_row.rows().size(); } - auto in_dim = in_vars[0]->Get().value().dims(); - auto in_dim_vec = framework::vectorize(in_dim); - in_dim_vec[0] = static_cast(first_dim); + auto in_dim = + framework::vectorize(get_selected_row(N - 1).value().dims()); + in_dim[0] = static_cast(first_dim); - out_value->Resize(framework::make_ddim(in_dim_vec)); + out_value->Resize(framework::make_ddim(in_dim)); out_value->mutable_data(context.GetPlace()); math::SelectedRowsAddTo functor; int64_t offset = 0; for (int i = 0; i < N; i++) { - PADDLE_ENFORCE_EQ(out->height(), - in_vars[i]->Get().height()); - functor(context.template device_context(), - in_vars[i]->Get(), offset, out); - offset += in_vars[i]->Get().value().numel(); + auto &sel_row = get_selected_row(i); + + PADDLE_ENFORCE_EQ(out->height(), sel_row.height()); + functor(context.template device_context(), sel_row, + offset, out); + offset += sel_row.value().numel(); } } else if (out_var->IsType()) { auto &out_array = *out_var->GetMutable(); diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc index 2fdd25dbbe68659f8a0a9da13a87148ed259127a..a744ebd61595403ee495a2e2c9e84181422e92ff 100644 --- a/paddle/operators/while_op.cc +++ b/paddle/operators/while_op.cc @@ -53,6 +53,8 @@ class WhileOp : public framework::OperatorBase { auto step_scopes = scope.FindVar(Output(kStepScopes))->GetMutable(); + PADDLE_ENFORCE(platform::is_cpu_place(cond.place()), + "Condition of while op must in CPU memory."); while (cond.data()[0]) { auto ¤t_scope = scope.NewScope(); step_scopes->push_back(¤t_scope); @@ -99,6 +101,9 @@ class WhileGradOp : public framework::OperatorBase { void Run(const framework::Scope &scope, const platform::Place &dev_place) const override { + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); framework::Executor executor(dev_place); auto *block = Attr(kStepBlock); auto *program = block->Program(); @@ -205,6 +210,8 @@ class WhileGradOp : public framework::OperatorBase { sum_op->Run(cur_scope, dev_place); cur_scope.Rename(new_inside_name, inside_grad_name); } + dev_ctx.Wait(); + const_cast(scope).DeleteScope(&cur_scope); } } }; diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc index 2a8afc940393baaaa939471f50f2d5c63edd6a84..6df087d154cc104955c6399050c9cb2bce8d36e1 100644 --- a/paddle/platform/profiler.cc +++ b/paddle/platform/profiler.cc @@ -233,7 +233,7 @@ void ParseEvents(std::vector>& events, }; break; default: - sorted_domain = "event end time"; + sorted_domain = "event first end time"; } std::vector> events_table; diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 490397afdd4de0cc1aafde746d31b1d800eded3b..a880d9bdbc63aacc1f2cdbc0d7da001a59c7b372 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -124,44 +124,25 @@ PYBIND11_PLUGIN(core) { .def( "__init__", [](LoDTensor &instance, const std::vector> &lod) { -#ifndef PADDLE_WITH_CUDA - new (&instance) LoDTensor(lod); -#else - LoD new_lod; - new_lod.reserve(lod.size()); - std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); - new (&instance) LoDTensor(new_lod); -#endif + LoD new_lod; + new_lod.reserve(lod.size()); + std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); + new (&instance) LoDTensor(new_lod); }) .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); }) .def("set_lod", [](LoDTensor &self, const std::vector> &lod) { -#ifndef PADDLE_WITH_CUDA - self.set_lod(lod); -#else LoD new_lod; new_lod.reserve(lod.size()); std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); self.set_lod(new_lod); -#endif }) .def("lod", [](LoDTensor &self) -> std::vector> { -#ifndef PADDLE_WITH_CUDA - return self.lod(); -#else - auto lod = self.lod(); - std::vector> new_lod; - new_lod.reserve(lod.size()); - std::transform(lod.begin(), lod.end(), std::back_inserter(new_lod), - [](Vector item) -> - std::vector { - std::vector v; - v.reserve(item.size()); - std::copy(item.begin(), item.end(), std::back_inserter(v)); - return v; - }); - return new_lod; -#endif + auto lod = self.lod(); + std::vector> new_lod; + new_lod.reserve(lod.size()); + std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); + return new_lod; }); py::class_(m, "SelectedRows") diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md index f0620498cfa6775ce2949cc02fa9f6c9529dec2e..65c46745556bc5ea91fdd4e33060f2535422e8e8 100644 --- a/paddle/scripts/docker/README.md +++ b/paddle/scripts/docker/README.md @@ -56,7 +56,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF" | ------ | -------- | ----------- | | `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. | | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. | -| `WITH_TESTING` | ON | Build unit tests binaries. | +| `WITH_TESTING` | OFF | Build unit tests binaries. | | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. | | `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. | | `WITH_SWIG_PY` | ON | Build with SWIG python API support. | diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index fbae37b2ca063e32cb12ded0da901d93438bc9a2..59f3af03986793a4185bc5933d26888e89d555f7 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -32,7 +32,7 @@ function cmake_gen() { cat < new_argv; std::string gflags_env; - new_argv.push_back(argv[0]); + for (int i = 0; i < argc; ++i) { + new_argv.push_back(argv[i]); + } #ifdef PADDLE_WITH_CUDA new_argv.push_back( - strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory")); + strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory," + "warpctc_dir")); #else - new_argv.push_back(strdup("--tryfromenv=use_pinned_memory")); + new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,warpctc_dir")); #endif int new_argc = static_cast(new_argv.size()); char** new_argv_address = new_argv.data(); diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py index 787416aed1acf81138df06110317614dfe77fb48..3ee58393c72c0b6f9bec96be51ad3946752a35dd 100644 --- a/python/paddle/v2/fluid/__init__.py +++ b/python/paddle/v2/fluid/__init__.py @@ -26,6 +26,7 @@ import initializer import layers import nets import optimizer +import learning_rate_decay import backward import regularizer from param_attr import ParamAttr @@ -35,27 +36,16 @@ from distribute_transpiler import DistributeTranspiler from distribute_transpiler_simple import SimpleDistributeTranspiler import clip from memory_optimization_transpiler import memory_optimize +import profiler Tensor = LoDTensor __all__ = framework.__all__ + executor.__all__ + [ - 'io', - 'initializer', - 'layers', - 'nets', - 'optimizer', - 'backward', - 'regularizer', - 'LoDTensor', - 'CPUPlace', - 'CUDAPlace', - 'Tensor', + 'io', 'initializer', 'layers', 'nets', 'optimizer', 'learning_rate_decay', + 'backward', 'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor', 'ParamAttr' - 'DataFeeder', - 'clip', - 'SimpleDistributeTranspiler', - 'DistributeTranspiler', - 'memory_optimize', + 'DataFeeder', 'clip', 'SimpleDistributeTranspiler', 'DistributeTranspiler', + 'memory_optimize', 'profiler' ] @@ -87,10 +77,10 @@ def __bootstrap__(): os.environ['OMP_NUM_THREADS'] = str(num_threads) read_env_flags = [ - 'use_pinned_memory', 'check_nan_inf', 'do_memory_benchmark' + 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir' ] if core.is_compiled_with_cuda(): - read_env_flags += ['fraction_of_gpu_memory_to_use', 'op_sync'] + read_env_flags += ['fraction_of_gpu_memory_to_use'] core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) core.init_glog(sys.argv[0]) diff --git a/python/paddle/v2/fluid/debuger.py b/python/paddle/v2/fluid/debuger.py new file mode 100644 index 0000000000000000000000000000000000000000..db1808c64745ac153962c050b08993450dd93c06 --- /dev/null +++ b/python/paddle/v2/fluid/debuger.py @@ -0,0 +1,265 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import re +from graphviz import GraphPreviewGenerator +import proto.framework_pb2 as framework_pb2 + +_vartype2str_ = [ + "UNK", + "LoDTensor", + "SelectedRows", + "FeedMinibatch", + "FetchList", + "StepScopes", + "LodRankTable", + "LoDTensorArray", + "PlaceList", +] +_dtype2str_ = [ + "bool", + "int16", + "int32", + "int64", + "float16", + "float32", + "float64", +] + + +def repr_data_type(type): + return _dtype2str_[type] + + +def repr_tensor(proto): + return "tensor(type={}, shape={})".format(_dtype2str_[int(proto.data_type)], + str(proto.dims)) + + +reprtpl = "{ttype} {name} ({reprs})" + + +def repr_lodtensor(proto): + if not proto.lod_tensor: return + level = proto.lod_tensor.lod_level + reprs = repr_tensor(proto.lod_tensor.tensor) + return reprtpl.format( + ttype="LoDTensor" if level > 0 else "Tensor", + name=proto.name, + reprs="level=%d, %s" % (level, reprs) if level > 0 else reprs) + + +def repr_selected_rows(proto): + if not proto.selected_rows: return + return reprtpl.format( + ttype="SelectedRows", + name=proto.name, + reprs=repr_tensor(proto.selected_rows)) + + +def repr_tensor_array(proto): + if not proto.tensor_array: return + return reprtpl.format( + ttype="TensorArray", + name=proto.name, + reprs="level=%d, %s" % (proto.tensor_array.lod_level, + repr_tensor(proto.lod_tensor))) + + +type_handlers = [ + repr_lodtensor, + repr_selected_rows, + repr_tensor_array, +] + + +def repr_var(vardesc): + for handler in type_handlers: + res = handler(vardesc) + if res: + return res + + +def pprint_program_codes(program_desc): + reprs = [] + for block_idx in range(program_desc.num_blocks()): + block_desc = program_desc.block(block_idx) + block_repr = pprint_block_codes(block_desc) + reprs.append(block_repr) + return '\n'.join(reprs) + + +def pprint_block_codes(block_desc, show_backward=False): + def is_op_backward(op_desc): + if op_desc.type.endswith('_grad'): return True + + def is_var_backward(var): + if "@GRAD" in var.parameter: return True + for arg in var.arguments: + if "@GRAD" in arg: return True + + for var in op_desc.inputs: + if is_var_backward(var): return True + for var in op_desc.outputs: + if is_var_backward(var): return True + return False + + def is_var_backward(var_desc): + return "@GRAD" in var_desc.name + + if type(block_desc) is not framework_pb2.BlockDesc: + block_desc = framework_pb2.BlockDesc.FromString( + block_desc.serialize_to_string()) + var_reprs = [] + op_reprs = [] + for var in block_desc.vars: + if not show_backward and is_var_backward(var): + continue + var_reprs.append(repr_var(var)) + + for op in block_desc.ops: + if not show_backward and is_op_backward(op): continue + op_reprs.append(repr_op(op)) + + tpl = "// block-{idx} parent-{pidx}\n// variables\n{vars}\n\n// operators\n{ops}\n" + return tpl.format( + idx=block_desc.idx, + pidx=block_desc.parent_idx, + vars='\n'.join(var_reprs), + ops='\n'.join(op_reprs), ) + + +def repr_attr(desc): + tpl = "{key}={value}" + valgetter = [ + lambda attr: attr.i, + lambda attr: attr.f, + lambda attr: attr.s, + lambda attr: attr.ints, + lambda attr: attr.floats, + lambda attr: attr.strings, + lambda attr: attr.b, + lambda attr: attr.bools, + lambda attr: attr.block_idx, + lambda attr: attr.l, + ] + key = desc.name + value = valgetter[desc.type](desc) + if key == "dtype": + value = repr_data_type(value) + return tpl.format(key=key, value=str(value)), (key, value) + + +def _repr_op_fill_constant(optype, inputs, outputs, attrs): + if optype == "fill_constant": + return "{output} = {data} [shape={shape}]".format( + output=','.join(outputs), + data=attrs['value'], + shape=str(attrs['shape'])) + + +op_repr_handlers = [_repr_op_fill_constant, ] + + +def repr_op(opdesc): + optype = None + attrs = [] + attr_dict = {} + is_target = None + inputs = [] + outputs = [] + + tpl = "{outputs} = {optype}({inputs}{is_target}) [{attrs}]" + args2value = lambda args: args[0] if len(args) == 1 else str(list(args)) + for var in opdesc.inputs: + key = var.parameter + value = args2value(var.arguments) + inputs.append("%s=%s" % (key, value)) + for var in opdesc.outputs: + value = args2value(var.arguments) + outputs.append(value) + for attr in opdesc.attrs: + attr_repr, attr_pair = repr_attr(attr) + attrs.append(attr_repr) + attr_dict[attr_pair[0]] = attr_pair[1] + + is_target = opdesc.is_target + + for handler in op_repr_handlers: + res = handler(opdesc.type, inputs, outputs, attr_dict) + if res: return res + + return tpl.format( + outputs=', '.join(outputs), + optype=opdesc.type, + inputs=', '.join(inputs), + attrs="{%s}" % ','.join(attrs), + is_target=", is_target" if is_target else "") + + +def draw_block_graphviz(block, highlights=None, path="./temp.dot"): + ''' + Generate a debug graph for block. + Args: + block(Block): a block. + ''' + graph = GraphPreviewGenerator("some graph") + # collect parameters and args + protostr = block.desc.serialize_to_string() + desc = framework_pb2.BlockDesc.FromString(str(protostr)) + + def need_highlight(name): + if highlights is None: return False + for pattern in highlights: + assert type(pattern) is str + if re.match(pattern, name): + return True + return False + + # draw parameters and args + vars = {} + for var in desc.vars: + shape = [str(i) for i in var.lod_tensor.tensor.dims] + if not shape: + shape = ['null'] + # create var + if var.persistable: + varn = graph.add_param( + var.name, var.type, shape, highlight=need_highlight(var.name)) + else: + varn = graph.add_arg(var.name, highlight=need_highlight(var.name)) + vars[var.name] = varn + + def add_op_link_var(op, var, op2var=False): + for arg in var.arguments: + if arg not in vars: + # add missing variables as argument + vars[arg] = graph.add_arg(arg, highlight=need_highlight(arg)) + varn = vars[arg] + highlight = need_highlight(op.description) or need_highlight( + varn.description) + if op2var: + graph.add_edge(op, varn, highlight=highlight) + else: + graph.add_edge(varn, op, highlight=highlight) + + for op in desc.ops: + opn = graph.add_op(op.type, highlight=need_highlight(op.type)) + for var in op.inputs: + add_op_link_var(opn, var, False) + for var in op.outputs: + add_op_link_var(opn, var, True) + + graph(path, show=True) diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py index 77f80442e06cb18402bb1b8b97aa9119c7473f54..121b407cae41fa477843b7252ebacc9053d5f7aa 100644 --- a/python/paddle/v2/fluid/distribute_transpiler.py +++ b/python/paddle/v2/fluid/distribute_transpiler.py @@ -153,11 +153,18 @@ class DistributeTranspiler: self.param_grad_ep_mapping[ep]["params"].append(param) self.param_grad_ep_mapping[ep]["grads"].append(grad) + rpc_client_var = program.global_block().create_var( + name="RPC_CLIENT_VAR", + psersistable=True, + dtype='float32', # dtype and shape is not used in fact + shape=[0]) + # create send_op send_op = program.global_block().append_op( type="send", inputs={"X": send_inputs}, - outputs={"Out": send_outputs}, + outputs={"Out": send_outputs, + "RPCClient": rpc_client_var}, attrs={"endpoints": pserver_endpoints, "epmap": eplist}) # step4 @@ -471,9 +478,9 @@ class DistributeTranspiler: else: self._append_pserver_non_opt_ops(optimize_sub_program, pserver_program, opt_op) - # Append the recv op + # Append the listen_and_serv op pserver_program.global_block().append_op( - type="recv", + type="listen_and_serv", inputs={}, outputs={}, attrs={ diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index 8bf545e2ecc3939b00ba25d003a6b3887a54f860..69cbebe41e57309bb0993148833836b715e417ce 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -451,9 +451,8 @@ class Operator(object): if not given == need: raise ValueError(("Incorrect setting for output(s) of " "operator \"%s\". Need: [%s] Given: [%s]") % - (type, ", ".join(str(e) - for e in need), ", ".join( - str(e) for e in given))) + (type, ", ".join(str(e) for e in need), + ", ".join(str(e) for e in given))) for out_proto in proto.outputs: out_args = outputs[out_proto.name] @@ -489,7 +488,8 @@ class Operator(object): no_kernel_op_set = { 'feed', 'fetch', 'save', 'load', 'recurrent', 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', - 'recv', 'parallel_do' + 'recv', 'listen_and_serv', 'parallel_do', 'save_combine', + 'load_combine' } if type not in no_kernel_op_set: self.desc.infer_var_type(self.block.desc) diff --git a/python/paddle/v2/fluid/graphviz.py b/python/paddle/v2/fluid/graphviz.py new file mode 100644 index 0000000000000000000000000000000000000000..5881119c39231282b5654cd60720a1d8a7877896 --- /dev/null +++ b/python/paddle/v2/fluid/graphviz.py @@ -0,0 +1,272 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random +import subprocess +import logging + + +def crepr(v): + if type(v) is str or type(v) is unicode: + return '"%s"' % v + return str(v) + + +class Rank(object): + def __init__(self, kind, name, priority): + ''' + kind: str + name: str + priority: int + ''' + self.kind = kind + self.name = name + self.priority = priority + self.nodes = [] + + def __str__(self): + if not self.nodes: + return '' + + return '{' + 'rank={};'.format(self.kind) + \ + ','.join([node.name for node in self.nodes]) + '}' + + +class Graph(object): + rank_counter = 0 + + def __init__(self, title, **attrs): + self.title = title + self.attrs = attrs + self.nodes = [] + self.edges = [] + self.rank_groups = {} + + def code(self): + return self.__str__() + + def rank_group(self, kind, priority): + name = "rankgroup-%d" % Graph.rank_counter + Graph.rank_counter += 1 + rank = Rank(kind, name, priority) + self.rank_groups[name] = rank + return name + + def node(self, label, prefix, description="", **attrs): + node = Node(label, prefix, description, **attrs) + + if 'rank' in attrs: + rank = self.rank_groups[attrs['rank']] + del attrs['rank'] + rank.nodes.append(node) + self.nodes.append(node) + return node + + def edge(self, source, target, **attrs): + edge = Edge(source, target, **attrs) + self.edges.append(edge) + return edge + + def compile(self, dot_path): + file = open(dot_path, 'w') + file.write(self.__str__()) + image_path = os.path.join( + os.path.dirname(__file__), dot_path[:-3] + "pdf") + cmd = ["dot", "-Tpdf", dot_path, "-o", image_path] + subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + logging.warning("write block debug graph to {}".format(image_path)) + return image_path + + def show(self, dot_path): + image = self.compile(dot_path) + cmd = ["open", image] + subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + def _rank_repr(self): + ranks = sorted( + self.rank_groups.items(), + cmp=lambda a, b: a[1].priority > b[1].priority) + repr = [] + for x in ranks: + repr.append(str(x[1])) + return '\n'.join(repr) + '\n' + + def __str__(self): + reprs = [ + 'digraph G {', + 'title = {}'.format(crepr(self.title)), + ] + + for attr in self.attrs: + reprs.append("{key}={value};".format( + key=attr, value=crepr(self.attrs[attr]))) + + reprs.append(self._rank_repr()) + + random.shuffle(self.nodes) + reprs += [str(node) for node in self.nodes] + + for x in self.edges: + reprs.append(str(x)) + + reprs.append('}') + return '\n'.join(reprs) + + +class Node(object): + counter = 1 + + def __init__(self, label, prefix, description="", **attrs): + self.label = label + self.name = "%s_%d" % (prefix, Node.counter) + self.description = description + self.attrs = attrs + Node.counter += 1 + + def __str__(self): + reprs = '{name} [label={label} {extra} ];'.format( + name=self.name, + label=self.label, + extra=',' + ','.join("%s=%s" % (key, crepr(value)) + for key, value in self.attrs.items()) + if self.attrs else "") + return reprs + + +class Edge(object): + def __init__(self, source, target, **attrs): + ''' + Link source to target. + :param source: Node + :param target: Node + :param graph: Graph + :param attrs: dic + ''' + self.source = source + self.target = target + self.attrs = attrs + + def __str__(self): + repr = "{source} -> {target} {extra}".format( + source=self.source.name, + target=self.target.name, + extra="" if not self.attrs else + "[" + ','.join("{}={}".format(attr[0], crepr(attr[1])) + for attr in self.attrs.items()) + "]") + return repr + + +class GraphPreviewGenerator(object): + ''' + Generate a graph image for ONNX proto. + ''' + + def __init__(self, title): + # init graphviz graph + self.graph = Graph( + title, + layout="dot", + concentrate="true", + rankdir="TB", ) + + self.op_rank = self.graph.rank_group('same', 2) + self.param_rank = self.graph.rank_group('same', 1) + self.arg_rank = self.graph.rank_group('same', 0) + + def __call__(self, path='temp.dot', show=False): + if not show: + self.graph.compile(path) + else: + self.graph.show(path) + + def add_param(self, name, data_type, shape, highlight=False): + label = '\n'.join([ + '<', + ' ', + ' ', + ' ', + ' ', + ' ' + ' ', + ' ', + ' ' + ' ', + '
', + ' ', + name, + ' ', + '
', + str(data_type), + '
', + '[%s]' % 'x'.join(shape), + '
>', + ]) + return self.graph.node( + label, + prefix="param", + description=name, + shape="none", + style="rounded,filled,bold", + width="1.3", + color="#148b97" if not highlight else "orange", + fontcolor="#ffffff", + fontname="Arial") + + def add_op(self, opType, **kwargs): + highlight = False + if 'highlight' in kwargs: + highlight = kwargs['highlight'] + del kwargs['highlight'] + return self.graph.node( + "<%s>" % opType, + prefix="op", + description=opType, + shape="box", + style="rounded, filled, bold", + color="#303A3A" if not highlight else "orange", + fontname="Arial", + fontcolor="#ffffff", + width="1.3", + height="0.84", ) + + def add_arg(self, name, highlight=False): + return self.graph.node( + crepr(name), + prefix="arg", + description=name, + shape="box", + style="rounded,filled,bold", + fontname="Arial", + fontcolor="#999999", + color="#dddddd" if not highlight else "orange") + + def add_edge(self, source, target, **kwargs): + highlight = False + if 'highlight' in kwargs: + highlight = kwargs['highlight'] + del kwargs['highlight'] + return self.graph.edge( + source, + target, + color="#00000" if not highlight else "orange", + **kwargs) diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py index d56ec45c538b580f5520bc060b4b339bb1be0539..613dc20b6ea5533d126a73b7ec47796b3f812db5 100644 --- a/python/paddle/v2/fluid/io.py +++ b/python/paddle/v2/fluid/io.py @@ -46,6 +46,9 @@ def is_parameter(var): def is_persistable(var): + if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ + var.desc.type() == core.VarDesc.VarType.FETCH_LIST: + return False return var.persistable @@ -60,7 +63,12 @@ def _clone_var_in_block_(block, var): persistable=True) -def save_vars(executor, dirname, main_program=None, vars=None, predicate=None): +def save_vars(executor, + dirname, + main_program=None, + vars=None, + predicate=None, + save_file_name=None): """ Save variables to directory by executor. @@ -69,9 +77,12 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None): :param main_program: program. If vars is None, then filter all variables in this program which fit `predicate`. Default default_main_program. :param predicate: The Predicate describes a callable that returns a variable - as a bool. If it returns true, the variables will be saved. - :param vars: variables need to be saved. If specify vars, program & predicate + as a bool. If it returns true, the corresponding input variable will be saved. + :param vars: variables need to be saved. If vars is specified, program & predicate will be ignored + :param save_file_name: The name of a single file that all vars are saved to. + If it is None, save variables to separate files. + :return: None """ if vars is None: @@ -83,21 +94,39 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None): save_vars( executor, dirname=dirname, - vars=filter(predicate, main_program.list_vars())) + vars=filter(predicate, main_program.list_vars()), + save_file_name=save_file_name) else: save_program = Program() save_block = save_program.global_block() + + save_var_map = {} for each_var in vars: new_var = _clone_var_in_block_(save_block, each_var) + if save_file_name is None: + save_block.append_op( + type='save', + inputs={'X': [new_var]}, + outputs={}, + attrs={'file_path': os.path.join(dirname, new_var.name)}) + else: + save_var_map[new_var.name] = new_var + + if save_file_name is not None: + save_var_list = [] + for name in sorted(save_var_map.keys()): + save_var_list.append(save_var_map[name]) + save_block.append_op( - type='save', - inputs={'X': [new_var]}, + type='save_combine', + inputs={'X': save_var_list}, outputs={}, - attrs={'file_path': os.path.join(dirname, new_var.name)}) + attrs={'file_path': os.path.join(dirname, save_file_name)}) + executor.run(save_program) -def save_params(executor, dirname, main_program=None): +def save_params(executor, dirname, main_program=None, save_file_name=None): """ Save all parameters to directory with executor. """ @@ -106,10 +135,12 @@ def save_params(executor, dirname, main_program=None): dirname=dirname, main_program=main_program, vars=None, - predicate=is_parameter) + predicate=is_parameter, + save_file_name=save_file_name) -def save_persistables(executor, dirname, main_program=None): +def save_persistables(executor, dirname, main_program=None, + save_file_name=None): """ Save all persistables to directory with executor. """ @@ -118,21 +149,30 @@ def save_persistables(executor, dirname, main_program=None): dirname=dirname, main_program=main_program, vars=None, - predicate=is_persistable) + predicate=is_persistable, + save_file_name=save_file_name) -def load_vars(executor, dirname, main_program=None, vars=None, predicate=None): +def load_vars(executor, + dirname, + main_program=None, + vars=None, + predicate=None, + load_file_name=None): """ Load variables from directory by executor. - :param executor: executor that save variable + :param executor: executor that load variable :param dirname: directory path :param main_program: program. If vars is None, then filter all variables in this program which fit `predicate`. Default default_main_program(). :param predicate: The Predicate describes a callable that returns a variable - as a bool. If it returns true, the variables will be loaded. - :param vars: variables need to be loaded. If specify vars, program & + as a bool. If it returns true, the corresponding input variable will be loaded. + :param vars: variables need to be loaded. If vars is specified, program & predicate will be ignored + :param load_file_name: The name of the single file that all vars are loaded from. + If it is None, load variables from separate files. + :return: None """ if vars is None: @@ -144,23 +184,40 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None): load_vars( executor, dirname=dirname, - vars=filter(predicate, main_program.list_vars())) + vars=filter(predicate, main_program.list_vars()), + load_file_name=load_file_name) else: load_prog = Program() load_block = load_prog.global_block() + + load_var_map = {} for each_var in vars: assert isinstance(each_var, Variable) new_var = _clone_var_in_block_(load_block, each_var) + if load_file_name is None: + load_block.append_op( + type='load', + inputs={}, + outputs={'Out': [new_var]}, + attrs={'file_path': os.path.join(dirname, new_var.name)}) + else: + load_var_map[new_var.name] = new_var + + if load_file_name is not None: + load_var_list = [] + for name in sorted(load_var_map.keys()): + load_var_list.append(load_var_map[name]) + load_block.append_op( - type='load', + type='load_combine', inputs={}, - outputs={"Out": [new_var]}, - attrs={'file_path': os.path.join(dirname, new_var.name)}) + outputs={"Out": load_var_list}, + attrs={'file_path': os.path.join(dirname, load_file_name)}) executor.run(load_prog) -def load_params(executor, dirname, main_program=None): +def load_params(executor, dirname, main_program=None, load_file_name=None): """ load all parameters from directory by executor. """ @@ -168,10 +225,12 @@ def load_params(executor, dirname, main_program=None): executor, dirname=dirname, main_program=main_program, - predicate=is_parameter) + predicate=is_parameter, + load_file_name=load_file_name) -def load_persistables(executor, dirname, main_program=None): +def load_persistables(executor, dirname, main_program=None, + load_file_name=None): """ load all persistables from directory by executor. """ @@ -179,7 +238,8 @@ def load_persistables(executor, dirname, main_program=None): executor, dirname=dirname, main_program=main_program, - predicate=is_persistable) + predicate=is_persistable, + load_file_name=load_file_name) def get_inference_program(target_vars, main_program=None): @@ -238,7 +298,8 @@ def save_inference_model(dirname, feeded_var_names, target_vars, executor, - main_program=None): + main_program=None, + save_file_name=None): """ Build a model especially for inference, and save it to directory by the executor. @@ -249,6 +310,8 @@ def save_inference_model(dirname, :param executor: executor that save inference model :param main_program: original program, which will be pruned to build the inference model. Default default_main_program(). + :param save_file_name: The name of a single file that all parameters are saved to. + If it is None, save parameters to separate files. :return: None """ @@ -283,25 +346,7 @@ def save_inference_model(dirname, with open(model_file_name, "wb") as f: f.write(inference_program.desc.serialize_to_string()) - save_params(executor, dirname, main_program) - - -def load_persistables_if_exist(executor, dirname, main_program=None): - filenames = next(os.walk(dirname))[2] - filenames = set(filenames) - - def _is_presistable_and_exist_(var): - if not is_persistable(var): - return False - else: - return var.name in filenames - - load_vars( - executor, - dirname, - main_program=main_program, - vars=None, - predicate=_is_presistable_and_exist_) + save_persistables(executor, dirname, inference_program, save_file_name) def get_feed_targets_names(program): @@ -322,13 +367,15 @@ def get_fetch_targets_names(program): return fetch_targets_names -def load_inference_model(dirname, executor): +def load_inference_model(dirname, executor, load_file_name=None): """ Load inference model from a directory :param dirname: directory path :param executor: executor that load inference model - + :param load_file_name: The name of the single file that all parameters are loaded from. + If it is None, load parameters from separate files. + :return: [program, feed_target_names, fetch_targets] program: program especially for inference. feed_target_names: Names of variables that need to feed data @@ -342,7 +389,7 @@ def load_inference_model(dirname, executor): program_desc_str = f.read() program = Program.parse_from_string(program_desc_str) - load_persistables_if_exist(executor, dirname, program) + load_persistables(executor, dirname, program, load_file_name) feed_target_names = get_feed_targets_names(program) fetch_target_names = get_fetch_targets_names(program) @@ -359,6 +406,7 @@ def get_parameter_value(para, executor): :param executor: executor for retrieving the value :param para: the given parameter + :return: the LoDTensor for the parameter """ assert is_parameter(para) @@ -377,6 +425,7 @@ def get_parameter_value_by_name(name, executor, program=None): :param name: the name of the parameter :param program: the program where the variable is found Default default_main_program(). + :return: the LoDTensor for the variable """ if program is None: diff --git a/python/paddle/v2/fluid/layers/io.py b/python/paddle/v2/fluid/layers/io.py index b7b2cf2296cc8868dd0b5eb6cd6d58b9ae795d5d..85e44a0e5149bd36f2787d9f2d516dbe4abdbb2e 100644 --- a/python/paddle/v2/fluid/layers/io.py +++ b/python/paddle/v2/fluid/layers/io.py @@ -108,7 +108,7 @@ class ListenAndServ(object): """ def __init__(self, endpoint, fan_in=1, optimizer_mode=True): - self.helper = LayerHelper("recv") + self.helper = LayerHelper("listen_and_serv") self.inputs = [] self.outputs = [] self.endpoint = endpoint @@ -158,7 +158,7 @@ class ListenAndServ(object): param_names = [p.name for p in params] grad_names = [g.name for g in grads] parent_block.append_op( - type='recv', + type='listen_and_serv', inputs={}, outputs={}, attrs={ @@ -196,3 +196,31 @@ def Send(endpoints, send_vars, get_vars): outputs={"Out": get_vars}, attrs={"endpoints": endpoints, "epmap": epmap}) + + +def Recv(endpoints, get_vars): + """ + Recv layer + + Args: + endpoints: comma seperated IP:PORT pairs in the order + of send_vars to send + send_vars: vars to send + get_vars: vars to get from server after send completes. + + Send variables to the server side, and get vars from server + side when server have finished running server side program. + """ + assert (type(send_vars) == list) + assert (type(get_vars) == list) + + epmap = endpoints.split(",") + endpoints = list(set(epmap)) + + helper = LayerHelper("Recv", **locals()) + helper.append_op( + type="recv", + inputs={"X": get_vars}, + outputs={"Out": get_vars}, + attrs={"endpoints": endpoints, + "epmap": epmap}) diff --git a/python/paddle/v2/fluid/layers/math_op_patch.py b/python/paddle/v2/fluid/layers/math_op_patch.py index f359e70126f7601b75261e795b5a37bdc241112e..79a130a3eb148e6c5a8fa3cdf174780b354c23c9 100644 --- a/python/paddle/v2/fluid/layers/math_op_patch.py +++ b/python/paddle/v2/fluid/layers/math_op_patch.py @@ -145,7 +145,9 @@ def monkey_patch_variable(): # a*b == b*a. Do not need to reverse explicitly ("__rmul__", "elementwise_mul", False), ("__div__", "elementwise_div", False), - ("__rdiv__", "elementwise_div", True)): + ("__rdiv__", "elementwise_div", True), + ("__pow__", "elementwise_pow", False), + ("__rpow__", "elementwise_pow", True)): setattr(Variable, method_name, _elemwise_method_creator_(method_name, op_type, reverse)) diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index d11dccfd22124d58d8634c01a00527c373b92f00..a79479f469a0c489edf2676bc5d07066bb480664 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -847,7 +847,35 @@ def cos_sim(X, Y, **kwargs): return out -def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs): +def dropout(x, dropout_prob, is_test=False, seed=None, **kwargs): + """ + Computes dropout. + + Drop or keep each element of `x` independently. Dropout is a regularization + technique for reducing overfitting by preventing neuron co-adaption during + training. The dropout operator randomly set (according to the given dropout + probability) the outputs of some units to zero, while others are remain + unchanged. + + Args: + x(variable): The input tensor. + dropout_prob(float): Probability of setting units to zero. + is_test(bool): A flag indicating whether it is in test phrase or not. + seed(int): A Python integer used to create random seeds. If this + parameter is set to None, a random seed is used. + NOTE: If an integer seed is given, always the same output + units will be dropped. DO NOT use a fixed seed in training. + + Returns: + Variable: A tensor variable. + + Examples: + .. code-block:: python + + x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32") + droped = fluid.layers.dropout(input=x, dropout_rate=0.5) + """ + helper = LayerHelper('dropout', **kwargs) out = helper.create_tmp_variable(dtype=x.dtype) mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True) @@ -856,9 +884,12 @@ def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs): inputs={'X': [x]}, outputs={'Out': [out], 'Mask': [mask]}, - attrs={'dropout_prob': dropout_prob, - 'is_test': is_test, - 'seed': seed}) + attrs={ + 'dropout_prob': dropout_prob, + 'is_test': is_test, + 'fix_seed': seed is not None, + 'seed': seed if seed is not None else 0 + }) return out @@ -1200,10 +1231,17 @@ def conv2d(input, """ if stride is None: stride = [1, 1] - helper = LayerHelper('conv2d', **locals()) - dtype = helper.input_dtype() num_channels = input.shape[1] + + l_type = 'conv2d' + if (num_channels == groups and num_filters % num_channels == 0 and + not use_cudnn): + l_type = 'depthwise_conv2d' + + helper = LayerHelper(l_type, **locals()) + dtype = helper.input_dtype() + if groups is None: num_filter_channels = num_channels else: @@ -1236,7 +1274,7 @@ def conv2d(input, pre_bias = helper.create_tmp_variable(dtype) helper.append_op( - type='conv2d', + type=l_type, inputs={ 'Input': input, 'Filter': filter_param, @@ -1447,7 +1485,9 @@ def batch_norm(input, param_attr=None, bias_attr=None, data_layout='NCHW', - name=None): + name=None, + moving_mean_name=None, + moving_variance_name=None): """ This function helps create an operator to implement the BatchNorm layer using the configurations from the input parameters. @@ -1477,6 +1517,7 @@ def batch_norm(input, attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True) mean = helper.create_global_variable( + name=moving_mean_name, dtype=input.dtype, shape=param_shape, persistable=True, @@ -1484,6 +1525,7 @@ def batch_norm(input, helper.set_variable_initializer(var=mean, initializer=Constant(0.0)) variance = helper.create_global_variable( + name=moving_variance_name, dtype=input.dtype, shape=param_shape, persistable=True, diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py index 022a94cad440f13383a927233195bb008a688843..c701e79ad266d996038c6868718106664e1009b5 100644 --- a/python/paddle/v2/fluid/layers/ops.py +++ b/python/paddle/v2/fluid/layers/ops.py @@ -56,8 +56,10 @@ __all__ = [ 'elementwise_mul', 'elementwise_max', 'elementwise_min', + 'elementwise_pow', 'clip', 'clip_by_norm', + 'softmax', 'sequence_softmax', ] + __activations__ diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py index 6e7d09459c07c77a8579300a1c67ae36dc3d2ba2..8460af2a08a54c5f241553757296ed2bf02f0167 100644 --- a/python/paddle/v2/fluid/layers/tensor.py +++ b/python/paddle/v2/fluid/layers/tensor.py @@ -16,12 +16,14 @@ from ..layer_helper import LayerHelper from ..param_attr import ParamAttr from ..framework import convert_np_dtype_to_dtype_ from ..framework import Variable +from ..initializer import Constant from ..core import DataType import numpy __all__ = [ 'create_tensor', 'create_parameter', + 'create_global_var', 'cast', 'concat', 'sums', @@ -58,13 +60,22 @@ def create_parameter(shape, Returns: Parameter: the created parameter """ - helper = LayerHelper("create_parameter") + helper = LayerHelper("create_parameter", **locals()) if attr is None: attr = ParamAttr() return helper.create_parameter(attr, shape, dtype, is_bias, default_initializer) +def create_global_var(shape, value, dtype, persistable=False, name=None): + helper = LayerHelper("global_var", **locals()) + var = helper.create_global_variable( + dtype=dtype, shape=shape, persistable=persistable, name=name) + helper.set_variable_initializer( + var, initializer=Constant(value=float(value))) + return var + + def cast(x, dtype): """ This function takes in the input with input_dtype @@ -284,7 +295,7 @@ def fill_constant_batch_size_like(input, return out -def ones(shape, dtype): +def ones(shape, dtype, force_cpu=False): """ **ones** @@ -308,7 +319,7 @@ def ones(shape, dtype): return fill_constant(value=1.0, **locals()) -def zeros(shape, dtype): +def zeros(shape, dtype, force_cpu=False): """ **zeros** diff --git a/python/paddle/v2/fluid/learning_rate_decay.py b/python/paddle/v2/fluid/learning_rate_decay.py new file mode 100644 index 0000000000000000000000000000000000000000..96b3e9a0d73cede5d6e36308a53ab8927a95a6da --- /dev/null +++ b/python/paddle/v2/fluid/learning_rate_decay.py @@ -0,0 +1,125 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import layers +from framework import Variable + +__all__ = ['exponential_decay', 'natural_exp_decay', 'inverse_time_decay'] +""" +When training a model, it's often useful to decay the +learning rate during training process, this is called +learning_rate_decay. There are many strategies to do +this, this module will provide some classical method. +User can also implement their own learning_rate_decay +strategy according to this module. +""" + + +def exponential_decay(learning_rate, + global_step, + decay_steps, + decay_rate, + staircase=False): + """Applies exponential decay to the learning rate. + + ```python + decayed_learning_rate = learning_rate * + decay_rate ^ (global_step / decay_steps) + ``` + Args: + learning_rate: A scalar float32 value or a Variable. This + will be the initial learning rate during training + global_step: A Variable that record the training step. + decay_steps: A Python `int32` number. + decay_rate: A Python `float` number. + staircase: Boolean. If set true, decay the learning rate every decay_steps. + + Returns: + The decayed learning rate + """ + if not isinstance(global_step, Variable): + raise ValueError("global_step is required for exponential_decay.") + + # update learning_rate + div_res = global_step / decay_steps + if staircase: + div_res = layers.floor(x=div_res) + return learning_rate * (decay_rate**div_res) + + +def natural_exp_decay(learning_rate, + global_step, + decay_steps, + decay_rate, + staircase=False): + """Applies natural exponential decay to the initial learning rate. + + ```python + if not staircase: + decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps)) + else: + decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps)) + ``` + Args: + learning_rate: A scalar float32 value or a Variable. This + will be the initial learning rate during training + global_step: A Variable that record the training step. + decay_steps: A Python `int32` number. + decay_rate: A Python `float` number. + staircase: Boolean. If set true, decay the learning rate every decay_steps. + + Returns: + The decayed learning rate + """ + if not isinstance(global_step, Variable): + raise ValueError("global_step is required for natural_exp_decay.") + + div_res = global_step / decay_steps + if staircase: + div_res = layers.floor(x=div_res) + return learning_rate * layers.exp(x=(-1 * decay_rate * div_res)) + + +def inverse_time_decay(learning_rate, + global_step, + decay_steps, + decay_rate, + staircase=False): + """Applies inverse time decay to the initial learning rate. + + ```python + if staircase: + decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step)) + else + decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step) + ``` + Args: + learning_rate: A scalar float32 value or a Variable. This + will be the initial learning rate during training + global_step: A Variable that record the training step. + decay_steps: A Python `int32` number. + decay_rate: A Python `float` number. + staircase: Boolean. If set true, decay the learning rate every decay_steps. + + Returns: + The decayed learning rate + """ + if not isinstance(global_step, Variable): + raise ValueError("global_step is required for inverse_time_decay.") + + div_res = global_step / decay_steps + if staircase: + div_res = layers.floor(x=div_res) + + return learning_rate / (1 + decay_rate * div_res) diff --git a/python/paddle/v2/fluid/memory_optimization_transpiler.py b/python/paddle/v2/fluid/memory_optimization_transpiler.py index 956c5b66da28fd8e74d4fd12f249688daa72d8ac..2b00923f5e85e6ba8fcdedebf5bbbc29403472c6 100644 --- a/python/paddle/v2/fluid/memory_optimization_transpiler.py +++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py @@ -31,7 +31,7 @@ dtype_to_size = { class ControlFlowGraph(object): - def __init__(self, Program, ops, forward_num): + def __init__(self, Program, ops, forward_num, skip_opt): self._program = Program self._ops = ops self._forward_num = forward_num @@ -41,6 +41,7 @@ class ControlFlowGraph(object): self._defs = defaultdict(set) self._live_in = defaultdict(set) self._live_out = defaultdict(set) + self._skip_opt = skip_opt def _add_connections(self, connections): for node1, node2 in connections: @@ -130,6 +131,10 @@ class ControlFlowGraph(object): block_desc, x, is_forward).type() != core.VarDesc.VarType.LOD_TENSOR: return False + if x in self._skip_opt: + return False + if not self._find_var(block_desc, x, is_forward).shape(): + return False return True self._build_graph() @@ -140,6 +145,7 @@ class ControlFlowGraph(object): if op.type() == "while" or op.type() == "while_grad": continue block_desc = op.block() + self.current_block_desc = block_desc is_forward = i < self._forward_num if self.pool: defs_can_optimize = filter( @@ -197,28 +203,32 @@ def get_cfgs(input_program): block_desc = pdesc.block(0) op_size = block_desc.op_size() # Get global block ops - ops_list.append(([block_desc.op(i) for i in range(op_size)], op_size)) + ops_list.append( + ([block_desc.op(i) for i in range(op_size)], op_size, set())) while_sub_block_ids = [] while_grad_sub_block_ids = [] - while_pair = [] + while_op_output = set() + while_block_id_pair = [] for i in range(op_size): op = block_desc.op(i) if op.type() == "while": while_sub_block_ids.append(op.attr("sub_block").id) + while_op_output.update(op.output_arg_names()) elif op.type() == "while_grad": while_grad_sub_block_ids.append(op.attr("sub_block").id) + while_op_output.update(op.output_arg_names()) # Find while/while_grad block pair for grad_id in while_grad_sub_block_ids: parent_id = pdesc.block(grad_id).parent if parent_id in while_sub_block_ids: - while_pair.append((parent_id, grad_id)) + while_block_id_pair.append((parent_id, grad_id)) while_sub_block_ids.remove(parent_id) # Get while/while_grad block ops - for parent_id, grad_id in while_pair: + for parent_id, grad_id in while_block_id_pair: while_block_ops = [] while_block = pdesc.block(parent_id) while_block_op_size = while_block.op_size() @@ -230,7 +240,7 @@ def get_cfgs(input_program): for i in range(while_grad_block_op_size): while_block_ops.append(while_grad_block.op(i)) - ops_list.append((while_block_ops, while_block_op_size)) + ops_list.append((while_block_ops, while_block_op_size, while_op_output)) # Process rest while block ops for parent_id in while_sub_block_ids: @@ -242,7 +252,7 @@ def get_cfgs(input_program): ops_list.append((while_block_ops, while_block_op_size)) - cfgs = [ControlFlowGraph(input_program, i, j) for i, j in ops_list] + cfgs = [ControlFlowGraph(input_program, i, j, k) for i, j, k in ops_list] return cfgs diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py index 0c3533b892176edd5dfd111fdd771cc17d468168..7844a4e2df1ce3989e48082f6472292560fbf1ee 100644 --- a/python/paddle/v2/fluid/optimizer.py +++ b/python/paddle/v2/fluid/optimizer.py @@ -15,6 +15,7 @@ from collections import defaultdict import framework +import layers from backward import append_backward from framework import unique_name, program_guard from initializer import Constant @@ -33,9 +34,11 @@ class Optimizer(object): but need to use one of it's implementation. """ - def __init__(self, global_step=None, regularization=None): + def __init__(self, learning_rate, global_step=None, regularization=None): + assert learning_rate is not None self._global_step = global_step self.regularization = regularization + self._global_learning_rate = learning_rate # Dictionary of accumulators. Some optimizer subclasses need to # allocate and manage extra variables associated with the parameters # to train. These variables are called accumulators. @@ -43,6 +46,28 @@ class Optimizer(object): self._accumulators = defaultdict(lambda: dict()) self.helper = None + def _create_global_learning_rate(self): + if isinstance(self._global_learning_rate, float): + self._global_learning_rate = layers.create_global_var( + name=unique_name("learning_rate"), + shape=[1], + value=float(self._global_learning_rate), + dtype='float32', + persistable=True) + + if not isinstance(self._global_learning_rate, framework.Variable): + raise ValueError("learning rate should be a Variable, " + "actual type is %s", + type(self._global_learning_rate)) + + @property + def global_learning_rate(self): + """ + get global decayed learning rate + :return: + """ + return self._global_learning_rate + def _append_optimize_op(self, block, param_and_grad): """ append optimize operator to block and return all the added optimize_op """ @@ -52,17 +77,7 @@ class Optimizer(object): # create learning rate variable for every parameter param = param_and_grad[0] param_lr = param.optimize_attr['learning_rate'] - param_lr_shape = [1] - param_lr_var = self.helper.create_global_variable( - name=unique_name("learning_rate"), - dtype='float32', - shape=param_lr_shape, - lod_level=1, - persistable=True) - param_lr = param_lr * self._learning_rate - self.helper.set_variable_initializer( - var=param_lr_var, initializer=Constant(param_lr)) - return param_lr_var + return self._global_learning_rate * param_lr def _create_accumulators(self, block, parameters): """Create all accumulators needed by the parameters @@ -163,7 +178,7 @@ class Optimizer(object): optimization. This will include parameter update ops, global step update ops and any other custom ops required by subclasses to manage their internal state. - :param startup_program: + :param startup_program: """ # This is a default implementation of create_optimization_pass that # can be shared by most optimizers. This implementation assumes that @@ -178,6 +193,7 @@ class Optimizer(object): self.helper = LayerHelper(self.__class__.__name__) self._create_accumulators(loss.block, [p[0] for p in parameters_and_grads]) + self._create_global_learning_rate() optimize_ops = [] for param_and_grad in parameters_and_grads: @@ -231,9 +247,9 @@ class SGDOptimizer(Optimizer): def __init__(self, learning_rate, **kwargs): assert learning_rate is not None - super(SGDOptimizer, self).__init__(**kwargs) + super(SGDOptimizer, self).__init__( + learning_rate=learning_rate, **kwargs) self.type = "sgd" - self._learning_rate = learning_rate def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) @@ -259,9 +275,9 @@ class MomentumOptimizer(Optimizer): def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs): assert learning_rate is not None assert momentum is not None - super(MomentumOptimizer, self).__init__(**kwargs) + super(MomentumOptimizer, self).__init__( + learning_rate=learning_rate, **kwargs) self.type = "momentum" - self._learning_rate = learning_rate self._momentum = momentum self._use_nesterov = bool(use_nesterov) @@ -303,9 +319,9 @@ class AdagradOptimizer(Optimizer): def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs): assert learning_rate is not None assert epsilon is not None - super(AdagradOptimizer, self).__init__(**kwargs) + super(AdagradOptimizer, self).__init__( + learning_rate=learning_rate, **kwargs) self.type = "adagrad" - self._learning_rate = learning_rate self._epsilon = epsilon def _create_accumulators(self, block, parameters): @@ -352,9 +368,9 @@ class AdamOptimizer(Optimizer): assert beta1 is not None assert beta2 is not None assert epsilon is not None - super(AdamOptimizer, self).__init__(**kwargs) + super(AdamOptimizer, self).__init__( + learning_rate=learning_rate, **kwargs) self.type = "adam" - self._learning_rate = learning_rate self._beta1 = beta1 self._beta2 = beta2 self._epsilon = epsilon @@ -457,9 +473,9 @@ class AdamaxOptimizer(Optimizer): assert beta1 is not None assert beta2 is not None assert epsilon is not None - super(AdamaxOptimizer, self).__init__(**kwargs) + super(AdamaxOptimizer, self).__init__( + learning_rate=learning_rate, **kwargs) self.type = "adamax" - self._learning_rate = learning_rate self._beta1 = beta1 self._beta2 = beta2 self._epsilon = epsilon @@ -535,9 +551,9 @@ class DecayedAdagradOptimizer(Optimizer): assert decay is not None assert epsilon is not None - super(DecayedAdagradOptimizer, self).__init__(**kwargs) + super(DecayedAdagradOptimizer, self).__init__( + learning_rate=learning_rate, **kwargs) self.type = "decayed_adagrad" - self._learning_rate = learning_rate self._decay = decay self._epsilon = epsilon diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py index 51c1c8aa705513825b46fb936c6c99090c50fb7d..d33a4c52a8873b1e376eb2077014130bdcad2e12 100644 --- a/python/paddle/v2/fluid/profiler.py +++ b/python/paddle/v2/fluid/profiler.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.v2.fluid.core as core +import core from contextlib import contextmanager import os -__all__ = ['CudaProfiler'] +__all__ = ['cuda_profiler', 'reset_profiler', 'profiler'] NVPROF_CONFIG = [ "gpustarttimestamp", @@ -103,10 +103,10 @@ def profiler(state, sorted_key=None): core.enable_profiler(prof_state) yield - if sorted_key not in ['calls', 'total', 'max', 'min', 'ave']: - raise ValueError("The state must be in 'calls', 'total', " - "'max', 'min', 'ave'") sorted_key = 'default' if sorted_key is None else sorted_key + if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']: + raise ValueError("The sorted_key must be None or in 'calls', 'total', " + "'max', 'min' and 'ave'") key_map = { 'default': core.EventSortingKey.kDefault, 'calls': core.EventSortingKey.kCalls, diff --git a/python/paddle/v2/fluid/tests/CMakeLists.txt b/python/paddle/v2/fluid/tests/CMakeLists.txt index 628ce60b406d880d961d705a6abd2b5236fb1c8c..26a80abcb5839e80b5a22f9415315519ce3042e8 100644 --- a/python/paddle/v2/fluid/tests/CMakeLists.txt +++ b/python/paddle/v2/fluid/tests/CMakeLists.txt @@ -5,9 +5,11 @@ if(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_recv_op) endif(NOT WITH_DISTRIBUTE) +list(REMOVE_ITEM TEST_OPS test_warpctc_op) foreach(src ${TEST_OPS}) py_test(${src} SRCS ${src}.py) endforeach() +py_test(test_warpctc_op SRCS test_warpctc_op.py ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR}) add_subdirectory(book) add_subdirectory(book_distribute) diff --git a/python/paddle/v2/fluid/tests/book/.gitignore b/python/paddle/v2/fluid/tests/book/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..f0b574b9396706a1d68393482296360362dca750 --- /dev/null +++ b/python/paddle/v2/fluid/tests/book/.gitignore @@ -0,0 +1 @@ +recognize_digits_*.inference.model diff --git a/python/paddle/v2/fluid/tests/book/CMakeLists.txt b/python/paddle/v2/fluid/tests/book/CMakeLists.txt index dda02c03fd531445c1b33b39a6ded10921991d9c..673c965b662a022739f8d489c331f4de9455a926 100644 --- a/python/paddle/v2/fluid/tests/book/CMakeLists.txt +++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt @@ -1,34 +1,6 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") -list(REMOVE_ITEM TEST_OPS test_image_classification_train test_recognize_digits) -py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet) -py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg) -py_test(test_recognize_digits_mlp_cpu - SRCS test_recognize_digits.py - ARGS mlp) -py_test(test_recognize_digits_mlp_cuda - SRCS test_recognize_digits.py - ARGS mlp --use_cuda) -py_test(test_recognize_digits_conv_cpu - SRCS test_recognize_digits.py - ARGS conv) -py_test(test_recognize_digits_conv_cuda - SRCS test_recognize_digits.py - ARGS conv --use_cuda) -py_test(test_recognize_digits_mlp_cpu_parallel - SRCS test_recognize_digits.py - ARGS mlp --parallel) -py_test(test_recognize_digits_mlp_cuda_parallel - SRCS test_recognize_digits.py - ARGS mlp --use_cuda --parallel) -py_test(test_recognize_digits_conv_cpu_parallel - SRCS test_recognize_digits.py - ARGS conv --parallel) -py_test(test_recognize_digits_conv_cuda_parallel - SRCS test_recognize_digits.py - ARGS conv --use_cuda --parallel) - # default test foreach(src ${TEST_OPS}) py_test(${src} SRCS ${src}.py) diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py index 0b954c60b6bc2d721c0373243e747056f8f572cf..27f34b17339db31ef3c07555db946fa76d6f1922 100644 --- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py +++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py @@ -12,44 +12,74 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import paddle.v2 as paddle import paddle.v2.fluid as fluid +import contextlib +import unittest -x = fluid.layers.data(name='x', shape=[13], dtype='float32') -y_predict = fluid.layers.fc(input=x, size=1, act=None) +def main(use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return -y = fluid.layers.data(name='y', shape=[1], dtype='float32') + x = fluid.layers.data(name='x', shape=[13], dtype='float32') -cost = fluid.layers.square_error_cost(input=y_predict, label=y) -avg_cost = fluid.layers.mean(x=cost) + y_predict = fluid.layers.fc(input=x, size=1, act=None) -sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) -sgd_optimizer.minimize(avg_cost) + y = fluid.layers.data(name='y', shape=[1], dtype='float32') -BATCH_SIZE = 20 + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(x=cost) -train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.uci_housing.train(), buf_size=500), - batch_size=BATCH_SIZE) + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer.minimize(avg_cost) -place = fluid.CPUPlace() -feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) -exe = fluid.Executor(place) + BATCH_SIZE = 20 -exe.run(fluid.default_startup_program()) + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.uci_housing.train(), buf_size=500), + batch_size=BATCH_SIZE) -PASS_NUM = 100 -for pass_id in range(PASS_NUM): - fluid.io.save_persistables(exe, "./fit_a_line.model/") - fluid.io.load_persistables(exe, "./fit_a_line.model/") - for data in train_reader(): - avg_loss_value, = exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[avg_cost]) - print(avg_loss_value) - if avg_loss_value[0] < 10.0: - exit(0) # if avg cost less than 10.0, we think our code is good. -exit(1) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + + exe.run(fluid.default_startup_program()) + + PASS_NUM = 100 + for pass_id in range(PASS_NUM): + fluid.io.save_persistables(exe, "./fit_a_line.model/") + fluid.io.load_persistables(exe, "./fit_a_line.model/") + for data in train_reader(): + avg_loss_value, = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost]) + print(avg_loss_value) + if avg_loss_value[0] < 10.0: + return + raise AssertionError("Fit a line cost is too large, {0:2.2}".format( + avg_loss_value[0])) + + +class TestFitALine(unittest.TestCase): + def test_cpu(self): + with self.program_scope_guard(): + main(use_cuda=False) + + def test_cuda(self): + with self.program_scope_guard(): + main(use_cuda=True) + + @contextlib.contextmanager + def program_scope_guard(self): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py index 30582a21d0a5eeab125f3a2764b45b51aa4f94b6..a4168d16db06f904faed811fdda3f0fe52f0b27b 100644 --- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py +++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py @@ -14,10 +14,10 @@ from __future__ import print_function -import sys - import paddle.v2 as paddle import paddle.v2.fluid as fluid +import unittest +import contextlib def resnet_cifar10(input, depth=32): @@ -89,56 +89,89 @@ def vgg16_bn_drop(input): return fc2 -classdim = 10 -data_shape = [3, 32, 32] - -images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') -label = fluid.layers.data(name='label', shape=[1], dtype='int64') - -net_type = "vgg" -if len(sys.argv) >= 2: - net_type = sys.argv[1] - -if net_type == "vgg": - print("train vgg net") - net = vgg16_bn_drop(images) -elif net_type == "resnet": - print("train resnet") - net = resnet_cifar10(images, 32) -else: - raise ValueError("%s network is not supported" % net_type) - -predict = fluid.layers.fc(input=net, size=classdim, act='softmax') -cost = fluid.layers.cross_entropy(input=predict, label=label) -avg_cost = fluid.layers.mean(x=cost) - -optimizer = fluid.optimizer.Adam(learning_rate=0.001) -opts = optimizer.minimize(avg_cost) - -accuracy = fluid.evaluator.Accuracy(input=predict, label=label) - -BATCH_SIZE = 128 -PASS_NUM = 1 - -train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.cifar.train10(), buf_size=128 * 10), - batch_size=BATCH_SIZE) - -place = fluid.CPUPlace() -exe = fluid.Executor(place) -feeder = fluid.DataFeeder(place=place, feed_list=[images, label]) -exe.run(fluid.default_startup_program()) - -for pass_id in range(PASS_NUM): - accuracy.reset(exe) - for data in train_reader(): - loss, acc = exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[avg_cost] + accuracy.metrics) - pass_acc = accuracy.eval(exe) - print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str( - pass_acc)) - # this model is slow, so if we can train two mini batch, we think it works properly. - exit(0) -exit(1) +def main(net_type, use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + + classdim = 10 + data_shape = [3, 32, 32] + + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + if net_type == "vgg": + print("train vgg net") + net = vgg16_bn_drop(images) + elif net_type == "resnet": + print("train resnet") + net = resnet_cifar10(images, 32) + else: + raise ValueError("%s network is not supported" % net_type) + + predict = fluid.layers.fc(input=net, size=classdim, act='softmax') + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + optimizer = fluid.optimizer.Adam(learning_rate=0.001) + optimizer.minimize(avg_cost) + + accuracy = fluid.evaluator.Accuracy(input=predict, label=label) + + BATCH_SIZE = 128 + PASS_NUM = 1 + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10(), buf_size=128 * 10), + batch_size=BATCH_SIZE) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(place=place, feed_list=[images, label]) + exe.run(fluid.default_startup_program()) + + loss = 0.0 + for pass_id in range(PASS_NUM): + accuracy.reset(exe) + for data in train_reader(): + loss, acc = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost] + accuracy.metrics) + pass_acc = accuracy.eval(exe) + print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str( + pass_acc)) + return + + raise AssertionError( + "Image classification loss is too large, {0:2.2}".format(loss)) + + +class TestImageClassification(unittest.TestCase): + def test_vgg_cuda(self): + with self.scope_prog_guard(): + main('vgg', use_cuda=True) + + def test_resnet_cuda(self): + with self.scope_prog_guard(): + main('resnet', use_cuda=True) + + def test_vgg_cpu(self): + with self.scope_prog_guard(): + main('vgg', use_cuda=False) + + def test_resnet_cpu(self): + with self.scope_prog_guard(): + main('resnet', use_cuda=False) + + @contextlib.contextmanager + def scope_prog_guard(self): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py index 1a342bf1fbbc0e5f4e3c7d440424b66c4b9f732f..f85768de99adb8b5005b23278ad807a24c5bff65 100644 --- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py @@ -175,7 +175,7 @@ def main(): paddle.reader.shuffle( paddle.dataset.conll05.test(), buf_size=8192), batch_size=BATCH_SIZE) - #place = fluid.CPUPlace() + # place = fluid.CPUPlace() place = fluid.CUDAPlace(0) feeder = fluid.DataFeeder( feed_list=[ diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py index 82b760d693560dae1ab1fa39afdc186f60423e65..5716ddd3dda90958ad1008679e018542c4fb73d7 100644 --- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py +++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py @@ -11,21 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import contextlib import numpy as np import paddle.v2 as paddle import paddle.v2.fluid as fluid -import paddle.v2.fluid.core as core import paddle.v2.fluid.framework as framework import paddle.v2.fluid.layers as pd from paddle.v2.fluid.executor import Executor +import unittest dict_size = 30000 source_dict_dim = target_dict_dim = dict_size -src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) hidden_dim = 32 word_dim = 16 -IS_SPARSE = True batch_size = 2 max_length = 8 topk_size = 50 @@ -34,10 +33,8 @@ beam_size = 2 decoder_size = hidden_dim -place = core.CPUPlace() - -def encoder(): +def encoder(is_sparse): # encoder src_word_id = pd.data( name="src_word_id", shape=[1], dtype='int64', lod_level=1) @@ -45,7 +42,7 @@ def encoder(): input=src_word_id, size=[dict_size, word_dim], dtype='float32', - is_sparse=IS_SPARSE, + is_sparse=is_sparse, param_attr=fluid.ParamAttr(name='vemb')) fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') @@ -54,7 +51,7 @@ def encoder(): return encoder_out -def decoder_train(context): +def decoder_train(context, is_sparse): # decoder trg_language_word = pd.data( name="target_language_word", shape=[1], dtype='int64', lod_level=1) @@ -62,7 +59,7 @@ def decoder_train(context): input=trg_language_word, size=[dict_size, word_dim], dtype='float32', - is_sparse=IS_SPARSE, + is_sparse=is_sparse, param_attr=fluid.ParamAttr(name='vemb')) rnn = pd.DynamicRNN() @@ -82,10 +79,10 @@ def decoder_train(context): return rnn() -def decoder_decode(context): +def decoder_decode(context, is_sparse): init_state = context array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) - counter = pd.zeros(shape=[1], dtype='int64') + counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) # fill the first element with init_state state_array = pd.create_array('float32') @@ -117,7 +114,7 @@ def decoder_decode(context): input=pre_ids, size=[dict_size, word_dim], dtype='float32', - is_sparse=IS_SPARSE) + is_sparse=is_sparse) # use rnn unit to update rnn current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded], @@ -150,7 +147,7 @@ def decoder_decode(context): def set_init_lod(data, lod, place): - res = core.LoDTensor() + res = fluid.LoDTensor() res.set(data, place) res.set_lod(lod) return res @@ -165,15 +162,19 @@ def to_lodtensor(data, place): lod.append(cur_len) flattened_data = np.concatenate(data, axis=0).astype("int64") flattened_data = flattened_data.reshape([len(flattened_data), 1]) - res = core.LoDTensor() + res = fluid.LoDTensor() res.set(flattened_data, place) res.set_lod([lod]) return res -def train_main(): - context = encoder() - rnn_out = decoder_train(context) +def train_main(use_cuda, is_sparse): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + context = encoder(is_sparse) + rnn_out = decoder_train(context, is_sparse) label = pd.data( name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) cost = pd.cross_entropy(input=rnn_out, label=label) @@ -212,9 +213,13 @@ def train_main(): batch_id += 1 -def decode_main(): - context = encoder() - translation_ids, translation_scores = decoder_decode(context) +def decode_main(use_cuda, is_sparse): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + context = encoder(is_sparse) + translation_ids, translation_scores = decoder_decode(context, is_sparse) exe = Executor(place) exe.run(framework.default_startup_program()) @@ -250,6 +255,60 @@ def decode_main(): break +class TestMachineTranslation(unittest.TestCase): + pass + + +@contextlib.contextmanager +def scope_prog_guard(): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield + + +def inject_test_train(use_cuda, is_sparse): + f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse' + if is_sparse else 'dense') + + def f(*args): + with scope_prog_guard(): + train_main(use_cuda, is_sparse) + + setattr(TestMachineTranslation, f_name, f) + + +def inject_test_decode(use_cuda, is_sparse, decorator=None): + f_name = 'test_{0}_{1}_decode'.format('cuda' + if use_cuda else 'cpu', 'sparse' + if is_sparse else 'dense') + + def f(*args): + with scope_prog_guard(): + decode_main(use_cuda, is_sparse) + + if decorator is not None: + f = decorator(f) + + setattr(TestMachineTranslation, f_name, f) + + +for _use_cuda_ in (False, True): + for _is_sparse_ in (False, True): + inject_test_train(_use_cuda_, _is_sparse_) + +for _use_cuda_ in (False, True): + for _is_sparse_ in (False, True): + + _decorator_ = None + if _use_cuda_: + _decorator_ = unittest.skip( + reason='Beam Search does not support CUDA!') + + inject_test_decode( + is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_) + if __name__ == '__main__': - # train_main() - decode_main() + unittest.main() diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py index ac7ef4046f9ff55c2cbfc28b50784b9bffb80d53..b8f55c813b6984a5a1d266acc1c159c45c23b665 100644 --- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py @@ -17,6 +17,7 @@ import paddle.v2.fluid as fluid import paddle.v2 as paddle import sys import numpy +import unittest def parse_arg(): @@ -45,8 +46,9 @@ BATCH_SIZE = 64 def loss_net(hidden, label): prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - return fluid.layers.mean(x=loss), fluid.layers.accuracy( - input=prediction, label=label) + avg_loss = fluid.layers.mean(x=loss) + acc = fluid.layers.accuracy(input=prediction, label=label) + return prediction, avg_loss, acc def mlp(img, label): @@ -73,25 +75,25 @@ def conv_net(img, label): return loss_net(conv_pool_2, label) -def main(): - args = parse_arg() - print("recognize digits with args: {0}".format(" ".join(sys.argv[1:]))) - +def train(nn_type, use_cuda, parallel, save_dirname): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') - if args.nn_type == 'mlp': + if nn_type == 'mlp': net_conf = mlp else: net_conf = conv_net - if args.parallel: + if parallel: places = fluid.layers.get_places() pd = fluid.layers.ParallelDo(places) with pd.do(): img_ = pd.read_input(img) label_ = pd.read_input(label) - for o in net_conf(img_, label_): + prediction, avg_loss, acc = net_conf(img_, label_) + for o in [avg_loss, acc]: pd.write_output(o) avg_loss, acc = pd() @@ -99,14 +101,14 @@ def main(): avg_loss = fluid.layers.mean(x=avg_loss) acc = fluid.layers.mean(x=acc) else: - avg_loss, acc = net_conf(img, label) + prediction, avg_loss, acc = net_conf(img, label) test_program = fluid.default_main_program().clone() optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer.minimize(avg_loss) - place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) @@ -137,13 +139,85 @@ def main(): acc_val = numpy.array(acc_set).mean() avg_loss_val = numpy.array(avg_loss_set).mean() if float(acc_val) > 0.85: # test acc > 85% - exit(0) + if save_dirname is not None: + fluid.io.save_inference_model(save_dirname, ["img"], + [prediction], exe) + return else: print( 'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'. format(pass_id, batch_id + 1, float(avg_loss_val), float(acc_val))) + raise AssertionError("Loss of recognize digits is too large") + + +def infer(use_cuda, save_dirname=None): + if save_dirname is None: + return + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + # Use fluid.io.load_inference_model to obtain the inference program desc, + # the feed_target_names (the names of variables that will be feeded + # data using feed operators), and the fetch_targets (variables that + # we want to obtain data from using fetch operators). + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) + + # The input's dimension of conv should be 4-D or 5-D. + tensor_img = numpy.random.rand(1, 1, 28, 28).astype("float32") + + # Construct feed as a dictionary of {feed_target_name: feed_target_data} + # and results will contain a list of data corresponding to fetch_targets. + results = exe.run(inference_program, + feed={feed_target_names[0]: tensor_img}, + fetch_list=fetch_targets) + print("infer results: ", results[0]) + + +def main(use_cuda, parallel, nn_type): + if not use_cuda and not parallel: + save_dirname = "recognize_digits_" + nn_type + ".inference.model" + else: + save_dirname = None + + train( + nn_type=nn_type, + use_cuda=use_cuda, + parallel=parallel, + save_dirname=save_dirname) + infer(use_cuda=use_cuda, save_dirname=save_dirname) + + +class TestRecognizeDigits(unittest.TestCase): + pass + + +def inject_test_method(use_cuda, parallel, nn_type): + def __impl__(self): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + main(use_cuda, parallel, nn_type) + + fn = 'test_{0}_{1}_{2}'.format(nn_type, 'cuda' + if use_cuda else 'cpu', 'parallel' + if parallel else 'normal') + + setattr(TestRecognizeDigits, fn, __impl__) + + +def inject_all_tests(): + for use_cuda in (False, True): + for parallel in (False, True): + for nn_type in ('mlp', 'conv'): + inject_test_method(use_cuda, parallel, nn_type) + +inject_all_tests() if __name__ == '__main__': - main() + unittest.main() diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py similarity index 52% rename from python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py rename to python/paddle/v2/fluid/tests/book/test_understand_sentiment.py index 529223eba8af6d968b490068f34559880312515d..2ba9077a26202b1c16cc480823115f7ad55c2c67 100644 --- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py +++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,9 +12,36 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np -import paddle.v2 as paddle +import unittest import paddle.v2.fluid as fluid +import paddle.v2 as paddle +import contextlib + + +def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32, + hid_dim=32): + emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim]) + conv_3 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=hid_dim, + filter_size=3, + act="tanh", + pool_type="sqrt") + conv_4 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=hid_dim, + filter_size=4, + act="tanh", + pool_type="sqrt") + prediction = fluid.layers.fc(input=[conv_3, conv_4], + size=class_dim, + act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002) + adam_optimizer.minimize(avg_cost) + accuracy = fluid.layers.accuracy(input=prediction, label=label) + return avg_cost, accuracy def stacked_lstm_net(data, @@ -51,63 +78,77 @@ def stacked_lstm_net(data, avg_cost = fluid.layers.mean(x=cost) adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002) adam_optimizer.minimize(avg_cost) - accuracy = fluid.evaluator.Accuracy(input=prediction, label=label) - return avg_cost, accuracy, accuracy.metrics[0] - - -def to_lodtensor(data, place): - seq_lens = [len(seq) for seq in data] - cur_len = 0 - lod = [cur_len] - for l in seq_lens: - cur_len += l - lod.append(cur_len) - flattened_data = np.concatenate(data, axis=0).astype("int64") - flattened_data = flattened_data.reshape([len(flattened_data), 1]) - res = fluid.LoDTensor() - res.set(flattened_data, place) - res.set_lod([lod]) - return res - - -def main(): - BATCH_SIZE = 100 - PASS_NUM = 5 + accuracy = fluid.layers.accuracy(input=prediction, label=label) + return avg_cost, accuracy - word_dict = paddle.dataset.imdb.word_dict() - print "load word dict successfully" + +def main(word_dict, net_method, use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + + BATCH_SIZE = 128 + PASS_NUM = 5 dict_dim = len(word_dict) class_dim = 2 data = fluid.layers.data( name="words", shape=[1], dtype="int64", lod_level=1) label = fluid.layers.data(name="label", shape=[1], dtype="int64") - cost, accuracy, acc_out = stacked_lstm_net( + cost, acc_out = net_method( data, label, input_dim=dict_dim, class_dim=class_dim) train_data = paddle.batch( paddle.reader.shuffle( paddle.dataset.imdb.train(word_dict), buf_size=1000), batch_size=BATCH_SIZE) - place = fluid.CPUPlace() + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) feeder = fluid.DataFeeder(feed_list=[data, label], place=place) exe.run(fluid.default_startup_program()) for pass_id in xrange(PASS_NUM): - accuracy.reset(exe) for data in train_data(): cost_val, acc_val = exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[cost, acc_out]) - pass_acc = accuracy.eval(exe) - print("cost=" + str(cost_val) + " acc=" + str(acc_val) + - " pass_acc=" + str(pass_acc)) - if cost_val < 1.0 and acc_val > 0.8: - exit(0) - exit(1) + print("cost=" + str(cost_val) + " acc=" + str(acc_val)) + if cost_val < 0.4 and acc_val > 0.8: + return + raise AssertionError("Cost is too large for {0}".format( + net_method.__name__)) + + +class TestUnderstandSentiment(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.word_dict = paddle.dataset.imdb.word_dict() + + @contextlib.contextmanager + def new_program_scope(self): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield + + def test_conv_cpu(self): + with self.new_program_scope(): + main(self.word_dict, net_method=convolution_net, use_cuda=False) + + def test_stacked_lstm_cpu(self): + with self.new_program_scope(): + main(self.word_dict, net_method=stacked_lstm_net, use_cuda=False) + + def test_conv_gpu(self): + with self.new_program_scope(): + main(self.word_dict, net_method=convolution_net, use_cuda=True) + + def test_stacked_lstm_gpu(self): + with self.new_program_scope(): + main(self.word_dict, net_method=stacked_lstm_net, use_cuda=True) if __name__ == '__main__': - main() + unittest.main() diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py deleted file mode 100644 index df27399dd215a579d7e3f8a1659180a06b1e7f64..0000000000000000000000000000000000000000 --- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function -import numpy as np -import paddle.v2 as paddle -import paddle.v2.fluid as fluid - - -def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32, - hid_dim=32): - emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim]) - conv_3 = fluid.nets.sequence_conv_pool( - input=emb, - num_filters=hid_dim, - filter_size=3, - act="tanh", - pool_type="sqrt") - conv_4 = fluid.nets.sequence_conv_pool( - input=emb, - num_filters=hid_dim, - filter_size=4, - act="tanh", - pool_type="sqrt") - prediction = fluid.layers.fc(input=[conv_3, conv_4], - size=class_dim, - act="softmax") - cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) - adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002) - adam_optimizer.minimize(avg_cost) - accuracy = fluid.evaluator.Accuracy(input=prediction, label=label) - return avg_cost, accuracy, accuracy.metrics[0] - - -def to_lodtensor(data, place): - seq_lens = [len(seq) for seq in data] - cur_len = 0 - lod = [cur_len] - for l in seq_lens: - cur_len += l - lod.append(cur_len) - flattened_data = np.concatenate(data, axis=0).astype("int64") - flattened_data = flattened_data.reshape([len(flattened_data), 1]) - res = fluid.LoDTensor() - res.set(flattened_data, place) - res.set_lod([lod]) - return res - - -def main(): - BATCH_SIZE = 100 - PASS_NUM = 5 - - word_dict = paddle.dataset.imdb.word_dict() - dict_dim = len(word_dict) - class_dim = 2 - - data = fluid.layers.data( - name="words", shape=[1], dtype="int64", lod_level=1) - label = fluid.layers.data(name="label", shape=[1], dtype="int64") - cost, accuracy, acc_out = convolution_net( - data, label, input_dim=dict_dim, class_dim=class_dim) - - train_data = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.imdb.train(word_dict), buf_size=1000), - batch_size=BATCH_SIZE) - place = fluid.CPUPlace() - exe = fluid.Executor(place) - feeder = fluid.DataFeeder(feed_list=[data, label], place=place) - - exe.run(fluid.default_startup_program()) - - for pass_id in xrange(PASS_NUM): - accuracy.reset(exe) - for data in train_data(): - cost_val, acc_val = exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[cost, acc_out]) - pass_acc = accuracy.eval(exe) - print("cost=" + str(cost_val) + " acc=" + str(acc_val) + - " pass_acc=" + str(pass_acc)) - if cost_val < 1.0 and pass_acc > 0.8: - exit(0) - exit(1) - - -if __name__ == '__main__': - main() diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py deleted file mode 100644 index 117f74c59ad5bf6bb67711801cd7b9a41f39f1f8..0000000000000000000000000000000000000000 --- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import paddle.v2 as paddle -import paddle.v2.fluid as fluid -from paddle.v2.fluid.layer_helper import LayerHelper - - -def lstm(x, c_pre_init, hidden_dim, forget_bias=None): - """ - This function helps create an operator for the LSTM (Long Short Term - Memory) cell that can be used inside an RNN. - """ - helper = LayerHelper('lstm_unit', **locals()) - rnn = fluid.layers.StaticRNN() - with rnn.step(): - c_pre = rnn.memory(init=c_pre_init) - x_t = rnn.step_input(x) - - before_fc = fluid.layers.concat(input=[x_t, c_pre], axis=1) - after_fc = fluid.layers.fc(input=before_fc, size=hidden_dim * 4) - - dtype = x.dtype - c = helper.create_tmp_variable(dtype) - h = helper.create_tmp_variable(dtype) - - helper.append_op( - type='lstm_unit', - inputs={"X": after_fc, - "C_prev": c_pre}, - outputs={"C": c, - "H": h}, - attrs={"forget_bias": forget_bias}) - - rnn.update_memory(c_pre, c) - rnn.output(h) - - return rnn() - - -def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50): - data = fluid.layers.data( - name="words", - shape=[seq_len * batch_size, 1], - append_batch_size=False, - dtype="int64", - lod_level=1) - label = fluid.layers.data( - name="label", - shape=[batch_size, 1], - append_batch_size=False, - dtype="int64") - - emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim]) - emb = fluid.layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim]) - emb = fluid.layers.transpose(x=emb, perm=[1, 0, 2]) - - c_pre_init = fluid.layers.fill_constant( - dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0) - c_pre_init.stop_gradient = False - layer_1_out = lstm(emb, c_pre_init=c_pre_init, hidden_dim=emb_dim) - layer_1_out = fluid.layers.transpose(x=layer_1_out, perm=[1, 0, 2]) - - prediction = fluid.layers.fc(input=layer_1_out, - size=class_dim, - act="softmax") - cost = fluid.layers.cross_entropy(input=prediction, label=label) - - avg_cost = fluid.layers.mean(x=cost) - adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002) - adam_optimizer.minimize(avg_cost) - acc = fluid.layers.accuracy(input=prediction, label=label) - - return avg_cost, acc - - -def to_lodtensor(data, place): - seq_lens = [len(seq) for seq in data] - cur_len = 0 - lod = [cur_len] - for l in seq_lens: - cur_len += l - lod.append(cur_len) - flattened_data = np.concatenate(data, axis=0).astype("int64") - flattened_data = flattened_data.reshape([len(flattened_data), 1]) - res = fluid.LoDTensor() - res.set(flattened_data, place) - res.set_lod([lod]) - return res - - -def chop_data(data, chop_len=80, batch_size=50): - data = [(x[0][:chop_len], x[1]) for x in data if len(x[0]) >= chop_len] - - return data[:batch_size] - - -def prepare_feed_data(data, place): - tensor_words = to_lodtensor(map(lambda x: x[0], data), place) - - label = np.array(map(lambda x: x[1], data)).astype("int64") - label = label.reshape([len(label), 1]) - tensor_label = fluid.LoDTensor() - tensor_label.set(label, place) - - return tensor_words, tensor_label - - -def main(): - BATCH_SIZE = 100 - PASS_NUM = 5 - - word_dict = paddle.dataset.imdb.word_dict() - print "load word dict successfully" - dict_dim = len(word_dict) - class_dim = 2 - - cost, acc = lstm_net(dict_dim=dict_dim, class_dim=class_dim) - - train_data = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.imdb.train(word_dict), buf_size=BATCH_SIZE * 10), - batch_size=BATCH_SIZE) - place = fluid.CPUPlace() - exe = fluid.Executor(place) - - exe.run(fluid.default_startup_program()) - - for pass_id in xrange(PASS_NUM): - for data in train_data(): - chopped_data = chop_data(data) - tensor_words, tensor_label = prepare_feed_data(chopped_data, place) - - outs = exe.run(fluid.default_main_program(), - feed={"words": tensor_words, - "label": tensor_label}, - fetch_list=[cost, acc]) - cost_val = np.array(outs[0]) - acc_val = np.array(outs[1]) - - print("cost=" + str(cost_val) + " acc=" + str(acc_val)) - if acc_val > 0.7: - exit(0) - exit(1) - - -if __name__ == '__main__': - main() diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py index 8cf54846fe5dba2742ce69e34e0788e124a1a85d..766ba9681d1bb816170e0458f540b32511c02933 100644 --- a/python/paddle/v2/fluid/tests/book/test_word2vec.py +++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py @@ -12,76 +12,145 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import paddle.v2 as paddle import paddle.v2.fluid as fluid +import unittest +import os -PASS_NUM = 100 -EMBED_SIZE = 32 -HIDDEN_SIZE = 256 -N = 5 -BATCH_SIZE = 32 -IS_SPARSE = True - -word_dict = paddle.dataset.imikolov.build_dict() -dict_size = len(word_dict) - -first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64') -second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64') -third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64') -forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64') -next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64') - -embed_first = fluid.layers.embedding( - input=first_word, - size=[dict_size, EMBED_SIZE], - dtype='float32', - is_sparse=IS_SPARSE, - param_attr='shared_w') -embed_second = fluid.layers.embedding( - input=second_word, - size=[dict_size, EMBED_SIZE], - dtype='float32', - is_sparse=IS_SPARSE, - param_attr='shared_w') -embed_third = fluid.layers.embedding( - input=third_word, - size=[dict_size, EMBED_SIZE], - dtype='float32', - is_sparse=IS_SPARSE, - param_attr='shared_w') -embed_forth = fluid.layers.embedding( - input=forth_word, - size=[dict_size, EMBED_SIZE], - dtype='float32', - is_sparse=IS_SPARSE, - param_attr='shared_w') - -concat_embed = fluid.layers.concat( - input=[embed_first, embed_second, embed_third, embed_forth], axis=1) -hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid') -predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax') -cost = fluid.layers.cross_entropy(input=predict_word, label=next_word) -avg_cost = fluid.layers.mean(x=cost) -sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) -sgd_optimizer.minimize(avg_cost) - -train_reader = paddle.batch( - paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE) - -place = fluid.CPUPlace() -exe = fluid.Executor(place) -feeder = fluid.DataFeeder( - feed_list=[first_word, second_word, third_word, forth_word, next_word], - place=place) - -exe.run(fluid.default_startup_program()) - -for pass_id in range(PASS_NUM): - for data in train_reader(): - avg_cost_np = exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[avg_cost]) - if avg_cost_np[0] < 5.0: - exit(0) # if avg cost less than 10.0, we think our code is good. -exit(1) + +def main(use_cuda, is_sparse, parallel): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + + PASS_NUM = 100 + EMBED_SIZE = 32 + HIDDEN_SIZE = 256 + N = 5 + BATCH_SIZE = 32 + IS_SPARSE = is_sparse + + def __network__(words): + embed_first = fluid.layers.embedding( + input=words[0], + size=[dict_size, EMBED_SIZE], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr='shared_w') + embed_second = fluid.layers.embedding( + input=words[1], + size=[dict_size, EMBED_SIZE], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr='shared_w') + embed_third = fluid.layers.embedding( + input=words[2], + size=[dict_size, EMBED_SIZE], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr='shared_w') + embed_forth = fluid.layers.embedding( + input=words[3], + size=[dict_size, EMBED_SIZE], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr='shared_w') + + concat_embed = fluid.layers.concat( + input=[embed_first, embed_second, embed_third, embed_forth], axis=1) + hidden1 = fluid.layers.fc(input=concat_embed, + size=HIDDEN_SIZE, + act='sigmoid') + predict_word = fluid.layers.fc(input=hidden1, + size=dict_size, + act='softmax') + cost = fluid.layers.cross_entropy(input=predict_word, label=words[4]) + avg_cost = fluid.layers.mean(x=cost) + return avg_cost + + word_dict = paddle.dataset.imikolov.build_dict() + dict_size = len(word_dict) + + first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64') + second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64') + third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64') + forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64') + next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64') + + if not parallel: + avg_cost = __network__( + [first_word, second_word, third_word, forth_word, next_word]) + else: + places = fluid.layers.get_places() + pd = fluid.layers.ParallelDo(places) + with pd.do(): + avg_cost = __network__( + map(pd.read_input, [ + first_word, second_word, third_word, forth_word, next_word + ])) + pd.write_output(avg_cost) + + avg_cost = fluid.layers.mean(x=pd()) + + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer.minimize(avg_cost) + + train_reader = paddle.batch( + paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + feeder = fluid.DataFeeder( + feed_list=[first_word, second_word, third_word, forth_word, next_word], + place=place) + + exe.run(fluid.default_startup_program()) + + for pass_id in range(PASS_NUM): + for data in train_reader(): + avg_cost_np = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost]) + if avg_cost_np[0] < 5.0: + return + raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0])) + + +FULL_TEST = os.getenv('FULL_TEST', + '0').lower() in ['true', '1', 't', 'y', 'yes', 'on'] +SKIP_REASON = "Only run minimum number of tests in CI server, to make CI faster" + + +class W2VTest(unittest.TestCase): + pass + + +def inject_test_method(use_cuda, is_sparse, parallel): + fn_name = "test_{0}_{1}_{2}".format("cuda" if use_cuda else "cpu", "sparse" + if is_sparse else "dense", "parallel" + if parallel else "normal") + + def __impl__(*args, **kwargs): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + main(use_cuda=use_cuda, is_sparse=is_sparse, parallel=parallel) + + if use_cuda and is_sparse and parallel: + fn = __impl__ + else: + # skip the other test when on CI server + fn = unittest.skipUnless( + condition=FULL_TEST, reason=SKIP_REASON)(__impl__) + + setattr(W2VTest, fn_name, fn) + + +for use_cuda in (False, True): + for is_sparse in (False, True): + for parallel in (False, True): + inject_test_method(use_cuda, is_sparse, parallel) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py index 74138298978c7c18936f53761b313887f07aea81..4943bbb3388c3a476596b2fd4dd28605ee7be9e0 100644 --- a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py +++ b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py @@ -62,7 +62,7 @@ def batch_bipartite_match(distance, lod): return match_indices, match_dist -class TestBipartiteMatchOpForWithLoD(OpTest): +class TestBipartiteMatchOpWithLoD(OpTest): def setUp(self): self.op_type = 'bipartite_match' lod = [[0, 5, 11, 23]] @@ -72,7 +72,7 @@ class TestBipartiteMatchOpForWithLoD(OpTest): self.inputs = {'DistMat': (dist, lod)} self.outputs = { 'ColToRowMatchIndices': (match_indices), - 'ColToRowMatchDis': (match_dist), + 'ColToRowMatchDist': (match_dist), } def test_check_output(self): @@ -89,7 +89,7 @@ class TestBipartiteMatchOpWithoutLoD(OpTest): self.inputs = {'DistMat': dist} self.outputs = { 'ColToRowMatchIndices': match_indices, - 'ColToRowMatchDis': match_dist, + 'ColToRowMatchDist': match_dist, } def test_check_output(self): diff --git a/python/paddle/v2/fluid/tests/test_box_coder_op.py b/python/paddle/v2/fluid/tests/test_box_coder_op.py new file mode 100644 index 0000000000000000000000000000000000000000..0dc18476fd5dce7cd293f6cb85f419be7d88ec95 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_box_coder_op.py @@ -0,0 +1,127 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys +import math +from op_test import OpTest + + +def box_coder(target_box, prior_box, prior_box_var, output_box, code_type): + prior_box_x = ( + (prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0]) + prior_box_y = ( + (prior_box[:, 3] + prior_box[:, 1]) / 2).reshape(1, prior_box.shape[0]) + prior_box_width = ( + (prior_box[:, 2] - prior_box[:, 0])).reshape(1, prior_box.shape[0]) + prior_box_height = ( + (prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0]) + prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0], + prior_box_var.shape[1]) + + if (code_type == "EncodeCenterSize"): + target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape( + target_box.shape[0], 1) + target_box_y = ((target_box[:, 3] + target_box[:, 1]) / 2).reshape( + target_box.shape[0], 1) + target_box_width = ((target_box[:, 2] - target_box[:, 0])).reshape( + target_box.shape[0], 1) + target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape( + target_box.shape[0], 1) + + output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \ + prior_box_var[:,:,0] + output_box[:,:,1] = (target_box_y - prior_box_y) / prior_box_height / \ + prior_box_var[:,:,1] + output_box[:,:,2] = np.log(np.fabs(target_box_width / prior_box_width)) / \ + prior_box_var[:,:,2] + output_box[:,:,3] = np.log(np.fabs(target_box_height / prior_box_height)) / \ + prior_box_var[:,:,3] + + elif (code_type == "DecodeCenterSize"): + target_box = target_box.reshape(target_box.shape[0], 1, + target_box.shape[1]) + target_box_x = prior_box_var[:,:,0] * target_box[:,:,0] * \ + prior_box_width + prior_box_x + target_box_y = prior_box_var[:,:,1] * target_box[:,:,1] * \ + prior_box_height + prior_box_y + target_box_width = np.exp(prior_box_var[:,:,2] * target_box[:,:,2]) * \ + prior_box_width + target_box_height = np.exp(prior_box_var[:,:,3] * target_box[:,:,3]) * \ + prior_box_height + output_box[:, :, 0] = target_box_x - target_box_width / 2 + output_box[:, :, 1] = target_box_y - target_box_height / 2 + output_box[:, :, 2] = target_box_x + target_box_width / 2 + output_box[:, :, 3] = target_box_y + target_box_height / 2 + + +def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type): + n = target_box.shape[0] + m = prior_box.shape[0] + output_box = np.zeros((n, m, 4), dtype=np.float32) + for i in range(len(lod) - 1): + box_coder(target_box[lod[i]:lod[i + 1], :], prior_box, prior_box_var, + output_box[lod[i]:lod[i + 1], :, :], code_type) + return output_box + + +class TestBoxCoderOp(OpTest): + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "box_coder" + lod = [[0, 20]] + prior_box = np.random.random((10, 4)).astype('float32') + prior_box_var = np.random.random((10, 4)).astype('float32') + target_box = np.random.random((20, 4)).astype('float32') + code_type = "DecodeCenterSize" + output_box = batch_box_coder(prior_box, prior_box_var, target_box, + lod[0], code_type) + + self.inputs = { + 'PriorBox': prior_box, + 'PriorBoxVar': prior_box_var, + 'TargetBox': target_box, + } + self.attrs = {'code_type': 'decode_center_size'} + self.outputs = {'OutputBox': output_box} + + +class TestBoxCoderOpWithLoD(OpTest): + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "box_coder" + lod = [[0, 4, 12, 20]] + prior_box = np.random.random((10, 4)).astype('float32') + prior_box_var = np.random.random((10, 4)).astype('float32') + target_box = np.random.random((20, 4)).astype('float32') + code_type = "EncodeCenterSize" + output_box = batch_box_coder(prior_box, prior_box_var, target_box, + lod[0], code_type) + + self.inputs = { + 'PriorBox': prior_box, + 'PriorBoxVar': prior_box_var, + 'TargetBox': (target_box, lod), + } + self.attrs = {'code_type': 'encode_center_size'} + self.outputs = {'OutputBox': output_box} + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py index 24de74d730eedbccb4837598bd6d2eb92da59e0d..7512ea333e37d5f4f0102531d8d13f8c2a744b8d 100644 --- a/python/paddle/v2/fluid/tests/test_conv2d_op.py +++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py @@ -241,6 +241,30 @@ class TestCUDNNWith1x1(TestWith1x1): self.op_type = "conv2d" +class TestDepthwiseConv(TestConv2dOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] / self.groups + self.filter_size = [6, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + +class TestDepthwiseConv2(TestConv2dOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] / self.groups + self.filter_size = [6, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + # cudnn v5 does not support dilation conv. # class TestCUDNNWithDilation(TestWithDilation): # def init_op_type(self): diff --git a/python/paddle/v2/fluid/tests/test_dropout_op.py b/python/paddle/v2/fluid/tests/test_dropout_op.py index 107b9567dc4a8539532c2fff40df437cc72cc163..b0c55df9f58834688846c5362113464996eb286a 100644 --- a/python/paddle/v2/fluid/tests/test_dropout_op.py +++ b/python/paddle/v2/fluid/tests/test_dropout_op.py @@ -21,7 +21,7 @@ class TestDropoutOp(OpTest): def setUp(self): self.op_type = "dropout" self.inputs = {'X': np.random.random((32, 64)).astype("float32")} - self.attrs = {'dropout_prob': 0.0, 'is_test': False} + self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False} self.outputs = { 'Out': self.inputs['X'], 'Mask': np.ones((32, 64)).astype('float32') @@ -38,7 +38,7 @@ class TestDropoutOp2(TestDropoutOp): def setUp(self): self.op_type = "dropout" self.inputs = {'X': np.random.random((32, 64)).astype("float32")} - self.attrs = {'dropout_prob': 1.0, 'is_test': False} + self.attrs = {'dropout_prob': 1.0, 'fix_seed': True, 'is_test': False} self.outputs = { 'Out': np.zeros((32, 64)).astype('float32'), 'Mask': np.zeros((32, 64)).astype('float32') @@ -49,7 +49,7 @@ class TestDropoutOp3(TestDropoutOp): def setUp(self): self.op_type = "dropout" self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")} - self.attrs = {'dropout_prob': 0.0, 'is_test': False} + self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False} self.outputs = { 'Out': self.inputs['X'], 'Mask': np.ones((32, 64, 2)).astype('float32') @@ -60,7 +60,7 @@ class TestDropoutOp4(OpTest): def setUp(self): self.op_type = "dropout" self.inputs = {'X': np.random.random((32, 64)).astype("float32")} - self.attrs = {'dropout_prob': 0.35, 'is_test': True} + self.attrs = {'dropout_prob': 0.35, 'fix_seed': True, 'is_test': True} self.outputs = { 'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob']) } diff --git a/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py b/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py new file mode 100644 index 0000000000000000000000000000000000000000..e31749df9baf10215fcd0cca3c1097f00c163ec7 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py @@ -0,0 +1,43 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +import numpy as np +from op_test import OpTest + + +class TestElementwisePowOp(OpTest): + def setUp(self): + self.op_type = "elementwise_pow" + self.inputs = { + 'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"), + 'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32") + } + self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])} + + def test_check_output(self): + self.check_output() + + +class TestElementwisePowOp_scalar(TestElementwisePowOp): + def setUp(self): + self.op_type = "elementwise_pow" + self.inputs = { + 'X': np.random.rand(2, 3, 4).astype('float32'), + 'Y': np.random.rand(1).astype('float32') + } + self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])} + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_label_smooth_op.py b/python/paddle/v2/fluid/tests/test_label_smooth_op.py new file mode 100644 index 0000000000000000000000000000000000000000..19a4df57446c0c83b415909df3e0246bf2716881 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_label_smooth_op.py @@ -0,0 +1,55 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class TestLabelSmoothOp(OpTest): + def config(self): + self.op_type = "label_smooth" + self.epsilon = 0.1 + batch_size, self.label_dim = 5, 10 + self.label = np.zeros((batch_size, self.label_dim)).astype("float64") + nonzero_index = np.random.randint(self.label_dim, size=(batch_size)) + self.label[np.arange(batch_size), nonzero_index] = 1 + + def setUp(self): + self.config() + smoothed_label = (1 - self.epsilon + ) * self.label + self.epsilon / self.label_dim + self.inputs = {'X': self.label} + self.attrs = {'epsilon': self.epsilon} + self.outputs = {'Out': smoothed_label} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestLabelSmoothOpWithPriorDist(TestLabelSmoothOp): + def setUp(self): + self.config() + dist = np.random.random((1, self.label_dim)) + smoothed_label = (1 - self.epsilon) * self.label + self.epsilon * dist + self.inputs = {'X': self.label, 'PriorDist': dist} + self.attrs = {'epsilon': self.epsilon} + self.outputs = {'Out': smoothed_label} + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py new file mode 100644 index 0000000000000000000000000000000000000000..68cf8673cd46677065588f652482cd0df08b3450 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py @@ -0,0 +1,252 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +import numpy as np + +from operator import mul +from op_test import OpTest +import paddle.v2.fluid.core as core +from paddle.v2.fluid.op import Operator +from paddle.v2.fluid.framework import grad_var_name + + +def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1): + x_shape = x.shape + N = reduce(mul, x_shape[0:begin_norm_axis], 1) + D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1) + x.shape = [N, D] + + mean = np.mean(x, axis=1) + var = np.var(x, axis=1) + epsilon + output = scale.reshape([1, D]) * np.divide( + (x - mean.reshape([N, 1])), + (np.sqrt(var)).reshape([N, 1])) + beta.reshape([1, D]) + + x.shape, output.shape = x_shape, x_shape + return output, mean, var + + +def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1): + x_shape = x.shape + scale_shape = scale.shape + N = reduce(mul, x_shape[0:begin_norm_axis], 1) + D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1) + x.shape, grad_y.shape = [N, D], [N, D] + var.shape, mean.shape = [N, 1], [N, 1] + scale.shape = [1, D] + + # d_bias + d_bias = np.sum(grad_y, axis=0).reshape([1, D]) + # d_scale + d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y, + axis=0).reshape([1, D]) + # dx + dx_end = scale * np.sqrt(1.0 / var) * grad_y + d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape( + [N, 1]) # the second part equals to zero. + d_mean = 1.0 / D * d_mean_0 + d_std = np.sum( + -(1.0 / var) * (x - mean) * grad_y * scale, axis=1).reshape([N, 1]) * ( + 1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean)) + + grad_x = dx_end + d_mean + d_std + + grad_y.shape = x_shape + x.shape = x_shape + scale.shape = scale_shape + return grad_x, d_scale, d_bias + + +def get_backward_op(scope, op, no_grad_set): + backward_op = core.Operator.backward(op, no_grad_set) + for input in backward_op.input_vars(): + var = scope.var(input) + var.get_tensor() + for output in backward_op.output_vars(): + var = scope.var(output) + var.get_tensor() + return backward_op + + +def create_or_get_tensor(scope, var_name, var, place): + tensor = scope.var(var_name).get_tensor() + if var is not None: + assert isinstance(var, np.ndarray) + tensor.set_lod([[]]) + tensor.set_dims(var.shape) + tensor.set(var, place) + return tensor + + +def set_output_grad(scope, outputs, place, feed_dict=None): + def __set_tensor__(name, data=None): + out_tensor = scope.find_var(name).get_tensor() + grad_tensor = scope.var(grad_var_name(name)).get_tensor() + out_dtype = out_tensor.dtype() + if data is None: + if out_dtype == core.DataType.FP64: + data = np.ones(out_tensor.shape(), dtype=np.float64) + elif out_dtype == core.DataType.FP32: + data = np.ones(out_tensor.shape(), dtype=np.float32) + else: + raise ValueError("Not supported data type " + str(out_dtype)) + grad_tensor.set(data, place) + + for output in outputs: + data = None + if output in feed_dict: + data = feed_dict[output] + __set_tensor__(output, data) + + +class TestLayerNormdOp(OpTest): + def __assert_close(self, tensor, np_array, msg, atol=1e-4): + self.assertTrue( + np.allclose( + np.array(tensor).reshape(np_array.shape), np_array, atol=atol), + msg) + + def __assert_grad_close(self, + tensor, + np_array, + name, + place, + max_relative_error=0.02): + a = np.array(tensor).reshape(np_array.shape) + b = np_array + abs_a = np.abs(a) + abs_a[abs_a < 1e-5] = 1 + + diff_mat = np.abs(a - b) / abs_a + max_diff = np.max(diff_mat) + + def err_msg(): + offset = np.argmax(diff_mat > max_relative_error) + return ("%s Variable %s max gradient diff %f over limit %f, " + "the first error element is %d, %f, %f") % ( + "Gradient Check On %s" % str(place), name, max_diff, + max_relative_error, offset, a.flatten()[offset], + b.flatten()[offset]) + + self.assertLessEqual(max_diff, max_relative_error, err_msg()) + + def check_forward_backward(self, shape, begin_norm_axis): + def test_with_place(place, shape, begin_norm_axis=1): + # setUp + assert begin_norm_axis > 0 and begin_norm_axis < len( + shape), 'begin_norm_axis must be between 0 and len(shape)-1.' + # attr + epsilon = 0.00001 + x_shape = shape + D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1) + scale_shape = [D] + np.random.random(123) + x_val = np.random.random_sample(x_shape).astype(np.float32) + scale_val = np.random.random_sample(scale_shape).astype(np.float32) + bias_val = np.random.random_sample(scale_shape).astype(np.float32) + y_grad = np.random.random_sample(x_shape).astype(np.float32) + + # run forward + y_out, saved_mean, var_ref = _reference_layer_norm_naive( + x_val, scale_val, bias_val, epsilon, begin_norm_axis) + naive_fw = {"Y": y_out, "Mean": saved_mean, "Variance": var_ref} + + # get gradient + x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad( + x_val, y_grad, scale_val, saved_mean, var_ref, begin_norm_axis) + naive_grad = { + "X": x_grad_ref, + "Scale": scale_grad_ref, + "Bias": bias_grad_ref + } + + scope = core.Scope() + + # create input + input_map = {"X": x_val, "Scale": scale_val, "Bias": bias_val} + for i_name in input_map: + create_or_get_tensor(scope, i_name, input_map[i_name], place) + + # create output + output_map = {"Y": None, "Mean": None, "Variance": None} + output_tensor = {} + for o_name in output_map: + output_tensor[o_name] = create_or_get_tensor( + scope, o_name, output_map[o_name], place) + + layer_norm_op = Operator( + "layer_norm", + # inputs + X="X", + Scale="Scale", + Bias="Bias", + # outputs + Y="Y", + Mean="Mean", + Variance="Variance", + # attrs + epsilon=epsilon, + begin_norm_axis=begin_norm_axis) + + layer_norm_op.run(scope, place) + + # check forward result + atol = 5e-2 if isinstance(place, core.CUDAPlace) else 1e-4 + for o_tensor in output_tensor: + self.__assert_close(output_tensor[o_tensor], naive_fw[o_tensor], + o_tensor, atol) + + # run backward + layer_norm_op_grad = get_backward_op(scope, layer_norm_op, set()) + set_output_grad( + scope, ["Y", "Mean", "Variance"], + place, + feed_dict={"Y": y_grad}) + layer_norm_op_grad.run(scope, place) + + # get output + grad_tensor = {} + for o_name in naive_grad: + grad_tensor[o_name] = x_ = create_or_get_tensor( + scope, grad_var_name(o_name), None, place) + + # check gradient output + for o_grad in naive_grad: + self.__assert_grad_close(grad_tensor[o_grad], + naive_grad[o_grad], o_grad + "@GRAD", + place) + + places = [core.CPUPlace()] + if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"): + places.append(core.CUDAPlace(0)) + + for place in places: + test_with_place(place, shape, begin_norm_axis) + + def test_check_forward_backward_with_scale_and_bias(self): + self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1) + self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3) + + def test_check_forward_backward_with_scale(self): + pass # TODO(zcd) + + def test_check_forward_backward_with_bias(self): + pass # TODO(zcd) + + def test_check_forward_backward(self): + pass # TODO(zcd) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py index 3f54e28defb76d3430a82e791578e20b84833f16..aea43c2517a02c72c1ee3307afdd3b21910f0064 100644 --- a/python/paddle/v2/fluid/tests/test_layers.py +++ b/python/paddle/v2/fluid/tests/test_layers.py @@ -223,6 +223,14 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(layers.sequence_softmax(x=seq)) print(str(program)) + def test_softmax(self): + program = Program() + with program_guard(program): + data = layers.data(name='data', shape=[10], dtype='float32') + hid = layers.fc(input=data, size=20) + self.assertIsNotNone(layers.softmax(x=hid)) + print(str(program)) + def test_get_places(self): program = Program() with program_guard(program): diff --git a/python/paddle/v2/fluid/tests/test_learning_rate_decay.py b/python/paddle/v2/fluid/tests/test_learning_rate_decay.py new file mode 100644 index 0000000000000000000000000000000000000000..dc348cf2d21693290095900f8ab63c29923b4673 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_learning_rate_decay.py @@ -0,0 +1,110 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import math +import paddle.v2.fluid.framework as framework +import paddle.v2.fluid as fluid +import paddle.v2.fluid.layers as layers +import paddle.v2.fluid.learning_rate_decay as lr_decay + + +def exponential_decay(learning_rate, + global_step, + decay_steps, + decay_rate, + staircase=False): + exponent = float(global_step) / float(decay_steps) + if staircase: + exponent = math.floor(exponent) + return learning_rate * decay_rate**exponent + + +def natural_exp_decay(learning_rate, + global_step, + decay_steps, + decay_rate, + staircase=False): + exponent = float(global_step) / float(decay_steps) + if staircase: + exponent = math.floor(exponent) + return learning_rate * math.exp(-1 * decay_rate * exponent) + + +def inverse_time_decay(learning_rate, + global_step, + decay_steps, + decay_rate, + staircase=False): + temp = float(global_step) / float(decay_steps) + if staircase: + temp = math.floor(temp) + return learning_rate / (1 + decay_rate * temp) + + +class TestLearningRateDecay(unittest.TestCase): + def check_decay(self, python_decay_fn, fluid_decay_fn, staircase): + init_lr = 1.0 + decay_steps = 5 + decay_rate = 0.5 + + global_step = layers.create_global_var( + shape=[1], value=0.0, dtype='float32', persistable=True) + + decayed_lr = fluid_decay_fn( + learning_rate=init_lr, + global_step=global_step, + decay_steps=decay_steps, + decay_rate=decay_rate, + staircase=staircase) + layers.increment(global_step, 1.0) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + + exe.run(fluid.default_startup_program()) + for step in range(10): + step_val, lr_val = exe.run(fluid.default_main_program(), + feed=[], + fetch_list=[global_step, decayed_lr]) + python_decayed_lr = python_decay_fn( + learning_rate=init_lr, + global_step=step, + decay_steps=decay_steps, + decay_rate=decay_rate, + staircase=staircase) + self.assertAlmostEqual(python_decayed_lr, lr_val[0]) + + def test_decay(self): + decay_fns = [ + (exponential_decay, lr_decay.exponential_decay, True), + (exponential_decay, lr_decay.exponential_decay, False), + (natural_exp_decay, lr_decay.natural_exp_decay, True), + (natural_exp_decay, lr_decay.natural_exp_decay, False), + (inverse_time_decay, lr_decay.inverse_time_decay, True), + (inverse_time_decay, lr_decay.inverse_time_decay, False), + ] + + for py_decay_fn, fluid_decay_fn, staircase in decay_fns: + print("decay_fn=" + str(py_decay_fn) + " staircase=" + str( + staircase)) + main_program = framework.Program() + startup_program = framework.Program() + with framework.program_guard(main_program, startup_program): + self.check_decay(py_decay_fn, fluid_decay_fn, staircase) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py b/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py new file mode 100755 index 0000000000000000000000000000000000000000..c27573c3d69037bc48e0b6a90636b3f027f15a41 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py @@ -0,0 +1,100 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys +import math +from op_test import OpTest + + +class TestMineHardExamplesOp(OpTest): + def set_data(self): + self.init_test_data() + self.inputs = { + 'ClsLoss': self.cls_loss, + 'LocLoss': self.loc_loss, + 'MatchIndices': self.match_indices, + 'MatchDist': self.match_dis + } + + self.attrs = { + 'neg_pos_ratio': self.neg_pos_ratio, + 'neg_overlap': self.neg_overlap, + 'sample_size': self.sample_size, + 'mining_type': self.mining_type + } + + self.outputs = { + 'NegIndices': (self.neg_indices, self.neg_indices_lod), + 'UpdatedMatchIndices': self.updated_match_indices + } + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + return + + def setUp(self): + self.op_type = "mine_hard_examples" + self.set_data() + + def init_test_data(self): + self.neg_pos_ratio = 1.0 + self.neg_overlap = 0.5 + self.sample_size = 0 + self.mining_type = "max_negative" + self.cls_loss = np.array([[0.1, 0.1, 0.3], + [0.3, 0.1, 0.1]]).astype('float32') + + self.loc_loss = np.array([[0.1, 0.2, 0.3], + [0.3, 0.4, 0.1]]).astype('float32') + + self.match_dis = np.array([[0.2, 0.4, 0.8], + [0.1, 0.9, 0.3]]).astype('float32') + + self.match_indices = np.array([[0, -1, -1], + [-1, 0, -1]]).astype('int32') + + self.updated_match_indices = self.match_indices + + self.neg_indices_lod = [[0, 1, 2]] + self.neg_indices = np.array([[1], [0]]).astype('int32') + + +class TestMineHardExamplesOpHardExample(TestMineHardExamplesOp): + def init_test_data(self): + super(TestMineHardExamplesOpHardExample, self).init_test_data() + self.mining_type = "hard_example" + self.sample_size = 2 + + self.cls_loss = np.array([[0.5, 0.1, 0.3], + [0.3, 0.1, 0.1]]).astype('float32') + + self.loc_loss = np.array([[0.2, 0.2, 0.3], + [0.3, 0.1, 0.2]]).astype('float32') + + self.match_indices = np.array([[0, -1, -1], + [-1, 0, -1]]).astype('int32') + + self.updated_match_indices = np.array([[0, -1, -1], + [-1, -1, -1]]).astype('int32') + + self.neg_indices_lod = [[0, 1, 3]] + self.neg_indices = np.array([[2], [0], [2]]).astype('int32') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py new file mode 100644 index 0000000000000000000000000000000000000000..3b80d2359b083d30f9a5a7b8cc18aaf1ca5146c1 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py @@ -0,0 +1,226 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. +import unittest +import numpy as np +import copy +from op_test import OpTest + + +def iou(box_a, box_b): + """Apply intersection-over-union overlap between box_a and box_b + """ + xmin_a = min(box_a[0], box_a[2]) + ymin_a = min(box_a[1], box_a[3]) + xmax_a = max(box_a[0], box_a[2]) + ymax_a = max(box_a[1], box_a[3]) + + xmin_b = min(box_b[0], box_b[2]) + ymin_b = min(box_b[1], box_b[3]) + xmax_b = max(box_b[0], box_b[2]) + ymax_b = max(box_b[1], box_b[3]) + + area_a = (ymax_a - ymin_a) * (xmax_a - xmin_a) + area_b = (ymax_b - ymin_b) * (xmax_b - xmin_b) + if area_a <= 0 and area_b <= 0: + return 0.0 + + xa = max(xmin_a, xmin_b) + ya = max(ymin_a, ymin_b) + xb = min(xmax_a, xmax_b) + yb = min(ymax_a, ymax_b) + + inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0) + + box_a_area = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]) + box_b_area = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]) + + iou_ratio = inter_area / (area_a + area_b - inter_area) + + return iou_ratio + + +def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0): + """Apply non-maximum suppression at test time to avoid detecting too many + overlapping bounding boxes for a given object. + Args: + boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. + scores: (tensor) The class predscores for the img, Shape:[num_priors]. + score_threshold: (float) The confidence thresh for filtering low + confidence boxes. + nms_threshold: (float) The overlap thresh for suppressing unnecessary + boxes. + top_k: (int) The maximum number of box preds to consider. + eta: (float) The parameter for adaptive NMS. + Return: + The indices of the kept boxes with respect to num_priors. + """ + all_scores = copy.deepcopy(scores) + all_scores = all_scores.flatten() + selected_indices = np.argwhere(all_scores > score_threshold) + selected_indices = selected_indices.flatten() + all_scores = all_scores[selected_indices] + + sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort') + sorted_scores = all_scores[sorted_indices] + if top_k > -1 and top_k < sorted_indices.shape[0]: + sorted_indices = sorted_indices[:top_k] + sorted_scores = sorted_scores[:top_k] + + selected_indices = [] + adaptive_threshold = nms_threshold + for i in range(sorted_scores.shape[0]): + idx = sorted_indices[i] + keep = True + for k in range(len(selected_indices)): + if keep: + kept_idx = selected_indices[k] + overlap = iou(boxes[idx], boxes[kept_idx]) + keep = True if overlap <= adaptive_threshold else False + else: + break + if keep: + selected_indices.append(idx) + if keep and eta < 1 and adaptive_threshold > 0.5: + adaptive_threshold *= eta + return selected_indices + + +def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold, + nms_top_k, keep_top_k): + class_num = scores.shape[0] + priorbox_num = scores.shape[1] + + selected_indices = {} + num_det = 0 + for c in range(class_num): + if c == background: continue + indices = nms(boxes, scores[c], score_threshold, nms_threshold, + nms_top_k) + selected_indices[c] = indices + num_det += len(indices) + + if keep_top_k > -1 and num_det > keep_top_k: + score_index = [] + for c, indices in selected_indices.iteritems(): + for idx in indices: + score_index.append((scores[c][idx], c, idx)) + + sorted_score_index = sorted( + score_index, key=lambda tup: tup[0], reverse=True) + sorted_score_index = sorted_score_index[:keep_top_k] + selected_indices = {} + + for _, c, _ in sorted_score_index: + selected_indices[c] = [] + for s, c, idx in sorted_score_index: + selected_indices[c].append(idx) + num_det = keep_top_k + + return selected_indices, num_det + + +def batched_multiclass_nms(boxes, scores, background, score_threshold, + nms_threshold, nms_top_k, keep_top_k): + batch_size = scores.shape[0] + + det_outs = [] + lod = [0] + for n in range(batch_size): + nmsed_outs, nmsed_num = multiclass_nms(boxes, scores[n], background, + score_threshold, nms_threshold, + nms_top_k, keep_top_k) + lod.append(lod[-1] + nmsed_num) + if nmsed_num == 0: continue + + for c, indices in nmsed_outs.iteritems(): + for idx in indices: + xmin, ymin, xmax, ymax = boxes[idx][:] + det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax]) + + return det_outs, lod + + +class TestMulticlassNMSOp(OpTest): + def set_argument(self): + self.score_threshold = 0.01 + + def setUp(self): + self.set_argument() + N = 7 + M = 1200 + C = 21 + BOX_SIZE = 4 + + background = 0 + nms_threshold = 0.3 + nms_top_k = 400 + keep_top_k = 200 + score_threshold = self.score_threshold + + scores = np.random.random((N * M, C)).astype('float32') + + def softmax(x): + shiftx = x - np.max(x).clip(-64.) + exps = np.exp(shiftx) + return exps / np.sum(exps) + + scores = np.apply_along_axis(softmax, 1, scores) + scores = np.reshape(scores, (N, M, C)) + scores = np.transpose(scores, (0, 2, 1)) + + boxes = np.random.random((M, BOX_SIZE)).astype('float32') + boxes[:, 0:2] = boxes[:, 0:2] * 0.5 + boxes[:, 2:4] = boxes[:, 2:4] * 0.5 + 0.5 + + nmsed_outs, lod = batched_multiclass_nms(boxes, scores, background, + score_threshold, nms_threshold, + nms_top_k, keep_top_k) + nmsed_outs = [-1] if not nmsed_outs else nmsed_outs + nmsed_outs = np.array(nmsed_outs).astype('float32') + + self.op_type = 'multiclass_nms' + self.inputs = {'BBoxes': boxes, 'Scores': scores} + self.outputs = {'Out': (nmsed_outs, [lod])} + self.attrs = { + 'background_label': 0, + 'nms_threshold': nms_threshold, + 'nms_top_k': nms_top_k, + 'keep_top_k': keep_top_k, + 'score_threshold': score_threshold, + 'nms_eta': 1.0, + } + + def test_check_output(self): + self.check_output() + + +class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp): + def set_argument(self): + # Here set 2.0 to test the case there is no outputs. + # In practical use, 0.0 < score_threshold < 1.0 + self.score_threshold = 2.0 + + +class TestIOU(unittest.TestCase): + def test_iou(self): + box1 = np.array([4.0, 3.0, 7.0, 5.0]).astype('float32') + box2 = np.array([3.0, 4.0, 6.0, 8.0]).astype('float32') + + expt_output = np.array([2.0 / 16.0]).astype('float32') + calc_output = np.array([iou(box1, box2)]).astype('float32') + self.assertTrue(np.allclose(calc_output, expt_output)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_recv_op.py b/python/paddle/v2/fluid/tests/test_recv_op.py index 5c4cec028d354b99d6203281ec4c727d7e3eceac..3a02b882410fe896cd2add03060127a01cbdaa38 100644 --- a/python/paddle/v2/fluid/tests/test_recv_op.py +++ b/python/paddle/v2/fluid/tests/test_recv_op.py @@ -19,6 +19,7 @@ import paddle.v2.fluid.layers as layers import numpy from multiprocessing import Process import os, sys +import time class TestRecvOp(unittest.TestCase): @@ -28,6 +29,7 @@ class TestRecvOp(unittest.TestCase): p = Process(target=self.init_serv, args=(place, )) p.daemon = True p.start() + time.sleep(1) self.init_client(place) # FIXME(typhoonzero): find a way to gracefully shutdown the server. os.system("kill -9 %d" % p.pid) diff --git a/python/paddle/v2/fluid/tests/test_tensor.py b/python/paddle/v2/fluid/tests/test_tensor.py index d5cc235f588ad37b0d1293dc9894952c97411757..0219bef42b3ba133dda7412c1036cf989a170a36 100644 --- a/python/paddle/v2/fluid/tests/test_tensor.py +++ b/python/paddle/v2/fluid/tests/test_tensor.py @@ -108,9 +108,31 @@ class TestTensor(unittest.TestCase): scope = core.Scope() place = core.CPUPlace() lod_py = [[0, 2, 5], [0, 2, 4, 5]] - lod_tensor = core.LoDTensor(lod_py) + lod_tensor = core.LoDTensor() lod_tensor.set_dims([5, 2, 3, 4]) + lod_tensor.set_lod(lod_py) + lod_tensor.alloc_float(place) + tensor_array = numpy.array(lod_tensor) + tensor_array[0, 0, 0, 0] = 1.0 + tensor_array[0, 0, 0, 1] = 2.0 + lod_tensor.set(tensor_array, place) + + lod_v = numpy.array(lod_tensor) + self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0]) + self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1]) + self.assertListEqual(lod_py, lod_tensor.lod()) + + def test_lod_tensor_gpu_init(self): + if not core.is_compiled_with_cuda(): + return + scope = core.Scope() + place = core.CUDAPlace(0) + lod_py = [[0, 2, 5], [0, 2, 4, 5]] + lod_tensor = core.LoDTensor() + + lod_tensor.set_dims([5, 2, 3, 4]) + lod_tensor.set_lod(lod_py) lod_tensor.alloc_float(place) tensor_array = numpy.array(lod_tensor) tensor_array[0, 0, 0, 0] = 1.0