diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c7eb260aea8478f4833cb79253f4481e10b8685..49334279f6dc88c0d35fec43daf80e3cbe65760c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,7 +39,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
-option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
+option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
@@ -137,7 +137,7 @@ include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn
include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc
-include(external/boost) # download, build, install boost
+include(external/boost) # download boost
include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..98356cd7613baff7f0cd66d1462068232b2b8500
--- /dev/null
+++ b/benchmark/cluster/vgg16/Dockerfile
@@ -0,0 +1,18 @@
+#FROM python:2.7.14
+FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
+RUN apt-get update && apt-get install -y python
+RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
+# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
+# so we must build one with distribute support to install in this image.
+RUN pip install paddlepaddle
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
+RUN pip uninstall -y paddlepaddle
+
+# below lines may change a lot for debugging
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && \
+chmod +x /usr/bin/paddle_k8s
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..11d00b8f85382aa720c169338c51333b730d44d5
--- /dev/null
+++ b/benchmark/cluster/vgg16/README.md
@@ -0,0 +1,76 @@
+# Performance for Distributed vgg16
+
+## Test Result
+
+### Hardware Infomation
+
+- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
+- cpu MHz : 2101.000
+- cache size : 20480 KB
+
+### Single Node Single Thread
+
+- PServer Count: 10
+- Trainer Count: 20
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
+| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
+| TensorFlow | - | - | - | - |
+
+### Different Batch Size
+
+- PServer Count: 10
+- Trainer Count: 20
+- Per trainer CPU Core: 1
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
+| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
+| TensorFlow | - | - | - | - |
+
+
+### Accelerate Rate
+
+- Pserver Count: 20
+- Batch Size: 128
+- Metrics: samples / sec
+
+| Trainer Count | 20 | 40 | 80 | 100 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
+| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
+| TensorFlow | - | - | - | - |
+
+### Different Pserver Count
+
+- Trainer Count: 60
+- Batch Size: 128
+- Metrics: samples/ sec
+
+| PServer Count | 3 | 6 |10 | 20 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
+| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
+| TensorFlow | - | - | - | - |
+
+*The performance gap between Fuild and v2 comes from the network interference.*
+
+
+## Steps to Run the Performance Test
+
+1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
+1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
+1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
+1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
+1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
+
+Check the logs for the distributed training progress and analyze the performance.
+
+## Enable Verbos Logs
+
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
diff --git a/benchmark/cluster/vgg16/fluid_pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ee8b0763b62fc011f40f6197e929a68b48a93e47
--- /dev/null
+++ b/benchmark/cluster/vgg16/fluid_pserver.yaml
@@ -0,0 +1,72 @@
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+ name: vgg16job-pserver
+spec:
+ replicas: 10
+ template:
+ metadata:
+ labels:
+ paddle-job-pserver: vgg16job
+ spec:
+ hostNetwork: true
+ imagePullSecrets:
+ - name: job-registry-secret
+ containers:
+ - name: pserver
+ image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+ imagePullPolicy: Always
+ ports:
+ - name: jobport-30236
+ containerPort: 30236
+ env:
+ - name: PADDLE_JOB_NAME
+ value: vgg16job
+ - name: MKL_NUM_THREADS
+ value: "1"
+ - name: TRAINING_ROLE
+ value: "PSERVER"
+ - name: TRAINERS
+ value: "20"
+ - name: PSERVERS
+ value: "10"
+ - name: TOPOLOGY
+ value: ""
+ - name: ENTRY
+ value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
+ - name: TRAINER_PACKAGE
+ value: "/workspace"
+ - name: PADDLE_INIT_PORT
+ value: "30236"
+ - name: PADDLE_INIT_NICS
+ value: "xgbe0"
+ - name: PADDLE_INIT_TRAINER_COUNT
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+ value: "1"
+ - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+ value: "20"
+ - name: PADDLE_INIT_NUM_PASSES
+ value: "1"
+ - name: PADDLE_INIT_USE_GPU
+ value: "0"
+ - name: LD_LIBRARY_PATH
+ value: "/usr/local/lib:/usr/local/nvidia/lib64"
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: "metadata.namespace"
+ - name: POD_IP
+ valueFrom:
+ fieldRef:
+ fieldPath: "status.podIP"
+ command: ["paddle_k8s", "start_fluid"]
+ resources:
+ requests:
+ memory: 10Gi
+ cpu: 4
+ limits:
+ memory: 10Gi
+ cpu: 4
diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0a0ed25ebe43c4cc0d5ab0b72cf36c936fcce802
--- /dev/null
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@@ -0,0 +1,69 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: vgg16job-trainer
+spec:
+ parallelism: 20
+ completions: 20
+ template:
+ metadata:
+ labels:
+ paddle-job: vgg16job
+ spec:
+ imagePullSecrets:
+ - name: job-registry-secret
+ hostNetwork: true
+ containers:
+ - name: trainer
+ image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+ imagePullPolicy: Always
+ command: ["paddle_k8s", "start_fluid"]
+ env:
+ - name: PADDLE_JOB_NAME
+ value: vgg16job
+ - name: TRAINING_ROLE
+ value: "TRAINER"
+ - name: TRAINERS
+ value: "20"
+ - name: PSERVERS
+ value: "10"
+ - name: TOPOLOGY
+ value: ""
+ - name: ENTRY
+ value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
+ - name: TRAINER_PACKAGE
+ value: "/workspace"
+ - name: PADDLE_INIT_PORT
+ value: "30236"
+ - name: PADDLE_INIT_NICS
+ value: "xgbe0"
+ - name: PADDLE_INIT_TRAINER_COUNT
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+ value: "1"
+ - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+ value: "20"
+ - name: PADDLE_INIT_NUM_PASSES
+ value: "1"
+ - name: PADDLE_INIT_USE_GPU
+ value: "0"
+ - name: LD_LIBRARY_PATH
+ value: "/usr/local/lib:/usr/local/nvidia/lib64"
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: "metadata.namespace"
+ - name: POD_IP
+ valueFrom:
+ fieldRef:
+ fieldPath: "status.podIP"
+ resources:
+ requests:
+ memory: 40Gi
+ cpu: 2
+ limits:
+ memory: 40Gi
+ cpu: 2
+ restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dd1271e0cf399184134c06b3200ee1202c65cef0
--- /dev/null
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
@@ -0,0 +1,64 @@
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+ name: vgg16v2job-pserver
+spec:
+ replicas: 10
+ template:
+ metadata:
+ labels:
+ paddle-job-pserver: vgg16v2job
+ spec:
+ hostNetwork: true
+ imagePullSecrets:
+ - name: job-registry-secret
+ containers:
+ - name: pserver
+ image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+ imagePullPolicy: Always
+ ports:
+ - name: jobport-30236
+ containerPort: 30236
+ env:
+ - name: PADDLE_JOB_NAME
+ value: vgg16v2job
+ - name: TRAINERS
+ value: "20"
+ - name: PSERVERS
+ value: "10"
+ - name: TOPOLOGY
+ value: ""
+ - name: ENTRY
+ value: "python train.py"
+ - name: TRAINER_PACKAGE
+ value: "/workspace"
+ - name: PADDLE_INIT_PORT
+ value: "30236"
+ - name: PADDLE_INIT_NICS
+ value: "xgbe0"
+ - name: PADDLE_INIT_TRAINER_COUNT
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+ value: "1"
+ - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+ value: "20"
+ - name: PADDLE_INIT_NUM_PASSES
+ value: "1"
+ - name: PADDLE_INIT_USE_GPU
+ value: "0"
+ - name: LD_LIBRARY_PATH
+ value: "/usr/local/lib:/usr/local/nvidia/lib64"
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: "metadata.namespace"
+ command: ["paddle_k8s", "start_pserver"]
+ resources:
+ requests:
+ memory: 10Gi
+ cpu: 4
+ limits:
+ memory: 10Gi
+ cpu: 4
diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..12c8964066cbcfe8d2a44de2f51a3d12ea422fe2
--- /dev/null
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@@ -0,0 +1,65 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: vgg16v2job-trainer
+spec:
+ parallelism: 20
+ completions: 20
+ template:
+ metadata:
+ labels:
+ paddle-job: vgg16v2job
+ spec:
+ imagePullSecrets:
+ - name: job-registry-secret
+ hostNetwork: true
+ containers:
+ - name: trainer
+ image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+ imagePullPolicy: Always
+ command: ["paddle_k8s", "start_trainer", "v2"]
+ env:
+ - name: PADDLE_JOB_NAME
+ value: vgg16v2job
+ - name: BATCH_SIZE
+ value: "256"
+ - name: TRAINERS
+ value: "20"
+ - name: PSERVERS
+ value: "10"
+ - name: TOPOLOGY
+ value: ""
+ - name: ENTRY
+ value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
+ - name: TRAINER_PACKAGE
+ value: "/workspace"
+ - name: PADDLE_INIT_PORT
+ value: "30236"
+ - name: PADDLE_INIT_NICS
+ value: "xgbe0"
+ - name: PADDLE_INIT_TRAINER_COUNT
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+ value: "1"
+ - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+ value: "20"
+ - name: PADDLE_INIT_NUM_PASSES
+ value: "2"
+ - name: PADDLE_INIT_USE_GPU
+ value: "0"
+ - name: LD_LIBRARY_PATH
+ value: "/usr/local/lib:/usr/local/nvidia/lib64"
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: "metadata.namespace"
+ resources:
+ requests:
+ memory: 40Gi
+ cpu: 2
+ limits:
+ memory: 40Gi
+ cpu: 2
+ restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
new file mode 100644
index 0000000000000000000000000000000000000000..499e06ec42fc8f840137173628fa465e0541ba30
--- /dev/null
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -0,0 +1,277 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.profiler as profiler
+import argparse
+import functools
+import os
+
+
+def str2bool(v):
+ if v.lower() in ('yes', 'true', 't', 'y', '1'):
+ return True
+ elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+ return False
+ else:
+ raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+ '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+ '--learning_rate',
+ type=float,
+ default=1e-3,
+ help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+ '--device',
+ type=str,
+ default='CPU',
+ choices=['CPU', 'GPU'],
+ help="The device type.")
+parser.add_argument('--device_id', type=int, default=0, help="The device id.")
+parser.add_argument(
+ '--data_format',
+ type=str,
+ default='NCHW',
+ choices=['NCHW', 'NHWC'],
+ help='The data order, now only support NCHW.')
+parser.add_argument(
+ '--data_set',
+ type=str,
+ default='cifar10',
+ choices=['cifar10', 'flowers'],
+ help='Optional dataset for benchmark.')
+parser.add_argument(
+ '--local',
+ type=str2bool,
+ default=True,
+ help='Whether to run as local mode.')
+args = parser.parse_args()
+
+
+def vgg16_bn_drop(input):
+ def conv_block(input, num_filter, groups, dropouts):
+ return fluid.nets.img_conv_group(
+ input=input,
+ pool_size=2,
+ pool_stride=2,
+ conv_num_filter=[num_filter] * groups,
+ conv_filter_size=3,
+ conv_act='relu',
+ conv_with_batchnorm=True,
+ conv_batchnorm_drop_rate=dropouts,
+ pool_type='max')
+
+ conv1 = conv_block(input, 64, 2, [0.3, 0])
+ conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+ conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+ conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+ conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+ drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+ fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+ bn = fluid.layers.batch_norm(input=fc1, act='relu')
+ drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+ fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+ return fc2
+
+
+def main():
+ if args.data_set == "cifar10":
+ classdim = 10
+ if args.data_format == 'NCHW':
+ data_shape = [3, 32, 32]
+ else:
+ data_shape = [32, 32, 3]
+ else:
+ classdim = 102
+ if args.data_format == 'NCHW':
+ data_shape = [3, 224, 224]
+ else:
+ data_shape = [224, 224, 3]
+
+ # Input data
+ images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+ label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+ # Train program
+ net = vgg16_bn_drop(images)
+ predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+ cost = fluid.layers.cross_entropy(input=predict, label=label)
+ avg_cost = fluid.layers.mean(x=cost)
+
+ # Evaluator
+ accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+ # inference program
+ inference_program = fluid.default_main_program().clone()
+ with fluid.program_guard(inference_program):
+ test_target = accuracy.metrics + accuracy.states
+ inference_program = fluid.io.get_inference_program(test_target)
+
+ # Optimization
+ optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+ optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+ # Initialize executor
+ place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
+ args.device_id)
+ exe = fluid.Executor(place)
+
+ # test
+ def test(exe):
+ accuracy.reset(exe)
+ for batch_id, data in enumerate(test_reader()):
+ img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+ data)).astype("float32")
+ y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+ y_data = y_data.reshape([-1, 1])
+
+ exe.run(inference_program,
+ feed={"pixel": img_data,
+ "label": y_data})
+
+ return accuracy.eval(exe)
+
+ def train_loop(exe, trainer_prog):
+ iters = 0
+ ts = time.time()
+ for pass_id in range(args.num_passes):
+ # train
+ start_time = time.time()
+ num_samples = 0
+ accuracy.reset(exe)
+ with profiler.profiler("CPU", 'total') as prof:
+ for batch_id, data in enumerate(train_reader()):
+ ts = time.time()
+ img_data = np.array(
+ map(lambda x: x[0].reshape(data_shape), data)).astype(
+ "float32")
+ y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+ y_data = y_data.reshape([-1, 1])
+
+ loss, acc = exe.run(
+ trainer_prog,
+ feed={"pixel": img_data,
+ "label": y_data},
+ fetch_list=[avg_cost] + accuracy.metrics)
+ iters += 1
+ num_samples += len(data)
+ print(
+ "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
+ % (pass_id, iters, loss, acc, time.time() - ts)
+ ) # The accuracy is the accumulation of batches, but not the current batch.
+
+ pass_elapsed = time.time() - start_time
+ pass_train_acc = accuracy.eval(exe)
+ pass_test_acc = test(exe)
+ print(
+ "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
+ % (pass_id, num_samples / pass_elapsed, pass_train_acc,
+ pass_test_acc))
+
+ if args.local:
+ # Parameter initialization
+ exe.run(fluid.default_startup_program())
+
+ # data reader
+ train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
+ else paddle.dataset.flowers.train(),
+ buf_size=5120),
+ batch_size=args.batch_size)
+ test_reader = paddle.batch(
+ paddle.dataset.cifar.test10()
+ if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+ batch_size=args.batch_size)
+ train_loop(exe, fluid.default_main_program())
+ else:
+ pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # all pserver endpoints
+ eplist = []
+ for ip in pserver_ips.split(","):
+ eplist.append(':'.join([ip, "6174"]))
+ pserver_endpoints = ",".join(eplist)
+ print("pserver endpoints: ", pserver_endpoints)
+ trainers = int(os.getenv("TRAINERS")) # total trainer count
+ print("trainers total: ", trainers)
+ current_endpoint = os.getenv(
+ "POD_IP") + ":6174" # current pserver endpoint
+ training_role = os.getenv(
+ "TRAINING_ROLE",
+ "TRAINER") # get the training role: trainer/pserver
+ t = fluid.DistributeTranspiler()
+ t.transpile(
+ optimize_ops,
+ params_grads,
+ pservers=pserver_endpoints,
+ trainers=trainers)
+
+ if training_role == "PSERVER":
+ if not current_endpoint:
+ print("need env SERVER_ENDPOINT")
+ exit(1)
+ pserver_prog = t.get_pserver_program(current_endpoint)
+ pserver_startup = t.get_startup_program(current_endpoint,
+ pserver_prog)
+ print("starting server side startup")
+ exe.run(pserver_startup)
+ print("starting parameter server...")
+ exe.run(pserver_prog)
+ elif training_role == "TRAINER":
+ # Parameter initialization
+ exe.run(fluid.default_startup_program())
+
+ # data reader
+ train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
+ else paddle.dataset.flowers.train(),
+ buf_size=5120),
+ batch_size=args.batch_size)
+ test_reader = paddle.batch(
+ paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
+ paddle.dataset.flowers.test(),
+ batch_size=args.batch_size)
+
+ trainer_prog = t.get_trainer_program()
+ feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+ # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
+ exe.run(fluid.default_startup_program())
+ train_loop(exe, trainer_prog)
+ else:
+ print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+def print_arguments():
+ print('----------- Configuration Arguments -----------')
+ for arg, value in sorted(vars(args).iteritems()):
+ print('%s: %s' % (arg, value))
+ print('------------------------------------------------')
+
+
+if __name__ == "__main__":
+ print_arguments()
+ main()
diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ac6b3c33252e0a1f596f539fc090c5ada118e15
--- /dev/null
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import gzip
+
+import paddle.v2.dataset.cifar as cifar
+import paddle.v2 as paddle
+import time
+import os
+
+DATA_DIM = 3 * 32 * 32
+CLASS_DIM = 10
+BATCH_SIZE = os.getenv("BATCH_SIZE")
+if BATCH_SIZE:
+ BATCH_SIZE = int(BATCH_SIZE)
+else:
+ BATCH_SIZE = 128
+print "batch_size", BATCH_SIZE
+NODE_COUNT = int(os.getenv("TRAINERS"))
+ts = 0
+
+
+def vgg(input, nums, class_dim):
+ def conv_block(input, num_filter, groups, num_channels=None):
+ return paddle.networks.img_conv_group(
+ input=input,
+ num_channels=num_channels,
+ pool_size=2,
+ pool_stride=2,
+ conv_num_filter=[num_filter] * groups,
+ conv_filter_size=3,
+ conv_act=paddle.activation.Relu(),
+ pool_type=paddle.pooling.Max())
+
+ assert len(nums) == 5
+ # the channel of input feature is 3
+ conv1 = conv_block(input, 64, nums[0], 3)
+ conv2 = conv_block(conv1, 128, nums[1])
+ conv3 = conv_block(conv2, 256, nums[2])
+ conv4 = conv_block(conv3, 512, nums[3])
+ conv5 = conv_block(conv4, 512, nums[4])
+
+ fc_dim = 512
+ fc1 = paddle.layer.fc(input=conv5,
+ size=fc_dim,
+ act=paddle.activation.Relu(),
+ layer_attr=paddle.attr.Extra(drop_rate=0.5))
+ fc2 = paddle.layer.fc(input=fc1,
+ size=fc_dim,
+ act=paddle.activation.Relu(),
+ layer_attr=paddle.attr.Extra(drop_rate=0.5))
+ out = paddle.layer.fc(input=fc2,
+ size=class_dim,
+ act=paddle.activation.Softmax())
+ return out
+
+
+def vgg13(input, class_dim):
+ nums = [2, 2, 2, 2, 2]
+ return vgg(input, nums, class_dim)
+
+
+def vgg16(input, class_dim):
+ nums = [2, 2, 3, 3, 3]
+ return vgg(input, nums, class_dim)
+
+
+def vgg19(input, class_dim):
+ nums = [2, 2, 4, 4, 4]
+ return vgg(input, nums, class_dim)
+
+
+def main():
+ global ts
+ paddle.init(use_gpu=False)
+ image = paddle.layer.data(
+ name="image", type=paddle.data_type.dense_vector(DATA_DIM))
+ lbl = paddle.layer.data(
+ name="label", type=paddle.data_type.integer_value(CLASS_DIM))
+
+ extra_layers = None
+ # NOTE: for v2 distributed training need averaging updates.
+ learning_rate = 1e-3 / NODE_COUNT
+ out = vgg16(image, class_dim=CLASS_DIM)
+ cost = paddle.layer.classification_cost(input=out, label=lbl)
+
+ # Create parameters
+ parameters = paddle.parameters.create(cost)
+
+ # Create optimizer
+ optimizer = paddle.optimizer.Momentum(
+ momentum=0.9,
+ regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
+ BATCH_SIZE),
+ learning_rate=learning_rate / BATCH_SIZE,
+ learning_rate_decay_a=0.1,
+ learning_rate_decay_b=128000 * 35,
+ learning_rate_schedule="discexp", )
+
+ train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ cifar.train10(),
+ # To use other data, replace the above line with:
+ # reader.train_reader('train.list'),
+ buf_size=1000),
+ batch_size=BATCH_SIZE)
+ test_reader = paddle.batch(
+ cifar.test10(),
+ # To use other data, replace the above line with:
+ # reader.test_reader('val.list'),
+ batch_size=BATCH_SIZE)
+
+ # Create trainer
+ trainer = paddle.trainer.SGD(cost=cost,
+ parameters=parameters,
+ update_equation=optimizer,
+ extra_layers=extra_layers,
+ is_local=False)
+
+ # End batch and end pass event handler
+ def event_handler(event):
+ global ts, ts_pass
+ if isinstance(event, paddle.event.BeginPass):
+ ts_pass = time.time()
+ if isinstance(event, paddle.event.BeginIteration):
+ ts = time.time()
+ if isinstance(event, paddle.event.EndIteration):
+ if event.batch_id % 1 == 0:
+ print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
+ event.pass_id, event.batch_id, event.cost, event.metrics,
+ time.time() - ts)
+ if isinstance(event, paddle.event.EndPass):
+ print "Pass %d end, spent: %f" % (event.pass_id,
+ time.time() - ts_pass)
+ result = trainer.test(reader=test_reader)
+ print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+
+ trainer.train(
+ reader=train_reader, num_passes=200, event_handler=event_handler)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index c70d83b3f4bb24740ed67b4e2f98a3ced26d1648..dbc676bdac30e0d730206c17a1912d49d4f896eb 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -21,6 +21,7 @@ set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOO
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
+set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
include_directories(${BOOST_INCLUDE_DIR})
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 585db019d521b1699baadfae31ef95b5059c71b4..33ef6860e1d38f4e87c4431addf43f9f8a655fc2 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -186,6 +186,11 @@ function(cc_library TARGET_NAME)
add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
endif()
if (cc_library_DEPS)
+ # Don't need link libwarpctc.so
+ if ("${cc_library_DEPS};" MATCHES "warpctc;")
+ list(REMOVE_ITEM cc_library_DEPS warpctc)
+ add_dependencies(${TARGET_NAME} warpctc)
+ endif()
add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
endif()
@@ -224,12 +229,18 @@ function(cc_test TARGET_NAME)
if(WITH_TESTING)
set(options "")
set(oneValueArgs "")
- set(multiValueArgs SRCS DEPS)
+ set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS})
- target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+ # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+ target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+ if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
+ list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
+ endif()
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
- add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+ add_test(NAME ${TARGET_NAME}
+ COMMAND ${TARGET_NAME} ${cc_test_ARGS}
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endfunction(cc_test)
@@ -457,12 +468,12 @@ endfunction()
function(py_test TARGET_NAME)
if(WITH_TESTING)
- set(options STATIC static SHARED shared)
+ set(options "")
set(oneValueArgs "")
- set(multiValueArgs SRCS DEPS ARGS)
+ set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
- COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
+ COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index ddf0b055a92d80295b24255a5462d477e0d9c796..29388f5005bf779a1bfa63c0d46d35996c0c792d 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -87,6 +87,11 @@ roi_pool
.. autoclass:: paddle.v2.layer.roi_pool
:noindex:
+pad
+----
+.. autoclass:: paddle.v2.layer.pad
+ :noindex:
+
Norm Layer
==========
@@ -133,6 +138,11 @@ grumemory
.. autoclass:: paddle.v2.layer.grumemory
:noindex:
+gated_unit
+-----------
+.. autoclass:: paddle.v2.layer.gated_unit
+ :noindex:
+
Recurrent Layer Group
=====================
@@ -340,6 +350,11 @@ bilinear_interp
.. autoclass:: paddle.v2.layer.bilinear_interp
:noindex:
+dropout
+--------
+.. autoclass:: paddle.v2.layer.dropout
+ :noindex:
+
dot_prod
---------
.. autoclass:: paddle.v2.layer.dot_prod
@@ -402,6 +417,11 @@ scale_shift
.. autoclass:: paddle.v2.layer.scale_shift
:noindex:
+factorization_machine
+---------------------
+.. autoclass:: paddle.v2.layer.factorization_machine
+ :noindex:
+
Sampling Layers
===============
@@ -420,22 +440,6 @@ multiplex
.. autoclass:: paddle.v2.layer.multiplex
:noindex:
-Factorization Machine Layer
-============================
-
-factorization_machine
----------------------
-.. autoclass:: paddle.v2.layer.factorization_machine
- :noindex:
-
-Slicing and Joining Layers
-==========================
-
-pad
-----
-.. autoclass:: paddle.v2.layer.pad
- :noindex:
-
.. _api_v2.layer_costs:
Cost Layers
@@ -526,6 +530,11 @@ multibox_loss
.. autoclass:: paddle.v2.layer.multibox_loss
:noindex:
+detection_output
+----------------
+.. autoclass:: paddle.v2.layer.detection_output
+ :noindex:
+
Check Layer
============
@@ -534,31 +543,10 @@ eos
.. autoclass:: paddle.v2.layer.eos
:noindex:
-Miscs
-=====
-
-dropout
---------
-.. autoclass:: paddle.v2.layer.dropout
- :noindex:
-
-Activation with learnable parameter
-===================================
+Activation
+==========
prelu
--------
.. autoclass:: paddle.v2.layer.prelu
:noindex:
-
-gated_unit
------------
-.. autoclass:: paddle.v2.layer.gated_unit
- :noindex:
-
-Detection output Layer
-======================
-
-detection_output
-----------------
-.. autoclass:: paddle.v2.layer.detection_output
- :noindex:
diff --git a/doc/api/v2/data/dataset.rst b/doc/api/v2/data/dataset.rst
index 6a8ecc5bb1d855e0ded3719943ab3adb810de365..02e41564b1e48c07da6ac071fc4b60089169e05a 100644
--- a/doc/api/v2/data/dataset.rst
+++ b/doc/api/v2/data/dataset.rst
@@ -73,3 +73,10 @@ wmt14
.. automodule:: paddle.v2.dataset.wmt14
:members:
:noindex:
+
+wmt16
++++++
+
+.. automodule:: paddle.v2.dataset.wmt16
+ :members:
+ :noindex:
diff --git a/doc/api/v2/fluid/data_feeder.rst b/doc/api/v2/fluid/data_feeder.rst
index 0fa78f7dfb04c13be7eb83b7fd35cb03f2f4a7fa..a591c7334fd31c98a94b50a4344f251560a0f2f9 100644
--- a/doc/api/v2/fluid/data_feeder.rst
+++ b/doc/api/v2/fluid/data_feeder.rst
@@ -1,9 +1,14 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
===========
-DataFeeder
+data_feeder
===========
DataFeeder
------------
-.. automodule:: paddle.v2.fluid.data_feeder
- :members: DataFeeder
+----------
+
+.. autoclass:: paddle.v2.fluid.data_feeder.DataFeeder
+ :members:
:noindex:
+
diff --git a/doc/api/v2/fluid/evaluator.rst b/doc/api/v2/fluid/evaluator.rst
index a23f3301d0331e0ea3733f06444515eb4680cd31..00dcecfd628a35d83d1c596bf0aea819a1705862 100644
--- a/doc/api/v2/fluid/evaluator.rst
+++ b/doc/api/v2/fluid/evaluator.rst
@@ -1,9 +1,21 @@
-===========
-Evaluator
-===========
-
-Evaluator
------------
-.. automodule:: paddle.v2.fluid.evaluator
- :members: Evaluator
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+=========
+evaluator
+=========
+
+Accuracy
+--------
+
+.. autoclass:: paddle.v2.fluid.evaluator.Accuracy
+ :members:
:noindex:
+
+ChunkEvaluator
+--------------
+
+.. autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator
+ :members:
+ :noindex:
+
diff --git a/doc/api/v2/fluid/executor.rst b/doc/api/v2/fluid/executor.rst
index 3a283538c120cfa1ef646c390bb71c6251c23675..a028f6283f2ca333bdf6c9857a98661c0222b41e 100644
--- a/doc/api/v2/fluid/executor.rst
+++ b/doc/api/v2/fluid/executor.rst
@@ -1,9 +1,32 @@
-===========
-Executor
-===========
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+========
+executor
+========
Executor
+--------
+
+.. autoclass:: paddle.v2.fluid.executor.Executor
+ :members:
+ :noindex:
+
+global_scope
+------------
+
+.. autofunction:: paddle.v2.fluid.executor.global_scope
+ :noindex:
+
+scope_guard
-----------
-.. automodule:: paddle.v2.fluid.executor
- :members: Executor
+
+.. autofunction:: paddle.v2.fluid.executor.scope_guard
+ :noindex:
+
+switch_scope
+------------
+
+.. autofunction:: paddle.v2.fluid.executor.switch_scope
:noindex:
+
diff --git a/doc/api/v2/fluid/gen_doc.py b/doc/api/v2/fluid/gen_doc.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2147fd3f7ea635d8f14210fbcd1a568ee2230ee
--- /dev/null
+++ b/doc/api/v2/fluid/gen_doc.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import argparse
+import sys
+import types
+
+import paddle.v2.fluid as fluid
+
+
+def parse_arg():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--submodules', nargs="*")
+ parser.add_argument(
+ 'module', type=str, help='Generate the documentation of which module')
+ return parser.parse_args()
+
+
+class DocGenerator(object):
+ def __init__(self, module_name, stream=sys.stdout):
+ self.stream = stream
+ self.module_name = module_name
+ if not hasattr(fluid, module_name):
+ raise ValueError("Cannot find fluid.{0}".format(module_name))
+ else:
+ self.module = getattr(fluid, module_name)
+ self.stream.write('''.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+''')
+
+ self._print_header_(module_name, dot='=', is_title=True)
+
+ def print_submodule(self, submodule_name):
+ submodule = getattr(self.module, submodule_name)
+ if submodule is None:
+ raise ValueError("Cannot find submodule {0}".format(submodule_name))
+ self.print_section(submodule_name)
+
+ for item in submodule.__all__:
+ self.print_item(item)
+
+ def print_current_module(self):
+ for item in self.module.__all__:
+ self.print_item(item)
+
+ def print_section(self, name):
+ self._print_header_(name, dot='=', is_title=False)
+
+ def print_item(self, name):
+ item = getattr(self.module, name)
+ if isinstance(item, types.TypeType):
+ self.print_class(name)
+ elif isinstance(item, types.FunctionType):
+ self.print_method(name)
+ else:
+ raise RuntimeError("Unsupported item {0}".format(name))
+
+ def print_class(self, name):
+ self._print_header_(name, dot='-', is_title=False)
+ self.stream.write('''.. autoclass:: paddle.v2.fluid.{0}.{1}
+ :members:
+ :noindex:
+
+'''.format(self.module_name, name))
+
+ def print_method(self, name):
+ self._print_header_(name, dot='-', is_title=False)
+ self.stream.write('''.. autofunction:: paddle.v2.fluid.{0}.{1}
+ :noindex:
+
+'''.format(self.module_name, name))
+
+ def _print_header_(self, name, dot, is_title):
+ dot_line = dot * len(name)
+ if is_title:
+ self.stream.write(dot_line)
+ self.stream.write('\n')
+ self.stream.write(name)
+ self.stream.write('\n')
+ self.stream.write(dot_line)
+ self.stream.write('\n')
+ self.stream.write('\n')
+
+
+def main():
+ args = parse_arg()
+ gen = DocGenerator(args.module)
+ if args.submodules is None:
+ gen.print_current_module()
+ else:
+ for submodule_name in args.submodules:
+ gen.print_submodule(submodule_name)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/doc/api/v2/fluid/gen_doc.sh b/doc/api/v2/fluid/gen_doc.sh
new file mode 100755
index 0000000000000000000000000000000000000000..ba7b7ba8e51399deb852b0a7c8ddd3128f521e85
--- /dev/null
+++ b/doc/api/v2/fluid/gen_doc.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
+
+for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
+do
+ python gen_doc.py ${module} > ${module}.rst
+done
diff --git a/doc/api/v2/fluid/initializer.rst b/doc/api/v2/fluid/initializer.rst
index 8f587837e9873370722062404f511654a9460587..c38be033fff2997930525f51c93995db09daa2b6 100644
--- a/doc/api/v2/fluid/initializer.rst
+++ b/doc/api/v2/fluid/initializer.rst
@@ -1,50 +1,35 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
===========
-Initializer
+initializer
===========
+Constant
+--------
-
-Initializer
------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: Initializer
- :noindex:
-
-
-
-ConstantInitializer
--------------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: ConstantInitializer
+.. autoclass:: paddle.v2.fluid.initializer.Constant
+ :members:
:noindex:
+Uniform
+-------
-
-UniformInitializer
-------------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: UniformInitializer
- :noindex:
-
-
-
-NormalInitializer
------------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: NormalInitializer
+.. autoclass:: paddle.v2.fluid.initializer.Uniform
+ :members:
:noindex:
+Normal
+------
-XavierInitializer
------------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: XavierInitializer
+.. autoclass:: paddle.v2.fluid.initializer.Normal
+ :members:
:noindex:
+Xavier
+------
-MSRAInitializer
----------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: MSRAInitializer
+.. autoclass:: paddle.v2.fluid.initializer.Xavier
+ :members:
:noindex:
diff --git a/doc/api/v2/fluid/io.rst b/doc/api/v2/fluid/io.rst
index 67f68c4e9e16b379207b8de114cdf769e056f78e..37c9c273e369532e8ff596e9649cb695a98a2505 100644
--- a/doc/api/v2/fluid/io.rst
+++ b/doc/api/v2/fluid/io.rst
@@ -1,10 +1,61 @@
-===========
-IO
-===========
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+==
+io
+==
+save_vars
+---------
-is_parameter
+.. autofunction:: paddle.v2.fluid.io.save_vars
+ :noindex:
+
+save_params
-----------
-.. autofunction:: paddle.v2.fluid.io.is_parameter
+
+.. autofunction:: paddle.v2.fluid.io.save_params
+ :noindex:
+
+save_persistables
+-----------------
+
+.. autofunction:: paddle.v2.fluid.io.save_persistables
+ :noindex:
+
+load_vars
+---------
+
+.. autofunction:: paddle.v2.fluid.io.load_vars
+ :noindex:
+
+load_params
+-----------
+
+.. autofunction:: paddle.v2.fluid.io.load_params
:noindex:
+
+load_persistables
+-----------------
+
+.. autofunction:: paddle.v2.fluid.io.load_persistables
+ :noindex:
+
+save_inference_model
+--------------------
+
+.. autofunction:: paddle.v2.fluid.io.save_inference_model
+ :noindex:
+
+load_inference_model
+--------------------
+
+.. autofunction:: paddle.v2.fluid.io.load_inference_model
+ :noindex:
+
+get_inference_program
+---------------------
+
+.. autofunction:: paddle.v2.fluid.io.get_inference_program
+ :noindex:
+
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index 231ec2d4ba102a5d31c47cbc7a5d484ef17a7f3a..e24613b94b422b7cdf9c6383c359fa92a4faf6ff 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -1,546 +1,799 @@
-==========
-Layers
-==========
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+======
+layers
+======
-fc
----
-.. autofunction:: paddle.v2.fluid.layers.fc
+control_flow
+============
+
+split_lod_tensor
+----------------
+
+.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
:noindex:
-embedding
----------
-.. autofunction:: paddle.v2.fluid.layers.embedding
+merge_lod_tensor
+----------------
+
+.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
:noindex:
-dynamic_lstm
-------------
-.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
+BlockGuard
+----------
+
+.. autoclass:: paddle.v2.fluid.layers.BlockGuard
+ :members:
:noindex:
-dynamic_lstmp
--------------
-.. autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
+BlockGuardWithCompletion
+------------------------
+
+.. autoclass:: paddle.v2.fluid.layers.BlockGuardWithCompletion
+ :members:
:noindex:
-dynamic_gru
------------
-.. autofunction:: paddle.v2.fluid.layers.dynamic_gru
+StaticRNNMemoryLink
+-------------------
+
+.. autoclass:: paddle.v2.fluid.layers.StaticRNNMemoryLink
+ :members:
:noindex:
-data
-----
-.. autofunction:: paddle.v2.fluid.layers.data
+WhileGuard
+----------
+
+.. autoclass:: paddle.v2.fluid.layers.WhileGuard
+ :members:
:noindex:
-mean
-----
-.. autofunction:: paddle.v2.fluid.layers.mean
+While
+-----
+
+.. autoclass:: paddle.v2.fluid.layers.While
+ :members:
:noindex:
-mul
----
-.. autofunction:: paddle.v2.fluid.layers.mul
+lod_rank_table
+--------------
+
+.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
:noindex:
-elementwise_add
----------------
-.. autofunction:: paddle.v2.fluid.layers.elementwise_add
+max_sequence_len
+----------------
+
+.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
:noindex:
-elementwise_sub
----------------
-.. autofunction:: paddle.v2.fluid.layers.elementwise_sub
+topk
+----
+
+.. autofunction:: paddle.v2.fluid.layers.topk
:noindex:
-elementwise_mul
----------------
-.. autofunction:: paddle.v2.fluid.layers.elementwise_mul
+lod_tensor_to_array
+-------------------
+
+.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
:noindex:
-elementwise_div
----------------
-.. autofunction:: paddle.v2.fluid.layers.elementwise_div
+array_to_lod_tensor
+-------------------
+
+.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
:noindex:
+increment
+---------
-dropout
--------
-.. autofunction:: paddle.v2.fluid.layers.dropout
+.. autofunction:: paddle.v2.fluid.layers.increment
:noindex:
+array_write
+-----------
-reshape
---------
-.. autofunction:: paddle.v2.fluid.layers.reshape
+.. autofunction:: paddle.v2.fluid.layers.array_write
:noindex:
+create_array
+------------
-sigmoid
+.. autofunction:: paddle.v2.fluid.layers.create_array
+ :noindex:
+
+less_than
---------
-.. autofunction:: paddle.v2.fluid.layers.sigmoid
+
+.. autofunction:: paddle.v2.fluid.layers.less_than
:noindex:
+array_read
+----------
-scale
----------
-.. autofunction:: paddle.v2.fluid.layers.scale
+.. autofunction:: paddle.v2.fluid.layers.array_read
+ :noindex:
+
+shrink_memory
+-------------
+
+.. autofunction:: paddle.v2.fluid.layers.shrink_memory
:noindex:
+array_length
+------------
-transpose
+.. autofunction:: paddle.v2.fluid.layers.array_length
+ :noindex:
+
+IfElse
+------
+
+.. autoclass:: paddle.v2.fluid.layers.IfElse
+ :members:
+ :noindex:
+
+DynamicRNN
+----------
+
+.. autoclass:: paddle.v2.fluid.layers.DynamicRNN
+ :members:
+ :noindex:
+
+ConditionalBlock
+----------------
+
+.. autoclass:: paddle.v2.fluid.layers.ConditionalBlock
+ :members:
+ :noindex:
+
+StaticRNN
---------
-.. autofunction:: paddle.v2.fluid.layers.transpose
+
+.. autoclass:: paddle.v2.fluid.layers.StaticRNN
+ :members:
:noindex:
+reorder_lod_tensor_by_rank
+--------------------------
-sigmoid_cross_entropy_with_logits
----------------------------------
-.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
+.. autofunction:: paddle.v2.fluid.layers.reorder_lod_tensor_by_rank
:noindex:
+ParallelDo
+----------
-cast
+.. autoclass:: paddle.v2.fluid.layers.ParallelDo
+ :members:
+ :noindex:
+
+Print
+-----
+
+.. autofunction:: paddle.v2.fluid.layers.Print
+ :noindex:
+
+device
+======
+
+get_places
+----------
+
+.. autofunction:: paddle.v2.fluid.layers.get_places
+ :noindex:
+
+io
+==
+
+data
----
-.. autofunction:: paddle.v2.fluid.layers.cast
+
+.. autofunction:: paddle.v2.fluid.layers.data
:noindex:
+BlockGuardServ
+--------------
-concat
--------
-.. autofunction:: paddle.v2.fluid.layers.concat
+.. autoclass:: paddle.v2.fluid.layers.BlockGuardServ
+ :members:
:noindex:
+ListenAndServ
+-------------
-sums
+.. autoclass:: paddle.v2.fluid.layers.ListenAndServ
+ :members:
+ :noindex:
+
+Send
----
-.. autofunction:: paddle.v2.fluid.layers.sums
+
+.. autofunction:: paddle.v2.fluid.layers.Send
:noindex:
+nn
+==
-linear_chain_crf
-----------------
-.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+fc
+--
+
+.. autofunction:: paddle.v2.fluid.layers.fc
:noindex:
+embedding
+---------
-assign
--------
.. autofunction:: paddle.v2.fluid.layers.embedding
:noindex:
+dynamic_lstm
+------------
-split_lod_tensor
-----------------
-.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
+.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
:noindex:
+dynamic_lstmp
+-------------
-merge_lod_tensor
+.. autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
+ :noindex:
+
+dynamic_gru
+-----------
+
+.. autofunction:: paddle.v2.fluid.layers.dynamic_gru
+ :noindex:
+
+gru_unit
+--------
+
+.. autofunction:: paddle.v2.fluid.layers.gru_unit
+ :noindex:
+
+linear_chain_crf
----------------
-.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
+
+.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+ :noindex:
+
+crf_decoding
+------------
+
+.. autofunction:: paddle.v2.fluid.layers.crf_decoding
:noindex:
cos_sim
---------
+-------
+
.. autofunction:: paddle.v2.fluid.layers.cos_sim
:noindex:
-
cross_entropy
-------------
+
.. autofunction:: paddle.v2.fluid.layers.cross_entropy
:noindex:
-
-
square_error_cost
-----------------
+
.. autofunction:: paddle.v2.fluid.layers.square_error_cost
:noindex:
-
accuracy
----------
+--------
+
.. autofunction:: paddle.v2.fluid.layers.accuracy
:noindex:
+chunk_eval
+----------
+
+.. autofunction:: paddle.v2.fluid.layers.chunk_eval
+ :noindex:
sequence_conv
-------------
+
.. autofunction:: paddle.v2.fluid.layers.sequence_conv
:noindex:
-
conv2d
------
+
.. autofunction:: paddle.v2.fluid.layers.conv2d
:noindex:
-
sequence_pool
-------------
+
.. autofunction:: paddle.v2.fluid.layers.sequence_pool
:noindex:
+pool2d
+------
-sequence_first_step
--------------------
-.. autofunction:: paddle.v2.fluid.layers.sequence_first_step
+.. autofunction:: paddle.v2.fluid.layers.pool2d
:noindex:
+batch_norm
+----------
+
+.. autofunction:: paddle.v2.fluid.layers.batch_norm
+ :noindex:
-sequence_last_step
+beam_search_decode
------------------
-.. autofunction:: paddle.v2.fluid.layers.sequence_last_step
+
+.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
:noindex:
+conv2d_transpose
+----------------
-pool2d
-------
-.. autofunction:: paddle.v2.fluid.layers.pool2d
+.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
:noindex:
+sequence_expand
+---------------
-batch_norm
+.. autofunction:: paddle.v2.fluid.layers.sequence_expand
+ :noindex:
+
+lstm_unit
+---------
+
+.. autofunction:: paddle.v2.fluid.layers.lstm_unit
+ :noindex:
+
+reduce_sum
----------
-.. autofunction:: paddle.v2.fluid.layers.batch_norm
+
+.. autofunction:: paddle.v2.fluid.layers.reduce_sum
+ :noindex:
+
+reduce_mean
+-----------
+
+.. autofunction:: paddle.v2.fluid.layers.reduce_mean
:noindex:
+reduce_max
+----------
+
+.. autofunction:: paddle.v2.fluid.layers.reduce_max
+ :noindex:
-beam_search_decode
+reduce_min
+----------
+
+.. autofunction:: paddle.v2.fluid.layers.reduce_min
+ :noindex:
+
+sequence_first_step
+-------------------
+
+.. autofunction:: paddle.v2.fluid.layers.sequence_first_step
+ :noindex:
+
+sequence_last_step
------------------
-.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
+
+.. autofunction:: paddle.v2.fluid.layers.sequence_last_step
+ :noindex:
+
+dropout
+-------
+
+.. autofunction:: paddle.v2.fluid.layers.dropout
:noindex:
+split
+-----
-lod_rank_table
---------------
-.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
+.. autofunction:: paddle.v2.fluid.layers.split
:noindex:
+ctc_greedy_decoder
+------------------
-max_sequence_len
-----------------
-.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
+.. autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
:noindex:
+edit_distance
+-------------
-topk
------
-.. autofunction:: paddle.v2.fluid.layers.topk
+.. autofunction:: paddle.v2.fluid.layers.edit_distance
:noindex:
+l2_normalize
+------------
-lod_tensor_to_array
--------------------
-.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
+.. autofunction:: paddle.v2.fluid.layers.l2_normalize
:noindex:
+matmul
+------
-
-array_to_lod_tensor
--------------------
-.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
+.. autofunction:: paddle.v2.fluid.layers.matmul
:noindex:
+warpctc
+-------
+.. autofunction:: paddle.v2.fluid.layers.warpctc
+ :noindex:
+sequence_reshape
+----------------
-fill_constant
--------------
-.. autofunction:: paddle.v2.fluid.layers.fill_constant
+.. autofunction:: paddle.v2.fluid.layers.sequence_reshape
:noindex:
+transpose
+---------
+.. autofunction:: paddle.v2.fluid.layers.transpose
+ :noindex:
-fill_constant_batch_size_like
------------------------------
-.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
+im2sequence
+-----------
+
+.. autofunction:: paddle.v2.fluid.layers.im2sequence
:noindex:
+nce
+---
-ones
-----
-.. autofunction:: paddle.v2.fluid.layers.ones
+.. autofunction:: paddle.v2.fluid.layers.nce
:noindex:
+beam_search
+-----------
-zeros
------
-.. autofunction:: paddle.v2.fluid.layers.zeros
+.. autofunction:: paddle.v2.fluid.layers.beam_search
:noindex:
+row_conv
+--------
-increment
----------
-.. autofunction:: paddle.v2.fluid.layers.increment
+.. autofunction:: paddle.v2.fluid.layers.row_conv
:noindex:
+multiplex
+---------
-array_write
------------
-.. autofunction:: paddle.v2.fluid.layers.array_write
+.. autofunction:: paddle.v2.fluid.layers.multiplex
:noindex:
+ops
+===
+mean
+----
-create_array
-------------
-.. autofunction:: paddle.v2.fluid.layers.create_array
+.. autofunction:: paddle.v2.fluid.layers.mean
:noindex:
+mul
+---
-less_than
----------
-.. autofunction:: paddle.v2.fluid.layers.less_than
+.. autofunction:: paddle.v2.fluid.layers.mul
:noindex:
+reshape
+-------
-array_read
-----------
-.. autofunction:: paddle.v2.fluid.layers.array_read
+.. autofunction:: paddle.v2.fluid.layers.reshape
:noindex:
+scale
+-----
-shrink_memory
---------------
-.. autofunction:: paddle.v2.fluid.layers.shrink_memory
+.. autofunction:: paddle.v2.fluid.layers.scale
:noindex:
+sigmoid_cross_entropy_with_logits
+---------------------------------
-array_length
--------------
-.. autofunction:: paddle.v2.fluid.layers.array_length
+.. autofunction:: paddle.v2.fluid.layers.sigmoid_cross_entropy_with_logits
:noindex:
+elementwise_add
+---------------
-conv2d_transpose
-----------------
-.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
+.. autofunction:: paddle.v2.fluid.layers.elementwise_add
:noindex:
-
-sequence_expand
+elementwise_div
---------------
-.. autofunction:: paddle.v2.fluid.layers.sequence_expand
+
+.. autofunction:: paddle.v2.fluid.layers.elementwise_div
:noindex:
+elementwise_sub
+---------------
-gru_unit
---------
-.. autofunction:: paddle.v2.fluid.layers.gru_unit
+.. autofunction:: paddle.v2.fluid.layers.elementwise_sub
:noindex:
+elementwise_mul
+---------------
-lstm_unit
----------
-.. autofunction:: paddle.v2.fluid.layers.lstm_unit
+.. autofunction:: paddle.v2.fluid.layers.elementwise_mul
:noindex:
+elementwise_max
+---------------
-sequence_softmax
-----------------
-.. autofunction:: paddle.v2.fluid.layers.sequence_softmax
+.. autofunction:: paddle.v2.fluid.layers.elementwise_max
:noindex:
+elementwise_min
+---------------
-reduce_sum
-----------
-.. autofunction:: paddle.v2.fluid.layers.reduce_sum
+.. autofunction:: paddle.v2.fluid.layers.elementwise_min
:noindex:
+elementwise_pow
+---------------
-reduce_mean
------------
-.. autofunction:: paddle.v2.fluid.layers.reduce_mean
+.. autofunction:: paddle.v2.fluid.layers.elementwise_pow
:noindex:
+clip
+----
-reduce_max
-----------
-.. autofunction:: paddle.v2.fluid.layers.reduce_max
+.. autofunction:: paddle.v2.fluid.layers.clip
:noindex:
+clip_by_norm
+------------
-reduce_min
-----------
-.. autofunction:: paddle.v2.fluid.layers.reduce_min
+.. autofunction:: paddle.v2.fluid.layers.clip_by_norm
:noindex:
+sequence_softmax
+----------------
-split
------
-.. autofunction:: paddle.v2.fluid.layers.split
+.. autofunction:: paddle.v2.fluid.layers.sequence_softmax
:noindex:
+sigmoid
+-------
-matmul
-------
-.. autofunction:: paddle.v2.fluid.layers.matmul
+.. autofunction:: paddle.v2.fluid.layers.sigmoid
:noindex:
logsigmoid
----------
+
.. autofunction:: paddle.v2.fluid.layers.logsigmoid
:noindex:
exp
---
+
.. autofunction:: paddle.v2.fluid.layers.exp
:noindex:
relu
----
+
.. autofunction:: paddle.v2.fluid.layers.relu
:noindex:
tanh
----
+
.. autofunction:: paddle.v2.fluid.layers.tanh
:noindex:
tanh_shrink
-----------
+
.. autofunction:: paddle.v2.fluid.layers.tanh_shrink
:noindex:
softshrink
----------
+
.. autofunction:: paddle.v2.fluid.layers.softshrink
:noindex:
sqrt
----
+
.. autofunction:: paddle.v2.fluid.layers.sqrt
:noindex:
abs
-----
+---
+
.. autofunction:: paddle.v2.fluid.layers.abs
:noindex:
ceil
----
+
.. autofunction:: paddle.v2.fluid.layers.ceil
:noindex:
floor
-----
+
.. autofunction:: paddle.v2.fluid.layers.floor
:noindex:
round
-----
+
.. autofunction:: paddle.v2.fluid.layers.round
:noindex:
reciprocal
----------
+
.. autofunction:: paddle.v2.fluid.layers.reciprocal
:noindex:
log
---
+
.. autofunction:: paddle.v2.fluid.layers.log
:noindex:
square
------
+
.. autofunction:: paddle.v2.fluid.layers.square
:noindex:
softplus
--------
+
.. autofunction:: paddle.v2.fluid.layers.softplus
:noindex:
softsign
----------
+--------
+
.. autofunction:: paddle.v2.fluid.layers.softsign
:noindex:
brelu
-----
+
.. autofunction:: paddle.v2.fluid.layers.brelu
:noindex:
leaky_relu
----------
+
.. autofunction:: paddle.v2.fluid.layers.leaky_relu
:noindex:
soft_relu
---------
+
.. autofunction:: paddle.v2.fluid.layers.soft_relu
:noindex:
elu
-----
+---
+
.. autofunction:: paddle.v2.fluid.layers.elu
:noindex:
relu6
-----
+
.. autofunction:: paddle.v2.fluid.layers.relu6
:noindex:
pow
-----
+---
+
.. autofunction:: paddle.v2.fluid.layers.pow
:noindex:
+stanh
+-----
+
+.. autofunction:: paddle.v2.fluid.layers.stanh
+ :noindex:
+
hard_shrink
-----------
+
.. autofunction:: paddle.v2.fluid.layers.hard_shrink
:noindex:
thresholded_relu
----------------
+
.. autofunction:: paddle.v2.fluid.layers.thresholded_relu
:noindex:
hard_sigmoid
--------------
+------------
+
.. autofunction:: paddle.v2.fluid.layers.hard_sigmoid
:noindex:
swish
-------
+-----
+
.. autofunction:: paddle.v2.fluid.layers.swish
:noindex:
-im2sequence
+tensor
+======
+
+create_tensor
+-------------
+
+.. autofunction:: paddle.v2.fluid.layers.create_tensor
+ :noindex:
+
+create_parameter
+----------------
+
+.. autofunction:: paddle.v2.fluid.layers.create_parameter
+ :noindex:
+
+create_global_var
+-----------------
+
+.. autofunction:: paddle.v2.fluid.layers.create_global_var
+ :noindex:
+
+cast
+----
+
+.. autofunction:: paddle.v2.fluid.layers.cast
+ :noindex:
+
+concat
------
-.. autofunction:: paddle.v2.fluid.layers.im2sequence
+
+.. autofunction:: paddle.v2.fluid.layers.concat
:noindex:
-edit_distance
----------------
-.. autofunction:: paddle.v2.fluid.layers.edit_distance_error
+sums
+----
+
+.. autofunction:: paddle.v2.fluid.layers.sums
:noindex:
-ctc_greedy_decoder
----------------
-.. autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
+assign
+------
+
+.. autofunction:: paddle.v2.fluid.layers.assign
:noindex:
-l2_normalize
-------------
-.. autofunction:: paddle.v2.fluid.layers.l2_normalize
+fill_constant_batch_size_like
+-----------------------------
+
+.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
:noindex:
-sequence_reshape
-----------------
-.. autofunction:: paddle.v2.fluid.layers.sequence_reshape
+fill_constant
+-------------
+
+.. autofunction:: paddle.v2.fluid.layers.fill_constant
:noindex:
-row_conv
---------
-.. autofunction:: paddle.v2.fluid.layers.row_conv
+ones
+----
+
+.. autofunction:: paddle.v2.fluid.layers.ones
:noindex:
-multiplex
----------
-.. autofunction:: paddle.v2.fluid.layers.multiplex
+zeros
+-----
+
+.. autofunction:: paddle.v2.fluid.layers.zeros
:noindex:
+
diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst
index 500019bc507f859c4c91de5d322a82eb1e78e2de..015581b7660848bdb0845fafe2d3fc05405e6ae6 100644
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
@@ -1,33 +1,31 @@
-===========
-Nets
-===========
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+====
+nets
+====
simple_img_conv_pool
--------------------
-.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
- :noindex:
-
-img_conv_group
----------------
-.. autofunction:: paddle.v2.fluid.nets.img_conv_group
+.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
:noindex:
-
sequence_conv_pool
------------------
+
.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
:noindex:
-
glu
---
+
.. autofunction:: paddle.v2.fluid.nets.glu
:noindex:
-
scaled_dot_product_attention
----------------------------
+
.. autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
:noindex:
diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst
index 19b4940f08de3e2f7dc177f2961e538946d10a78..1691ebb9a7cb16da96e04147d0adea322374f529 100644
--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
@@ -1,54 +1,49 @@
-===========
-Optimizer
-===========
-
-Optimizer
------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: Optimizer
- :noindex:
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+=========
+optimizer
+=========
-SGDOptimizer
------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: SGDOptimizer
- :noindex:
+SGD
+---
+.. autoclass:: paddle.v2.fluid.optimizer.SGD
+ :members:
+ :noindex:
+Momentum
+--------
-MomentumOptimizer
------------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: MomentumOptimizer
+.. autoclass:: paddle.v2.fluid.optimizer.Momentum
+ :members:
:noindex:
+Adagrad
+-------
-
-AdagradOptimizer
-----------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: AdagradOptimizer
+.. autoclass:: paddle.v2.fluid.optimizer.Adagrad
+ :members:
:noindex:
+Adam
+----
-AdamOptimizer
--------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: AdamOptimizer
+.. autoclass:: paddle.v2.fluid.optimizer.Adam
+ :members:
:noindex:
+Adamax
+------
-AdamaxOptimizer
------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: AdamaxOptimizer
+.. autoclass:: paddle.v2.fluid.optimizer.Adamax
+ :members:
:noindex:
+DecayedAdagrad
+--------------
-DecayedAdagradOptimizer
------------------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: DecayedAdagradOptimizer
+.. autoclass:: paddle.v2.fluid.optimizer.DecayedAdagrad
+ :members:
:noindex:
diff --git a/doc/api/v2/fluid/param_attr.rst b/doc/api/v2/fluid/param_attr.rst
index ca0c8af9e8c4f2271de7a131ad0d27c0e8635f50..8083d0d858dafcd275eaddb9b475875ee42ef724 100644
--- a/doc/api/v2/fluid/param_attr.rst
+++ b/doc/api/v2/fluid/param_attr.rst
@@ -1,11 +1,21 @@
-===========
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+==========
+param_attr
+==========
+
ParamAttr
-===========
+---------
+.. autoclass:: paddle.v2.fluid.param_attr.ParamAttr
+ :members:
+ :noindex:
+WeightNormParamAttr
+-------------------
-ParamAttr
------------
-.. automodule:: paddle.v2.fluid.param_attr
- :members: ParamAttr
+.. autoclass:: paddle.v2.fluid.param_attr.WeightNormParamAttr
+ :members:
:noindex:
+
diff --git a/doc/api/v2/fluid/profiler.rst b/doc/api/v2/fluid/profiler.rst
index 7d4042d1f41c12c4a551ba6576559d612116872a..4a1ff7cb6976e0054f77428b699ea679aa91394f 100644
--- a/doc/api/v2/fluid/profiler.rst
+++ b/doc/api/v2/fluid/profiler.rst
@@ -1,10 +1,25 @@
-===========
-Profiler
-===========
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+========
+profiler
+========
+cuda_profiler
+-------------
-Profiler
------------
.. autofunction:: paddle.v2.fluid.profiler.cuda_profiler
:noindex:
+
+reset_profiler
+--------------
+
+.. autofunction:: paddle.v2.fluid.profiler.reset_profiler
+ :noindex:
+
+profiler
+--------
+
+.. autofunction:: paddle.v2.fluid.profiler.profiler
+ :noindex:
+
diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst
index 868e225ed3d59e79aeb217fb88081ea25f80fa2c..2c17d15599baa1d02eb87c7b6c40034769ebb3a4 100644
--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
@@ -1,25 +1,27 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
===========
-Regularizer
+regularizer
===========
-WeightDecayRegularizer
-----------------------
-.. automodule:: paddle.v2.fluid.regularizer
- :members: WeightDecayRegularizer
- :noindex:
-
+append_regularization_ops
+-------------------------
-L2DecayRegularizer
-------------------
-.. automodule:: paddle.v2.fluid.regularizer
- :members: L2DecayRegularizer
+.. autofunction:: paddle.v2.fluid.regularizer.append_regularization_ops
:noindex:
+L1Decay
+-------
+.. autoclass:: paddle.v2.fluid.regularizer.L1Decay
+ :members:
+ :noindex:
-L1DecayRegularizer
--------------------
-.. automodule:: paddle.v2.fluid.regularizer
- :members: L1DecayRegularizer
+L2Decay
+-------
+.. autoclass:: paddle.v2.fluid.regularizer.L2Decay
+ :members:
+ :noindex:
diff --git a/doc/design/speech/README.MD b/doc/design/speech/deep_speech_2.md
similarity index 85%
rename from doc/design/speech/README.MD
rename to doc/design/speech/deep_speech_2.md
index 7304650e628dba210488cd2dc4836318b5383b2a..cfdc4d6df04344c70d3334626bd38eca997c31ff 100644
--- a/doc/design/speech/README.MD
+++ b/doc/design/speech/deep_speech_2.md
@@ -140,7 +140,19 @@ TODO by Assignees
### Beam Search with CTC and LM
-TODO by Assignees
+
+
+Figure 2. Algorithm for CTC Beam Search Decoder.
+
+
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
+ - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
+ - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
+- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
+- Such external scorer consists of language model, word count or any other custom scorers.
+- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
+
## Future Work
@@ -153,3 +165,4 @@ TODO by Assignees
1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595.
+3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
diff --git a/doc/design/speech/image/beam_search.png b/doc/design/speech/image/beam_search.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f7e35f34223162d0f7f0ed97375909c43b830ae
Binary files /dev/null and b/doc/design/speech/image/beam_search.png differ
diff --git a/doc/design/switch.md b/doc/design/switch.md
new file mode 100644
index 0000000000000000000000000000000000000000..9db1b2782a521c2ff4b28b8f9efcdf1492242ed4
--- /dev/null
+++ b/doc/design/switch.md
@@ -0,0 +1,32 @@
+### Design Doc: Switch
+
+### Background
+
+Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid.
+
+The following example shows the usage of `fluid.switch`.
+
+```python
+a = fluid.Var(10)
+b = fluid.Var(0)
+
+switch = fluid.switch()
+with switch.block():
+ with switch.case(fluid.less_equal(a, 10)):
+ fluid.print("Case 1")
+ with switch.case(fluid.larger(a, 0)):
+ fluid.print("Case 2")
+ with switch.default():
+ fluid.print("Case 3")
+```
+
+### The Semantics
+
+1. A `switch` control-flow checks cases one-by-one.
+1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
+1. It runs the first matched case, or the default case if there is one.
+1. Once it matches a case, it runs the corresponding branch and only that branch. It's like there is a C's `break` keyword at the end of each case.
+
+The above program should print and print only "Case 1".
+
+The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst
index 71904dc41ed0d946867d890cc585e1b88450ca8c..ff904b1022a41612c9680dce92d3fc2c69ad7e93 100644
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -115,7 +115,7 @@ PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种B
"WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
"WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
"WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
- "WITH_TESTING", "是否开启单元测试", "ON"
+ "WITH_TESTING", "是否开启单元测试", "OFF"
"WITH_DOC", "是否编译中英文文档", "OFF"
"WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto"
"WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst
index 27f73b2e2c029b41d514e1612912ed1c335605b6..718fb869c23a1f7be82c87c726282bded9dad516 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -126,7 +126,7 @@ You can add :code:`-D` argument to pass such options, like:
"WITH_AVX", "Build with AVX support", "ON"
"WITH_PYTHON", "Build with integrated Python interpreter", "ON"
"WITH_STYLE_CHECK", "Check code style when building", "ON"
- "WITH_TESTING", "Build unit tests", "ON"
+ "WITH_TESTING", "Build unit tests", "OFF"
"WITH_DOC", "Build documentations", "OFF"
"WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
"WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 98fada7bdb46f4dd2927d6f93bcbcebbe7d18604..79d214635a069a739060e0b79424729f6ff90387 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -95,6 +95,12 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
docker run -p 8888:8888 paddlepaddle/book
+国内用户可以使用下面的镜像源来加速访问:
+
+ .. code-block: bash
+
+ docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
然后在浏览器中输入以下网址:
.. code-block:: text
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index b1d0890b4cdddb77114a80276130afd07c22d270..e0e0559fb858a093db96a9b4ec1c5a45d6c71a38 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -102,6 +102,12 @@ We provide a packaged book image, simply issue the command:
docker run -p 8888:8888 paddlepaddle/book
+For users in China, we provide a faster mirror:
+
+ .. code-block: bash
+
+ docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
Then, you would back and paste the address into the local browser:
.. code-block:: text
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
index c2fc86687d7106aac7c74d6dd16bc229353cb7c1..0f3db59607fb6b43da01f5fdb46949087517ed6c 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -92,11 +92,11 @@ paddle.init(
参数说明
- use_gpu: **可选,默认False**,是否启用GPU训练
-- trainer_count:**必选,默认1**,当前训练任务trainer总个数
+- trainer_count:**必选,默认1**,当前trainer的线程数目
- port:**必选,默认7164**,连接到pserver的端口
- ports_num:**必选,默认1**,连接到pserver的端口个数
- ports_num_for_sparse:**必选,默认0**,和pserver之间用于稀疏类型参数通信的端口个数
-- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数
+- num_gradient_servers:**必选,默认1**,当前训练任务trainer总数
- trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数
- pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
index 28cd1fa7903e559e33a7fc2f00172fdfbe2fdc97..f9424f8f1a29fcf001c4e7976086512b22f6e858 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -95,11 +95,11 @@ paddle.init(
Parameter Description
- use_gpu: **optional, default False**, set to "True" to enable GPU training.
-- trainer_count: **required, default 1**, total count of trainers in the training job.
+- trainer_count: **required, default 1**, number of threads in current trainer.
- port: **required, default 7164**, port to connect to parameter server.
- ports_num: **required, default 1**, number of ports for communication.
- ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
-- num_gradient_servers: **required, default 1**, total number of gradient server.
+- num_gradient_servers: **required, default 1**, number of trainers in current job.
- trainer_id: **required, default 0**, ID for every trainer, start from 0.
- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
diff --git a/doc/index_cn.rst b/doc/index_cn.rst
index ada51c2d73263898b2c748437f8eb0f30b537073..9279bac7f4b2898c18979630a8d6dfcb2dba70e0 100644
--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -8,4 +8,3 @@ PaddlePaddle 文档
howto/index_cn.rst
api/index_cn.rst
faq/index_cn.rst
- mobile/index_cn.rst
diff --git a/doc/index_en.rst b/doc/index_en.rst
index 23b64b6cadf776d44c4d0aa5a550ffe24be13b18..64684b8b9b27e245c6b32ea28809d3bbce22fab9 100644
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -7,4 +7,3 @@ PaddlePaddle Documentation
getstarted/index_en.rst
howto/index_en.rst
api/index_en.rst
- mobile/index_en.rst
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
deleted file mode 100644
index 1d99666e58b7043b85b0203ee0dfcd1957710161..0000000000000000000000000000000000000000
--- a/doc/mobile/index_cn.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-MOBILE
-======
-
-.. toctree::
- :maxdepth: 1
-
- cross_compiling_for_android_cn.md
- cross_compiling_for_ios_cn.md
- cross_compiling_for_raspberry_cn.md
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
deleted file mode 100644
index ef421dacad458828cadf8cf505375d6c4bfd9dde..0000000000000000000000000000000000000000
--- a/doc/mobile/index_en.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-MOBILE
-======
-
-.. toctree::
- :maxdepth: 1
-
- cross_compiling_for_android_en.md
- cross_compiling_for_ios_en.md
- cross_compiling_for_raspberry_en.md
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index d394fa5d10d502d8fadbb48b6b85e4884f20b70d..a2a0be08d9425cdd8cce374aecd097085491d4c0 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -22,7 +22,7 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
-nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
+nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
cc_test(variable_test SRCS variable_test.cc)
diff --git a/paddle/framework/channel.h b/paddle/framework/channel.h
index 70ecccc1a1078374f3190b3956103ed8000c4fc5..b679387b1124e42499df158767b6c7afe1afd0c6 100644
--- a/paddle/framework/channel.h
+++ b/paddle/framework/channel.h
@@ -23,12 +23,10 @@ namespace framework {
template
class Channel {
public:
- virtual void Send(T*) = 0;
- virtual void Receive(T*) = 0;
+ virtual bool Send(T*) = 0;
+ virtual bool Receive(T*) = 0;
virtual size_t Cap() = 0;
-
- // Don't delete channels; instead, call Channel::Close.
- protected:
+ virtual void Close() = 0;
virtual ~Channel() {}
};
@@ -50,11 +48,7 @@ Channel* MakeChannel(size_t buffer_size) {
template
void CloseChannel(Channel* ch) {
- if (ch->Cap() > 0) {
- delete dynamic_cast*>(ch);
- } else {
- delete dynamic_cast*>(ch);
- }
+ ch->Close();
}
} // namespace framework
diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc
index 9efc0172658c800d14102531332dbef68fa392f4..444d68498c9676fe0e246167dfacbe999a41d1a7 100644
--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@@ -14,13 +14,329 @@ limitations under the License. */
#include "paddle/framework/channel.h"
+#include
+#include
+
#include "gtest/gtest.h"
+using paddle::framework::Channel;
+using paddle::framework::MakeChannel;
+using paddle::framework::CloseChannel;
+
TEST(Channel, MakeAndClose) {
- using paddle::framework::Channel;
- using paddle::framework::MakeChannel;
- using paddle::framework::CloseChannel;
+ using paddle::framework::details::Buffered;
+ using paddle::framework::details::UnBuffered;
+ {
+ // MakeChannel should return a buffered channel is buffer_size > 0.
+ auto ch = MakeChannel(10);
+ EXPECT_NE(dynamic_cast *>(ch), nullptr);
+ EXPECT_EQ(dynamic_cast *>(ch), nullptr);
+ CloseChannel(ch);
+ delete ch;
+ }
+ {
+ // MakeChannel should return an un-buffered channel is buffer_size = 0.
+ auto ch = MakeChannel(0);
+ EXPECT_EQ(dynamic_cast *>(ch), nullptr);
+ EXPECT_NE(dynamic_cast *>(ch), nullptr);
+ CloseChannel(ch);
+ delete ch;
+ }
+}
+
+TEST(Channel, SufficientBufferSizeDoesntBlock) {
+ const size_t buffer_size = 10;
+ auto ch = MakeChannel(buffer_size);
+ for (size_t i = 0; i < buffer_size; ++i) {
+ EXPECT_EQ(ch->Send(&i), true); // should not block
+ }
+
+ size_t out;
+ for (size_t i = 0; i < buffer_size; ++i) {
+ EXPECT_EQ(ch->Receive(&out), true); // should not block
+ EXPECT_EQ(out, i);
+ }
+ CloseChannel(ch);
+ delete ch;
+}
+
+TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
+ const size_t buffer_size = 10;
+ auto ch = MakeChannel(buffer_size);
+ size_t sum = 0;
+ std::thread t([&]() {
+ // Try to write more than buffer size.
+ for (size_t i = 0; i < 2 * buffer_size; ++i) {
+ if (i < buffer_size)
+ EXPECT_EQ(ch->Send(&i), true); // should block after 10 iterations
+ else
+ EXPECT_EQ(ch->Send(&i), false);
+ sum += i;
+ }
+ });
+ std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec
+ EXPECT_EQ(sum, 45U);
+
+ CloseChannel(ch);
+ t.join();
+ delete ch;
+}
+
+TEST(Channel, SimpleUnbufferedChannelTest) {
+ auto ch = MakeChannel(0);
+ unsigned sum_send = 0;
+ std::thread t([&]() {
+ for (int i = 0; i < 5; i++) {
+ EXPECT_EQ(ch->Send(&i), true);
+ sum_send += i;
+ }
+ });
+ for (int i = 0; i < 5; i++) {
+ int recv;
+ EXPECT_EQ(ch->Receive(&recv), true);
+ EXPECT_EQ(recv, i);
+ }
+
+ CloseChannel(ch);
+ t.join();
+ EXPECT_EQ(sum_send, 10U);
+ delete ch;
+}
+
+// This tests that closing a buffered channel also unblocks
+// any receivers waiting on the channel
+TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
+ auto ch = MakeChannel(1);
+ size_t num_threads = 5;
+ std::thread t[num_threads];
+ bool thread_ended[num_threads];
+
+ // Launches threads that try to read and are blocked because of no writers
+ for (size_t i = 0; i < num_threads; i++) {
+ thread_ended[i] = false;
+ t[i] = std::thread(
+ [&](bool *p) {
+ int data;
+ // All reads should return false
+ EXPECT_EQ(ch->Receive(&data), false);
+ *p = true;
+ },
+ &thread_ended[i]);
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
+
+ // Verify that all threads are blocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], false);
+ }
+
+ // Explicitly close the channel
+ // This should unblock all receivers
+ CloseChannel(ch);
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
+
+ // Verify that all threads got unblocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], true);
+ }
+
+ for (size_t i = 0; i < num_threads; i++) t[i].join();
+ delete ch;
+}
+
+// This tests that closing a buffered channel also unblocks
+// any senders waiting for channel to have write space
+TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
+ auto ch = MakeChannel(1);
+ size_t num_threads = 5;
+ std::thread t[num_threads];
+ bool thread_ended[num_threads];
+ bool send_success[num_threads];
+
+ // Launches threads that try to write and are blocked because of no readers
+ for (size_t i = 0; i < num_threads; i++) {
+ thread_ended[i] = false;
+ send_success[i] = false;
+ t[i] = std::thread(
+ [&](bool *ended, bool *success) {
+ int data = 10;
+ *success = ch->Send(&data);
+ *ended = true;
+ },
+ &thread_ended[i], &send_success[i]);
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
+
+ // Verify that atleast 4 threads are blocked
+ int ct = 0;
+ for (size_t i = 0; i < num_threads; i++) {
+ if (thread_ended[i] == false) ct++;
+ }
+ // Atleast 4 threads must be blocked
+ EXPECT_GE(ct, 4);
+
+ // Explicitly close the thread
+ // This should unblock all senders
+ CloseChannel(ch);
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
+
+ // Verify that all threads got unblocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], true);
+ }
+
+ // Verify that only 1 send was successful
+ ct = 0;
+ for (size_t i = 0; i < num_threads; i++) {
+ if (send_success[i]) ct++;
+ }
+ // Only 1 send must be successful
+ EXPECT_EQ(ct, 1);
+
+ for (size_t i = 0; i < num_threads; i++) t[i].join();
+ delete ch;
+}
+
+// This tests that closing an unbuffered channel also unblocks
+// unblocks any receivers waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
+ auto ch = MakeChannel(0);
+ size_t num_threads = 5;
+ std::thread t[num_threads];
+ bool thread_ended[num_threads];
+
+ // Launches threads that try to read and are blocked becausew of no writers
+ for (size_t i = 0; i < num_threads; i++) {
+ thread_ended[i] = false;
+ t[i] = std::thread(
+ [&](bool *p) {
+ int data;
+ EXPECT_EQ(ch->Receive(&data), false);
+ *p = true;
+ },
+ &thread_ended[i]);
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
+
+ // Verify that all the threads are blocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], false);
+ }
+
+ // Explicitly close the thread
+ // This should unblock all receivers
+ CloseChannel(ch);
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
+
+ // Verify that all threads got unblocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], true);
+ }
+
+ for (size_t i = 0; i < num_threads; i++) t[i].join();
+ delete ch;
+}
+
+// This tests that closing an unbuffered channel also unblocks
+// unblocks any senders waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
+ auto ch = MakeChannel(0);
+ size_t num_threads = 5;
+ std::thread t[num_threads];
+ bool thread_ended[num_threads];
+
+ // Launches threads that try to read and are blocked becausew of no writers
+ for (size_t i = 0; i < num_threads; i++) {
+ thread_ended[i] = false;
+ t[i] = std::thread(
+ [&](bool *p) {
+ int data = 10;
+ EXPECT_EQ(ch->Send(&data), false);
+ *p = true;
+ },
+ &thread_ended[i]);
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
+
+ // Verify that all the threads are blocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], false);
+ }
+
+ // Explicitly close the thread
+ // This should unblock all receivers
+ CloseChannel(ch);
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
+
+ // Verify that all threads got unblocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], true);
+ }
+
+ for (size_t i = 0; i < num_threads; i++) t[i].join();
+ delete ch;
+}
+
+TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
+ auto ch = MakeChannel(0);
+ unsigned sum_send = 0;
+ // Send should block after three iterations
+ // since we only have three receivers.
+ std::thread t([&]() {
+ // Try to send more number of times
+ // than receivers
+ for (int i = 0; i < 4; i++) {
+ ch->Send(&i);
+ sum_send += i;
+ }
+ });
+ for (int i = 0; i < 3; i++) {
+ int recv;
+ ch->Receive(&recv);
+ EXPECT_EQ(recv, i);
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec
+ EXPECT_EQ(sum_send, 3U);
+
+ CloseChannel(ch);
+ t.join();
+ delete ch;
+}
+
+TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
+ auto ch = MakeChannel(0);
+ unsigned sum_send = 0;
+ unsigned sum_receive = 0;
+ // The receiver should block after 5
+ // iterations, since there are only 5 senders.
+ std::thread t([&]() {
+ for (int i = 0; i < 8; i++) {
+ int recv;
+ ch->Receive(&recv); // should block after the fifth iteration.
+ EXPECT_EQ(recv, i);
+ sum_receive += i;
+ }
+ });
+ for (int i = 0; i < 5; i++) {
+ ch->Send(&i);
+ sum_send += i;
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
+ EXPECT_EQ(sum_send, 10U);
+ EXPECT_EQ(sum_receive, 10U);
+ // send three more elements
+ for (int i = 5; i < 8; i++) {
+ ch->Send(&i);
+ sum_send += i;
+ }
- Channel* ch = MakeChannel(10);
CloseChannel(ch);
+ t.join();
+ EXPECT_EQ(sum_send, 28U);
+ EXPECT_EQ(sum_receive, 28U);
+ delete ch;
}
diff --git a/paddle/framework/details/buffered_channel.h b/paddle/framework/details/buffered_channel.h
index 572e29d44a3baec84a029d87f9b0874784aa761b..7ac234b8d42bae0661c3256c78311455c0fbc77c 100644
--- a/paddle/framework/details/buffered_channel.h
+++ b/paddle/framework/details/buffered_channel.h
@@ -18,6 +18,7 @@ limitations under the License. */
#include
#include "paddle/framework/channel.h"
+#include "paddle/platform/enforce.h"
namespace paddle {
namespace framework {
@@ -29,9 +30,11 @@ class Buffered : public paddle::framework::Channel {
friend void paddle::framework::CloseChannel(Channel*);
public:
- virtual void Send(T*);
- virtual void Receive(T*);
+ virtual bool Send(T*);
+ virtual bool Receive(T*);
virtual size_t Cap() { return cap_; }
+ virtual void Close();
+ virtual ~Buffered();
private:
size_t cap_;
@@ -39,42 +42,64 @@ class Buffered : public paddle::framework::Channel {
std::condition_variable empty_cond_var_;
std::condition_variable full_cond_var_;
std::deque channel_;
+ bool closed_;
- Buffered(size_t cap) : cap_(cap) {}
- virtual ~Buffered();
+ Buffered(size_t cap) : cap_(cap), closed_(false) {
+ PADDLE_ENFORCE_GT(cap, 0);
+ }
- void NotifyAllSenders(std::unique_lock*);
+ void NotifyAllParticipants(std::unique_lock*);
};
template
-void Buffered::Send(T* item) {
+bool Buffered::Send(T* item) {
+ std::unique_lock lock(mu_);
+ full_cond_var_.wait(lock,
+ [this]() { return channel_.size() < cap_ || closed_; });
+ bool ret = false;
+ if (!closed_) {
+ channel_.push_back(std::move(*item));
+ lock.unlock();
+ empty_cond_var_.notify_one();
+ ret = true;
+ }
+ return ret;
+}
+
+template
+bool Buffered::Receive(T* item) {
std::unique_lock lock(mu_);
- full_cond_var_.wait(lock, [this]() { return channel_.size() < cap_; });
- channel_.push_back(std::move(*item));
- lock.unlock();
- empty_cond_var_.notify_one();
+ empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
+ bool ret = false;
+ if (!closed_) {
+ *item = std::move(channel_.front());
+ channel_.pop_front();
+ full_cond_var_.notify_one();
+ ret = true;
+ }
+ return ret;
}
template
-void Buffered::Receive(T* item) {
+void Buffered::Close() {
std::unique_lock lock(mu_);
- empty_cond_var_.wait(lock, [this]() { return !channel_.empty(); });
- *item = std::move(channel_.front());
- channel_.pop_front();
- NotifyAllSenders(&lock);
+ closed_ = true;
+ NotifyAllParticipants(&lock);
}
template
Buffered::~Buffered() {
std::unique_lock lock(mu_);
+ closed_ = true;
channel_.clear();
- NotifyAllSenders(&lock);
+ NotifyAllParticipants(&lock);
}
template
-void Buffered::NotifyAllSenders(std::unique_lock* lock) {
+void Buffered::NotifyAllParticipants(std::unique_lock* lock) {
lock->unlock();
- full_cond_var_.notify_one();
+ full_cond_var_.notify_all();
+ empty_cond_var_.notify_all();
}
} // namespace details
diff --git a/paddle/framework/details/unbuffered_channel.h b/paddle/framework/details/unbuffered_channel.h
index 7ecced1fba88fea781fc342091bc71e5aa496d3a..f86a894bb4a42e45edf6964e30620b68183faaa8 100644
--- a/paddle/framework/details/unbuffered_channel.h
+++ b/paddle/framework/details/unbuffered_channel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
+#include
#include
-#include
#include
#include "paddle/framework/channel.h"
@@ -29,23 +29,117 @@ class UnBuffered : public paddle::framework::Channel {
friend void paddle::framework::CloseChannel(Channel*);
public:
- virtual void Send(T*);
- virtual void Receive(T*);
+ virtual bool Send(T*);
+ virtual bool Receive(T*);
virtual size_t Cap() { return 0; }
+ virtual void Close();
+ virtual ~UnBuffered();
private:
- UnBuffered() {}
- virtual ~UnBuffered();
+ std::mutex mu_ch_;
+ // Mutex for readers and writers who are waiting for other reader
+ // and writer to complete execution
+ std::recursive_mutex mu_read_, mu_write_;
+ // reader_found_ is set true when a reader is ready to accept data
+ // writer_found_ is set true when a writer is ready to send data
+ // A transaction occurs only when both are true
+ std::atomic reader_found_{false}, writer_found_{false};
+ std::condition_variable cv_channel_;
+ std::condition_variable_any cv_reader_, cv_writer_;
+ T* item{nullptr};
+ std::atomic closed_{false};
+
+ UnBuffered() : closed_(false) {}
+
+ void NotifyAllParticipants(std::unique_lock*);
};
+// This function implements the concept of how data should
+// be sent from a writer to a reader.
+template
+bool UnBuffered::Send(T* data) {
+ // Prevent other writers from entering
+ std::unique_lock writer_lock(mu_write_);
+ writer_found_ = true;
+ std::unique_lock cv_lock(mu_write_);
+ // If writer comes first, it should wait till a reader arrives
+ cv_writer_.wait(cv_lock,
+ [this]() { return reader_found_ == true || closed_; });
+ cv_reader_.notify_one();
+ bool ret = false;
+ if (!closed_) {
+ std::unique_lock channel_lock(mu_ch_);
+ item = data;
+ channel_lock.unlock();
+ cv_channel_.notify_one();
+ channel_lock.lock();
+ cv_channel_.wait(channel_lock,
+ [this]() { return item == nullptr || closed_; });
+ ret = true;
+ }
+ writer_found_ = false;
+ return ret;
+}
+
+// This function implements the concept of how
+// data that was sent by a writer is read from a reader.
+template
+bool UnBuffered::Receive(T* data) {
+ // Prevent other readers from entering
+ std::unique_lock read_lock{mu_read_};
+ reader_found_ = true;
+ std::unique_lock cv_lock{mu_read_};
+ // If reader comes first, it should wait till a writer arrives
+ cv_reader_.wait(cv_lock,
+ [this]() { return writer_found_ == true || closed_; });
+ cv_writer_.notify_one();
+ bool ret = false;
+ if (!closed_) {
+ std::unique_lock lock_ch{mu_ch_};
+ // Reader should wait for the writer to first write its data
+ cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
+ if (!closed_) {
+ *data = std::move(*item);
+ item = nullptr;
+ lock_ch.unlock();
+ ret = true;
+ }
+ cv_channel_.notify_one();
+ }
+ reader_found_ = false;
+ return ret;
+}
+
+// This function implements the sequence of events
+// that take place once the channel is closed.
template
-void UnBuffered::Send(T* channel_element) {}
+void UnBuffered::Close() {
+ std::unique_lock lock(mu_ch_);
+ item = nullptr;
+ closed_ = true;
+ NotifyAllParticipants(&lock);
+}
+// This function implements the sequence of events
+// that are executed once the object of an UnBuffered
+// channel is destroyed.
template
-void UnBuffered::Receive(T*) {}
+UnBuffered::~UnBuffered() {
+ std::unique_lock lock(mu_ch_);
+ item = nullptr;
+ closed_ = true;
+ NotifyAllParticipants(&lock);
+}
+// This function notifies all the readers, writers and
+// the channel condition variables.
template
-UnBuffered::~UnBuffered() {}
+void UnBuffered::NotifyAllParticipants(std::unique_lock* lock) {
+ lock->unlock();
+ cv_writer_.notify_all();
+ cv_channel_.notify_all();
+ cv_reader_.notify_all();
+}
} // namespace details
} // namespace framework
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index cbf3ec75265fa74aaacffee684b7b7d5f73b7c02..9a232b08434d299d10bb2acdb6e96295de875d56 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -25,7 +25,7 @@ limitations under the License. */
#include "paddle/platform/place.h"
#include "paddle/platform/profiler.h"
-DECLARE_bool(do_memory_benchmark);
+DECLARE_bool(benchmark);
DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely.");
@@ -33,9 +33,6 @@ DEFINE_bool(check_nan_inf, false,
namespace paddle {
namespace framework {
-const std::string kFeedOpType = "feed";
-const std::string kFetchOpType = "fetch";
-
Executor::Executor(const platform::Place& place) : place_(place) {}
static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
@@ -125,7 +122,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
op->Run(*local_scope, place_);
VLOG(3) << op->DebugStringEx(local_scope);
- if (FLAGS_do_memory_benchmark) {
+ if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_);
}
@@ -142,7 +139,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope);
}
- if (FLAGS_do_memory_benchmark) {
+ if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------";
VLOG(2) << "Memory used after deleting local scope: "
<< memory::memory_usage(place_);
diff --git a/paddle/framework/feed_fetch_type.h b/paddle/framework/feed_fetch_type.h
index 9bc4a90c44828ecb7458d524f59609f01848cc5c..168f456675af508df86dd0520cdeb5d16d94ad31 100644
--- a/paddle/framework/feed_fetch_type.h
+++ b/paddle/framework/feed_fetch_type.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
+#include
#include
#include "paddle/framework/lod_tensor.h"
@@ -20,5 +21,8 @@ namespace paddle {
namespace framework {
using FeedFetchType = LoDTensor;
using FeedFetchList = std::vector;
+
+static const std::string kFeedOpType = "feed";
+static const std::string kFetchOpType = "fetch";
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc
index 4ef82a541efaa35bcf831d5122570154f2fa2423..3f6ea121b3994979d89a7d5a8c20c59240a0c111 100644
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include // for strdup
#include
+#include
#include
#include "paddle/framework/init.h"
@@ -46,17 +47,23 @@ void InitDevices() {
std::vector places;
places.emplace_back(platform::CPUPlace());
+ int count = 0;
#ifdef PADDLE_WITH_CUDA
- int count = platform::GetCUDADeviceCount();
- for (int i = 0; i < count; ++i) {
- places.emplace_back(platform::CUDAPlace(i));
+ try {
+ count = platform::GetCUDADeviceCount();
+ } catch (const std::exception &exp) {
+ LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
}
#else
LOG(WARNING)
- << "'GPU' is not supported, Please re-compile with WITH_GPU option";
+ << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
#endif
+ for (int i = 0; i < count; ++i) {
+ places.emplace_back(platform::CUDAPlace(i));
+ }
+
platform::DeviceContextPool::Init(places);
}
diff --git a/paddle/framework/init_test.cc b/paddle/framework/init_test.cc
index f837a965d3be7d40c20803ae4462b3bfd91bffd0..01e076dd8ea24831e3ed7c8a7f8fae6818a89335 100644
--- a/paddle/framework/init_test.cc
+++ b/paddle/framework/init_test.cc
@@ -20,7 +20,21 @@ TEST(InitDevices, CPU) {
using paddle::framework::InitDevices;
using paddle::platform::DeviceContextPool;
+#ifndef PADDLE_WITH_CUDA
InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance();
- ASSERT_GE(pool.size(), 1U);
+ ASSERT_EQ(pool.size(), 1U);
+#endif
+}
+
+TEST(InitDevices, CUDA) {
+ using paddle::framework::InitDevices;
+ using paddle::platform::DeviceContextPool;
+
+#ifdef PADDLE_WITH_CUDA
+ int count = paddle::platform::GetCUDADeviceCount();
+ InitDevices();
+ DeviceContextPool& pool = DeviceContextPool::Instance();
+ ASSERT_EQ(pool.size(), 1U + static_cast(count));
+#endif
}
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 53b0d0fe083579da4f0bb600f292765aa2aa0d8a..cb27de6991674247e6215ce64a2da5000fa78ed4 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -24,8 +24,6 @@ limitations under the License. */
#include
#include
-#include
-
namespace paddle {
namespace framework {
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 9d1294fdeb9bd76bf944f7ec3687e3c5bb333241..d0ab640485baf6d76ee629ea420b603f42b031b4 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -18,11 +18,11 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include
#include
-#include
#endif
#include
#include "paddle/framework/ddim.h"
+#include "paddle/framework/mixed_vector.h"
#include "paddle/framework/tensor.h"
#include "paddle/framework/tensor_util.h"
#include "paddle/platform/enforce.h"
@@ -31,15 +31,6 @@ limitations under the License. */
namespace paddle {
namespace framework {
-#ifndef PADDLE_WITH_CUDA
-template
-using Vector = std::vector;
-#else
-template
-using Vector = thrust::host_vector<
- T, thrust::system::cuda::experimental::pinned_allocator>;
-#endif
-
/*
* LoD is short for Level of Details.
*
@@ -55,7 +46,15 @@ using Vector = thrust::host_vector<
* 0 2 4 7
* 0 2 5 7 10 12 15 20
*/
-using LoD = std::vector>;
+struct LoD : public std::vector> {
+ using std::vector>::vector;
+
+ void CopyFromCUDA() {
+ for (auto it = this->begin(); it != this->end(); ++it) {
+ it->CopyFromCUDA();
+ }
+ }
+};
std::ostream& operator<<(std::ostream& os, const LoD& lod);
std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
@@ -109,7 +108,10 @@ bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
*/
class LoDTensor : public Tensor {
public:
- LoDTensor() {}
+ LoDTensor() : Tensor() {}
+
+ /* Constructor with place should only be used in pybind */
+ explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
explicit LoDTensor(const LoD& lod) : lod_(lod) {}
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index 4d172c43c7cceacb7d0dfaf1c4d3028717350268..3b63020e685436396071fa05cd7697630ae56c95 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -23,6 +23,17 @@
namespace paddle {
namespace framework {
+TEST(LoD, data) {
+ LoD lod{{0, 1, 2}};
+ lod.push_back({0, 2, 4, 5});
+ lod.push_back(std::vector({0, 1, 6, 8, 10, 11}));
+
+ auto& v = lod[0];
+ for (size_t i = 0; i < v.size(); ++i) {
+ EXPECT_EQ(v[i], i);
+ }
+}
+
TEST(LodExpand, test) {
LoD lod{{0, 2}};
LoDTensor tensor;
diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu
index 1e253a2f6f35e827fb2e5db6270da03705b39514..d4c9f00bd9c00f3cae68858ca46c5320fc117405 100644
--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
@@ -14,6 +14,8 @@
#include
#include
+#include
+#include "paddle/framework/init.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/platform/assert.h"
@@ -26,7 +28,48 @@ __global__ void test(size_t* a, int size) {
}
}
+TEST(Vector, Normal) {
+ using namespace paddle::framework;
+ using namespace paddle::platform;
+ using namespace paddle::memory;
+
+ paddle::framework::InitDevices();
+
+ paddle::framework::Vector vec({1, 2, 3});
+ size_t* ptr = vec.data();
+ for (size_t i = 0; i < vec.size(); ++i) {
+ EXPECT_EQ(vec[i], *(ptr + i));
+ }
+
+ vec.clear();
+ vec.CopyFromCUDA();
+
+ std::vector v = {1, 2, 3};
+ for (size_t i = 0; i < v.size(); ++i) {
+ EXPECT_EQ(v[i], vec[i]);
+ }
+}
+
+TEST(LoD, data) {
+ paddle::framework::InitDevices();
+
+ paddle::framework::LoD lod{{0, 1, 2}};
+ lod.push_back({0, 2, 4, 5});
+ lod.push_back(std::vector({0, 1, 6, 8, 10, 11}));
+
+ auto& v = lod[0];
+ test<<<1, 1>>>(v.cuda_data(), v.size());
+ cudaDeviceSynchronize();
+
+ v.CopyFromCUDA();
+ for (size_t i = 0; i < v.size(); ++i) {
+ EXPECT_EQ(v[i], i * 2);
+ }
+}
+
TEST(LoDTensor, LoDInGPU) {
+ paddle::framework::InitDevices();
+
paddle::framework::LoDTensor lod_tensor;
paddle::platform::CUDAPlace place(0);
@@ -42,8 +85,9 @@ TEST(LoDTensor, LoDInGPU) {
auto lod = lod_tensor.lod();
- test<<<1, 8>>>(lod[0].data(), lod[0].size());
+ test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size());
cudaDeviceSynchronize();
+ lod.CopyFromCUDA();
for (size_t i = 0; i < src_lod[0].size(); ++i) {
EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
diff --git a/paddle/framework/mixed_vector.h b/paddle/framework/mixed_vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..85caac8dcd9ede4fe997e2fd246d1421aa73c80a
--- /dev/null
+++ b/paddle/framework/mixed_vector.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#include
+#include
+
+#include "paddle/memory/memcpy.h"
+#include "paddle/memory/memory.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+/**
+ * @brief Vector support both cpu and gpu.
+ * host vector lifetime is same with Vector
+ * device vector is lazily malloc and modified.
+ */
+
+template
+class Vector : public std::vector {
+ public:
+ using std::vector::vector;
+
+ Vector() {}
+ Vector(const std::vector &v) : std::vector(v) {} // NOLINT
+
+ virtual ~Vector() {
+#ifdef PADDLE_WITH_CUDA
+ if (cuda_ptr_ != nullptr) {
+ memory::Free(place_, cuda_ptr_);
+ }
+#endif
+ }
+
+ /* Get device vector */
+ T *cuda_data() {
+ CopyToCUDA();
+ PADDLE_ENFORCE_NOT_NULL(
+ cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
+ return static_cast(cuda_ptr_);
+ }
+
+ /* Get host vector */
+ T *data() { return std::vector::data(); }
+ const T *data() const { return std::vector::data(); }
+
+ /* Synchronize host vector to device vector */
+ void CopyToCUDA();
+ /* Synchronize device vector to host vector */
+ void CopyFromCUDA();
+ /* Switch device vector location */
+ void CopyToPeer(platform::Place);
+
+ private:
+ void *cuda_ptr_ = nullptr;
+ size_t cuda_size_ = 0; // device vector numel
+ platform::CUDAPlace place_;
+};
+
+template
+void Vector::CopyToCUDA() {
+#ifdef PADDLE_WITH_CUDA
+ if (cuda_size_ < this->size()) {
+ if (cuda_ptr_ != nullptr) {
+ memory::Free(place_, cuda_ptr_);
+ }
+ cuda_ptr_ =
+ memory::Alloc(place_, this->size() * sizeof(T));
+ }
+ cuda_size_ = this->size();
+ platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+ auto *ctx = pool.GetByPlace(place_);
+ memory::Copy(place_, cuda_ptr_, platform::CPUPlace(),
+ static_cast(this->data()),
+ this->size() * sizeof(T), ctx->stream());
+ ctx->Wait();
+#endif
+}
+
+template
+void Vector::CopyFromCUDA() {
+#ifdef PADDLE_WITH_CUDA
+ if (cuda_ptr_ == nullptr) {
+ LOG(WARNING) << "No uncommitted cuda data.";
+ return;
+ }
+ this->resize(cuda_size_);
+ platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+ auto *ctx = pool.GetByPlace(place_);
+ memory::Copy(platform::CPUPlace(), static_cast(this->data()), place_,
+ static_cast(cuda_ptr_), this->size() * sizeof(T),
+ ctx->stream());
+ ctx->Wait();
+#endif
+}
+
+template
+void Vector::CopyToPeer(platform::Place peer_place) {
+#ifdef PADDLE_WITH_CUDA
+ auto *ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
+ void *peer_cuda_ptr = memory::Alloc(
+ boost::get(peer_place), this->size() * sizeof(T));
+ memory::Copy(boost::get(peer_place), peer_cuda_ptr,
+ place_, cuda_ptr_, this->size() * sizeof(T), ctx->stream());
+ ctx->Wait();
+
+ memory::Free(place_, cuda_ptr_);
+ place_ = boost::get(peer_place);
+ cuda_ptr_ = peer_cuda_ptr;
+#endif
+}
+
+template class Vector;
+template class Vector;
+template class Vector;
+template class Vector;
+
+} // namespace framework
+} // namespace paddle
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index f8df2cf97ad532f06cb1393b1a24cd789f8bde29..f554c77845087453f8c6e4d04522a8555e583ae6 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext {
bool HasOutputs(const std::string &name) const override;
- DDim GetInputDim(const std::string &name) const override;
-
- void SetOutputDim(const std::string &name, const DDim &dim) override;
-
AttrReader Attrs() const override;
const std::vector &Inputs(
@@ -444,21 +440,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
return true;
}
-DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const {
- std::vector ddims = GetInputsDim(name);
- auto length = ddims.size();
- PADDLE_ENFORCE_EQ(length, 1UL,
- "Input(%s) should have 1 value, "
- "but it has %d now",
- name, length);
- return ddims[0];
-}
-
-void CompileTimeInferShapeContext::SetOutputDim(const std::string &name,
- const DDim &dim) {
- SetOutputsDim(name, {dim});
-}
-
AttrReader CompileTimeInferShapeContext::Attrs() const {
return AttrReader(op_.GetAttrMap());
}
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 831b1e2a1e10777d9e89364adcd4b1f367e86080..81fa8cf477423fc2a54c719c9a743729215513c3 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,9 +22,7 @@ limitations under the License. */
#include "paddle/framework/shape_inference.h"
#include "paddle/framework/var_type.h"
-DEFINE_bool(op_sync, false,
- "Default cuda is asynchronous device, set to True will"
- "force op run in synchronous mode.");
+DECLARE_bool(benchmark);
namespace paddle {
namespace framework {
@@ -368,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
return true;
}
- DDim GetInputDim(const std::string& name) const override {
- return GetDim(op_.Input(name));
- }
-
- void SetOutputDim(const std::string& name, const DDim& dim) override {
- SetDim(op_.Output(name), dim);
- }
-
AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
const std::vector& Inputs(
@@ -531,7 +521,7 @@ void OperatorWithKernel::Run(const Scope& scope,
ExecutionContext(*this, new_scope, *new_dev_ctx));
/*For profiling/benchmark only*/
- if (FLAGS_op_sync) {
+ if (FLAGS_benchmark) {
new_dev_ctx->Wait();
}
}
diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc
index b2368e3a27abe6382b7460222e3fccce6f1beb08..15ea4035c6e6193105b621210a900e74d1466941 100644
--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -14,13 +14,11 @@ limitations under the License. */
#include "paddle/framework/program_desc.h"
#include "paddle/framework/block_desc.h"
+#include "paddle/framework/feed_fetch_type.h"
namespace paddle {
namespace framework {
-const std::string kFeedOpType = "feed";
-const std::string kFetchOpType = "fetch";
-
BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
auto *b = desc_.add_blocks();
b->set_parent_idx(parent.ID());
diff --git a/paddle/framework/program_desc.h b/paddle/framework/program_desc.h
index b9741b31393a474e06fd156a2f3354844d53187c..8e958eab6ee08436ca73b13bac010e66c7df2b8b 100644
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -16,6 +16,7 @@ limitations under the License. */
#include
#include
+#include "paddle/framework/block_desc.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/proto_desc.h"
#include "paddle/platform/macros.h"
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index a67ff910093d93060d07d849f6e968e5f4ce21cd..af08b2ab816f63c05d4c65df9601c787e57994f5 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -20,9 +20,11 @@ limitations under the License. */
#include "paddle/framework/threadpool.h"
#include "paddle/string/printf.h"
-DEFINE_bool(do_memory_benchmark, false,
+DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, "
- "and add some memory usage logs");
+ "and add some memory usage logs."
+ "Default cuda is asynchronous device, set to True will"
+ "force op run in synchronous mode.");
namespace paddle {
namespace framework {
@@ -93,7 +95,7 @@ void Scope::DeleteScope(Scope* scope) {
PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
this->kids_.erase(it);
// When making memory benchmark on Fluid, we have to delete scope sync.
- if (FLAGS_do_memory_benchmark) {
+ if (FLAGS_benchmark) {
delete scope;
} else {
Async([scope] { delete scope; });
diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc
index e53cc0cdabc623ae358f1a3e21823a2f38ec3c62..a0fa467291bb42c59b65f5efeabe9c2235e15b2a 100644
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -18,10 +18,18 @@ limitations under the License. */
namespace paddle {
namespace framework {
-std::vector InferShapeContext::GetInputsDim(
+DDim InferShapeContext::GetInputDim(const std::string &name) const {
+ const std::vector &arg_names = Inputs(name);
+ PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+ "Input(%s) should hold one element, but now it holds %d",
+ name, arg_names.size());
+ return this->GetDim(arg_names[0]);
+}
+
+std::vector InferShapeContext::GetInputsDim(
const std::string &name) const {
- const std::vector &names = Inputs(name);
- return GetDims(names);
+ const std::vector &arg_names = Inputs(name);
+ return GetDims(arg_names);
}
DDim InferShapeContext::GetInputsElementDim(const std::string &name,
@@ -30,24 +38,31 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name,
return this->GetDim(names[idx]);
}
-void InferShapeContext::SetOutputsDim(
- const std::string &name, const std::vector &dims) {
+void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
+ auto &arg_names = Outputs(name);
+ PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+ "Output(%s) should hold one element, but now it holds %d",
+ name, arg_names.size());
+ SetDim(arg_names[0], dim);
+}
+
+void InferShapeContext::SetOutputsDim(const std::string &name,
+ const std::vector &dims) {
auto &names = Outputs(name);
SetDims(names, dims);
}
-std::vector InferShapeContext::GetDims(
+std::vector InferShapeContext::GetDims(
const std::vector &names) const {
- std::vector ret;
+ std::vector ret;
ret.reserve(names.size());
std::transform(
names.begin(), names.end(), std::back_inserter(ret),
[this](const std::string &name) { return this->GetDim(name); });
return ret;
}
-
void InferShapeContext::SetDims(const std::vector &names,
- const std::vector &dims) {
+ const std::vector &dims) {
size_t length = names.size();
PADDLE_ENFORCE_EQ(length, dims.size());
for (size_t i = 0; i < length; ++i) {
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index f93319d8f2fd4c5d388bd57fd595a6a5edd51775..830f199ed1451538f12fc8dd34fb7b2bfc356a71 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -35,14 +35,13 @@ class InferShapeContext {
virtual bool HasInputs(const std::string &name) const = 0;
virtual bool HasOutputs(const std::string &name) const = 0;
- virtual framework::DDim GetInputDim(const std::string &name) const = 0;
+ DDim GetInputDim(const std::string &name) const;
- std::vector GetInputsDim(const std::string &name) const;
+ std::vector GetInputsDim(const std::string &name) const;
DDim GetInputsElementDim(const std::string &name, int idx) const;
- virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
- void SetOutputsDim(const std::string &name,
- const std::vector &dims);
+ void SetOutputDim(const std::string &name, const DDim &dim);
+ void SetOutputsDim(const std::string &name, const std::vector &dims);
virtual AttrReader Attrs() const = 0;
virtual const std::vector &Inputs(
@@ -57,15 +56,13 @@ class InferShapeContext {
// Note: In while op, we need this to be public
void SetDims(const std::vector &names,
- const std::vector &dims);
+ const std::vector &dims);
protected:
- virtual framework::DDim GetDim(const std::string &name) const = 0;
- virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
-
- std::vector GetDims(
- const std::vector &names) const;
+ virtual DDim GetDim(const std::string &name) const = 0;
+ virtual void SetDim(const std::string &name, const DDim &dim) = 0;
+ std::vector GetDims(const std::vector &names) const;
std::vector GetVarTypes(
const std::vector &names) const;
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 4aaa29d794c95592832a1fe990e2dce274eba9d5..f0ea709a5c37e769e3ffa1b2e9d1e39721979251 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -47,6 +47,11 @@ class Tensor {
public:
Tensor() : offset_(0) {}
+ /*! Constructor with place should only be used in pybind. */
+ explicit Tensor(const platform::Place& place) : offset_(0) {
+ holder_->set_place(place);
+ }
+
/*! Return a pointer to mutable memory block. */
template
inline T* data();
@@ -137,6 +142,7 @@ class Tensor {
virtual std::type_index type() const = 0;
virtual platform::Place place() const = 0;
virtual void set_type(std::type_index type) = 0;
+ virtual void set_place(platform::Place place) = 0;
};
template
@@ -156,6 +162,7 @@ class Tensor {
virtual void* ptr() const { return static_cast(ptr_.get()); }
virtual std::type_index type() const { return type_; }
virtual void set_type(std::type_index type) { type_ = type; }
+ virtual void set_place(platform::Place place) { place_ = place; }
/*! the pointer of memory block. */
std::unique_ptr> ptr_;
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index cbdbf5335d32d55a0221728758025c9d2cb3e7d1..a9876cec2aabf7d116443b685391ee9d20bc1370 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -178,19 +178,22 @@ public:
real* inputData = inputs[0].data();
real* filterData = inputs[1].data();
real* outputData = outputs[0].data();
+ real* colData = NULL;
bool needIm2col = isNeedIm2col(filter);
TensorShape imShape =
TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
TensorShape colShape;
- real* colData = NULL;
- size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
- size_t colWidth = outputHeight * outputWidth;
- // Max col matrix height 256, Max col matrix width 1024
- size_t stepColHeight = std::min(colHeight, static_cast(256));
- size_t stepColWidth = std::min(colWidth, static_cast(2048));
+ // Max col matrix width 4096, Max col matrix size 4M.
+ size_t outputHeightSteps =
+ std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
+ size_t maxColWidth = outputHeightSteps * outputWidth;
+ size_t channelSteps =
+ std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
+ (size_t)1),
+ inputChannels / groups_);
+ size_t maxColHeight = channelSteps * filterHeight * filterWidth;
if (needIm2col) {
colShape = TensorShape({inputChannels / groups_,
@@ -199,7 +202,7 @@ public:
outputHeight,
outputWidth});
- resizeBuffer(stepColHeight * stepColWidth * sizeof(real));
+ resizeBuffer(maxColHeight * maxColWidth * sizeof(real));
colData = reinterpret_cast(memory_->getBuf());
}
@@ -209,20 +212,24 @@ public:
(outputChannels / groups_) * outputHeight * outputWidth;
size_t filterOffset = filter.getElements() / groups_;
- int nStride = colWidth;
- int kStride = colHeight;
+ int nStride = outputHeight * outputWidth;
+ int kStride = inputChannels / groups_ * filterHeight * filterWidth;
for (size_t i = 0; i < batchSize; i++) {
+ filterData = inputs[1].data();
for (size_t g = 0; g < groups_; g++) {
if (needIm2col) {
real beta_ = beta;
- for (size_t colHeightStart = 0; colHeightStart < colHeight;
- colHeightStart += stepColHeight) {
- for (size_t colWidthStart = 0; colWidthStart < colWidth;
- colWidthStart += stepColWidth) {
- int N = std::min(colWidth - colWidthStart, stepColWidth);
- int K = std::min(colHeight - colHeightStart, stepColHeight);
+ for (size_t ic = 0; ic < inputChannels / groups_;
+ ic += channelSteps) {
+ int channels = std::min(inputChannels / groups_ - ic, channelSteps);
+ for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
+ int height = std::min(outputHeight - oh, outputHeightSteps);
+
+ int M = outputChannels / groups_;
+ int N = height * outputWidth;
+ int K = channels * filterHeight * filterWidth;
// im2col
- im2col(inputData + g * inputOffset,
+ im2col(inputData,
imShape,
colData,
colShape,
@@ -232,13 +239,12 @@ public:
paddingW(),
dilationH(),
dilationW(),
- colHeightStart,
- K,
- colWidthStart,
+ channels,
+ oh,
+ height,
N);
// gemm
- int M = outputChannels / groups_;
BlasGemm::compute(
false,
false,
@@ -246,12 +252,12 @@ public:
N,
K,
1.0f,
- filterData + g * filterOffset + colHeightStart,
+ filterData + ic * filterHeight * filterWidth,
kStride,
colData,
N,
beta_,
- outputData + g * outputOffset + colWidthStart,
+ outputData + oh * outputWidth,
nStride);
}
beta_ = 1.0;
@@ -266,17 +272,18 @@ public:
N,
K,
1.0f,
- filterData + g * filterOffset,
+ filterData,
K,
- inputData + g * inputOffset,
+ inputData,
N,
beta,
- outputData + g * outputOffset,
+ outputData,
N);
}
+ inputData += inputOffset;
+ outputData += outputOffset;
+ filterData += filterOffset;
}
- inputData += inputChannels * inputHeight * inputWidth;
- outputData += outputChannels * outputHeight * outputWidth;
}
memory_.reset();
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index 36a9bcf84e4b14965c83627821b71d1c7c0da1b2..915119e291caaa223249cf8e37078723621517b0 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -111,39 +111,42 @@ public:
int paddingWidth,
int dilationHeight,
int dilationWidth,
- int colHeightStart,
- int colHeightSize,
- int colWidthStart,
- int colWidthSize) {
+ int inputChannels,
+ int colOffset,
+ int colOutputHeight,
+ int colWidth) {
int inputHeight = imShape[1];
int inputWidth = imShape[2];
int filterHeight = colShape[1];
int filterWidth = colShape[2];
int outputWidth = colShape[4];
- for (int colh = 0; colh < colHeightSize; colh++) {
- int wOffset = (colHeightStart + colh) % filterWidth;
- int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
- int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
-
- for (int colw = 0; colw < colWidthSize; colw++) {
- int h = (colWidthStart + colw) / outputWidth;
- int w = (colWidthStart + colw) % outputWidth;
-
- int imRowIdx = h * strideHeight + hOffset * dilationHeight;
- int imColIdx = w * strideWidth + wOffset * dilationWidth;
- if ((imRowIdx - paddingHeight) < 0 ||
- (imRowIdx - paddingHeight) >= inputHeight ||
- (imColIdx - paddingWidth) < 0 ||
- (imColIdx - paddingWidth) >= inputWidth) {
- colData[colh * colWidthSize + colw] = static_cast(0);
- } else {
- imRowIdx += c_im * inputHeight - paddingHeight;
- imColIdx -= paddingWidth;
- colData[colh * colWidthSize + colw] =
- imData[imRowIdx * inputWidth + imColIdx];
+ for (int ic = 0; ic < inputChannels; ic++) {
+ for (int oh = 0; oh < colOutputHeight; oh++) {
+ T* dstData = colData + oh * outputWidth;
+ for (int fh = 0; fh < filterHeight; fh++) {
+ for (int fw = 0; fw < filterWidth; fw++) {
+ int imRowIdx = (oh + colOffset) * strideHeight +
+ fh * dilationHeight - paddingHeight;
+ if (imRowIdx < 0 || imRowIdx >= inputHeight) {
+ memset(dstData, 0, outputWidth * sizeof(T));
+ } else {
+ for (int ow = 0; ow < outputWidth; ow++) {
+ int imColIdx =
+ ow * strideWidth + fw * dilationWidth - paddingWidth;
+ if (imColIdx < 0 || imColIdx >= inputWidth) {
+ dstData[ow] = T(0);
+ } else {
+ dstData[ow] = imData[imRowIdx * inputWidth + imColIdx];
+ }
+ }
+ }
+ dstData += colWidth;
+ }
}
}
+ colData += filterHeight * filterWidth * colWidth;
+ imData += inputHeight * inputWidth;
}
}
};
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
index 3ba866dcdd845403d52f7a85adfef08cbb11c305..fe44a8bf79005efb87c56f6a79f46421129bab22 100644
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() {
padding,
dilation,
dilation,
+ channels,
0,
- height,
- 0,
- width);
+ outputHeight,
+ outputHeight * outputWidth);
autotest::TensorCheckEqual(*output1, *output2);
}
diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt
index 3f587fa790d1d980a35224b7a42dac1845fab99c..e8e0ee210718bb266383c967699b15418b18ea08 100644
--- a/paddle/inference/CMakeLists.txt
+++ b/paddle/inference/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(FLUID_CORE_MODULES proto_desc paddle_memory executor prune init)
+set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
cc_library(paddle_fluid_api
SRCS io.cc
@@ -29,19 +29,6 @@ add_custom_target(inference_lib_dist DEPENDS
inference_lib framework_lib memory_lib platform_lib string_lib
gflags_lib glog_lib protobuf_lib eigen3_lib)
-add_executable(example example.cc)
-if(APPLE)
- set(OPTIONAL_LINK_FLAGS)
- if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
- set(OPTIONAL_LINK_FLAGS "-undefined dynamic_lookup")
- endif()
- target_link_libraries(example
- -Wl,-force_load paddle_fluid
- ${OPTIONAL_LINK_FLAGS}
- ${PTOOLS_LIB})
-else()
- target_link_libraries(example
- -Wl,--start-group -Wl,--whole-archive paddle_fluid
- -Wl,--no-whole-archive -Wl,--end-group
- ${PTOOLS_LIB})
+if(WITH_TESTING)
+ add_subdirectory(tests/book)
endif()
diff --git a/paddle/inference/example.cc b/paddle/inference/example.cc
deleted file mode 100644
index ac2aedd88b61cde18e8fb9c05d34dd62daf62ab7..0000000000000000000000000000000000000000
--- a/paddle/inference/example.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include
-#include
-#include "gflags/gflags.h"
-#include "paddle/framework/init.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/inference/io.h"
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-int main(int argc, char** argv) {
- google::ParseCommandLineFlags(&argc, &argv, true);
- if (FLAGS_dirname.empty()) {
- // Example:
- // ./example --dirname=recognize_digits_mlp.inference.model
- std::cout << "Usage: ./example --dirname=path/to/your/model" << std::endl;
- exit(1);
- }
-
- // 1. Define place, executor, scope
- auto place = paddle::platform::CPUPlace();
- paddle::framework::InitDevices();
- auto* executor = new paddle::framework::Executor(place);
- auto* scope = new paddle::framework::Scope();
-
- std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
- std::string dirname = FLAGS_dirname;
-
- // 2. Initialize the inference program
- auto inference_program = paddle::inference::Load(*executor, *scope, dirname);
-
- // 3. Optional: perform optimization on the inference_program
-
- // 4. Get the feed_target_names and fetch_target_names
- const std::vector& feed_target_names =
- inference_program->GetFeedTargetNames();
- const std::vector& fetch_target_names =
- inference_program->GetFetchTargetNames();
-
- // 5. Generate input
- paddle::framework::LoDTensor input;
- srand(time(0));
- float* input_ptr =
- input.mutable_data({1, 784}, paddle::platform::CPUPlace());
- for (int i = 0; i < 784; ++i) {
- input_ptr[i] = rand() / (static_cast(RAND_MAX));
- }
-
- std::vector feeds;
- feeds.push_back(input);
- std::vector fetchs;
-
- // Set up maps for feed and fetch targets
- std::map feed_targets;
- std::map fetch_targets;
-
- // set_feed_variable
- for (size_t i = 0; i < feed_target_names.size(); ++i) {
- feed_targets[feed_target_names[i]] = &feeds[i];
- }
-
- // get_fetch_variable
- fetchs.resize(fetch_target_names.size());
- for (size_t i = 0; i < fetch_target_names.size(); ++i) {
- fetch_targets[fetch_target_names[i]] = &fetchs[i];
- }
-
- // Run the inference program
- executor->Run(*inference_program, scope, feed_targets, fetch_targets);
-
- // Get outputs
- for (size_t i = 0; i < fetchs.size(); ++i) {
- auto dims_i = fetchs[i].dims();
- std::cout << "dims_i:";
- for (int j = 0; j < dims_i.size(); ++j) {
- std::cout << " " << dims_i[j];
- }
- std::cout << std::endl;
- std::cout << "result:";
- float* output_ptr = fetchs[i].data();
- for (int j = 0; j < paddle::framework::product(dims_i); ++j) {
- std::cout << " " << output_ptr[j];
- }
- std::cout << std::endl;
- }
-
- delete scope;
- delete executor;
-
- return 0;
-}
diff --git a/paddle/inference/io.cc b/paddle/inference/io.cc
index f6d901381e781f161689f05315d4e0fe63610f84..60ad7af1c0a469beb6a07bf057a8647fcb98cca8 100644
--- a/paddle/inference/io.cc
+++ b/paddle/inference/io.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/inference/io.h"
+
#include
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/feed_fetch_type.h"
namespace paddle {
namespace inference {
-const std::string kFeedOpType = "feed";
-
bool IsParameter(const framework::VarDesc* var,
const framework::ProgramDesc& main_program) {
if (var->Persistable()) {
@@ -27,7 +28,7 @@ bool IsParameter(const framework::VarDesc* var,
for (size_t i = 0; i < main_program.Size(); ++i) {
const framework::BlockDesc& block = main_program.Block(i);
for (auto* op : block.AllOps()) {
- if (op->Type() == kFeedOpType) {
+ if (op->Type() == framework::kFeedOpType) {
continue;
}
for (auto input_argument_name : op->InputArgumentNames()) {
@@ -51,7 +52,7 @@ void LoadPersistables(framework::Executor& executor,
framework::BlockDesc* load_block = load_program->MutableBlock(0);
for (auto* var : global_block.AllVars()) {
if (IsParameter(var, main_program)) {
- LOG(INFO) << "parameter's name: " << var->Name();
+ VLOG(3) << "parameter's name: " << var->Name();
framework::VarDesc* new_var = load_block->Var(var->Name());
new_var->SetShape(var->Shape());
diff --git a/paddle/inference/io.h b/paddle/inference/io.h
index dccb700e9565b3482152cfcf399b2369edf01c7b..962b6c4e20d30de3cc28eae1c8c5c33b3ab5f6ac 100644
--- a/paddle/inference/io.h
+++ b/paddle/inference/io.h
@@ -17,18 +17,13 @@ limitations under the License. */
#include
#include
#include
-#include "paddle/framework/block_desc.h"
#include "paddle/framework/executor.h"
#include "paddle/framework/program_desc.h"
#include "paddle/framework/scope.h"
-#include "paddle/framework/var_desc.h"
namespace paddle {
namespace inference {
-bool IsParameter(const framework::VarDesc* var,
- const framework::ProgramDesc& main_program);
-
void LoadPersistables(framework::Executor& executor,
framework::Scope& scope,
const std::string& dirname,
diff --git a/paddle/inference/tests/book/CMakeLists.txt b/paddle/inference/tests/book/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0e987eb0240301c58cfb74c9e995d3b525130125
--- /dev/null
+++ b/paddle/inference/tests/book/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
+cc_test(test_inference_recognize_digits_mlp
+ SRCS test_inference_recognize_digits.cc
+ DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+ ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
+set_tests_properties(test_inference_recognize_digits_mlp
+ PROPERTIES DEPENDS test_recognize_digits)
diff --git a/paddle/inference/tests/book/test_inference_recognize_digits.cc b/paddle/inference/tests/book/test_inference_recognize_digits.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26dc2aee04261d9a1fd29b4d75bfacc7870c09d8
--- /dev/null
+++ b/paddle/inference/tests/book/test_inference_recognize_digits.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include
+#include
+#include
+#include "gflags/gflags.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/inference/io.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+template
+void TestInference(const std::string& dirname,
+ const std::vector& cpu_feeds,
+ std::vector& cpu_fetchs) {
+ // 1. Define place, executor and scope
+ auto place = Place();
+ auto executor = paddle::framework::Executor(place);
+ auto* scope = new paddle::framework::Scope();
+
+ // 2. Initialize the inference_program and load all parameters from file
+ auto inference_program = paddle::inference::Load(executor, *scope, dirname);
+
+ // 3. Get the feed_target_names and fetch_target_names
+ const std::vector& feed_target_names =
+ inference_program->GetFeedTargetNames();
+ const std::vector& fetch_target_names =
+ inference_program->GetFetchTargetNames();
+
+ // 4. Prepare inputs: set up maps for feed targets
+ std::map feed_targets;
+ for (size_t i = 0; i < feed_target_names.size(); ++i) {
+ // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+ feed_targets[feed_target_names[i]] = cpu_feeds[i];
+ }
+
+ // 5. Define Tensor to get the outputs: set up maps for fetch targets
+ std::map fetch_targets;
+ for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+ fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+ }
+
+ // 6. Run the inference program
+ executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+
+ delete scope;
+}
+
+TEST(inference, recognize_digits) {
+ if (FLAGS_dirname.empty()) {
+ LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+ }
+
+ LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+ std::string dirname = FLAGS_dirname;
+
+ // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+ // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+ paddle::framework::LoDTensor input;
+ srand(time(0));
+ float* input_ptr =
+ input.mutable_data({1, 28, 28}, paddle::platform::CPUPlace());
+ for (int i = 0; i < 784; ++i) {
+ input_ptr[i] = rand() / (static_cast(RAND_MAX));
+ }
+ std::vector cpu_feeds;
+ cpu_feeds.push_back(&input);
+
+ paddle::framework::LoDTensor output1;
+ std::vector cpu_fetchs1;
+ cpu_fetchs1.push_back(&output1);
+
+ // Run inference on CPU
+ TestInference(
+ dirname, cpu_feeds, cpu_fetchs1);
+ LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+ paddle::framework::LoDTensor output2;
+ std::vector cpu_fetchs2;
+ cpu_fetchs2.push_back(&output2);
+
+ // Run inference on CUDA GPU
+ TestInference(
+ dirname, cpu_feeds, cpu_fetchs2);
+ LOG(INFO) << output2.dims();
+
+ EXPECT_EQ(output1.dims(), output2.dims());
+ EXPECT_EQ(output1.numel(), output2.numel());
+
+ float err = 1E-3;
+ int count = 0;
+ for (int64_t i = 0; i < output1.numel(); ++i) {
+ if (fabs(output1.data()[i] - output2.data()[i]) > err) {
+ count++;
+ }
+ }
+ EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+#endif
+}
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 1ec4336cabbc7d3073b7638b7484bf61e83a2dc5..cc86b12be08ba987f9682ebf3fda56c2f07fb576 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
CHECK_EQ(channels * outLength, maskMatP->getWidth());
}
- /* initialize the data_ */
- for (size_t i = 0; i < height_; i++) {
- for (size_t j = 0; j < width_; j++) {
- outData[i * outStride + j] = -(real)FLT_MAX;
- }
- }
-
/* pool max one by one */
for (size_t n = 0; n < num; ++n) { // frame by frame
if (!isContiguous()) {
@@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
for (size_t c = 0; c < channels; ++c) { // channel by channel
for (size_t ph = 0; ph < outputH; ++ph) {
int hstart = ph * strideH - paddingH;
- int hend = std::min(hstart + sizeY, imgSizeH);
- hstart = std::max(hstart, 0);
+ int hend = hstart + sizeY;
+ hstart = hstart < 0 ? 0 : hstart;
+ hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
for (size_t pw = 0; pw < outputW; ++pw) {
int wstart = pw * strideW - paddingW;
- int wend = std::min(wstart + sizeX, imgSizeW);
- wstart = std::max(wstart, 0);
+ int wend = wstart + sizeX;
+ wstart = wstart < 0 ? 0 : wstart;
+ wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
if (maskData == NULL) {
+ real tmp = -(real)FLT_MAX;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
- outData[ph * outputW + pw] = std::max(
- outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
+ tmp = tmp < inputData[h * imgSizeW + w]
+ ? inputData[h * imgSizeW + w]
+ : tmp;
}
}
+ outData[ph * outputW + pw] = tmp;
} else {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 48cf5816cce4bb5ee8e66e72c5b1acea8535ab10..000c2089c176adf8d845a56a1f98528734f47ea1 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -122,9 +122,11 @@ if(WITH_DISTRIBUTE)
set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
- cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
+ op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
+ set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+ cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
else()
- set(DEPS_OPS ${DEPS_OPS} send_op recv_op)
+ set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op)
endif()
op_library(cond_op DEPS framework_proto tensor net_op)
@@ -156,7 +158,10 @@ op_library(parallel_do_op DEPS executor)
# Regist multiple Kernel to pybind
if (WITH_GPU)
-op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col)
+
+op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
+ vol2col depthwise_conv)
+
op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
@@ -173,6 +178,8 @@ endif()
# FIXME(typhoonzero): save/load depends lodtensor serialization functions
op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor)
+op_library(save_combine_op DEPS lod_tensor)
+op_library(load_combine_op DEPS lod_tensor)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS})
@@ -192,3 +199,4 @@ if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
+cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu
index 4e579387924a5b0499f29609bc6b1322030a3c0d..00cb6e9cafb4e79ed3d59cd4a6e40ea132e5efda 100644
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -82,7 +82,7 @@ struct SparseAdagradFunctor {
math::scatter::MergeAdd merge_func;
auto grad_merge = merge_func(context, grad);
auto* grad_merge_data = grad_merge.mutable_value()->template data();
- auto& merge_rows = grad_merge.rows();
+ framework::Vector merge_rows(grad_merge.rows());
// 2. m += g_m * g_m
math::scatter::Mul sqare_func;
auto grad_square = sqare_func(context, grad_merge, grad_merge);
@@ -101,8 +101,8 @@ struct SparseAdagradFunctor {
SparseAdagradFunctorKernel<
T, 256><<(context)
- .stream()>>>(grad_merge_data, grad_merge.rows().data(),
- lr, param_data, moment_data, grad_width,
+ .stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr,
+ param_data, moment_data, grad_width,
epsilon);
}
};
diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h
index 9cc34bdded780e61e8700eb4fa4a295c84fb48bc..bf536687d398b8342e6ae76a07c11e5fe47483e0 100644
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -199,7 +199,12 @@ class AdamOpKernel : public framework::OpKernel {
merge_func(ctx.template device_context(), grad);
auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data();
- auto* rows = grad_merge.rows().data();
+ int64_t* rows = nullptr;
+ if (platform::is_gpu_place(ctx.GetPlace())) {
+ rows = grad_merge.mutable_rows()->cuda_data();
+ } else {
+ rows = grad_merge.mutable_rows()->data();
+ }
auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
SparseAdamFunctor functor(
diff --git a/paddle/operators/bipartite_match_op.cc b/paddle/operators/bipartite_match_op.cc
index 83c8778fe4cec4d9d80de691e117a39fdd92f494..1e6fa2091de25218e2bdafeb740ce884234638a5 100644
--- a/paddle/operators/bipartite_match_op.cc
+++ b/paddle/operators/bipartite_match_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -28,12 +28,18 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("DistMat"),
"Input(DistMat) of BipartiteMatch should not be null.");
+ PADDLE_ENFORCE(
+ ctx->HasOutput("ColToRowMatchIndices"),
+ "Output(ColToRowMatchIndices) of BipartiteMatch should not be null.");
+ PADDLE_ENFORCE(
+ ctx->HasOutput("ColToRowMatchDist"),
+ "Output(ColToRowMatchDist) of BipartiteMatch should not be null.");
auto dims = ctx->GetInputDim("DistMat");
PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
ctx->SetOutputDim("ColToRowMatchIndices", dims);
- ctx->SetOutputDim("ColToRowMatchDis", dims);
+ ctx->SetOutputDim("ColToRowMatchDist", dims);
}
};
@@ -91,7 +97,7 @@ class BipartiteMatchKernel : public framework::OpKernel {
void Compute(const framework::ExecutionContext& context) const override {
auto* dist_mat = context.Input("DistMat");
auto* match_indices = context.Output("ColToRowMatchIndices");
- auto* match_dist = context.Output("ColToRowMatchDis");
+ auto* match_dist = context.Output("ColToRowMatchDist");
auto& dev_ctx = context.device_context();
@@ -148,13 +154,13 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
"Otherwise, it means B[j] is matched to row "
"ColToRowMatchIndices[i][j] in i-th instance. The row number of "
"i-th instance is saved in ColToRowMatchIndices[i][j].");
- AddOutput("ColToRowMatchDis",
+ AddOutput("ColToRowMatchDist",
"(Tensor) A 2-D Tensor with shape [N, M] in float type. "
"N is batch size. If ColToRowMatchIndices[i][j] is -1, "
- "ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed "
+ "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed "
"ColToRowMatchIndices[i][j] = d, and the row offsets of each "
"instance are called LoD. Then "
- "ColToRowMatchDis[i][j] = DistMat[d+LoD[i]][j]");
+ "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]");
AddComment(R"DOC(
This operator is a greedy bipartite matching algorithm, which is used to
obtain the matching with the maximum distance based on the input
diff --git a/paddle/operators/box_coder_op.cc b/paddle/operators/box_coder_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..539813d4858b8faef386047f9ef64aa232aefca1
--- /dev/null
+++ b/paddle/operators/box_coder_op.cc
@@ -0,0 +1,121 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/box_coder_op.h"
+
+namespace paddle {
+namespace operators {
+
+class BoxCoderOp : public framework::OperatorWithKernel {
+ public:
+ using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+ void InferShape(framework::InferShapeContext *ctx) const override {
+ PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
+ "Input(PriorBox) of BoxCoderOp should not be null.");
+ PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
+ "Input(PriorBoxVar) of BoxCoderOp should not be null.");
+ PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
+ "Input(TargetBox) of BoxCoderOp should not be null.");
+ PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
+ "Output(OutputBox) of BoxCoderOp should not be null.");
+
+ auto prior_box_dims = ctx->GetInputDim("PriorBox");
+ auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
+ auto target_box_dims = ctx->GetInputDim("TargetBox");
+
+ PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
+ "The rank of Input of PriorBoxVar must be 2");
+ PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
+ PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
+ PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
+ "The rank of Input of TargetBox must be 2");
+ PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
+ "The shape of TargetBox is [M, 4]");
+
+ GetBoxCodeType(ctx->Attrs().Get("code_type"));
+
+ ctx->SetOutputDim(
+ "OutputBox",
+ framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
+ ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
+ }
+};
+
+class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput(
+ "PriorBox",
+ "(Tensor, default Tensor) "
+ "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, "
+ "each box is represented as [xmin, ymin, xmax, ymax], "
+ "[xmin, ymin] is the left top coordinate of the anchor box, "
+ "if the input is image feature map, they are close to the origin "
+ "of the coordinate system. [xmax, ymax] is the right bottom "
+ "coordinate of the anchor box.");
+ AddInput("PriorBoxVar",
+ "(Tensor, default Tensor) "
+ "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
+ "of variance.");
+ AddInput(
+ "TargetBox",
+ "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
+ "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
+ "[xmin, ymin] is the left top coordinate of the box if the input "
+ "is image feature map, they are close to the origin of the coordinate "
+ "system. [xmax, ymax] is the right bottom coordinate of the box. "
+ "This tensor can contain LoD information to represent a batch "
+ "of inputs. One instance of this batch can contain different "
+ "numbers of entities.");
+ AddAttr("code_type",
+ "(string, default encode_center_size) "
+ "the code type used with the target box")
+ .SetDefault("encode_center_size")
+ .InEnum({"encode_center_size", "decode_center_size"});
+ AddOutput(
+ "OutputBox",
+ "(LoDTensor or Tensor) "
+ "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
+ "representing the result of N target boxes encoded/decoded with "
+ "M Prior boxes and variances.");
+
+ AddComment(R"DOC(
+Bounding Box Coder Operator.
+Encode/Decode the target bounding box with the priorbox information.
+The Encoding schema described below:
+ox = (tx - px) / pw / pxv
+oy = (ty - py) / ph / pyv
+ow = log(abs(tw / pw)) / pwv
+oh = log(abs(th / ph)) / phv
+The Decoding schema described below:
+ox = (pw * pxv * tx * + px) - tw / 2
+oy = (ph * pyv * ty * + py) - th / 2
+ow = exp(pwv * tw) * pw + tw / 2
+oh = exp(phv * th) * ph + th / 2
+where tx, ty, tw, th denote the target box's center coordinates, width and
+height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
+center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
+of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
+width and height.
+)DOC");
+ }
+};
+
+} // namespace operators
+} // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
+REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel,
+ ops::BoxCoderKernel);
diff --git a/paddle/operators/box_coder_op.cu b/paddle/operators/box_coder_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..98bd93457fafb49f2af5e1ff258fbfa9f9985600
--- /dev/null
+++ b/paddle/operators/box_coder_op.cu
@@ -0,0 +1,150 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/box_coder_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+template
+__global__ void EncodeCenterSizeKernel(const T* prior_box_data,
+ const T* prior_box_var_data,
+ const T* target_box_data, const int row,
+ const int col, const int len,
+ T* output) {
+ const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+ if (idx < row * col) {
+ const int row_idx = idx / col;
+ const int col_idx = idx % col;
+ T prior_box_width =
+ prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
+ T prior_box_height =
+ prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+ T prior_box_center_x =
+ (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
+ T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
+ prior_box_data[col_idx * len + 1]) /
+ 2;
+
+ T target_box_center_x =
+ (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
+ 2;
+ T target_box_center_y = (target_box_data[row_idx * len + 3] +
+ target_box_data[row_idx * len + 1]) /
+ 2;
+ T target_box_width =
+ target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
+ T target_box_height =
+ target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
+
+ output[idx * len] = (target_box_center_x - prior_box_center_x) /
+ prior_box_width / prior_box_var_data[col_idx * len];
+ output[idx * len + 1] = (target_box_center_y - prior_box_center_y) /
+ prior_box_height /
+ prior_box_var_data[col_idx * len + 1];
+ output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) /
+ prior_box_var_data[col_idx * len + 2];
+ output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) /
+ prior_box_var_data[col_idx * len + 3];
+ }
+}
+
+template
+__global__ void DecodeCenterSizeKernel(const T* prior_box_data,
+ const T* prior_box_var_data,
+ const T* target_box_data, const int row,
+ const int col, const int len,
+ T* output) {
+ const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+ if (idx < row * col) {
+ const int row_idx = idx / col;
+ const int col_idx = idx % col;
+ T prior_box_width =
+ prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
+ T prior_box_height =
+ prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+ T prior_box_center_x =
+ (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
+ T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
+ prior_box_data[col_idx * len + 1]) /
+ 2;
+
+ T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
+ target_box_data[row_idx * len + 2]) *
+ prior_box_width;
+ T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
+ target_box_data[row_idx * len + 3]) *
+ prior_box_height;
+ T target_box_center_x = prior_box_var_data[col_idx * len] *
+ target_box_data[row_idx * len] *
+ prior_box_width +
+ prior_box_center_x;
+ T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
+ target_box_data[row_idx * len + 1] *
+ prior_box_height +
+ prior_box_center_y;
+
+ output[idx * len] = target_box_center_x - target_box_width / 2;
+ output[idx * len + 1] = target_box_center_y - target_box_height / 2;
+ output[idx * len + 2] = target_box_center_x + target_box_width / 2;
+ output[idx * len + 3] = target_box_center_y + target_box_height / 2;
+ }
+}
+
+template