提交 446198da 编写于 作者: L Luo Tao

Merge branch 'develop' into inference_lib_dist

...@@ -39,7 +39,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F ...@@ -39,7 +39,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON) option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
...@@ -137,7 +137,7 @@ include(external/openblas) # download, build, install openblas ...@@ -137,7 +137,7 @@ include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn include(external/mkldnn) # download, build, install mkldnn
include(external/swig) # download, build, install swig include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc include(external/warpctc) # download, build, install warpctc
include(external/boost) # download, build, install boost include(external/boost) # download boost
include(external/any) # download libn::any include(external/any) # download libn::any
include(external/eigen) # download eigen3 include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11 include(external/pybind11) # download pybind11
......
#FROM python:2.7.14
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
RUN apt-get update && apt-get install -y python
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
# so we must build one with distribute support to install in this image.
RUN pip install paddlepaddle
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
RUN pip uninstall -y paddlepaddle
# below lines may change a lot for debugging
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl && \
chmod +x /usr/bin/paddle_k8s
ENV LD_LIBRARY_PATH=/usr/local/lib
ADD vgg16_fluid.py vgg16_v2.py /workspace/
# Performance for Distributed vgg16
## Test Result
### Hardware Infomation
- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
- cpu MHz : 2101.000
- cache size : 20480 KB
### Single Node Single Thread
- PServer Count: 10
- Trainer Count: 20
- Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
| TensorFlow | - | - | - | - |
### Different Batch Size
- PServer Count: 10
- Trainer Count: 20
- Per trainer CPU Core: 1
- Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
| TensorFlow | - | - | - | - |
### Accelerate Rate
- Pserver Count: 20
- Batch Size: 128
- Metrics: samples / sec
| Trainer Count | 20 | 40 | 80 | 100 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
| TensorFlow | - | - | - | - |
### Different Pserver Count
- Trainer Count: 60
- Batch Size: 128
- Metrics: samples/ sec
| PServer Count | 3 | 6 |10 | 20 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
| TensorFlow | - | - | - | - |
*The performance gap between Fuild and v2 comes from the network interference.*
## Steps to Run the Performance Test
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
Check the logs for the distributed training progress and analyze the performance.
## Enable Verbos Logs
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: MKL_NUM_THREADS
value: "1"
- name: TRAINING_ROLE
value: "PSERVER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
command: ["paddle_k8s", "start_fluid"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_fluid"]
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: TRAINING_ROLE
value: "TRAINER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16v2job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16v2job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "python train.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
command: ["paddle_k8s", "start_pserver"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16v2job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16v2job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_trainer", "v2"]
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: BATCH_SIZE
value: "256"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "2"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""VGG16 benchmark in Fluid"""
from __future__ import print_function
import sys
import time
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
import paddle.v2.fluid.profiler as profiler
import argparse
import functools
import os
def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--batch_size', type=int, default=128, help="Batch size for training.")
parser.add_argument(
'--learning_rate',
type=float,
default=1e-3,
help="Learning rate for training.")
parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
parser.add_argument(
'--device',
type=str,
default='CPU',
choices=['CPU', 'GPU'],
help="The device type.")
parser.add_argument('--device_id', type=int, default=0, help="The device id.")
parser.add_argument(
'--data_format',
type=str,
default='NCHW',
choices=['NCHW', 'NHWC'],
help='The data order, now only support NCHW.')
parser.add_argument(
'--data_set',
type=str,
default='cifar10',
choices=['cifar10', 'flowers'],
help='Optional dataset for benchmark.')
parser.add_argument(
'--local',
type=str2bool,
default=True,
help='Whether to run as local mode.')
args = parser.parse_args()
def vgg16_bn_drop(input):
def conv_block(input, num_filter, groups, dropouts):
return fluid.nets.img_conv_group(
input=input,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act='relu',
conv_with_batchnorm=True,
conv_batchnorm_drop_rate=dropouts,
pool_type='max')
conv1 = conv_block(input, 64, 2, [0.3, 0])
conv2 = conv_block(conv1, 128, 2, [0.4, 0])
conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = fluid.layers.fc(input=drop, size=512, act=None)
bn = fluid.layers.batch_norm(input=fc1, act='relu')
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
return fc2
def main():
if args.data_set == "cifar10":
classdim = 10
if args.data_format == 'NCHW':
data_shape = [3, 32, 32]
else:
data_shape = [32, 32, 3]
else:
classdim = 102
if args.data_format == 'NCHW':
data_shape = [3, 224, 224]
else:
data_shape = [224, 224, 3]
# Input data
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
net = vgg16_bn_drop(images)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
# inference program
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
test_target = accuracy.metrics + accuracy.states
inference_program = fluid.io.get_inference_program(test_target)
# Optimization
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
optimize_ops, params_grads = optimizer.minimize(avg_cost)
# Initialize executor
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
args.device_id)
exe = fluid.Executor(place)
# test
def test(exe):
accuracy.reset(exe)
for batch_id, data in enumerate(test_reader()):
img_data = np.array(map(lambda x: x[0].reshape(data_shape),
data)).astype("float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
exe.run(inference_program,
feed={"pixel": img_data,
"label": y_data})
return accuracy.eval(exe)
def train_loop(exe, trainer_prog):
iters = 0
ts = time.time()
for pass_id in range(args.num_passes):
# train
start_time = time.time()
num_samples = 0
accuracy.reset(exe)
with profiler.profiler("CPU", 'total') as prof:
for batch_id, data in enumerate(train_reader()):
ts = time.time()
img_data = np.array(
map(lambda x: x[0].reshape(data_shape), data)).astype(
"float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
loss, acc = exe.run(
trainer_prog,
feed={"pixel": img_data,
"label": y_data},
fetch_list=[avg_cost] + accuracy.metrics)
iters += 1
num_samples += len(data)
print(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
% (pass_id, iters, loss, acc, time.time() - ts)
) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time
pass_train_acc = accuracy.eval(exe)
pass_test_acc = test(exe)
print(
"Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
% (pass_id, num_samples / pass_elapsed, pass_train_acc,
pass_test_acc))
if args.local:
# Parameter initialization
exe.run(fluid.default_startup_program())
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
batch_size=args.batch_size)
train_loop(exe, fluid.default_main_program())
else:
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # all pserver endpoints
eplist = []
for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, "6174"]))
pserver_endpoints = ",".join(eplist)
print("pserver endpoints: ", pserver_endpoints)
trainers = int(os.getenv("TRAINERS")) # total trainer count
print("trainers total: ", trainers)
current_endpoint = os.getenv(
"POD_IP") + ":6174" # current pserver endpoint
training_role = os.getenv(
"TRAINING_ROLE",
"TRAINER") # get the training role: trainer/pserver
t = fluid.DistributeTranspiler()
t.transpile(
optimize_ops,
params_grads,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER":
if not current_endpoint:
print("need env SERVER_ENDPOINT")
exit(1)
pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint,
pserver_prog)
print("starting server side startup")
exe.run(pserver_startup)
print("starting parameter server...")
exe.run(pserver_prog)
elif training_role == "TRAINER":
# Parameter initialization
exe.run(fluid.default_startup_program())
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
paddle.dataset.flowers.test(),
batch_size=args.batch_size)
trainer_prog = t.get_trainer_program()
feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
# TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
exe.run(fluid.default_startup_program())
train_loop(exe, trainer_prog)
else:
print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
def print_arguments():
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == "__main__":
print_arguments()
main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import gzip
import paddle.v2.dataset.cifar as cifar
import paddle.v2 as paddle
import time
import os
DATA_DIM = 3 * 32 * 32
CLASS_DIM = 10
BATCH_SIZE = os.getenv("BATCH_SIZE")
if BATCH_SIZE:
BATCH_SIZE = int(BATCH_SIZE)
else:
BATCH_SIZE = 128
print "batch_size", BATCH_SIZE
NODE_COUNT = int(os.getenv("TRAINERS"))
ts = 0
def vgg(input, nums, class_dim):
def conv_block(input, num_filter, groups, num_channels=None):
return paddle.networks.img_conv_group(
input=input,
num_channels=num_channels,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act=paddle.activation.Relu(),
pool_type=paddle.pooling.Max())
assert len(nums) == 5
# the channel of input feature is 3
conv1 = conv_block(input, 64, nums[0], 3)
conv2 = conv_block(conv1, 128, nums[1])
conv3 = conv_block(conv2, 256, nums[2])
conv4 = conv_block(conv3, 512, nums[3])
conv5 = conv_block(conv4, 512, nums[4])
fc_dim = 512
fc1 = paddle.layer.fc(input=conv5,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
fc2 = paddle.layer.fc(input=fc1,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
out = paddle.layer.fc(input=fc2,
size=class_dim,
act=paddle.activation.Softmax())
return out
def vgg13(input, class_dim):
nums = [2, 2, 2, 2, 2]
return vgg(input, nums, class_dim)
def vgg16(input, class_dim):
nums = [2, 2, 3, 3, 3]
return vgg(input, nums, class_dim)
def vgg19(input, class_dim):
nums = [2, 2, 4, 4, 4]
return vgg(input, nums, class_dim)
def main():
global ts
paddle.init(use_gpu=False)
image = paddle.layer.data(
name="image", type=paddle.data_type.dense_vector(DATA_DIM))
lbl = paddle.layer.data(
name="label", type=paddle.data_type.integer_value(CLASS_DIM))
extra_layers = None
# NOTE: for v2 distributed training need averaging updates.
learning_rate = 1e-3 / NODE_COUNT
out = vgg16(image, class_dim=CLASS_DIM)
cost = paddle.layer.classification_cost(input=out, label=lbl)
# Create parameters
parameters = paddle.parameters.create(cost)
# Create optimizer
optimizer = paddle.optimizer.Momentum(
momentum=0.9,
regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
BATCH_SIZE),
learning_rate=learning_rate / BATCH_SIZE,
learning_rate_decay_a=0.1,
learning_rate_decay_b=128000 * 35,
learning_rate_schedule="discexp", )
train_reader = paddle.batch(
paddle.reader.shuffle(
cifar.train10(),
# To use other data, replace the above line with:
# reader.train_reader('train.list'),
buf_size=1000),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
cifar.test10(),
# To use other data, replace the above line with:
# reader.test_reader('val.list'),
batch_size=BATCH_SIZE)
# Create trainer
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer,
extra_layers=extra_layers,
is_local=False)
# End batch and end pass event handler
def event_handler(event):
global ts, ts_pass
if isinstance(event, paddle.event.BeginPass):
ts_pass = time.time()
if isinstance(event, paddle.event.BeginIteration):
ts = time.time()
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 1 == 0:
print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
event.pass_id, event.batch_id, event.cost, event.metrics,
time.time() - ts)
if isinstance(event, paddle.event.EndPass):
print "Pass %d end, spent: %f" % (event.pass_id,
time.time() - ts_pass)
result = trainer.test(reader=test_reader)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
trainer.train(
reader=train_reader, num_passes=200, event_handler=event_handler)
if __name__ == '__main__':
main()
...@@ -21,6 +21,7 @@ set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOO ...@@ -21,6 +21,7 @@ set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOO
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE) set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
include_directories(${BOOST_INCLUDE_DIR}) include_directories(${BOOST_INCLUDE_DIR})
......
...@@ -186,6 +186,11 @@ function(cc_library TARGET_NAME) ...@@ -186,6 +186,11 @@ function(cc_library TARGET_NAME)
add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
endif() endif()
if (cc_library_DEPS) if (cc_library_DEPS)
# Don't need link libwarpctc.so
if ("${cc_library_DEPS};" MATCHES "warpctc;")
list(REMOVE_ITEM cc_library_DEPS warpctc)
add_dependencies(${TARGET_NAME} warpctc)
endif()
add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
endif() endif()
...@@ -224,12 +229,18 @@ function(cc_test TARGET_NAME) ...@@ -224,12 +229,18 @@ function(cc_test TARGET_NAME)
if(WITH_TESTING) if(WITH_TESTING)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS}) add_executable(${TARGET_NAME} ${cc_test_SRCS})
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
endif()
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif() endif()
endfunction(cc_test) endfunction(cc_test)
...@@ -457,12 +468,12 @@ endfunction() ...@@ -457,12 +468,12 @@ endfunction()
function(py_test TARGET_NAME) function(py_test TARGET_NAME)
if(WITH_TESTING) if(WITH_TESTING)
set(options STATIC static SHARED shared) set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS ARGS) set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif() endif()
......
...@@ -87,6 +87,11 @@ roi_pool ...@@ -87,6 +87,11 @@ roi_pool
.. autoclass:: paddle.v2.layer.roi_pool .. autoclass:: paddle.v2.layer.roi_pool
:noindex: :noindex:
pad
----
.. autoclass:: paddle.v2.layer.pad
:noindex:
Norm Layer Norm Layer
========== ==========
...@@ -133,6 +138,11 @@ grumemory ...@@ -133,6 +138,11 @@ grumemory
.. autoclass:: paddle.v2.layer.grumemory .. autoclass:: paddle.v2.layer.grumemory
:noindex: :noindex:
gated_unit
-----------
.. autoclass:: paddle.v2.layer.gated_unit
:noindex:
Recurrent Layer Group Recurrent Layer Group
===================== =====================
...@@ -340,6 +350,11 @@ bilinear_interp ...@@ -340,6 +350,11 @@ bilinear_interp
.. autoclass:: paddle.v2.layer.bilinear_interp .. autoclass:: paddle.v2.layer.bilinear_interp
:noindex: :noindex:
dropout
--------
.. autoclass:: paddle.v2.layer.dropout
:noindex:
dot_prod dot_prod
--------- ---------
.. autoclass:: paddle.v2.layer.dot_prod .. autoclass:: paddle.v2.layer.dot_prod
...@@ -402,6 +417,11 @@ scale_shift ...@@ -402,6 +417,11 @@ scale_shift
.. autoclass:: paddle.v2.layer.scale_shift .. autoclass:: paddle.v2.layer.scale_shift
:noindex: :noindex:
factorization_machine
---------------------
.. autoclass:: paddle.v2.layer.factorization_machine
:noindex:
Sampling Layers Sampling Layers
=============== ===============
...@@ -420,22 +440,6 @@ multiplex ...@@ -420,22 +440,6 @@ multiplex
.. autoclass:: paddle.v2.layer.multiplex .. autoclass:: paddle.v2.layer.multiplex
:noindex: :noindex:
Factorization Machine Layer
============================
factorization_machine
---------------------
.. autoclass:: paddle.v2.layer.factorization_machine
:noindex:
Slicing and Joining Layers
==========================
pad
----
.. autoclass:: paddle.v2.layer.pad
:noindex:
.. _api_v2.layer_costs: .. _api_v2.layer_costs:
Cost Layers Cost Layers
...@@ -526,6 +530,11 @@ multibox_loss ...@@ -526,6 +530,11 @@ multibox_loss
.. autoclass:: paddle.v2.layer.multibox_loss .. autoclass:: paddle.v2.layer.multibox_loss
:noindex: :noindex:
detection_output
----------------
.. autoclass:: paddle.v2.layer.detection_output
:noindex:
Check Layer Check Layer
============ ============
...@@ -534,31 +543,10 @@ eos ...@@ -534,31 +543,10 @@ eos
.. autoclass:: paddle.v2.layer.eos .. autoclass:: paddle.v2.layer.eos
:noindex: :noindex:
Miscs Activation
===== ==========
dropout
--------
.. autoclass:: paddle.v2.layer.dropout
:noindex:
Activation with learnable parameter
===================================
prelu prelu
-------- --------
.. autoclass:: paddle.v2.layer.prelu .. autoclass:: paddle.v2.layer.prelu
:noindex: :noindex:
gated_unit
-----------
.. autoclass:: paddle.v2.layer.gated_unit
:noindex:
Detection output Layer
======================
detection_output
----------------
.. autoclass:: paddle.v2.layer.detection_output
:noindex:
...@@ -73,3 +73,10 @@ wmt14 ...@@ -73,3 +73,10 @@ wmt14
.. automodule:: paddle.v2.dataset.wmt14 .. automodule:: paddle.v2.dataset.wmt14
:members: :members:
:noindex: :noindex:
wmt16
+++++
.. automodule:: paddle.v2.dataset.wmt16
:members:
:noindex:
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=========== ===========
DataFeeder data_feeder
=========== ===========
DataFeeder DataFeeder
----------- ----------
.. automodule:: paddle.v2.fluid.data_feeder
:members: DataFeeder .. autoclass:: paddle.v2.fluid.data_feeder.DataFeeder
:members:
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Evaluator !DO NOT EDIT THIS FILE MANUALLY!
===========
=========
Evaluator evaluator
----------- =========
.. automodule:: paddle.v2.fluid.evaluator
:members: Evaluator Accuracy
--------
.. autoclass:: paddle.v2.fluid.evaluator.Accuracy
:members:
:noindex: :noindex:
ChunkEvaluator
--------------
.. autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator
:members:
:noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Executor !DO NOT EDIT THIS FILE MANUALLY!
===========
========
executor
========
Executor Executor
--------
.. autoclass:: paddle.v2.fluid.executor.Executor
:members:
:noindex:
global_scope
------------
.. autofunction:: paddle.v2.fluid.executor.global_scope
:noindex:
scope_guard
----------- -----------
.. automodule:: paddle.v2.fluid.executor
:members: Executor .. autofunction:: paddle.v2.fluid.executor.scope_guard
:noindex:
switch_scope
------------
.. autofunction:: paddle.v2.fluid.executor.switch_scope
:noindex: :noindex:
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import argparse
import sys
import types
import paddle.v2.fluid as fluid
def parse_arg():
parser = argparse.ArgumentParser()
parser.add_argument('--submodules', nargs="*")
parser.add_argument(
'module', type=str, help='Generate the documentation of which module')
return parser.parse_args()
class DocGenerator(object):
def __init__(self, module_name, stream=sys.stdout):
self.stream = stream
self.module_name = module_name
if not hasattr(fluid, module_name):
raise ValueError("Cannot find fluid.{0}".format(module_name))
else:
self.module = getattr(fluid, module_name)
self.stream.write('''.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
''')
self._print_header_(module_name, dot='=', is_title=True)
def print_submodule(self, submodule_name):
submodule = getattr(self.module, submodule_name)
if submodule is None:
raise ValueError("Cannot find submodule {0}".format(submodule_name))
self.print_section(submodule_name)
for item in submodule.__all__:
self.print_item(item)
def print_current_module(self):
for item in self.module.__all__:
self.print_item(item)
def print_section(self, name):
self._print_header_(name, dot='=', is_title=False)
def print_item(self, name):
item = getattr(self.module, name)
if isinstance(item, types.TypeType):
self.print_class(name)
elif isinstance(item, types.FunctionType):
self.print_method(name)
else:
raise RuntimeError("Unsupported item {0}".format(name))
def print_class(self, name):
self._print_header_(name, dot='-', is_title=False)
self.stream.write('''.. autoclass:: paddle.v2.fluid.{0}.{1}
:members:
:noindex:
'''.format(self.module_name, name))
def print_method(self, name):
self._print_header_(name, dot='-', is_title=False)
self.stream.write('''.. autofunction:: paddle.v2.fluid.{0}.{1}
:noindex:
'''.format(self.module_name, name))
def _print_header_(self, name, dot, is_title):
dot_line = dot * len(name)
if is_title:
self.stream.write(dot_line)
self.stream.write('\n')
self.stream.write(name)
self.stream.write('\n')
self.stream.write(dot_line)
self.stream.write('\n')
self.stream.write('\n')
def main():
args = parse_arg()
gen = DocGenerator(args.module)
if args.submodules is None:
gen.print_current_module()
else:
for submodule_name in args.submodules:
gen.print_submodule(submodule_name)
if __name__ == '__main__':
main()
#!/bin/bash
python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
do
python gen_doc.py ${module} > ${module}.rst
done
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=========== ===========
Initializer initializer
=========== ===========
Constant
--------
.. autoclass:: paddle.v2.fluid.initializer.Constant
Initializer :members:
-----------
.. automodule:: paddle.v2.fluid.initializer
:members: Initializer
:noindex:
ConstantInitializer
-------------------
.. automodule:: paddle.v2.fluid.initializer
:members: ConstantInitializer
:noindex: :noindex:
Uniform
-------
.. autoclass:: paddle.v2.fluid.initializer.Uniform
UniformInitializer :members:
------------------
.. automodule:: paddle.v2.fluid.initializer
:members: UniformInitializer
:noindex:
NormalInitializer
-----------------
.. automodule:: paddle.v2.fluid.initializer
:members: NormalInitializer
:noindex: :noindex:
Normal
------
XavierInitializer .. autoclass:: paddle.v2.fluid.initializer.Normal
----------------- :members:
.. automodule:: paddle.v2.fluid.initializer
:members: XavierInitializer
:noindex: :noindex:
Xavier
------
MSRAInitializer .. autoclass:: paddle.v2.fluid.initializer.Xavier
--------------- :members:
.. automodule:: paddle.v2.fluid.initializer
:members: MSRAInitializer
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
IO !DO NOT EDIT THIS FILE MANUALLY!
===========
==
io
==
save_vars
---------
is_parameter .. autofunction:: paddle.v2.fluid.io.save_vars
:noindex:
save_params
----------- -----------
.. autofunction:: paddle.v2.fluid.io.is_parameter
.. autofunction:: paddle.v2.fluid.io.save_params
:noindex:
save_persistables
-----------------
.. autofunction:: paddle.v2.fluid.io.save_persistables
:noindex:
load_vars
---------
.. autofunction:: paddle.v2.fluid.io.load_vars
:noindex:
load_params
-----------
.. autofunction:: paddle.v2.fluid.io.load_params
:noindex: :noindex:
load_persistables
-----------------
.. autofunction:: paddle.v2.fluid.io.load_persistables
:noindex:
save_inference_model
--------------------
.. autofunction:: paddle.v2.fluid.io.save_inference_model
:noindex:
load_inference_model
--------------------
.. autofunction:: paddle.v2.fluid.io.load_inference_model
:noindex:
get_inference_program
---------------------
.. autofunction:: paddle.v2.fluid.io.get_inference_program
:noindex:
========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Layers !DO NOT EDIT THIS FILE MANUALLY!
==========
======
layers
======
fc control_flow
--- ============
.. autofunction:: paddle.v2.fluid.layers.fc
split_lod_tensor
----------------
.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
:noindex: :noindex:
embedding merge_lod_tensor
--------- ----------------
.. autofunction:: paddle.v2.fluid.layers.embedding
.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
:noindex: :noindex:
dynamic_lstm BlockGuard
------------ ----------
.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
.. autoclass:: paddle.v2.fluid.layers.BlockGuard
:members:
:noindex: :noindex:
dynamic_lstmp BlockGuardWithCompletion
------------- ------------------------
.. autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
.. autoclass:: paddle.v2.fluid.layers.BlockGuardWithCompletion
:members:
:noindex: :noindex:
dynamic_gru StaticRNNMemoryLink
----------- -------------------
.. autofunction:: paddle.v2.fluid.layers.dynamic_gru
.. autoclass:: paddle.v2.fluid.layers.StaticRNNMemoryLink
:members:
:noindex: :noindex:
data WhileGuard
---- ----------
.. autofunction:: paddle.v2.fluid.layers.data
.. autoclass:: paddle.v2.fluid.layers.WhileGuard
:members:
:noindex: :noindex:
mean While
---- -----
.. autofunction:: paddle.v2.fluid.layers.mean
.. autoclass:: paddle.v2.fluid.layers.While
:members:
:noindex: :noindex:
mul lod_rank_table
--- --------------
.. autofunction:: paddle.v2.fluid.layers.mul
.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
:noindex: :noindex:
elementwise_add max_sequence_len
--------------- ----------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_add
.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
:noindex: :noindex:
elementwise_sub topk
--------------- ----
.. autofunction:: paddle.v2.fluid.layers.elementwise_sub
.. autofunction:: paddle.v2.fluid.layers.topk
:noindex: :noindex:
elementwise_mul lod_tensor_to_array
--------------- -------------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_mul
.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
:noindex: :noindex:
elementwise_div array_to_lod_tensor
--------------- -------------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_div
.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
:noindex: :noindex:
increment
---------
dropout .. autofunction:: paddle.v2.fluid.layers.increment
-------
.. autofunction:: paddle.v2.fluid.layers.dropout
:noindex: :noindex:
array_write
-----------
reshape .. autofunction:: paddle.v2.fluid.layers.array_write
--------
.. autofunction:: paddle.v2.fluid.layers.reshape
:noindex: :noindex:
create_array
------------
sigmoid .. autofunction:: paddle.v2.fluid.layers.create_array
:noindex:
less_than
--------- ---------
.. autofunction:: paddle.v2.fluid.layers.sigmoid
.. autofunction:: paddle.v2.fluid.layers.less_than
:noindex: :noindex:
array_read
----------
scale .. autofunction:: paddle.v2.fluid.layers.array_read
--------- :noindex:
.. autofunction:: paddle.v2.fluid.layers.scale
shrink_memory
-------------
.. autofunction:: paddle.v2.fluid.layers.shrink_memory
:noindex: :noindex:
array_length
------------
transpose .. autofunction:: paddle.v2.fluid.layers.array_length
:noindex:
IfElse
------
.. autoclass:: paddle.v2.fluid.layers.IfElse
:members:
:noindex:
DynamicRNN
----------
.. autoclass:: paddle.v2.fluid.layers.DynamicRNN
:members:
:noindex:
ConditionalBlock
----------------
.. autoclass:: paddle.v2.fluid.layers.ConditionalBlock
:members:
:noindex:
StaticRNN
--------- ---------
.. autofunction:: paddle.v2.fluid.layers.transpose
.. autoclass:: paddle.v2.fluid.layers.StaticRNN
:members:
:noindex: :noindex:
reorder_lod_tensor_by_rank
--------------------------
sigmoid_cross_entropy_with_logits .. autofunction:: paddle.v2.fluid.layers.reorder_lod_tensor_by_rank
---------------------------------
.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
:noindex: :noindex:
ParallelDo
----------
cast .. autoclass:: paddle.v2.fluid.layers.ParallelDo
:members:
:noindex:
Print
-----
.. autofunction:: paddle.v2.fluid.layers.Print
:noindex:
device
======
get_places
----------
.. autofunction:: paddle.v2.fluid.layers.get_places
:noindex:
io
==
data
---- ----
.. autofunction:: paddle.v2.fluid.layers.cast
.. autofunction:: paddle.v2.fluid.layers.data
:noindex: :noindex:
BlockGuardServ
--------------
concat .. autoclass:: paddle.v2.fluid.layers.BlockGuardServ
------- :members:
.. autofunction:: paddle.v2.fluid.layers.concat
:noindex: :noindex:
ListenAndServ
-------------
sums .. autoclass:: paddle.v2.fluid.layers.ListenAndServ
:members:
:noindex:
Send
---- ----
.. autofunction:: paddle.v2.fluid.layers.sums
.. autofunction:: paddle.v2.fluid.layers.Send
:noindex: :noindex:
nn
==
linear_chain_crf fc
---------------- --
.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
.. autofunction:: paddle.v2.fluid.layers.fc
:noindex: :noindex:
embedding
---------
assign
-------
.. autofunction:: paddle.v2.fluid.layers.embedding .. autofunction:: paddle.v2.fluid.layers.embedding
:noindex: :noindex:
dynamic_lstm
------------
split_lod_tensor .. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
----------------
.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
:noindex: :noindex:
dynamic_lstmp
-------------
merge_lod_tensor .. autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
:noindex:
dynamic_gru
-----------
.. autofunction:: paddle.v2.fluid.layers.dynamic_gru
:noindex:
gru_unit
--------
.. autofunction:: paddle.v2.fluid.layers.gru_unit
:noindex:
linear_chain_crf
---------------- ----------------
.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
:noindex:
crf_decoding
------------
.. autofunction:: paddle.v2.fluid.layers.crf_decoding
:noindex: :noindex:
cos_sim cos_sim
-------- -------
.. autofunction:: paddle.v2.fluid.layers.cos_sim .. autofunction:: paddle.v2.fluid.layers.cos_sim
:noindex: :noindex:
cross_entropy cross_entropy
------------- -------------
.. autofunction:: paddle.v2.fluid.layers.cross_entropy .. autofunction:: paddle.v2.fluid.layers.cross_entropy
:noindex: :noindex:
square_error_cost square_error_cost
----------------- -----------------
.. autofunction:: paddle.v2.fluid.layers.square_error_cost .. autofunction:: paddle.v2.fluid.layers.square_error_cost
:noindex: :noindex:
accuracy accuracy
--------- --------
.. autofunction:: paddle.v2.fluid.layers.accuracy .. autofunction:: paddle.v2.fluid.layers.accuracy
:noindex: :noindex:
chunk_eval
----------
.. autofunction:: paddle.v2.fluid.layers.chunk_eval
:noindex:
sequence_conv sequence_conv
------------- -------------
.. autofunction:: paddle.v2.fluid.layers.sequence_conv .. autofunction:: paddle.v2.fluid.layers.sequence_conv
:noindex: :noindex:
conv2d conv2d
------ ------
.. autofunction:: paddle.v2.fluid.layers.conv2d .. autofunction:: paddle.v2.fluid.layers.conv2d
:noindex: :noindex:
sequence_pool sequence_pool
------------- -------------
.. autofunction:: paddle.v2.fluid.layers.sequence_pool .. autofunction:: paddle.v2.fluid.layers.sequence_pool
:noindex: :noindex:
pool2d
------
sequence_first_step .. autofunction:: paddle.v2.fluid.layers.pool2d
-------------------
.. autofunction:: paddle.v2.fluid.layers.sequence_first_step
:noindex: :noindex:
batch_norm
----------
.. autofunction:: paddle.v2.fluid.layers.batch_norm
:noindex:
sequence_last_step beam_search_decode
------------------ ------------------
.. autofunction:: paddle.v2.fluid.layers.sequence_last_step
.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
:noindex: :noindex:
conv2d_transpose
----------------
pool2d .. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
------
.. autofunction:: paddle.v2.fluid.layers.pool2d
:noindex: :noindex:
sequence_expand
---------------
batch_norm .. autofunction:: paddle.v2.fluid.layers.sequence_expand
:noindex:
lstm_unit
---------
.. autofunction:: paddle.v2.fluid.layers.lstm_unit
:noindex:
reduce_sum
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.batch_norm
.. autofunction:: paddle.v2.fluid.layers.reduce_sum
:noindex:
reduce_mean
-----------
.. autofunction:: paddle.v2.fluid.layers.reduce_mean
:noindex: :noindex:
reduce_max
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_max
:noindex:
beam_search_decode reduce_min
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_min
:noindex:
sequence_first_step
-------------------
.. autofunction:: paddle.v2.fluid.layers.sequence_first_step
:noindex:
sequence_last_step
------------------ ------------------
.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
.. autofunction:: paddle.v2.fluid.layers.sequence_last_step
:noindex:
dropout
-------
.. autofunction:: paddle.v2.fluid.layers.dropout
:noindex: :noindex:
split
-----
lod_rank_table .. autofunction:: paddle.v2.fluid.layers.split
--------------
.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
:noindex: :noindex:
ctc_greedy_decoder
------------------
max_sequence_len .. autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
----------------
.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
:noindex: :noindex:
edit_distance
-------------
topk .. autofunction:: paddle.v2.fluid.layers.edit_distance
-----
.. autofunction:: paddle.v2.fluid.layers.topk
:noindex: :noindex:
l2_normalize
------------
lod_tensor_to_array .. autofunction:: paddle.v2.fluid.layers.l2_normalize
-------------------
.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
:noindex: :noindex:
matmul
------
.. autofunction:: paddle.v2.fluid.layers.matmul
array_to_lod_tensor
-------------------
.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
:noindex: :noindex:
warpctc
-------
.. autofunction:: paddle.v2.fluid.layers.warpctc
:noindex:
sequence_reshape
----------------
fill_constant .. autofunction:: paddle.v2.fluid.layers.sequence_reshape
-------------
.. autofunction:: paddle.v2.fluid.layers.fill_constant
:noindex: :noindex:
transpose
---------
.. autofunction:: paddle.v2.fluid.layers.transpose
:noindex:
fill_constant_batch_size_like im2sequence
----------------------------- -----------
.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
.. autofunction:: paddle.v2.fluid.layers.im2sequence
:noindex: :noindex:
nce
---
ones .. autofunction:: paddle.v2.fluid.layers.nce
----
.. autofunction:: paddle.v2.fluid.layers.ones
:noindex: :noindex:
beam_search
-----------
zeros .. autofunction:: paddle.v2.fluid.layers.beam_search
-----
.. autofunction:: paddle.v2.fluid.layers.zeros
:noindex: :noindex:
row_conv
--------
increment .. autofunction:: paddle.v2.fluid.layers.row_conv
---------
.. autofunction:: paddle.v2.fluid.layers.increment
:noindex: :noindex:
multiplex
---------
array_write .. autofunction:: paddle.v2.fluid.layers.multiplex
-----------
.. autofunction:: paddle.v2.fluid.layers.array_write
:noindex: :noindex:
ops
===
mean
----
create_array .. autofunction:: paddle.v2.fluid.layers.mean
------------
.. autofunction:: paddle.v2.fluid.layers.create_array
:noindex: :noindex:
mul
---
less_than .. autofunction:: paddle.v2.fluid.layers.mul
---------
.. autofunction:: paddle.v2.fluid.layers.less_than
:noindex: :noindex:
reshape
-------
array_read .. autofunction:: paddle.v2.fluid.layers.reshape
----------
.. autofunction:: paddle.v2.fluid.layers.array_read
:noindex: :noindex:
scale
-----
shrink_memory .. autofunction:: paddle.v2.fluid.layers.scale
--------------
.. autofunction:: paddle.v2.fluid.layers.shrink_memory
:noindex: :noindex:
sigmoid_cross_entropy_with_logits
---------------------------------
array_length .. autofunction:: paddle.v2.fluid.layers.sigmoid_cross_entropy_with_logits
-------------
.. autofunction:: paddle.v2.fluid.layers.array_length
:noindex: :noindex:
elementwise_add
---------------
conv2d_transpose .. autofunction:: paddle.v2.fluid.layers.elementwise_add
----------------
.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
:noindex: :noindex:
elementwise_div
sequence_expand
--------------- ---------------
.. autofunction:: paddle.v2.fluid.layers.sequence_expand
.. autofunction:: paddle.v2.fluid.layers.elementwise_div
:noindex: :noindex:
elementwise_sub
---------------
gru_unit .. autofunction:: paddle.v2.fluid.layers.elementwise_sub
--------
.. autofunction:: paddle.v2.fluid.layers.gru_unit
:noindex: :noindex:
elementwise_mul
---------------
lstm_unit .. autofunction:: paddle.v2.fluid.layers.elementwise_mul
---------
.. autofunction:: paddle.v2.fluid.layers.lstm_unit
:noindex: :noindex:
elementwise_max
---------------
sequence_softmax .. autofunction:: paddle.v2.fluid.layers.elementwise_max
----------------
.. autofunction:: paddle.v2.fluid.layers.sequence_softmax
:noindex: :noindex:
elementwise_min
---------------
reduce_sum .. autofunction:: paddle.v2.fluid.layers.elementwise_min
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_sum
:noindex: :noindex:
elementwise_pow
---------------
reduce_mean .. autofunction:: paddle.v2.fluid.layers.elementwise_pow
-----------
.. autofunction:: paddle.v2.fluid.layers.reduce_mean
:noindex: :noindex:
clip
----
reduce_max .. autofunction:: paddle.v2.fluid.layers.clip
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_max
:noindex: :noindex:
clip_by_norm
------------
reduce_min .. autofunction:: paddle.v2.fluid.layers.clip_by_norm
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_min
:noindex: :noindex:
sequence_softmax
----------------
split .. autofunction:: paddle.v2.fluid.layers.sequence_softmax
-----
.. autofunction:: paddle.v2.fluid.layers.split
:noindex: :noindex:
sigmoid
-------
matmul .. autofunction:: paddle.v2.fluid.layers.sigmoid
------
.. autofunction:: paddle.v2.fluid.layers.matmul
:noindex: :noindex:
logsigmoid logsigmoid
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.logsigmoid .. autofunction:: paddle.v2.fluid.layers.logsigmoid
:noindex: :noindex:
exp exp
--- ---
.. autofunction:: paddle.v2.fluid.layers.exp .. autofunction:: paddle.v2.fluid.layers.exp
:noindex: :noindex:
relu relu
---- ----
.. autofunction:: paddle.v2.fluid.layers.relu .. autofunction:: paddle.v2.fluid.layers.relu
:noindex: :noindex:
tanh tanh
---- ----
.. autofunction:: paddle.v2.fluid.layers.tanh .. autofunction:: paddle.v2.fluid.layers.tanh
:noindex: :noindex:
tanh_shrink tanh_shrink
----------- -----------
.. autofunction:: paddle.v2.fluid.layers.tanh_shrink .. autofunction:: paddle.v2.fluid.layers.tanh_shrink
:noindex: :noindex:
softshrink softshrink
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.softshrink .. autofunction:: paddle.v2.fluid.layers.softshrink
:noindex: :noindex:
sqrt sqrt
---- ----
.. autofunction:: paddle.v2.fluid.layers.sqrt .. autofunction:: paddle.v2.fluid.layers.sqrt
:noindex: :noindex:
abs abs
---- ---
.. autofunction:: paddle.v2.fluid.layers.abs .. autofunction:: paddle.v2.fluid.layers.abs
:noindex: :noindex:
ceil ceil
---- ----
.. autofunction:: paddle.v2.fluid.layers.ceil .. autofunction:: paddle.v2.fluid.layers.ceil
:noindex: :noindex:
floor floor
----- -----
.. autofunction:: paddle.v2.fluid.layers.floor .. autofunction:: paddle.v2.fluid.layers.floor
:noindex: :noindex:
round round
----- -----
.. autofunction:: paddle.v2.fluid.layers.round .. autofunction:: paddle.v2.fluid.layers.round
:noindex: :noindex:
reciprocal reciprocal
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.reciprocal .. autofunction:: paddle.v2.fluid.layers.reciprocal
:noindex: :noindex:
log log
--- ---
.. autofunction:: paddle.v2.fluid.layers.log .. autofunction:: paddle.v2.fluid.layers.log
:noindex: :noindex:
square square
------ ------
.. autofunction:: paddle.v2.fluid.layers.square .. autofunction:: paddle.v2.fluid.layers.square
:noindex: :noindex:
softplus softplus
-------- --------
.. autofunction:: paddle.v2.fluid.layers.softplus .. autofunction:: paddle.v2.fluid.layers.softplus
:noindex: :noindex:
softsign softsign
--------- --------
.. autofunction:: paddle.v2.fluid.layers.softsign .. autofunction:: paddle.v2.fluid.layers.softsign
:noindex: :noindex:
brelu brelu
----- -----
.. autofunction:: paddle.v2.fluid.layers.brelu .. autofunction:: paddle.v2.fluid.layers.brelu
:noindex: :noindex:
leaky_relu leaky_relu
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.leaky_relu .. autofunction:: paddle.v2.fluid.layers.leaky_relu
:noindex: :noindex:
soft_relu soft_relu
--------- ---------
.. autofunction:: paddle.v2.fluid.layers.soft_relu .. autofunction:: paddle.v2.fluid.layers.soft_relu
:noindex: :noindex:
elu elu
---- ---
.. autofunction:: paddle.v2.fluid.layers.elu .. autofunction:: paddle.v2.fluid.layers.elu
:noindex: :noindex:
relu6 relu6
----- -----
.. autofunction:: paddle.v2.fluid.layers.relu6 .. autofunction:: paddle.v2.fluid.layers.relu6
:noindex: :noindex:
pow pow
---- ---
.. autofunction:: paddle.v2.fluid.layers.pow .. autofunction:: paddle.v2.fluid.layers.pow
:noindex: :noindex:
stanh
-----
.. autofunction:: paddle.v2.fluid.layers.stanh
:noindex:
hard_shrink hard_shrink
----------- -----------
.. autofunction:: paddle.v2.fluid.layers.hard_shrink .. autofunction:: paddle.v2.fluid.layers.hard_shrink
:noindex: :noindex:
thresholded_relu thresholded_relu
---------------- ----------------
.. autofunction:: paddle.v2.fluid.layers.thresholded_relu .. autofunction:: paddle.v2.fluid.layers.thresholded_relu
:noindex: :noindex:
hard_sigmoid hard_sigmoid
------------- ------------
.. autofunction:: paddle.v2.fluid.layers.hard_sigmoid .. autofunction:: paddle.v2.fluid.layers.hard_sigmoid
:noindex: :noindex:
swish swish
------ -----
.. autofunction:: paddle.v2.fluid.layers.swish .. autofunction:: paddle.v2.fluid.layers.swish
:noindex: :noindex:
im2sequence tensor
======
create_tensor
-------------
.. autofunction:: paddle.v2.fluid.layers.create_tensor
:noindex:
create_parameter
----------------
.. autofunction:: paddle.v2.fluid.layers.create_parameter
:noindex:
create_global_var
-----------------
.. autofunction:: paddle.v2.fluid.layers.create_global_var
:noindex:
cast
----
.. autofunction:: paddle.v2.fluid.layers.cast
:noindex:
concat
------ ------
.. autofunction:: paddle.v2.fluid.layers.im2sequence
.. autofunction:: paddle.v2.fluid.layers.concat
:noindex: :noindex:
edit_distance sums
--------------- ----
.. autofunction:: paddle.v2.fluid.layers.edit_distance_error
.. autofunction:: paddle.v2.fluid.layers.sums
:noindex: :noindex:
ctc_greedy_decoder assign
--------------- ------
.. autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
.. autofunction:: paddle.v2.fluid.layers.assign
:noindex: :noindex:
l2_normalize fill_constant_batch_size_like
------------ -----------------------------
.. autofunction:: paddle.v2.fluid.layers.l2_normalize
.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
:noindex: :noindex:
sequence_reshape fill_constant
---------------- -------------
.. autofunction:: paddle.v2.fluid.layers.sequence_reshape
.. autofunction:: paddle.v2.fluid.layers.fill_constant
:noindex: :noindex:
row_conv ones
-------- ----
.. autofunction:: paddle.v2.fluid.layers.row_conv
.. autofunction:: paddle.v2.fluid.layers.ones
:noindex: :noindex:
multiplex zeros
--------- -----
.. autofunction:: paddle.v2.fluid.layers.multiplex
.. autofunction:: paddle.v2.fluid.layers.zeros
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Nets !DO NOT EDIT THIS FILE MANUALLY!
===========
====
nets
====
simple_img_conv_pool simple_img_conv_pool
-------------------- --------------------
.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
:noindex:
.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
img_conv_group
---------------
.. autofunction:: paddle.v2.fluid.nets.img_conv_group
:noindex: :noindex:
sequence_conv_pool sequence_conv_pool
------------------ ------------------
.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool .. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
:noindex: :noindex:
glu glu
--- ---
.. autofunction:: paddle.v2.fluid.nets.glu .. autofunction:: paddle.v2.fluid.nets.glu
:noindex: :noindex:
scaled_dot_product_attention scaled_dot_product_attention
---------------------------- ----------------------------
.. autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention .. autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Optimizer !DO NOT EDIT THIS FILE MANUALLY!
===========
Optimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: Optimizer
:noindex:
=========
optimizer
=========
SGDOptimizer SGD
----------- ---
.. automodule:: paddle.v2.fluid.optimizer
:members: SGDOptimizer
:noindex:
.. autoclass:: paddle.v2.fluid.optimizer.SGD
:members:
:noindex:
Momentum
--------
MomentumOptimizer .. autoclass:: paddle.v2.fluid.optimizer.Momentum
----------------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: MomentumOptimizer
:noindex: :noindex:
Adagrad
-------
.. autoclass:: paddle.v2.fluid.optimizer.Adagrad
AdagradOptimizer :members:
----------------
.. automodule:: paddle.v2.fluid.optimizer
:members: AdagradOptimizer
:noindex: :noindex:
Adam
----
AdamOptimizer .. autoclass:: paddle.v2.fluid.optimizer.Adam
------------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: AdamOptimizer
:noindex: :noindex:
Adamax
------
AdamaxOptimizer .. autoclass:: paddle.v2.fluid.optimizer.Adamax
----------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: AdamaxOptimizer
:noindex: :noindex:
DecayedAdagrad
--------------
DecayedAdagradOptimizer .. autoclass:: paddle.v2.fluid.optimizer.DecayedAdagrad
----------------------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: DecayedAdagradOptimizer
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
==========
param_attr
==========
ParamAttr ParamAttr
=========== ---------
.. autoclass:: paddle.v2.fluid.param_attr.ParamAttr
:members:
:noindex:
WeightNormParamAttr
-------------------
ParamAttr .. autoclass:: paddle.v2.fluid.param_attr.WeightNormParamAttr
----------- :members:
.. automodule:: paddle.v2.fluid.param_attr
:members: ParamAttr
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Profiler !DO NOT EDIT THIS FILE MANUALLY!
===========
========
profiler
========
cuda_profiler
-------------
Profiler
-----------
.. autofunction:: paddle.v2.fluid.profiler.cuda_profiler .. autofunction:: paddle.v2.fluid.profiler.cuda_profiler
:noindex: :noindex:
reset_profiler
--------------
.. autofunction:: paddle.v2.fluid.profiler.reset_profiler
:noindex:
profiler
--------
.. autofunction:: paddle.v2.fluid.profiler.profiler
:noindex:
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=========== ===========
Regularizer regularizer
=========== ===========
WeightDecayRegularizer append_regularization_ops
---------------------- -------------------------
.. automodule:: paddle.v2.fluid.regularizer
:members: WeightDecayRegularizer
:noindex:
L2DecayRegularizer .. autofunction:: paddle.v2.fluid.regularizer.append_regularization_ops
------------------
.. automodule:: paddle.v2.fluid.regularizer
:members: L2DecayRegularizer
:noindex: :noindex:
L1Decay
-------
.. autoclass:: paddle.v2.fluid.regularizer.L1Decay
:members:
:noindex:
L1DecayRegularizer L2Decay
------------------- -------
.. automodule:: paddle.v2.fluid.regularizer
:members: L1DecayRegularizer
.. autoclass:: paddle.v2.fluid.regularizer.L2Decay
:members:
:noindex:
...@@ -140,7 +140,19 @@ TODO by Assignees ...@@ -140,7 +140,19 @@ TODO by Assignees
### Beam Search with CTC and LM ### Beam Search with CTC and LM
TODO by Assignees <div align="center">
<img src="image/beam_search.png" width=600><br/>
Figure 2. Algorithm for CTC Beam Search Decoder.
</div>
- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
- 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
- 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
- Such external scorer consists of language model, word count or any other custom scorers.
- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
## Future Work ## Future Work
...@@ -153,3 +165,4 @@ TODO by Assignees ...@@ -153,3 +165,4 @@ TODO by Assignees
1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016. 1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595. 2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595.
3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
### Design Doc: Switch
### Background
Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid.
The following example shows the usage of `fluid.switch`.
```python
a = fluid.Var(10)
b = fluid.Var(0)
switch = fluid.switch()
with switch.block():
with switch.case(fluid.less_equal(a, 10)):
fluid.print("Case 1")
with switch.case(fluid.larger(a, 0)):
fluid.print("Case 2")
with switch.default():
fluid.print("Case 3")
```
### The Semantics
1. A `switch` control-flow checks cases one-by-one.
1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
1. It runs the first matched case, or the default case if there is one.
1. Once it matches a case, it runs the corresponding branch and only that branch. It's like there is a C's `break` keyword at the end of each case.
The above program should print and print only "Case 1".
The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
...@@ -115,7 +115,7 @@ PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种B ...@@ -115,7 +115,7 @@ PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种B
"WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON" "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
"WITH_PYTHON", "是否内嵌PYTHON解释器", "ON" "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
"WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON" "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
"WITH_TESTING", "是否开启单元测试", "ON" "WITH_TESTING", "是否开启单元测试", "OFF"
"WITH_DOC", "是否编译中英文文档", "OFF" "WITH_DOC", "是否编译中英文文档", "OFF"
"WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto" "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto"
"WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON" "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
......
...@@ -126,7 +126,7 @@ You can add :code:`-D` argument to pass such options, like: ...@@ -126,7 +126,7 @@ You can add :code:`-D` argument to pass such options, like:
"WITH_AVX", "Build with AVX support", "ON" "WITH_AVX", "Build with AVX support", "ON"
"WITH_PYTHON", "Build with integrated Python interpreter", "ON" "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
"WITH_STYLE_CHECK", "Check code style when building", "ON" "WITH_STYLE_CHECK", "Check code style when building", "ON"
"WITH_TESTING", "Build unit tests", "ON" "WITH_TESTING", "Build unit tests", "OFF"
"WITH_DOC", "Build documentations", "OFF" "WITH_DOC", "Build documentations", "OFF"
"WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto" "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
"WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON" "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
......
...@@ -95,6 +95,12 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note ...@@ -95,6 +95,12 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
docker run -p 8888:8888 paddlepaddle/book docker run -p 8888:8888 paddlepaddle/book
国内用户可以使用下面的镜像源来加速访问:
.. code-block: bash
docker run -p 8888:8888 docker.paddlepaddlehub.com/book
然后在浏览器中输入以下网址: 然后在浏览器中输入以下网址:
.. code-block:: text .. code-block:: text
......
...@@ -102,6 +102,12 @@ We provide a packaged book image, simply issue the command: ...@@ -102,6 +102,12 @@ We provide a packaged book image, simply issue the command:
docker run -p 8888:8888 paddlepaddle/book docker run -p 8888:8888 paddlepaddle/book
For users in China, we provide a faster mirror:
.. code-block: bash
docker run -p 8888:8888 docker.paddlepaddlehub.com/book
Then, you would back and paste the address into the local browser: Then, you would back and paste the address into the local browser:
.. code-block:: text .. code-block:: text
......
...@@ -92,11 +92,11 @@ paddle.init( ...@@ -92,11 +92,11 @@ paddle.init(
参数说明 参数说明
- use_gpu: **可选,默认False**,是否启用GPU训练 - use_gpu: **可选,默认False**,是否启用GPU训练
- trainer_count:**必选,默认1**,当前训练任务trainer总个数 - trainer_count:**必选,默认1**,当前trainer的线程数目
- port:**必选,默认7164**,连接到pserver的端口 - port:**必选,默认7164**,连接到pserver的端口
- ports_num:**必选,默认1**,连接到pserver的端口个数 - ports_num:**必选,默认1**,连接到pserver的端口个数
- ports_num_for_sparse:**必选,默认0**,和pserver之间用于稀疏类型参数通信的端口个数 - ports_num_for_sparse:**必选,默认0**,和pserver之间用于稀疏类型参数通信的端口个数
- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数 - num_gradient_servers:**必选,默认1**,当前训练任务trainer总数
- trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数 - trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数
- pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开 - pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开
......
...@@ -95,11 +95,11 @@ paddle.init( ...@@ -95,11 +95,11 @@ paddle.init(
Parameter Description Parameter Description
- use_gpu: **optional, default False**, set to "True" to enable GPU training. - use_gpu: **optional, default False**, set to "True" to enable GPU training.
- trainer_count: **required, default 1**, total count of trainers in the training job. - trainer_count: **required, default 1**, number of threads in current trainer.
- port: **required, default 7164**, port to connect to parameter server. - port: **required, default 7164**, port to connect to parameter server.
- ports_num: **required, default 1**, number of ports for communication. - ports_num: **required, default 1**, number of ports for communication.
- ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation. - ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
- num_gradient_servers: **required, default 1**, total number of gradient server. - num_gradient_servers: **required, default 1**, number of trainers in current job.
- trainer_id: **required, default 0**, ID for every trainer, start from 0. - trainer_id: **required, default 0**, ID for every trainer, start from 0.
- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",". - pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
......
...@@ -8,4 +8,3 @@ PaddlePaddle 文档 ...@@ -8,4 +8,3 @@ PaddlePaddle 文档
howto/index_cn.rst howto/index_cn.rst
api/index_cn.rst api/index_cn.rst
faq/index_cn.rst faq/index_cn.rst
mobile/index_cn.rst
...@@ -7,4 +7,3 @@ PaddlePaddle Documentation ...@@ -7,4 +7,3 @@ PaddlePaddle Documentation
getstarted/index_en.rst getstarted/index_en.rst
howto/index_en.rst howto/index_en.rst
api/index_en.rst api/index_en.rst
mobile/index_en.rst
MOBILE
======
.. toctree::
:maxdepth: 1
cross_compiling_for_android_cn.md
cross_compiling_for_ios_cn.md
cross_compiling_for_raspberry_cn.md
MOBILE
======
.. toctree::
:maxdepth: 1
cross_compiling_for_android_en.md
cross_compiling_for_ios_en.md
cross_compiling_for_raspberry_en.md
...@@ -22,7 +22,7 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) ...@@ -22,7 +22,7 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
cc_test(variable_test SRCS variable_test.cc) cc_test(variable_test SRCS variable_test.cc)
......
...@@ -23,12 +23,10 @@ namespace framework { ...@@ -23,12 +23,10 @@ namespace framework {
template <typename T> template <typename T>
class Channel { class Channel {
public: public:
virtual void Send(T*) = 0; virtual bool Send(T*) = 0;
virtual void Receive(T*) = 0; virtual bool Receive(T*) = 0;
virtual size_t Cap() = 0; virtual size_t Cap() = 0;
virtual void Close() = 0;
// Don't delete channels; instead, call Channel::Close.
protected:
virtual ~Channel() {} virtual ~Channel() {}
}; };
...@@ -50,11 +48,7 @@ Channel<T>* MakeChannel(size_t buffer_size) { ...@@ -50,11 +48,7 @@ Channel<T>* MakeChannel(size_t buffer_size) {
template <typename T> template <typename T>
void CloseChannel(Channel<T>* ch) { void CloseChannel(Channel<T>* ch) {
if (ch->Cap() > 0) { ch->Close();
delete dynamic_cast<details::Buffered<T>*>(ch);
} else {
delete dynamic_cast<details::UnBuffered<T>*>(ch);
}
} }
} // namespace framework } // namespace framework
......
...@@ -14,13 +14,329 @@ limitations under the License. */ ...@@ -14,13 +14,329 @@ limitations under the License. */
#include "paddle/framework/channel.h" #include "paddle/framework/channel.h"
#include <chrono>
#include <thread>
#include "gtest/gtest.h" #include "gtest/gtest.h"
using paddle::framework::Channel;
using paddle::framework::MakeChannel;
using paddle::framework::CloseChannel;
TEST(Channel, MakeAndClose) { TEST(Channel, MakeAndClose) {
using paddle::framework::Channel; using paddle::framework::details::Buffered;
using paddle::framework::MakeChannel; using paddle::framework::details::UnBuffered;
using paddle::framework::CloseChannel; {
// MakeChannel should return a buffered channel is buffer_size > 0.
auto ch = MakeChannel<int>(10);
EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
CloseChannel(ch);
delete ch;
}
{
// MakeChannel should return an un-buffered channel is buffer_size = 0.
auto ch = MakeChannel<int>(0);
EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
CloseChannel(ch);
delete ch;
}
}
TEST(Channel, SufficientBufferSizeDoesntBlock) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
for (size_t i = 0; i < buffer_size; ++i) {
EXPECT_EQ(ch->Send(&i), true); // should not block
}
size_t out;
for (size_t i = 0; i < buffer_size; ++i) {
EXPECT_EQ(ch->Receive(&out), true); // should not block
EXPECT_EQ(out, i);
}
CloseChannel(ch);
delete ch;
}
TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
size_t sum = 0;
std::thread t([&]() {
// Try to write more than buffer size.
for (size_t i = 0; i < 2 * buffer_size; ++i) {
if (i < buffer_size)
EXPECT_EQ(ch->Send(&i), true); // should block after 10 iterations
else
EXPECT_EQ(ch->Send(&i), false);
sum += i;
}
});
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec
EXPECT_EQ(sum, 45U);
CloseChannel(ch);
t.join();
delete ch;
}
TEST(Channel, SimpleUnbufferedChannelTest) {
auto ch = MakeChannel<int>(0);
unsigned sum_send = 0;
std::thread t([&]() {
for (int i = 0; i < 5; i++) {
EXPECT_EQ(ch->Send(&i), true);
sum_send += i;
}
});
for (int i = 0; i < 5; i++) {
int recv;
EXPECT_EQ(ch->Receive(&recv), true);
EXPECT_EQ(recv, i);
}
CloseChannel(ch);
t.join();
EXPECT_EQ(sum_send, 10U);
delete ch;
}
// This tests that closing a buffered channel also unblocks
// any receivers waiting on the channel
TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
auto ch = MakeChannel<int>(1);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
// All reads should return false
EXPECT_EQ(ch->Receive(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that all threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the channel
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
}
// This tests that closing a buffered channel also unblocks
// any senders waiting for channel to have write space
TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
auto ch = MakeChannel<int>(1);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
bool send_success[num_threads];
// Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
[&](bool *ended, bool *success) {
int data = 10;
*success = ch->Send(&data);
*ended = true;
},
&thread_ended[i], &send_success[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that atleast 4 threads are blocked
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (thread_ended[i] == false) ct++;
}
// Atleast 4 threads must be blocked
EXPECT_GE(ct, 4);
// Explicitly close the thread
// This should unblock all senders
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
// Verify that only 1 send was successful
ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (send_success[i]) ct++;
}
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
}
// This tests that closing an unbuffered channel also unblocks
// unblocks any receivers waiting for senders
TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
auto ch = MakeChannel<int>(0);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked becausew of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
EXPECT_EQ(ch->Receive(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the thread
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
}
// This tests that closing an unbuffered channel also unblocks
// unblocks any senders waiting for senders
TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
auto ch = MakeChannel<int>(0);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked becausew of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data = 10;
EXPECT_EQ(ch->Send(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the thread
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
}
TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
auto ch = MakeChannel<int>(0);
unsigned sum_send = 0;
// Send should block after three iterations
// since we only have three receivers.
std::thread t([&]() {
// Try to send more number of times
// than receivers
for (int i = 0; i < 4; i++) {
ch->Send(&i);
sum_send += i;
}
});
for (int i = 0; i < 3; i++) {
int recv;
ch->Receive(&recv);
EXPECT_EQ(recv, i);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec
EXPECT_EQ(sum_send, 3U);
CloseChannel(ch);
t.join();
delete ch;
}
TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
auto ch = MakeChannel<int>(0);
unsigned sum_send = 0;
unsigned sum_receive = 0;
// The receiver should block after 5
// iterations, since there are only 5 senders.
std::thread t([&]() {
for (int i = 0; i < 8; i++) {
int recv;
ch->Receive(&recv); // should block after the fifth iteration.
EXPECT_EQ(recv, i);
sum_receive += i;
}
});
for (int i = 0; i < 5; i++) {
ch->Send(&i);
sum_send += i;
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
EXPECT_EQ(sum_send, 10U);
EXPECT_EQ(sum_receive, 10U);
// send three more elements
for (int i = 5; i < 8; i++) {
ch->Send(&i);
sum_send += i;
}
Channel<int>* ch = MakeChannel<int>(10);
CloseChannel(ch); CloseChannel(ch);
t.join();
EXPECT_EQ(sum_send, 28U);
EXPECT_EQ(sum_receive, 28U);
delete ch;
} }
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <mutex> #include <mutex>
#include "paddle/framework/channel.h" #include "paddle/framework/channel.h"
#include "paddle/platform/enforce.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -29,9 +30,11 @@ class Buffered : public paddle::framework::Channel<T> { ...@@ -29,9 +30,11 @@ class Buffered : public paddle::framework::Channel<T> {
friend void paddle::framework::CloseChannel<T>(Channel<T>*); friend void paddle::framework::CloseChannel<T>(Channel<T>*);
public: public:
virtual void Send(T*); virtual bool Send(T*);
virtual void Receive(T*); virtual bool Receive(T*);
virtual size_t Cap() { return cap_; } virtual size_t Cap() { return cap_; }
virtual void Close();
virtual ~Buffered();
private: private:
size_t cap_; size_t cap_;
...@@ -39,42 +42,64 @@ class Buffered : public paddle::framework::Channel<T> { ...@@ -39,42 +42,64 @@ class Buffered : public paddle::framework::Channel<T> {
std::condition_variable empty_cond_var_; std::condition_variable empty_cond_var_;
std::condition_variable full_cond_var_; std::condition_variable full_cond_var_;
std::deque<T> channel_; std::deque<T> channel_;
bool closed_;
Buffered(size_t cap) : cap_(cap) {} Buffered(size_t cap) : cap_(cap), closed_(false) {
virtual ~Buffered(); PADDLE_ENFORCE_GT(cap, 0);
}
void NotifyAllSenders(std::unique_lock<std::mutex>*); void NotifyAllParticipants(std::unique_lock<std::mutex>*);
}; };
template <typename T> template <typename T>
void Buffered<T>::Send(T* item) { bool Buffered<T>::Send(T* item) {
std::unique_lock<std::mutex> lock(mu_); std::unique_lock<std::mutex> lock(mu_);
full_cond_var_.wait(lock, [this]() { return channel_.size() < cap_; }); full_cond_var_.wait(lock,
[this]() { return channel_.size() < cap_ || closed_; });
bool ret = false;
if (!closed_) {
channel_.push_back(std::move(*item)); channel_.push_back(std::move(*item));
lock.unlock(); lock.unlock();
empty_cond_var_.notify_one(); empty_cond_var_.notify_one();
ret = true;
}
return ret;
} }
template <typename T> template <typename T>
void Buffered<T>::Receive(T* item) { bool Buffered<T>::Receive(T* item) {
std::unique_lock<std::mutex> lock(mu_); std::unique_lock<std::mutex> lock(mu_);
empty_cond_var_.wait(lock, [this]() { return !channel_.empty(); }); empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
bool ret = false;
if (!closed_) {
*item = std::move(channel_.front()); *item = std::move(channel_.front());
channel_.pop_front(); channel_.pop_front();
NotifyAllSenders(&lock); full_cond_var_.notify_one();
ret = true;
}
return ret;
}
template <typename T>
void Buffered<T>::Close() {
std::unique_lock<std::mutex> lock(mu_);
closed_ = true;
NotifyAllParticipants(&lock);
} }
template <typename T> template <typename T>
Buffered<T>::~Buffered() { Buffered<T>::~Buffered() {
std::unique_lock<std::mutex> lock(mu_); std::unique_lock<std::mutex> lock(mu_);
closed_ = true;
channel_.clear(); channel_.clear();
NotifyAllSenders(&lock); NotifyAllParticipants(&lock);
} }
template <typename T> template <typename T>
void Buffered<T>::NotifyAllSenders(std::unique_lock<std::mutex>* lock) { void Buffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
lock->unlock(); lock->unlock();
full_cond_var_.notify_one(); full_cond_var_.notify_all();
empty_cond_var_.notify_all();
} }
} // namespace details } // namespace details
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <atomic>
#include <condition_variable> #include <condition_variable>
#include <deque>
#include <mutex> #include <mutex>
#include "paddle/framework/channel.h" #include "paddle/framework/channel.h"
...@@ -29,23 +29,117 @@ class UnBuffered : public paddle::framework::Channel<T> { ...@@ -29,23 +29,117 @@ class UnBuffered : public paddle::framework::Channel<T> {
friend void paddle::framework::CloseChannel<T>(Channel<T>*); friend void paddle::framework::CloseChannel<T>(Channel<T>*);
public: public:
virtual void Send(T*); virtual bool Send(T*);
virtual void Receive(T*); virtual bool Receive(T*);
virtual size_t Cap() { return 0; } virtual size_t Cap() { return 0; }
virtual void Close();
virtual ~UnBuffered();
private: private:
UnBuffered() {} std::mutex mu_ch_;
virtual ~UnBuffered(); // Mutex for readers and writers who are waiting for other reader
// and writer to complete execution
std::recursive_mutex mu_read_, mu_write_;
// reader_found_ is set true when a reader is ready to accept data
// writer_found_ is set true when a writer is ready to send data
// A transaction occurs only when both are true
std::atomic<bool> reader_found_{false}, writer_found_{false};
std::condition_variable cv_channel_;
std::condition_variable_any cv_reader_, cv_writer_;
T* item{nullptr};
std::atomic<bool> closed_{false};
UnBuffered() : closed_(false) {}
void NotifyAllParticipants(std::unique_lock<std::mutex>*);
}; };
// This function implements the concept of how data should
// be sent from a writer to a reader.
template <typename T>
bool UnBuffered<T>::Send(T* data) {
// Prevent other writers from entering
std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
writer_found_ = true;
std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
// If writer comes first, it should wait till a reader arrives
cv_writer_.wait(cv_lock,
[this]() { return reader_found_ == true || closed_; });
cv_reader_.notify_one();
bool ret = false;
if (!closed_) {
std::unique_lock<std::mutex> channel_lock(mu_ch_);
item = data;
channel_lock.unlock();
cv_channel_.notify_one();
channel_lock.lock();
cv_channel_.wait(channel_lock,
[this]() { return item == nullptr || closed_; });
ret = true;
}
writer_found_ = false;
return ret;
}
// This function implements the concept of how
// data that was sent by a writer is read from a reader.
template <typename T>
bool UnBuffered<T>::Receive(T* data) {
// Prevent other readers from entering
std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
reader_found_ = true;
std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
// If reader comes first, it should wait till a writer arrives
cv_reader_.wait(cv_lock,
[this]() { return writer_found_ == true || closed_; });
cv_writer_.notify_one();
bool ret = false;
if (!closed_) {
std::unique_lock<std::mutex> lock_ch{mu_ch_};
// Reader should wait for the writer to first write its data
cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
if (!closed_) {
*data = std::move(*item);
item = nullptr;
lock_ch.unlock();
ret = true;
}
cv_channel_.notify_one();
}
reader_found_ = false;
return ret;
}
// This function implements the sequence of events
// that take place once the channel is closed.
template <typename T> template <typename T>
void UnBuffered<T>::Send(T* channel_element) {} void UnBuffered<T>::Close() {
std::unique_lock<std::mutex> lock(mu_ch_);
item = nullptr;
closed_ = true;
NotifyAllParticipants(&lock);
}
// This function implements the sequence of events
// that are executed once the object of an UnBuffered
// channel is destroyed.
template <typename T> template <typename T>
void UnBuffered<T>::Receive(T*) {} UnBuffered<T>::~UnBuffered() {
std::unique_lock<std::mutex> lock(mu_ch_);
item = nullptr;
closed_ = true;
NotifyAllParticipants(&lock);
}
// This function notifies all the readers, writers and
// the channel condition variables.
template <typename T> template <typename T>
UnBuffered<T>::~UnBuffered() {} void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
lock->unlock();
cv_writer_.notify_all();
cv_channel_.notify_all();
cv_reader_.notify_all();
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
...@@ -25,7 +25,7 @@ limitations under the License. */ ...@@ -25,7 +25,7 @@ limitations under the License. */
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
#include "paddle/platform/profiler.h" #include "paddle/platform/profiler.h"
DECLARE_bool(do_memory_benchmark); DECLARE_bool(benchmark);
DEFINE_bool(check_nan_inf, false, DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be " "Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely."); "extremely slow so please use this flag wisely.");
...@@ -33,9 +33,6 @@ DEFINE_bool(check_nan_inf, false, ...@@ -33,9 +33,6 @@ DEFINE_bool(check_nan_inf, false,
namespace paddle { namespace paddle {
namespace framework { namespace framework {
const std::string kFeedOpType = "feed";
const std::string kFetchOpType = "fetch";
Executor::Executor(const platform::Place& place) : place_(place) {} Executor::Executor(const platform::Place& place) : place_(place) {}
static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
...@@ -125,7 +122,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -125,7 +122,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
op->Run(*local_scope, place_); op->Run(*local_scope, place_);
VLOG(3) << op->DebugStringEx(local_scope); VLOG(3) << op->DebugStringEx(local_scope);
if (FLAGS_do_memory_benchmark) { if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: " VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_); << memory::memory_usage(place_);
} }
...@@ -142,7 +139,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -142,7 +139,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
if (create_vars && create_local_scope) { if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
} }
if (FLAGS_do_memory_benchmark) { if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------"; VLOG(2) << "-------------------------------------------------------";
VLOG(2) << "Memory used after deleting local scope: " VLOG(2) << "Memory used after deleting local scope: "
<< memory::memory_usage(place_); << memory::memory_usage(place_);
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector> #include <vector>
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
...@@ -20,5 +21,8 @@ namespace paddle { ...@@ -20,5 +21,8 @@ namespace paddle {
namespace framework { namespace framework {
using FeedFetchType = LoDTensor; using FeedFetchType = LoDTensor;
using FeedFetchList = std::vector<FeedFetchType>; using FeedFetchList = std::vector<FeedFetchType>;
static const std::string kFeedOpType = "feed";
static const std::string kFetchOpType = "fetch";
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <string.h> // for strdup #include <string.h> // for strdup
#include <algorithm> #include <algorithm>
#include <stdexcept>
#include <string> #include <string>
#include "paddle/framework/init.h" #include "paddle/framework/init.h"
...@@ -46,17 +47,23 @@ void InitDevices() { ...@@ -46,17 +47,23 @@ void InitDevices() {
std::vector<platform::Place> places; std::vector<platform::Place> places;
places.emplace_back(platform::CPUPlace()); places.emplace_back(platform::CPUPlace());
int count = 0;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
int count = platform::GetCUDADeviceCount(); try {
for (int i = 0; i < count; ++i) { count = platform::GetCUDADeviceCount();
places.emplace_back(platform::CUDAPlace(i)); } catch (const std::exception &exp) {
LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
} }
#else #else
LOG(WARNING) LOG(WARNING)
<< "'GPU' is not supported, Please re-compile with WITH_GPU option"; << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
#endif #endif
for (int i = 0; i < count; ++i) {
places.emplace_back(platform::CUDAPlace(i));
}
platform::DeviceContextPool::Init(places); platform::DeviceContextPool::Init(places);
} }
......
...@@ -20,7 +20,21 @@ TEST(InitDevices, CPU) { ...@@ -20,7 +20,21 @@ TEST(InitDevices, CPU) {
using paddle::framework::InitDevices; using paddle::framework::InitDevices;
using paddle::platform::DeviceContextPool; using paddle::platform::DeviceContextPool;
#ifndef PADDLE_WITH_CUDA
InitDevices(); InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance(); DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_GE(pool.size(), 1U); ASSERT_EQ(pool.size(), 1U);
#endif
}
TEST(InitDevices, CUDA) {
using paddle::framework::InitDevices;
using paddle::platform::DeviceContextPool;
#ifdef PADDLE_WITH_CUDA
int count = paddle::platform::GetCUDADeviceCount();
InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
#endif
} }
...@@ -24,8 +24,6 @@ limitations under the License. */ ...@@ -24,8 +24,6 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <iterator> #include <iterator>
#include <glog/logging.h>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -18,11 +18,11 @@ limitations under the License. */ ...@@ -18,11 +18,11 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <thrust/device_vector.h> #include <thrust/device_vector.h>
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>
#endif #endif
#include <glog/logging.h> #include <glog/logging.h>
#include "paddle/framework/ddim.h" #include "paddle/framework/ddim.h"
#include "paddle/framework/mixed_vector.h"
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
#include "paddle/framework/tensor_util.h" #include "paddle/framework/tensor_util.h"
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
...@@ -31,15 +31,6 @@ limitations under the License. */ ...@@ -31,15 +31,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
#ifndef PADDLE_WITH_CUDA
template <typename T>
using Vector = std::vector<T>;
#else
template <typename T>
using Vector = thrust::host_vector<
T, thrust::system::cuda::experimental::pinned_allocator<T>>;
#endif
/* /*
* LoD is short for Level of Details. * LoD is short for Level of Details.
* *
...@@ -55,7 +46,15 @@ using Vector = thrust::host_vector< ...@@ -55,7 +46,15 @@ using Vector = thrust::host_vector<
* 0 2 4 7 * 0 2 4 7
* 0 2 5 7 10 12 15 20 * 0 2 5 7 10 12 15 20
*/ */
using LoD = std::vector<Vector<size_t>>; struct LoD : public std::vector<Vector<size_t>> {
using std::vector<Vector<size_t>>::vector;
void CopyFromCUDA() {
for (auto it = this->begin(); it != this->end(); ++it) {
it->CopyFromCUDA();
}
}
};
std::ostream& operator<<(std::ostream& os, const LoD& lod); std::ostream& operator<<(std::ostream& os, const LoD& lod);
std::ostream& operator<<(std::ostream& os, const LoDTensor& t); std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
...@@ -109,7 +108,10 @@ bool CheckAbsLoD(const LoD& in, int tensor_height = -1); ...@@ -109,7 +108,10 @@ bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
*/ */
class LoDTensor : public Tensor { class LoDTensor : public Tensor {
public: public:
LoDTensor() {} LoDTensor() : Tensor() {}
/* Constructor with place should only be used in pybind */
explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
explicit LoDTensor(const LoD& lod) : lod_(lod) {} explicit LoDTensor(const LoD& lod) : lod_(lod) {}
......
...@@ -23,6 +23,17 @@ ...@@ -23,6 +23,17 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
TEST(LoD, data) {
LoD lod{{0, 1, 2}};
lod.push_back({0, 2, 4, 5});
lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
auto& v = lod[0];
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], i);
}
}
TEST(LodExpand, test) { TEST(LodExpand, test) {
LoD lod{{0, 2}}; LoD lod{{0, 2}};
LoDTensor tensor; LoDTensor tensor;
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <stdio.h>
#include "paddle/framework/init.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/platform/assert.h" #include "paddle/platform/assert.h"
...@@ -26,7 +28,48 @@ __global__ void test(size_t* a, int size) { ...@@ -26,7 +28,48 @@ __global__ void test(size_t* a, int size) {
} }
} }
TEST(Vector, Normal) {
using namespace paddle::framework;
using namespace paddle::platform;
using namespace paddle::memory;
paddle::framework::InitDevices();
paddle::framework::Vector<size_t> vec({1, 2, 3});
size_t* ptr = vec.data();
for (size_t i = 0; i < vec.size(); ++i) {
EXPECT_EQ(vec[i], *(ptr + i));
}
vec.clear();
vec.CopyFromCUDA();
std::vector<size_t> v = {1, 2, 3};
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], vec[i]);
}
}
TEST(LoD, data) {
paddle::framework::InitDevices();
paddle::framework::LoD lod{{0, 1, 2}};
lod.push_back({0, 2, 4, 5});
lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
auto& v = lod[0];
test<<<1, 1>>>(v.cuda_data(), v.size());
cudaDeviceSynchronize();
v.CopyFromCUDA();
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], i * 2);
}
}
TEST(LoDTensor, LoDInGPU) { TEST(LoDTensor, LoDInGPU) {
paddle::framework::InitDevices();
paddle::framework::LoDTensor lod_tensor; paddle::framework::LoDTensor lod_tensor;
paddle::platform::CUDAPlace place(0); paddle::platform::CUDAPlace place(0);
...@@ -42,8 +85,9 @@ TEST(LoDTensor, LoDInGPU) { ...@@ -42,8 +85,9 @@ TEST(LoDTensor, LoDInGPU) {
auto lod = lod_tensor.lod(); auto lod = lod_tensor.lod();
test<<<1, 8>>>(lod[0].data(), lod[0].size()); test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size());
cudaDeviceSynchronize(); cudaDeviceSynchronize();
lod.CopyFromCUDA();
for (size_t i = 0; i < src_lod[0].size(); ++i) { for (size_t i = 0; i < src_lod[0].size(); ++i) {
EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2); EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <initializer_list>
#include <vector>
#include "paddle/memory/memcpy.h"
#include "paddle/memory/memory.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace framework {
/**
* @brief Vector support both cpu and gpu.
* host vector lifetime is same with Vector
* device vector is lazily malloc and modified.
*/
template <typename T>
class Vector : public std::vector<T> {
public:
using std::vector<T>::vector;
Vector() {}
Vector(const std::vector<T> &v) : std::vector<T>(v) {} // NOLINT
virtual ~Vector() {
#ifdef PADDLE_WITH_CUDA
if (cuda_ptr_ != nullptr) {
memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
}
#endif
}
/* Get device vector */
T *cuda_data() {
CopyToCUDA();
PADDLE_ENFORCE_NOT_NULL(
cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
return static_cast<T *>(cuda_ptr_);
}
/* Get host vector */
T *data() { return std::vector<T>::data(); }
const T *data() const { return std::vector<T>::data(); }
/* Synchronize host vector to device vector */
void CopyToCUDA();
/* Synchronize device vector to host vector */
void CopyFromCUDA();
/* Switch device vector location */
void CopyToPeer(platform::Place);
private:
void *cuda_ptr_ = nullptr;
size_t cuda_size_ = 0; // device vector numel
platform::CUDAPlace place_;
};
template <typename T>
void Vector<T>::CopyToCUDA() {
#ifdef PADDLE_WITH_CUDA
if (cuda_size_ < this->size()) {
if (cuda_ptr_ != nullptr) {
memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
}
cuda_ptr_ =
memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T));
}
cuda_size_ = this->size();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *ctx = pool.GetByPlace(place_);
memory::Copy(place_, cuda_ptr_, platform::CPUPlace(),
static_cast<const void *>(this->data()),
this->size() * sizeof(T), ctx->stream());
ctx->Wait();
#endif
}
template <typename T>
void Vector<T>::CopyFromCUDA() {
#ifdef PADDLE_WITH_CUDA
if (cuda_ptr_ == nullptr) {
LOG(WARNING) << "No uncommitted cuda data.";
return;
}
this->resize(cuda_size_);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *ctx = pool.GetByPlace(place_);
memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
ctx->stream());
ctx->Wait();
#endif
}
template <typename T>
void Vector<T>::CopyToPeer(platform::Place peer_place) {
#ifdef PADDLE_WITH_CUDA
auto *ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
void *peer_cuda_ptr = memory::Alloc<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(peer_place), this->size() * sizeof(T));
memory::Copy(boost::get<platform::CUDAPlace>(peer_place), peer_cuda_ptr,
place_, cuda_ptr_, this->size() * sizeof(T), ctx->stream());
ctx->Wait();
memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
place_ = boost::get<platform::CUDAPlace>(peer_place);
cuda_ptr_ = peer_cuda_ptr;
#endif
}
template class Vector<int>;
template class Vector<unsigned>;
template class Vector<size_t>;
template class Vector<int64_t>;
} // namespace framework
} // namespace paddle
...@@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext {
bool HasOutputs(const std::string &name) const override; bool HasOutputs(const std::string &name) const override;
DDim GetInputDim(const std::string &name) const override;
void SetOutputDim(const std::string &name, const DDim &dim) override;
AttrReader Attrs() const override; AttrReader Attrs() const override;
const std::vector<std::string> &Inputs( const std::vector<std::string> &Inputs(
...@@ -444,21 +440,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const { ...@@ -444,21 +440,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
return true; return true;
} }
DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const {
std::vector<DDim> ddims = GetInputsDim(name);
auto length = ddims.size();
PADDLE_ENFORCE_EQ(length, 1UL,
"Input(%s) should have 1 value, "
"but it has %d now",
name, length);
return ddims[0];
}
void CompileTimeInferShapeContext::SetOutputDim(const std::string &name,
const DDim &dim) {
SetOutputsDim(name, {dim});
}
AttrReader CompileTimeInferShapeContext::Attrs() const { AttrReader CompileTimeInferShapeContext::Attrs() const {
return AttrReader(op_.GetAttrMap()); return AttrReader(op_.GetAttrMap());
} }
......
...@@ -22,9 +22,7 @@ limitations under the License. */ ...@@ -22,9 +22,7 @@ limitations under the License. */
#include "paddle/framework/shape_inference.h" #include "paddle/framework/shape_inference.h"
#include "paddle/framework/var_type.h" #include "paddle/framework/var_type.h"
DEFINE_bool(op_sync, false, DECLARE_bool(benchmark);
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -368,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -368,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
return true; return true;
} }
DDim GetInputDim(const std::string& name) const override {
return GetDim(op_.Input(name));
}
void SetOutputDim(const std::string& name, const DDim& dim) override {
SetDim(op_.Output(name), dim);
}
AttrReader Attrs() const override { return AttrReader(op_.Attrs()); } AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
const std::vector<std::string>& Inputs( const std::vector<std::string>& Inputs(
...@@ -531,7 +521,7 @@ void OperatorWithKernel::Run(const Scope& scope, ...@@ -531,7 +521,7 @@ void OperatorWithKernel::Run(const Scope& scope,
ExecutionContext(*this, new_scope, *new_dev_ctx)); ExecutionContext(*this, new_scope, *new_dev_ctx));
/*For profiling/benchmark only*/ /*For profiling/benchmark only*/
if (FLAGS_op_sync) { if (FLAGS_benchmark) {
new_dev_ctx->Wait(); new_dev_ctx->Wait();
} }
} }
......
...@@ -14,13 +14,11 @@ limitations under the License. */ ...@@ -14,13 +14,11 @@ limitations under the License. */
#include "paddle/framework/program_desc.h" #include "paddle/framework/program_desc.h"
#include "paddle/framework/block_desc.h" #include "paddle/framework/block_desc.h"
#include "paddle/framework/feed_fetch_type.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
const std::string kFeedOpType = "feed";
const std::string kFetchOpType = "fetch";
BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) { BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
auto *b = desc_.add_blocks(); auto *b = desc_.add_blocks();
b->set_parent_idx(parent.ID()); b->set_parent_idx(parent.ID());
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/framework/block_desc.h"
#include "paddle/framework/framework.pb.h" #include "paddle/framework/framework.pb.h"
#include "paddle/framework/proto_desc.h" #include "paddle/framework/proto_desc.h"
#include "paddle/platform/macros.h" #include "paddle/platform/macros.h"
......
...@@ -20,9 +20,11 @@ limitations under the License. */ ...@@ -20,9 +20,11 @@ limitations under the License. */
#include "paddle/framework/threadpool.h" #include "paddle/framework/threadpool.h"
#include "paddle/string/printf.h" #include "paddle/string/printf.h"
DEFINE_bool(do_memory_benchmark, false, DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, " "Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs"); "and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -93,7 +95,7 @@ void Scope::DeleteScope(Scope* scope) { ...@@ -93,7 +95,7 @@ void Scope::DeleteScope(Scope* scope) {
PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
this->kids_.erase(it); this->kids_.erase(it);
// When making memory benchmark on Fluid, we have to delete scope sync. // When making memory benchmark on Fluid, we have to delete scope sync.
if (FLAGS_do_memory_benchmark) { if (FLAGS_benchmark) {
delete scope; delete scope;
} else { } else {
Async([scope] { delete scope; }); Async([scope] { delete scope; });
......
...@@ -18,10 +18,18 @@ limitations under the License. */ ...@@ -18,10 +18,18 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
std::vector<framework::DDim> InferShapeContext::GetInputsDim( DDim InferShapeContext::GetInputDim(const std::string &name) const {
const std::vector<std::string> &arg_names = Inputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Input(%s) should hold one element, but now it holds %d",
name, arg_names.size());
return this->GetDim(arg_names[0]);
}
std::vector<DDim> InferShapeContext::GetInputsDim(
const std::string &name) const { const std::string &name) const {
const std::vector<std::string> &names = Inputs(name); const std::vector<std::string> &arg_names = Inputs(name);
return GetDims(names); return GetDims(arg_names);
} }
DDim InferShapeContext::GetInputsElementDim(const std::string &name, DDim InferShapeContext::GetInputsElementDim(const std::string &name,
...@@ -30,24 +38,31 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name, ...@@ -30,24 +38,31 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name,
return this->GetDim(names[idx]); return this->GetDim(names[idx]);
} }
void InferShapeContext::SetOutputsDim( void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
const std::string &name, const std::vector<framework::DDim> &dims) { auto &arg_names = Outputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Output(%s) should hold one element, but now it holds %d",
name, arg_names.size());
SetDim(arg_names[0], dim);
}
void InferShapeContext::SetOutputsDim(const std::string &name,
const std::vector<DDim> &dims) {
auto &names = Outputs(name); auto &names = Outputs(name);
SetDims(names, dims); SetDims(names, dims);
} }
std::vector<framework::DDim> InferShapeContext::GetDims( std::vector<DDim> InferShapeContext::GetDims(
const std::vector<std::string> &names) const { const std::vector<std::string> &names) const {
std::vector<framework::DDim> ret; std::vector<DDim> ret;
ret.reserve(names.size()); ret.reserve(names.size());
std::transform( std::transform(
names.begin(), names.end(), std::back_inserter(ret), names.begin(), names.end(), std::back_inserter(ret),
[this](const std::string &name) { return this->GetDim(name); }); [this](const std::string &name) { return this->GetDim(name); });
return ret; return ret;
} }
void InferShapeContext::SetDims(const std::vector<std::string> &names, void InferShapeContext::SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims) { const std::vector<DDim> &dims) {
size_t length = names.size(); size_t length = names.size();
PADDLE_ENFORCE_EQ(length, dims.size()); PADDLE_ENFORCE_EQ(length, dims.size());
for (size_t i = 0; i < length; ++i) { for (size_t i = 0; i < length; ++i) {
......
...@@ -35,14 +35,13 @@ class InferShapeContext { ...@@ -35,14 +35,13 @@ class InferShapeContext {
virtual bool HasInputs(const std::string &name) const = 0; virtual bool HasInputs(const std::string &name) const = 0;
virtual bool HasOutputs(const std::string &name) const = 0; virtual bool HasOutputs(const std::string &name) const = 0;
virtual framework::DDim GetInputDim(const std::string &name) const = 0; DDim GetInputDim(const std::string &name) const;
std::vector<framework::DDim> GetInputsDim(const std::string &name) const; std::vector<DDim> GetInputsDim(const std::string &name) const;
DDim GetInputsElementDim(const std::string &name, int idx) const; DDim GetInputsElementDim(const std::string &name, int idx) const;
virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0; void SetOutputDim(const std::string &name, const DDim &dim);
void SetOutputsDim(const std::string &name, void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims);
const std::vector<framework::DDim> &dims);
virtual AttrReader Attrs() const = 0; virtual AttrReader Attrs() const = 0;
virtual const std::vector<std::string> &Inputs( virtual const std::vector<std::string> &Inputs(
...@@ -57,15 +56,13 @@ class InferShapeContext { ...@@ -57,15 +56,13 @@ class InferShapeContext {
// Note: In while op, we need this to be public // Note: In while op, we need this to be public
void SetDims(const std::vector<std::string> &names, void SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims); const std::vector<DDim> &dims);
protected: protected:
virtual framework::DDim GetDim(const std::string &name) const = 0; virtual DDim GetDim(const std::string &name) const = 0;
virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0; virtual void SetDim(const std::string &name, const DDim &dim) = 0;
std::vector<framework::DDim> GetDims(
const std::vector<std::string> &names) const;
std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
std::vector<proto::VarDesc::VarType> GetVarTypes( std::vector<proto::VarDesc::VarType> GetVarTypes(
const std::vector<std::string> &names) const; const std::vector<std::string> &names) const;
......
...@@ -47,6 +47,11 @@ class Tensor { ...@@ -47,6 +47,11 @@ class Tensor {
public: public:
Tensor() : offset_(0) {} Tensor() : offset_(0) {}
/*! Constructor with place should only be used in pybind. */
explicit Tensor(const platform::Place& place) : offset_(0) {
holder_->set_place(place);
}
/*! Return a pointer to mutable memory block. */ /*! Return a pointer to mutable memory block. */
template <typename T> template <typename T>
inline T* data(); inline T* data();
...@@ -137,6 +142,7 @@ class Tensor { ...@@ -137,6 +142,7 @@ class Tensor {
virtual std::type_index type() const = 0; virtual std::type_index type() const = 0;
virtual platform::Place place() const = 0; virtual platform::Place place() const = 0;
virtual void set_type(std::type_index type) = 0; virtual void set_type(std::type_index type) = 0;
virtual void set_place(platform::Place place) = 0;
}; };
template <typename Place> template <typename Place>
...@@ -156,6 +162,7 @@ class Tensor { ...@@ -156,6 +162,7 @@ class Tensor {
virtual void* ptr() const { return static_cast<void*>(ptr_.get()); } virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
virtual std::type_index type() const { return type_; } virtual std::type_index type() const { return type_; }
virtual void set_type(std::type_index type) { type_ = type; } virtual void set_type(std::type_index type) { type_ = type; }
virtual void set_place(platform::Place place) { place_ = place; }
/*! the pointer of memory block. */ /*! the pointer of memory block. */
std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_; std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
......
...@@ -178,19 +178,22 @@ public: ...@@ -178,19 +178,22 @@ public:
real* inputData = inputs[0].data<real>(); real* inputData = inputs[0].data<real>();
real* filterData = inputs[1].data<real>(); real* filterData = inputs[1].data<real>();
real* outputData = outputs[0].data<real>(); real* outputData = outputs[0].data<real>();
real* colData = NULL;
bool needIm2col = isNeedIm2col(filter); bool needIm2col = isNeedIm2col(filter);
TensorShape imShape = TensorShape imShape =
TensorShape({inputChannels / groups_, inputHeight, inputWidth}); TensorShape({inputChannels / groups_, inputHeight, inputWidth});
TensorShape colShape; TensorShape colShape;
real* colData = NULL;
size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth; // Max col matrix width 4096, Max col matrix size 4M.
size_t colWidth = outputHeight * outputWidth; size_t outputHeightSteps =
// Max col matrix height 256, Max col matrix width 1024 std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256)); size_t maxColWidth = outputHeightSteps * outputWidth;
size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048)); size_t channelSteps =
std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
(size_t)1),
inputChannels / groups_);
size_t maxColHeight = channelSteps * filterHeight * filterWidth;
if (needIm2col) { if (needIm2col) {
colShape = TensorShape({inputChannels / groups_, colShape = TensorShape({inputChannels / groups_,
...@@ -199,7 +202,7 @@ public: ...@@ -199,7 +202,7 @@ public:
outputHeight, outputHeight,
outputWidth}); outputWidth});
resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real)); resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
colData = reinterpret_cast<real*>(memory_->getBuf()); colData = reinterpret_cast<real*>(memory_->getBuf());
} }
...@@ -209,20 +212,24 @@ public: ...@@ -209,20 +212,24 @@ public:
(outputChannels / groups_) * outputHeight * outputWidth; (outputChannels / groups_) * outputHeight * outputWidth;
size_t filterOffset = filter.getElements() / groups_; size_t filterOffset = filter.getElements() / groups_;
int nStride = colWidth; int nStride = outputHeight * outputWidth;
int kStride = colHeight; int kStride = inputChannels / groups_ * filterHeight * filterWidth;
for (size_t i = 0; i < batchSize; i++) { for (size_t i = 0; i < batchSize; i++) {
filterData = inputs[1].data<real>();
for (size_t g = 0; g < groups_; g++) { for (size_t g = 0; g < groups_; g++) {
if (needIm2col) { if (needIm2col) {
real beta_ = beta; real beta_ = beta;
for (size_t colHeightStart = 0; colHeightStart < colHeight; for (size_t ic = 0; ic < inputChannels / groups_;
colHeightStart += stepColHeight) { ic += channelSteps) {
for (size_t colWidthStart = 0; colWidthStart < colWidth; int channels = std::min(inputChannels / groups_ - ic, channelSteps);
colWidthStart += stepColWidth) { for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
int N = std::min(colWidth - colWidthStart, stepColWidth); int height = std::min(outputHeight - oh, outputHeightSteps);
int K = std::min(colHeight - colHeightStart, stepColHeight);
int M = outputChannels / groups_;
int N = height * outputWidth;
int K = channels * filterHeight * filterWidth;
// im2col // im2col
im2col(inputData + g * inputOffset, im2col(inputData,
imShape, imShape,
colData, colData,
colShape, colShape,
...@@ -232,13 +239,12 @@ public: ...@@ -232,13 +239,12 @@ public:
paddingW(), paddingW(),
dilationH(), dilationH(),
dilationW(), dilationW(),
colHeightStart, channels,
K, oh,
colWidthStart, height,
N); N);
// gemm // gemm
int M = outputChannels / groups_;
BlasGemm<Device, real>::compute( BlasGemm<Device, real>::compute(
false, false,
false, false,
...@@ -246,12 +252,12 @@ public: ...@@ -246,12 +252,12 @@ public:
N, N,
K, K,
1.0f, 1.0f,
filterData + g * filterOffset + colHeightStart, filterData + ic * filterHeight * filterWidth,
kStride, kStride,
colData, colData,
N, N,
beta_, beta_,
outputData + g * outputOffset + colWidthStart, outputData + oh * outputWidth,
nStride); nStride);
} }
beta_ = 1.0; beta_ = 1.0;
...@@ -266,17 +272,18 @@ public: ...@@ -266,17 +272,18 @@ public:
N, N,
K, K,
1.0f, 1.0f,
filterData + g * filterOffset, filterData,
K, K,
inputData + g * inputOffset, inputData,
N, N,
beta, beta,
outputData + g * outputOffset, outputData,
N); N);
} }
inputData += inputOffset;
outputData += outputOffset;
filterData += filterOffset;
} }
inputData += inputChannels * inputHeight * inputWidth;
outputData += outputChannels * outputHeight * outputWidth;
} }
memory_.reset(); memory_.reset();
......
...@@ -111,39 +111,42 @@ public: ...@@ -111,39 +111,42 @@ public:
int paddingWidth, int paddingWidth,
int dilationHeight, int dilationHeight,
int dilationWidth, int dilationWidth,
int colHeightStart, int inputChannels,
int colHeightSize, int colOffset,
int colWidthStart, int colOutputHeight,
int colWidthSize) { int colWidth) {
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
int filterHeight = colShape[1]; int filterHeight = colShape[1];
int filterWidth = colShape[2]; int filterWidth = colShape[2];
int outputWidth = colShape[4]; int outputWidth = colShape[4];
for (int colh = 0; colh < colHeightSize; colh++) { for (int ic = 0; ic < inputChannels; ic++) {
int wOffset = (colHeightStart + colh) % filterWidth; for (int oh = 0; oh < colOutputHeight; oh++) {
int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight; T* dstData = colData + oh * outputWidth;
int c_im = (colHeightStart + colh) / filterWidth / filterHeight; for (int fh = 0; fh < filterHeight; fh++) {
for (int fw = 0; fw < filterWidth; fw++) {
for (int colw = 0; colw < colWidthSize; colw++) { int imRowIdx = (oh + colOffset) * strideHeight +
int h = (colWidthStart + colw) / outputWidth; fh * dilationHeight - paddingHeight;
int w = (colWidthStart + colw) % outputWidth; if (imRowIdx < 0 || imRowIdx >= inputHeight) {
memset(dstData, 0, outputWidth * sizeof(T));
int imRowIdx = h * strideHeight + hOffset * dilationHeight;
int imColIdx = w * strideWidth + wOffset * dilationWidth;
if ((imRowIdx - paddingHeight) < 0 ||
(imRowIdx - paddingHeight) >= inputHeight ||
(imColIdx - paddingWidth) < 0 ||
(imColIdx - paddingWidth) >= inputWidth) {
colData[colh * colWidthSize + colw] = static_cast<T>(0);
} else { } else {
imRowIdx += c_im * inputHeight - paddingHeight; for (int ow = 0; ow < outputWidth; ow++) {
imColIdx -= paddingWidth; int imColIdx =
colData[colh * colWidthSize + colw] = ow * strideWidth + fw * dilationWidth - paddingWidth;
imData[imRowIdx * inputWidth + imColIdx]; if (imColIdx < 0 || imColIdx >= inputWidth) {
dstData[ow] = T(0);
} else {
dstData[ow] = imData[imRowIdx * inputWidth + imColIdx];
}
}
}
dstData += colWidth;
}
} }
} }
colData += filterHeight * filterWidth * colWidth;
imData += inputHeight * inputWidth;
} }
} }
}; };
......
...@@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() { ...@@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() {
padding, padding,
dilation, dilation,
dilation, dilation,
channels,
0, 0,
height, outputHeight,
0, outputHeight * outputWidth);
width);
autotest::TensorCheckEqual(*output1, *output2); autotest::TensorCheckEqual(*output1, *output2);
} }
......
set(FLUID_CORE_MODULES proto_desc paddle_memory executor prune init) set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
cc_library(paddle_fluid_api cc_library(paddle_fluid_api
SRCS io.cc SRCS io.cc
...@@ -29,19 +29,6 @@ add_custom_target(inference_lib_dist DEPENDS ...@@ -29,19 +29,6 @@ add_custom_target(inference_lib_dist DEPENDS
inference_lib framework_lib memory_lib platform_lib string_lib inference_lib framework_lib memory_lib platform_lib string_lib
gflags_lib glog_lib protobuf_lib eigen3_lib) gflags_lib glog_lib protobuf_lib eigen3_lib)
add_executable(example example.cc) if(WITH_TESTING)
if(APPLE) add_subdirectory(tests/book)
set(OPTIONAL_LINK_FLAGS)
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
set(OPTIONAL_LINK_FLAGS "-undefined dynamic_lookup")
endif()
target_link_libraries(example
-Wl,-force_load paddle_fluid
${OPTIONAL_LINK_FLAGS}
${PTOOLS_LIB})
else()
target_link_libraries(example
-Wl,--start-group -Wl,--whole-archive paddle_fluid
-Wl,--no-whole-archive -Wl,--end-group
${PTOOLS_LIB})
endif() endif()
...@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and ...@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/inference/io.h" #include "paddle/inference/io.h"
#include <fstream> #include <fstream>
#include "paddle/framework/block_desc.h"
#include "paddle/framework/feed_fetch_type.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
const std::string kFeedOpType = "feed";
bool IsParameter(const framework::VarDesc* var, bool IsParameter(const framework::VarDesc* var,
const framework::ProgramDesc& main_program) { const framework::ProgramDesc& main_program) {
if (var->Persistable()) { if (var->Persistable()) {
...@@ -27,7 +28,7 @@ bool IsParameter(const framework::VarDesc* var, ...@@ -27,7 +28,7 @@ bool IsParameter(const framework::VarDesc* var,
for (size_t i = 0; i < main_program.Size(); ++i) { for (size_t i = 0; i < main_program.Size(); ++i) {
const framework::BlockDesc& block = main_program.Block(i); const framework::BlockDesc& block = main_program.Block(i);
for (auto* op : block.AllOps()) { for (auto* op : block.AllOps()) {
if (op->Type() == kFeedOpType) { if (op->Type() == framework::kFeedOpType) {
continue; continue;
} }
for (auto input_argument_name : op->InputArgumentNames()) { for (auto input_argument_name : op->InputArgumentNames()) {
...@@ -51,7 +52,7 @@ void LoadPersistables(framework::Executor& executor, ...@@ -51,7 +52,7 @@ void LoadPersistables(framework::Executor& executor,
framework::BlockDesc* load_block = load_program->MutableBlock(0); framework::BlockDesc* load_block = load_program->MutableBlock(0);
for (auto* var : global_block.AllVars()) { for (auto* var : global_block.AllVars()) {
if (IsParameter(var, main_program)) { if (IsParameter(var, main_program)) {
LOG(INFO) << "parameter's name: " << var->Name(); VLOG(3) << "parameter's name: " << var->Name();
framework::VarDesc* new_var = load_block->Var(var->Name()); framework::VarDesc* new_var = load_block->Var(var->Name());
new_var->SetShape(var->Shape()); new_var->SetShape(var->Shape());
......
...@@ -17,18 +17,13 @@ limitations under the License. */ ...@@ -17,18 +17,13 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/framework/block_desc.h"
#include "paddle/framework/executor.h" #include "paddle/framework/executor.h"
#include "paddle/framework/program_desc.h" #include "paddle/framework/program_desc.h"
#include "paddle/framework/scope.h" #include "paddle/framework/scope.h"
#include "paddle/framework/var_desc.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
bool IsParameter(const framework::VarDesc* var,
const framework::ProgramDesc& main_program);
void LoadPersistables(framework::Executor& executor, void LoadPersistables(framework::Executor& executor,
framework::Scope& scope, framework::Scope& scope,
const std::string& dirname, const std::string& dirname,
......
set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
cc_test(test_inference_recognize_digits_mlp
SRCS test_inference_recognize_digits.cc
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
set_tests_properties(test_inference_recognize_digits_mlp
PROPERTIES DEPENDS test_recognize_digits)
...@@ -12,93 +12,102 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,93 +12,102 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h>
#include <time.h> #include <time.h>
#include <iostream> #include <sstream>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/framework/init.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/inference/io.h" #include "paddle/inference/io.h"
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
int main(int argc, char** argv) { template <typename Place, typename T>
google::ParseCommandLineFlags(&argc, &argv, true); void TestInference(const std::string& dirname,
if (FLAGS_dirname.empty()) { const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
// Example: std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
// ./example --dirname=recognize_digits_mlp.inference.model // 1. Define place, executor and scope
std::cout << "Usage: ./example --dirname=path/to/your/model" << std::endl; auto place = Place();
exit(1); auto executor = paddle::framework::Executor(place);
}
// 1. Define place, executor, scope
auto place = paddle::platform::CPUPlace();
paddle::framework::InitDevices();
auto* executor = new paddle::framework::Executor(place);
auto* scope = new paddle::framework::Scope(); auto* scope = new paddle::framework::Scope();
std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl; // 2. Initialize the inference_program and load all parameters from file
std::string dirname = FLAGS_dirname; auto inference_program = paddle::inference::Load(executor, *scope, dirname);
// 2. Initialize the inference program
auto inference_program = paddle::inference::Load(*executor, *scope, dirname);
// 3. Optional: perform optimization on the inference_program // 3. Get the feed_target_names and fetch_target_names
// 4. Get the feed_target_names and fetch_target_names
const std::vector<std::string>& feed_target_names = const std::vector<std::string>& feed_target_names =
inference_program->GetFeedTargetNames(); inference_program->GetFeedTargetNames();
const std::vector<std::string>& fetch_target_names = const std::vector<std::string>& fetch_target_names =
inference_program->GetFetchTargetNames(); inference_program->GetFetchTargetNames();
// 5. Generate input // 4. Prepare inputs: set up maps for feed targets
paddle::framework::LoDTensor input;
srand(time(0));
float* input_ptr =
input.mutable_data<float>({1, 784}, paddle::platform::CPUPlace());
for (int i = 0; i < 784; ++i) {
input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
}
std::vector<paddle::framework::LoDTensor> feeds;
feeds.push_back(input);
std::vector<paddle::framework::LoDTensor> fetchs;
// Set up maps for feed and fetch targets
std::map<std::string, const paddle::framework::LoDTensor*> feed_targets; std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
// set_feed_variable
for (size_t i = 0; i < feed_target_names.size(); ++i) { for (size_t i = 0; i < feed_target_names.size(); ++i) {
feed_targets[feed_target_names[i]] = &feeds[i]; // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
feed_targets[feed_target_names[i]] = cpu_feeds[i];
} }
// get_fetch_variable // 5. Define Tensor to get the outputs: set up maps for fetch targets
fetchs.resize(fetch_target_names.size()); std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
for (size_t i = 0; i < fetch_target_names.size(); ++i) { for (size_t i = 0; i < fetch_target_names.size(); ++i) {
fetch_targets[fetch_target_names[i]] = &fetchs[i]; fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
} }
// Run the inference program // 6. Run the inference program
executor->Run(*inference_program, scope, feed_targets, fetch_targets); executor.Run(*inference_program, scope, feed_targets, fetch_targets);
// Get outputs delete scope;
for (size_t i = 0; i < fetchs.size(); ++i) { }
auto dims_i = fetchs[i].dims();
std::cout << "dims_i:"; TEST(inference, recognize_digits) {
for (int j = 0; j < dims_i.size(); ++j) { if (FLAGS_dirname.empty()) {
std::cout << " " << dims_i[j]; LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
std::cout << std::endl;
std::cout << "result:";
float* output_ptr = fetchs[i].data<float>();
for (int j = 0; j < paddle::framework::product(dims_i); ++j) {
std::cout << " " << output_ptr[j];
}
std::cout << std::endl;
} }
delete scope; LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
delete executor; std::string dirname = FLAGS_dirname;
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
return 0; paddle::framework::LoDTensor input;
srand(time(0));
float* input_ptr =
input.mutable_data<float>({1, 28, 28}, paddle::platform::CPUPlace());
for (int i = 0; i < 784; ++i) {
input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
}
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace, float>(
dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace, float>(
dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();
EXPECT_EQ(output1.dims(), output2.dims());
EXPECT_EQ(output1.numel(), output2.numel());
float err = 1E-3;
int count = 0;
for (int64_t i = 0; i < output1.numel(); ++i) {
if (fabs(output1.data<float>()[i] - output2.data<float>()[i]) > err) {
count++;
}
}
EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
#endif
} }
...@@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, ...@@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
CHECK_EQ(channels * outLength, maskMatP->getWidth()); CHECK_EQ(channels * outLength, maskMatP->getWidth());
} }
/* initialize the data_ */
for (size_t i = 0; i < height_; i++) {
for (size_t j = 0; j < width_; j++) {
outData[i * outStride + j] = -(real)FLT_MAX;
}
}
/* pool max one by one */ /* pool max one by one */
for (size_t n = 0; n < num; ++n) { // frame by frame for (size_t n = 0; n < num; ++n) { // frame by frame
if (!isContiguous()) { if (!isContiguous()) {
...@@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, ...@@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
for (size_t c = 0; c < channels; ++c) { // channel by channel for (size_t c = 0; c < channels; ++c) { // channel by channel
for (size_t ph = 0; ph < outputH; ++ph) { for (size_t ph = 0; ph < outputH; ++ph) {
int hstart = ph * strideH - paddingH; int hstart = ph * strideH - paddingH;
int hend = std::min(hstart + sizeY, imgSizeH); int hend = hstart + sizeY;
hstart = std::max(hstart, 0); hstart = hstart < 0 ? 0 : hstart;
hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
for (size_t pw = 0; pw < outputW; ++pw) { for (size_t pw = 0; pw < outputW; ++pw) {
int wstart = pw * strideW - paddingW; int wstart = pw * strideW - paddingW;
int wend = std::min(wstart + sizeX, imgSizeW); int wend = wstart + sizeX;
wstart = std::max(wstart, 0); wstart = wstart < 0 ? 0 : wstart;
wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
if (maskData == NULL) { if (maskData == NULL) {
real tmp = -(real)FLT_MAX;
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
outData[ph * outputW + pw] = std::max( tmp = tmp < inputData[h * imgSizeW + w]
outData[ph * outputW + pw], inputData[h * imgSizeW + w]); ? inputData[h * imgSizeW + w]
: tmp;
} }
} }
outData[ph * outputW + pw] = tmp;
} else { } else {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
......
...@@ -122,9 +122,11 @@ if(WITH_DISTRIBUTE) ...@@ -122,9 +122,11 @@ if(WITH_DISTRIBUTE)
set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor) op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
else() else()
set(DEPS_OPS ${DEPS_OPS} send_op recv_op) set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op)
endif() endif()
op_library(cond_op DEPS framework_proto tensor net_op) op_library(cond_op DEPS framework_proto tensor net_op)
...@@ -156,7 +158,10 @@ op_library(parallel_do_op DEPS executor) ...@@ -156,7 +158,10 @@ op_library(parallel_do_op DEPS executor)
# Regist multiple Kernel to pybind # Regist multiple Kernel to pybind
if (WITH_GPU) if (WITH_GPU)
op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col)
op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
vol2col depthwise_conv)
op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function) op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling) op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
...@@ -173,6 +178,8 @@ endif() ...@@ -173,6 +178,8 @@ endif()
# FIXME(typhoonzero): save/load depends lodtensor serialization functions # FIXME(typhoonzero): save/load depends lodtensor serialization functions
op_library(save_op DEPS lod_tensor) op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor)
op_library(save_combine_op DEPS lod_tensor)
op_library(load_combine_op DEPS lod_tensor)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
...@@ -192,3 +199,4 @@ if(WITH_GPU) ...@@ -192,3 +199,4 @@ if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif() endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
...@@ -82,7 +82,7 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> { ...@@ -82,7 +82,7 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func; math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
auto grad_merge = merge_func(context, grad); auto grad_merge = merge_func(context, grad);
auto* grad_merge_data = grad_merge.mutable_value()->template data<T>(); auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
auto& merge_rows = grad_merge.rows(); framework::Vector<int64_t> merge_rows(grad_merge.rows());
// 2. m += g_m * g_m // 2. m += g_m * g_m
math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func; math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
auto grad_square = sqare_func(context, grad_merge, grad_merge); auto grad_square = sqare_func(context, grad_merge, grad_merge);
...@@ -101,8 +101,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> { ...@@ -101,8 +101,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
SparseAdagradFunctorKernel< SparseAdagradFunctorKernel<
T, 256><<<grid2, threads, 0, T, 256><<<grid2, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(grad_merge_data, grad_merge.rows().data(), .stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr,
lr, param_data, moment_data, grad_width, param_data, moment_data, grad_width,
epsilon); epsilon);
} }
}; };
......
...@@ -199,7 +199,12 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -199,7 +199,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
merge_func(ctx.template device_context<DeviceContext>(), grad); merge_func(ctx.template device_context<DeviceContext>(), grad);
auto& grad_tensor = grad_merge.value(); auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>(); const T* grad_data = grad_tensor.template data<T>();
auto* rows = grad_merge.rows().data(); int64_t* rows = nullptr;
if (platform::is_gpu_place(ctx.GetPlace())) {
rows = grad_merge.mutable_rows()->cuda_data();
} else {
rows = grad_merge.mutable_rows()->data();
}
auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
SparseAdamFunctor<T> functor( SparseAdamFunctor<T> functor(
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -28,12 +28,18 @@ class BipartiteMatchOp : public framework::OperatorWithKernel { ...@@ -28,12 +28,18 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("DistMat"), PADDLE_ENFORCE(ctx->HasInput("DistMat"),
"Input(DistMat) of BipartiteMatch should not be null."); "Input(DistMat) of BipartiteMatch should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("ColToRowMatchIndices"),
"Output(ColToRowMatchIndices) of BipartiteMatch should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("ColToRowMatchDist"),
"Output(ColToRowMatchDist) of BipartiteMatch should not be null.");
auto dims = ctx->GetInputDim("DistMat"); auto dims = ctx->GetInputDim("DistMat");
PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2."); PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
ctx->SetOutputDim("ColToRowMatchIndices", dims); ctx->SetOutputDim("ColToRowMatchIndices", dims);
ctx->SetOutputDim("ColToRowMatchDis", dims); ctx->SetOutputDim("ColToRowMatchDist", dims);
} }
}; };
...@@ -91,7 +97,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> { ...@@ -91,7 +97,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* dist_mat = context.Input<LoDTensor>("DistMat"); auto* dist_mat = context.Input<LoDTensor>("DistMat");
auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices"); auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
auto* match_dist = context.Output<Tensor>("ColToRowMatchDis"); auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");
auto& dev_ctx = context.device_context<platform::CPUDeviceContext>(); auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
...@@ -148,13 +154,13 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -148,13 +154,13 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
"Otherwise, it means B[j] is matched to row " "Otherwise, it means B[j] is matched to row "
"ColToRowMatchIndices[i][j] in i-th instance. The row number of " "ColToRowMatchIndices[i][j] in i-th instance. The row number of "
"i-th instance is saved in ColToRowMatchIndices[i][j]."); "i-th instance is saved in ColToRowMatchIndices[i][j].");
AddOutput("ColToRowMatchDis", AddOutput("ColToRowMatchDist",
"(Tensor) A 2-D Tensor with shape [N, M] in float type. " "(Tensor) A 2-D Tensor with shape [N, M] in float type. "
"N is batch size. If ColToRowMatchIndices[i][j] is -1, " "N is batch size. If ColToRowMatchIndices[i][j] is -1, "
"ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed " "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed "
"ColToRowMatchIndices[i][j] = d, and the row offsets of each " "ColToRowMatchIndices[i][j] = d, and the row offsets of each "
"instance are called LoD. Then " "instance are called LoD. Then "
"ColToRowMatchDis[i][j] = DistMat[d+LoD[i]][j]"); "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]");
AddComment(R"DOC( AddComment(R"DOC(
This operator is a greedy bipartite matching algorithm, which is used to This operator is a greedy bipartite matching algorithm, which is used to
obtain the matching with the maximum distance based on the input obtain the matching with the maximum distance based on the input
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/box_coder_op.h"
namespace paddle {
namespace operators {
class BoxCoderOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
"Input(PriorBox) of BoxCoderOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
"Input(PriorBoxVar) of BoxCoderOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
"Input(TargetBox) of BoxCoderOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
"Output(OutputBox) of BoxCoderOp should not be null.");
auto prior_box_dims = ctx->GetInputDim("PriorBox");
auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
auto target_box_dims = ctx->GetInputDim("TargetBox");
PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
"The rank of Input of PriorBoxVar must be 2");
PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
"The rank of Input of TargetBox must be 2");
PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
"The shape of TargetBox is [M, 4]");
GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
ctx->SetOutputDim(
"OutputBox",
framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
}
};
class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
public:
BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"PriorBox",
"(Tensor, default Tensor<float>) "
"Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, "
"each box is represented as [xmin, ymin, xmax, ymax], "
"[xmin, ymin] is the left top coordinate of the anchor box, "
"if the input is image feature map, they are close to the origin "
"of the coordinate system. [xmax, ymax] is the right bottom "
"coordinate of the anchor box.");
AddInput("PriorBoxVar",
"(Tensor, default Tensor<float>) "
"PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
"of variance.");
AddInput(
"TargetBox",
"(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
"[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
"[xmin, ymin] is the left top coordinate of the box if the input "
"is image feature map, they are close to the origin of the coordinate "
"system. [xmax, ymax] is the right bottom coordinate of the box. "
"This tensor can contain LoD information to represent a batch "
"of inputs. One instance of this batch can contain different "
"numbers of entities.");
AddAttr<std::string>("code_type",
"(string, default encode_center_size) "
"the code type used with the target box")
.SetDefault("encode_center_size")
.InEnum({"encode_center_size", "decode_center_size"});
AddOutput(
"OutputBox",
"(LoDTensor or Tensor) "
"(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
"representing the result of N target boxes encoded/decoded with "
"M Prior boxes and variances.");
AddComment(R"DOC(
Bounding Box Coder Operator.
Encode/Decode the target bounding box with the priorbox information.
The Encoding schema described below:
ox = (tx - px) / pw / pxv
oy = (ty - py) / ph / pyv
ow = log(abs(tw / pw)) / pwv
oh = log(abs(th / ph)) / phv
The Decoding schema described below:
ox = (pw * pxv * tx * + px) - tw / 2
oy = (ph * pyv * ty * + py) - th / 2
ow = exp(pwv * tw) * pw + tw / 2
oh = exp(phv * th) * ph + th / 2
where tx, ty, tw, th denote the target box's center coordinates, width and
height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
width and height.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
ops::BoxCoderKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/box_coder_op.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void EncodeCenterSizeKernel(const T* prior_box_data,
const T* prior_box_var_data,
const T* target_box_data, const int row,
const int col, const int len,
T* output) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < row * col) {
const int row_idx = idx / col;
const int col_idx = idx % col;
T prior_box_width =
prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
T prior_box_height =
prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
T prior_box_center_x =
(prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
prior_box_data[col_idx * len + 1]) /
2;
T target_box_center_x =
(target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
2;
T target_box_center_y = (target_box_data[row_idx * len + 3] +
target_box_data[row_idx * len + 1]) /
2;
T target_box_width =
target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
T target_box_height =
target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
output[idx * len] = (target_box_center_x - prior_box_center_x) /
prior_box_width / prior_box_var_data[col_idx * len];
output[idx * len + 1] = (target_box_center_y - prior_box_center_y) /
prior_box_height /
prior_box_var_data[col_idx * len + 1];
output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) /
prior_box_var_data[col_idx * len + 2];
output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) /
prior_box_var_data[col_idx * len + 3];
}
}
template <typename T>
__global__ void DecodeCenterSizeKernel(const T* prior_box_data,
const T* prior_box_var_data,
const T* target_box_data, const int row,
const int col, const int len,
T* output) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < row * col) {
const int row_idx = idx / col;
const int col_idx = idx % col;
T prior_box_width =
prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
T prior_box_height =
prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
T prior_box_center_x =
(prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
prior_box_data[col_idx * len + 1]) /
2;
T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
target_box_data[row_idx * len + 2]) *
prior_box_width;
T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
target_box_data[row_idx * len + 3]) *
prior_box_height;
T target_box_center_x = prior_box_var_data[col_idx * len] *
target_box_data[row_idx * len] *
prior_box_width +
prior_box_center_x;
T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
target_box_data[row_idx * len + 1] *
prior_box_height +
prior_box_center_y;
output[idx * len] = target_box_center_x - target_box_width / 2;
output[idx * len + 1] = target_box_center_y - target_box_height / 2;
output[idx * len + 2] = target_box_center_x + target_box_width / 2;
output[idx * len + 3] = target_box_center_y + target_box_height / 2;
}
}
template <typename T>
class BoxCoderCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
"This kernel only runs on GPU device.");
auto* prior_box = context.Input<framework::Tensor>("PriorBox");
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
auto* output_box = context.Output<framework::Tensor>("OutputBox");
if (target_box->lod().size()) {
PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
"Only support 1 level of LoD.");
}
auto row = target_box->dims()[0];
auto col = prior_box->dims()[0];
auto len = prior_box->dims()[1];
int block = 512;
int grid = (row * col + block - 1) / block;
auto& device_ctx = context.cuda_device_context();
const T* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = prior_box_var->data<T>();
const T* target_box_data = target_box->data<T>();
output_box->mutable_data<T>({row, col, len}, context.GetPlace());
T* output = output_box->data<T>();
auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
if (code_type == BoxCodeType::kEncodeCenterSize) {
EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
prior_box_data, prior_box_var_data, target_box_data, row, col, len,
output);
} else if (code_type == BoxCodeType::kDecodeCenterSize) {
DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
prior_box_data, prior_box_var_data, target_box_data, row, col, len,
output);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel<float>,
ops::BoxCoderCUDAKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
inline BoxCodeType GetBoxCodeType(const std::string& type) {
if (type == "encode_center_size") {
return BoxCodeType::kEncodeCenterSize;
} else if (type == "decode_center_size") {
return BoxCodeType::kDecodeCenterSize;
}
PADDLE_THROW("Not support type %s.", type);
}
template <typename T>
class BoxCoderKernel : public framework::OpKernel<T> {
public:
void EncodeCenterSize(const framework::Tensor& target_box,
const framework::Tensor& prior_box,
const framework::Tensor& prior_box_var,
T* output) const {
int64_t row = target_box.dims()[0];
int64_t col = prior_box.dims()[0];
int64_t len = prior_box.dims()[1];
auto* target_box_data = target_box.data<T>();
auto* prior_box_data = prior_box.data<T>();
auto* prior_box_var_data = prior_box_var.data<T>();
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
T prior_box_width =
prior_box_data[j * len + 2] - prior_box_data[j * len];
T prior_box_height =
prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
T prior_box_center_x =
(prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
T prior_box_center_y =
(prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
T target_box_center_x =
(target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
T target_box_center_y =
(target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
T target_box_width =
target_box_data[i * len + 2] - target_box_data[i * len];
T target_box_height =
target_box_data[i * len + 3] - target_box_data[i * len + 1];
size_t offset = i * col * len + j * len;
output[offset] = (target_box_center_x - prior_box_center_x) /
prior_box_width / prior_box_var_data[j * len];
output[offset + 1] = (target_box_center_y - prior_box_center_y) /
prior_box_height / prior_box_var_data[j * len + 1];
output[offset + 2] =
std::log(std::fabs(target_box_width / prior_box_width)) /
prior_box_var_data[j * len + 2];
output[offset + 3] =
std::log(std::fabs(target_box_height / prior_box_height)) /
prior_box_var_data[j * len + 3];
}
}
}
void DecodeCenterSize(const framework::Tensor& target_box,
const framework::Tensor& prior_box,
const framework::Tensor& prior_box_var,
T* output) const {
int64_t row = target_box.dims()[0];
int64_t col = prior_box.dims()[0];
int64_t len = prior_box.dims()[1];
auto* target_box_data = target_box.data<T>();
auto* prior_box_data = prior_box.data<T>();
auto* prior_box_var_data = prior_box_var.data<T>();
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
T prior_box_width =
prior_box_data[j * len + 2] - prior_box_data[j * len];
T prior_box_height =
prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
T prior_box_center_x =
(prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
T prior_box_center_y =
(prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
T target_box_center_x = prior_box_var_data[j * len] *
target_box_data[i * len] * prior_box_width +
prior_box_center_x;
T target_box_center_y = prior_box_var_data[j * len + 1] *
target_box_data[i * len + 1] *
prior_box_height +
prior_box_center_y;
T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
target_box_data[i * len + 2]) *
prior_box_width;
T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
target_box_data[i * len + 3]) *
prior_box_height;
size_t offset = i * col * len + j * len;
output[offset] = target_box_center_x - target_box_width / 2;
output[offset + 1] = target_box_center_y - target_box_height / 2;
output[offset + 2] = target_box_center_x + target_box_width / 2;
output[offset + 3] = target_box_center_y + target_box_height / 2;
}
}
}
void Compute(const framework::ExecutionContext& context) const override {
auto* prior_box = context.Input<framework::Tensor>("PriorBox");
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
auto* output_box = context.Output<framework::Tensor>("OutputBox");
if (target_box->lod().size()) {
PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
"Only support 1 level of LoD.");
}
auto row = target_box->dims()[0];
auto col = prior_box->dims()[0];
auto len = prior_box->dims()[1];
output_box->mutable_data<T>({row, col, len}, context.GetPlace());
auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
T* output = output_box->data<T>();
if (code_type == BoxCodeType::kEncodeCenterSize) {
EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
} else if (code_type == BoxCodeType::kDecodeCenterSize) {
DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
}
}
};
} // namespace operators
} // namespace paddle
...@@ -54,7 +54,15 @@ class CompareOpKernel ...@@ -54,7 +54,15 @@ class CompareOpKernel
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
using T = typename Functor::ELEM_TYPE; using T = typename Functor::ELEM_TYPE;
ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context); using Tensor = framework::Tensor;
auto* x = context.Input<Tensor>("X");
auto* y = context.Input<Tensor>("Y");
auto* z = context.Output<Tensor>("Out");
z->mutable_data<T>(context.GetPlace());
int axis = context.Attr<int>("axis");
ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
z);
} }
}; };
......
...@@ -318,9 +318,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( ...@@ -318,9 +318,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad, REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
ops::ConvOpGrad); ops::ConvOpGrad);
// depthwise convolution op
REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
depthwise_conv2d_grad, ops::ConvOpGrad);
REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad, REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
ops::ConvOpGrad); ops::ConvOpGrad);
// depthwise conv kernel
// TODO(xingzhaolong): neon kernel for mobile
REGISTER_OP_CPU_KERNEL(
depthwise_conv2d,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
depthwise_conv2d_grad,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>, conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>); ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
......
...@@ -16,6 +16,16 @@ limitations under the License. */ ...@@ -16,6 +16,16 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d,
ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d_grad,
ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>, conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>); ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册