提交 446198da 编写于 作者: L Luo Tao

Merge branch 'develop' into inference_lib_dist

...@@ -39,7 +39,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F ...@@ -39,7 +39,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON) option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
...@@ -137,7 +137,7 @@ include(external/openblas) # download, build, install openblas ...@@ -137,7 +137,7 @@ include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn include(external/mkldnn) # download, build, install mkldnn
include(external/swig) # download, build, install swig include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc include(external/warpctc) # download, build, install warpctc
include(external/boost) # download, build, install boost include(external/boost) # download boost
include(external/any) # download libn::any include(external/any) # download libn::any
include(external/eigen) # download eigen3 include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11 include(external/pybind11) # download pybind11
......
#FROM python:2.7.14
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
RUN apt-get update && apt-get install -y python
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
# so we must build one with distribute support to install in this image.
RUN pip install paddlepaddle
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
RUN pip uninstall -y paddlepaddle
# below lines may change a lot for debugging
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl && \
chmod +x /usr/bin/paddle_k8s
ENV LD_LIBRARY_PATH=/usr/local/lib
ADD vgg16_fluid.py vgg16_v2.py /workspace/
# Performance for Distributed vgg16
## Test Result
### Hardware Infomation
- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
- cpu MHz : 2101.000
- cache size : 20480 KB
### Single Node Single Thread
- PServer Count: 10
- Trainer Count: 20
- Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
| TensorFlow | - | - | - | - |
### Different Batch Size
- PServer Count: 10
- Trainer Count: 20
- Per trainer CPU Core: 1
- Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
| TensorFlow | - | - | - | - |
### Accelerate Rate
- Pserver Count: 20
- Batch Size: 128
- Metrics: samples / sec
| Trainer Count | 20 | 40 | 80 | 100 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
| TensorFlow | - | - | - | - |
### Different Pserver Count
- Trainer Count: 60
- Batch Size: 128
- Metrics: samples/ sec
| PServer Count | 3 | 6 |10 | 20 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
| TensorFlow | - | - | - | - |
*The performance gap between Fuild and v2 comes from the network interference.*
## Steps to Run the Performance Test
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
Check the logs for the distributed training progress and analyze the performance.
## Enable Verbos Logs
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: MKL_NUM_THREADS
value: "1"
- name: TRAINING_ROLE
value: "PSERVER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
command: ["paddle_k8s", "start_fluid"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_fluid"]
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: TRAINING_ROLE
value: "TRAINER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16v2job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16v2job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "python train.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
command: ["paddle_k8s", "start_pserver"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16v2job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16v2job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_trainer", "v2"]
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: BATCH_SIZE
value: "256"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "2"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""VGG16 benchmark in Fluid"""
from __future__ import print_function
import sys
import time
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
import paddle.v2.fluid.profiler as profiler
import argparse
import functools
import os
def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--batch_size', type=int, default=128, help="Batch size for training.")
parser.add_argument(
'--learning_rate',
type=float,
default=1e-3,
help="Learning rate for training.")
parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
parser.add_argument(
'--device',
type=str,
default='CPU',
choices=['CPU', 'GPU'],
help="The device type.")
parser.add_argument('--device_id', type=int, default=0, help="The device id.")
parser.add_argument(
'--data_format',
type=str,
default='NCHW',
choices=['NCHW', 'NHWC'],
help='The data order, now only support NCHW.')
parser.add_argument(
'--data_set',
type=str,
default='cifar10',
choices=['cifar10', 'flowers'],
help='Optional dataset for benchmark.')
parser.add_argument(
'--local',
type=str2bool,
default=True,
help='Whether to run as local mode.')
args = parser.parse_args()
def vgg16_bn_drop(input):
def conv_block(input, num_filter, groups, dropouts):
return fluid.nets.img_conv_group(
input=input,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act='relu',
conv_with_batchnorm=True,
conv_batchnorm_drop_rate=dropouts,
pool_type='max')
conv1 = conv_block(input, 64, 2, [0.3, 0])
conv2 = conv_block(conv1, 128, 2, [0.4, 0])
conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = fluid.layers.fc(input=drop, size=512, act=None)
bn = fluid.layers.batch_norm(input=fc1, act='relu')
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
return fc2
def main():
if args.data_set == "cifar10":
classdim = 10
if args.data_format == 'NCHW':
data_shape = [3, 32, 32]
else:
data_shape = [32, 32, 3]
else:
classdim = 102
if args.data_format == 'NCHW':
data_shape = [3, 224, 224]
else:
data_shape = [224, 224, 3]
# Input data
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
net = vgg16_bn_drop(images)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
# inference program
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
test_target = accuracy.metrics + accuracy.states
inference_program = fluid.io.get_inference_program(test_target)
# Optimization
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
optimize_ops, params_grads = optimizer.minimize(avg_cost)
# Initialize executor
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
args.device_id)
exe = fluid.Executor(place)
# test
def test(exe):
accuracy.reset(exe)
for batch_id, data in enumerate(test_reader()):
img_data = np.array(map(lambda x: x[0].reshape(data_shape),
data)).astype("float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
exe.run(inference_program,
feed={"pixel": img_data,
"label": y_data})
return accuracy.eval(exe)
def train_loop(exe, trainer_prog):
iters = 0
ts = time.time()
for pass_id in range(args.num_passes):
# train
start_time = time.time()
num_samples = 0
accuracy.reset(exe)
with profiler.profiler("CPU", 'total') as prof:
for batch_id, data in enumerate(train_reader()):
ts = time.time()
img_data = np.array(
map(lambda x: x[0].reshape(data_shape), data)).astype(
"float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
loss, acc = exe.run(
trainer_prog,
feed={"pixel": img_data,
"label": y_data},
fetch_list=[avg_cost] + accuracy.metrics)
iters += 1
num_samples += len(data)
print(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
% (pass_id, iters, loss, acc, time.time() - ts)
) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time
pass_train_acc = accuracy.eval(exe)
pass_test_acc = test(exe)
print(
"Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
% (pass_id, num_samples / pass_elapsed, pass_train_acc,
pass_test_acc))
if args.local:
# Parameter initialization
exe.run(fluid.default_startup_program())
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
batch_size=args.batch_size)
train_loop(exe, fluid.default_main_program())
else:
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # all pserver endpoints
eplist = []
for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, "6174"]))
pserver_endpoints = ",".join(eplist)
print("pserver endpoints: ", pserver_endpoints)
trainers = int(os.getenv("TRAINERS")) # total trainer count
print("trainers total: ", trainers)
current_endpoint = os.getenv(
"POD_IP") + ":6174" # current pserver endpoint
training_role = os.getenv(
"TRAINING_ROLE",
"TRAINER") # get the training role: trainer/pserver
t = fluid.DistributeTranspiler()
t.transpile(
optimize_ops,
params_grads,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER":
if not current_endpoint:
print("need env SERVER_ENDPOINT")
exit(1)
pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint,
pserver_prog)
print("starting server side startup")
exe.run(pserver_startup)
print("starting parameter server...")
exe.run(pserver_prog)
elif training_role == "TRAINER":
# Parameter initialization
exe.run(fluid.default_startup_program())
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
paddle.dataset.flowers.test(),
batch_size=args.batch_size)
trainer_prog = t.get_trainer_program()
feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
# TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
exe.run(fluid.default_startup_program())
train_loop(exe, trainer_prog)
else:
print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
def print_arguments():
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == "__main__":
print_arguments()
main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import gzip
import paddle.v2.dataset.cifar as cifar
import paddle.v2 as paddle
import time
import os
DATA_DIM = 3 * 32 * 32
CLASS_DIM = 10
BATCH_SIZE = os.getenv("BATCH_SIZE")
if BATCH_SIZE:
BATCH_SIZE = int(BATCH_SIZE)
else:
BATCH_SIZE = 128
print "batch_size", BATCH_SIZE
NODE_COUNT = int(os.getenv("TRAINERS"))
ts = 0
def vgg(input, nums, class_dim):
def conv_block(input, num_filter, groups, num_channels=None):
return paddle.networks.img_conv_group(
input=input,
num_channels=num_channels,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act=paddle.activation.Relu(),
pool_type=paddle.pooling.Max())
assert len(nums) == 5
# the channel of input feature is 3
conv1 = conv_block(input, 64, nums[0], 3)
conv2 = conv_block(conv1, 128, nums[1])
conv3 = conv_block(conv2, 256, nums[2])
conv4 = conv_block(conv3, 512, nums[3])
conv5 = conv_block(conv4, 512, nums[4])
fc_dim = 512
fc1 = paddle.layer.fc(input=conv5,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
fc2 = paddle.layer.fc(input=fc1,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
out = paddle.layer.fc(input=fc2,
size=class_dim,
act=paddle.activation.Softmax())
return out
def vgg13(input, class_dim):
nums = [2, 2, 2, 2, 2]
return vgg(input, nums, class_dim)
def vgg16(input, class_dim):
nums = [2, 2, 3, 3, 3]
return vgg(input, nums, class_dim)
def vgg19(input, class_dim):
nums = [2, 2, 4, 4, 4]
return vgg(input, nums, class_dim)
def main():
global ts
paddle.init(use_gpu=False)
image = paddle.layer.data(
name="image", type=paddle.data_type.dense_vector(DATA_DIM))
lbl = paddle.layer.data(
name="label", type=paddle.data_type.integer_value(CLASS_DIM))
extra_layers = None
# NOTE: for v2 distributed training need averaging updates.
learning_rate = 1e-3 / NODE_COUNT
out = vgg16(image, class_dim=CLASS_DIM)
cost = paddle.layer.classification_cost(input=out, label=lbl)
# Create parameters
parameters = paddle.parameters.create(cost)
# Create optimizer
optimizer = paddle.optimizer.Momentum(
momentum=0.9,
regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
BATCH_SIZE),
learning_rate=learning_rate / BATCH_SIZE,
learning_rate_decay_a=0.1,
learning_rate_decay_b=128000 * 35,
learning_rate_schedule="discexp", )
train_reader = paddle.batch(
paddle.reader.shuffle(
cifar.train10(),
# To use other data, replace the above line with:
# reader.train_reader('train.list'),
buf_size=1000),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
cifar.test10(),
# To use other data, replace the above line with:
# reader.test_reader('val.list'),
batch_size=BATCH_SIZE)
# Create trainer
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer,
extra_layers=extra_layers,
is_local=False)
# End batch and end pass event handler
def event_handler(event):
global ts, ts_pass
if isinstance(event, paddle.event.BeginPass):
ts_pass = time.time()
if isinstance(event, paddle.event.BeginIteration):
ts = time.time()
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 1 == 0:
print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
event.pass_id, event.batch_id, event.cost, event.metrics,
time.time() - ts)
if isinstance(event, paddle.event.EndPass):
print "Pass %d end, spent: %f" % (event.pass_id,
time.time() - ts_pass)
result = trainer.test(reader=test_reader)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
trainer.train(
reader=train_reader, num_passes=200, event_handler=event_handler)
if __name__ == '__main__':
main()
...@@ -21,6 +21,7 @@ set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOO ...@@ -21,6 +21,7 @@ set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOO
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE) set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
include_directories(${BOOST_INCLUDE_DIR}) include_directories(${BOOST_INCLUDE_DIR})
......
...@@ -186,6 +186,11 @@ function(cc_library TARGET_NAME) ...@@ -186,6 +186,11 @@ function(cc_library TARGET_NAME)
add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
endif() endif()
if (cc_library_DEPS) if (cc_library_DEPS)
# Don't need link libwarpctc.so
if ("${cc_library_DEPS};" MATCHES "warpctc;")
list(REMOVE_ITEM cc_library_DEPS warpctc)
add_dependencies(${TARGET_NAME} warpctc)
endif()
add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
endif() endif()
...@@ -224,12 +229,18 @@ function(cc_test TARGET_NAME) ...@@ -224,12 +229,18 @@ function(cc_test TARGET_NAME)
if(WITH_TESTING) if(WITH_TESTING)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS}) add_executable(${TARGET_NAME} ${cc_test_SRCS})
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
endif()
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif() endif()
endfunction(cc_test) endfunction(cc_test)
...@@ -457,12 +468,12 @@ endfunction() ...@@ -457,12 +468,12 @@ endfunction()
function(py_test TARGET_NAME) function(py_test TARGET_NAME)
if(WITH_TESTING) if(WITH_TESTING)
set(options STATIC static SHARED shared) set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS ARGS) set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif() endif()
......
...@@ -87,6 +87,11 @@ roi_pool ...@@ -87,6 +87,11 @@ roi_pool
.. autoclass:: paddle.v2.layer.roi_pool .. autoclass:: paddle.v2.layer.roi_pool
:noindex: :noindex:
pad
----
.. autoclass:: paddle.v2.layer.pad
:noindex:
Norm Layer Norm Layer
========== ==========
...@@ -133,6 +138,11 @@ grumemory ...@@ -133,6 +138,11 @@ grumemory
.. autoclass:: paddle.v2.layer.grumemory .. autoclass:: paddle.v2.layer.grumemory
:noindex: :noindex:
gated_unit
-----------
.. autoclass:: paddle.v2.layer.gated_unit
:noindex:
Recurrent Layer Group Recurrent Layer Group
===================== =====================
...@@ -340,6 +350,11 @@ bilinear_interp ...@@ -340,6 +350,11 @@ bilinear_interp
.. autoclass:: paddle.v2.layer.bilinear_interp .. autoclass:: paddle.v2.layer.bilinear_interp
:noindex: :noindex:
dropout
--------
.. autoclass:: paddle.v2.layer.dropout
:noindex:
dot_prod dot_prod
--------- ---------
.. autoclass:: paddle.v2.layer.dot_prod .. autoclass:: paddle.v2.layer.dot_prod
...@@ -402,6 +417,11 @@ scale_shift ...@@ -402,6 +417,11 @@ scale_shift
.. autoclass:: paddle.v2.layer.scale_shift .. autoclass:: paddle.v2.layer.scale_shift
:noindex: :noindex:
factorization_machine
---------------------
.. autoclass:: paddle.v2.layer.factorization_machine
:noindex:
Sampling Layers Sampling Layers
=============== ===============
...@@ -420,22 +440,6 @@ multiplex ...@@ -420,22 +440,6 @@ multiplex
.. autoclass:: paddle.v2.layer.multiplex .. autoclass:: paddle.v2.layer.multiplex
:noindex: :noindex:
Factorization Machine Layer
============================
factorization_machine
---------------------
.. autoclass:: paddle.v2.layer.factorization_machine
:noindex:
Slicing and Joining Layers
==========================
pad
----
.. autoclass:: paddle.v2.layer.pad
:noindex:
.. _api_v2.layer_costs: .. _api_v2.layer_costs:
Cost Layers Cost Layers
...@@ -526,6 +530,11 @@ multibox_loss ...@@ -526,6 +530,11 @@ multibox_loss
.. autoclass:: paddle.v2.layer.multibox_loss .. autoclass:: paddle.v2.layer.multibox_loss
:noindex: :noindex:
detection_output
----------------
.. autoclass:: paddle.v2.layer.detection_output
:noindex:
Check Layer Check Layer
============ ============
...@@ -534,31 +543,10 @@ eos ...@@ -534,31 +543,10 @@ eos
.. autoclass:: paddle.v2.layer.eos .. autoclass:: paddle.v2.layer.eos
:noindex: :noindex:
Miscs Activation
===== ==========
dropout
--------
.. autoclass:: paddle.v2.layer.dropout
:noindex:
Activation with learnable parameter
===================================
prelu prelu
-------- --------
.. autoclass:: paddle.v2.layer.prelu .. autoclass:: paddle.v2.layer.prelu
:noindex: :noindex:
gated_unit
-----------
.. autoclass:: paddle.v2.layer.gated_unit
:noindex:
Detection output Layer
======================
detection_output
----------------
.. autoclass:: paddle.v2.layer.detection_output
:noindex:
...@@ -73,3 +73,10 @@ wmt14 ...@@ -73,3 +73,10 @@ wmt14
.. automodule:: paddle.v2.dataset.wmt14 .. automodule:: paddle.v2.dataset.wmt14
:members: :members:
:noindex: :noindex:
wmt16
+++++
.. automodule:: paddle.v2.dataset.wmt16
:members:
:noindex:
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=========== ===========
DataFeeder data_feeder
=========== ===========
DataFeeder DataFeeder
----------- ----------
.. automodule:: paddle.v2.fluid.data_feeder
:members: DataFeeder .. autoclass:: paddle.v2.fluid.data_feeder.DataFeeder
:members:
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Evaluator !DO NOT EDIT THIS FILE MANUALLY!
===========
=========
Evaluator evaluator
----------- =========
.. automodule:: paddle.v2.fluid.evaluator
:members: Evaluator Accuracy
--------
.. autoclass:: paddle.v2.fluid.evaluator.Accuracy
:members:
:noindex: :noindex:
ChunkEvaluator
--------------
.. autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator
:members:
:noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Executor !DO NOT EDIT THIS FILE MANUALLY!
===========
========
executor
========
Executor Executor
--------
.. autoclass:: paddle.v2.fluid.executor.Executor
:members:
:noindex:
global_scope
------------
.. autofunction:: paddle.v2.fluid.executor.global_scope
:noindex:
scope_guard
----------- -----------
.. automodule:: paddle.v2.fluid.executor
:members: Executor .. autofunction:: paddle.v2.fluid.executor.scope_guard
:noindex:
switch_scope
------------
.. autofunction:: paddle.v2.fluid.executor.switch_scope
:noindex: :noindex:
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import argparse
import sys
import types
import paddle.v2.fluid as fluid
def parse_arg():
parser = argparse.ArgumentParser()
parser.add_argument('--submodules', nargs="*")
parser.add_argument(
'module', type=str, help='Generate the documentation of which module')
return parser.parse_args()
class DocGenerator(object):
def __init__(self, module_name, stream=sys.stdout):
self.stream = stream
self.module_name = module_name
if not hasattr(fluid, module_name):
raise ValueError("Cannot find fluid.{0}".format(module_name))
else:
self.module = getattr(fluid, module_name)
self.stream.write('''.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
''')
self._print_header_(module_name, dot='=', is_title=True)
def print_submodule(self, submodule_name):
submodule = getattr(self.module, submodule_name)
if submodule is None:
raise ValueError("Cannot find submodule {0}".format(submodule_name))
self.print_section(submodule_name)
for item in submodule.__all__:
self.print_item(item)
def print_current_module(self):
for item in self.module.__all__:
self.print_item(item)
def print_section(self, name):
self._print_header_(name, dot='=', is_title=False)
def print_item(self, name):
item = getattr(self.module, name)
if isinstance(item, types.TypeType):
self.print_class(name)
elif isinstance(item, types.FunctionType):
self.print_method(name)
else:
raise RuntimeError("Unsupported item {0}".format(name))
def print_class(self, name):
self._print_header_(name, dot='-', is_title=False)
self.stream.write('''.. autoclass:: paddle.v2.fluid.{0}.{1}
:members:
:noindex:
'''.format(self.module_name, name))
def print_method(self, name):
self._print_header_(name, dot='-', is_title=False)
self.stream.write('''.. autofunction:: paddle.v2.fluid.{0}.{1}
:noindex:
'''.format(self.module_name, name))
def _print_header_(self, name, dot, is_title):
dot_line = dot * len(name)
if is_title:
self.stream.write(dot_line)
self.stream.write('\n')
self.stream.write(name)
self.stream.write('\n')
self.stream.write(dot_line)
self.stream.write('\n')
self.stream.write('\n')
def main():
args = parse_arg()
gen = DocGenerator(args.module)
if args.submodules is None:
gen.print_current_module()
else:
for submodule_name in args.submodules:
gen.print_submodule(submodule_name)
if __name__ == '__main__':
main()
#!/bin/bash
python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
do
python gen_doc.py ${module} > ${module}.rst
done
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=========== ===========
Initializer initializer
=========== ===========
Constant
--------
.. autoclass:: paddle.v2.fluid.initializer.Constant
Initializer :members:
-----------
.. automodule:: paddle.v2.fluid.initializer
:members: Initializer
:noindex:
ConstantInitializer
-------------------
.. automodule:: paddle.v2.fluid.initializer
:members: ConstantInitializer
:noindex: :noindex:
Uniform
-------
.. autoclass:: paddle.v2.fluid.initializer.Uniform
UniformInitializer :members:
------------------
.. automodule:: paddle.v2.fluid.initializer
:members: UniformInitializer
:noindex:
NormalInitializer
-----------------
.. automodule:: paddle.v2.fluid.initializer
:members: NormalInitializer
:noindex: :noindex:
Normal
------
XavierInitializer .. autoclass:: paddle.v2.fluid.initializer.Normal
----------------- :members:
.. automodule:: paddle.v2.fluid.initializer
:members: XavierInitializer
:noindex: :noindex:
Xavier
------
MSRAInitializer .. autoclass:: paddle.v2.fluid.initializer.Xavier
--------------- :members:
.. automodule:: paddle.v2.fluid.initializer
:members: MSRAInitializer
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
IO !DO NOT EDIT THIS FILE MANUALLY!
===========
==
io
==
save_vars
---------
is_parameter .. autofunction:: paddle.v2.fluid.io.save_vars
:noindex:
save_params
----------- -----------
.. autofunction:: paddle.v2.fluid.io.is_parameter
.. autofunction:: paddle.v2.fluid.io.save_params
:noindex:
save_persistables
-----------------
.. autofunction:: paddle.v2.fluid.io.save_persistables
:noindex:
load_vars
---------
.. autofunction:: paddle.v2.fluid.io.load_vars
:noindex:
load_params
-----------
.. autofunction:: paddle.v2.fluid.io.load_params
:noindex: :noindex:
load_persistables
-----------------
.. autofunction:: paddle.v2.fluid.io.load_persistables
:noindex:
save_inference_model
--------------------
.. autofunction:: paddle.v2.fluid.io.save_inference_model
:noindex:
load_inference_model
--------------------
.. autofunction:: paddle.v2.fluid.io.load_inference_model
:noindex:
get_inference_program
---------------------
.. autofunction:: paddle.v2.fluid.io.get_inference_program
:noindex:
========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Layers !DO NOT EDIT THIS FILE MANUALLY!
==========
======
layers
======
fc control_flow
--- ============
.. autofunction:: paddle.v2.fluid.layers.fc
split_lod_tensor
----------------
.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
:noindex: :noindex:
embedding merge_lod_tensor
--------- ----------------
.. autofunction:: paddle.v2.fluid.layers.embedding
.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
:noindex: :noindex:
dynamic_lstm BlockGuard
------------ ----------
.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
.. autoclass:: paddle.v2.fluid.layers.BlockGuard
:members:
:noindex: :noindex:
dynamic_lstmp BlockGuardWithCompletion
------------- ------------------------
.. autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
.. autoclass:: paddle.v2.fluid.layers.BlockGuardWithCompletion
:members:
:noindex: :noindex:
dynamic_gru StaticRNNMemoryLink
----------- -------------------
.. autofunction:: paddle.v2.fluid.layers.dynamic_gru
.. autoclass:: paddle.v2.fluid.layers.StaticRNNMemoryLink
:members:
:noindex: :noindex:
data WhileGuard
---- ----------
.. autofunction:: paddle.v2.fluid.layers.data
.. autoclass:: paddle.v2.fluid.layers.WhileGuard
:members:
:noindex: :noindex:
mean While
---- -----
.. autofunction:: paddle.v2.fluid.layers.mean
.. autoclass:: paddle.v2.fluid.layers.While
:members:
:noindex: :noindex:
mul lod_rank_table
--- --------------
.. autofunction:: paddle.v2.fluid.layers.mul
.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
:noindex: :noindex:
elementwise_add max_sequence_len
--------------- ----------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_add
.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
:noindex: :noindex:
elementwise_sub topk
--------------- ----
.. autofunction:: paddle.v2.fluid.layers.elementwise_sub
.. autofunction:: paddle.v2.fluid.layers.topk
:noindex: :noindex:
elementwise_mul lod_tensor_to_array
--------------- -------------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_mul
.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
:noindex: :noindex:
elementwise_div array_to_lod_tensor
--------------- -------------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_div
.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
:noindex: :noindex:
increment
---------
dropout .. autofunction:: paddle.v2.fluid.layers.increment
-------
.. autofunction:: paddle.v2.fluid.layers.dropout
:noindex: :noindex:
array_write
-----------
reshape .. autofunction:: paddle.v2.fluid.layers.array_write
--------
.. autofunction:: paddle.v2.fluid.layers.reshape
:noindex: :noindex:
create_array
------------
sigmoid .. autofunction:: paddle.v2.fluid.layers.create_array
:noindex:
less_than
--------- ---------
.. autofunction:: paddle.v2.fluid.layers.sigmoid
.. autofunction:: paddle.v2.fluid.layers.less_than
:noindex: :noindex:
array_read
----------
scale .. autofunction:: paddle.v2.fluid.layers.array_read
--------- :noindex:
.. autofunction:: paddle.v2.fluid.layers.scale
shrink_memory
-------------
.. autofunction:: paddle.v2.fluid.layers.shrink_memory
:noindex: :noindex:
array_length
------------
transpose .. autofunction:: paddle.v2.fluid.layers.array_length
:noindex:
IfElse
------
.. autoclass:: paddle.v2.fluid.layers.IfElse
:members:
:noindex:
DynamicRNN
----------
.. autoclass:: paddle.v2.fluid.layers.DynamicRNN
:members:
:noindex:
ConditionalBlock
----------------
.. autoclass:: paddle.v2.fluid.layers.ConditionalBlock
:members:
:noindex:
StaticRNN
--------- ---------
.. autofunction:: paddle.v2.fluid.layers.transpose
.. autoclass:: paddle.v2.fluid.layers.StaticRNN
:members:
:noindex: :noindex:
reorder_lod_tensor_by_rank
--------------------------
sigmoid_cross_entropy_with_logits .. autofunction:: paddle.v2.fluid.layers.reorder_lod_tensor_by_rank
---------------------------------
.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
:noindex: :noindex:
ParallelDo
----------
cast .. autoclass:: paddle.v2.fluid.layers.ParallelDo
:members:
:noindex:
Print
-----
.. autofunction:: paddle.v2.fluid.layers.Print
:noindex:
device
======
get_places
----------
.. autofunction:: paddle.v2.fluid.layers.get_places
:noindex:
io
==
data
---- ----
.. autofunction:: paddle.v2.fluid.layers.cast
.. autofunction:: paddle.v2.fluid.layers.data
:noindex: :noindex:
BlockGuardServ
--------------
concat .. autoclass:: paddle.v2.fluid.layers.BlockGuardServ
------- :members:
.. autofunction:: paddle.v2.fluid.layers.concat
:noindex: :noindex:
ListenAndServ
-------------
sums .. autoclass:: paddle.v2.fluid.layers.ListenAndServ
:members:
:noindex:
Send
---- ----
.. autofunction:: paddle.v2.fluid.layers.sums
.. autofunction:: paddle.v2.fluid.layers.Send
:noindex: :noindex:
nn
==
linear_chain_crf fc
---------------- --
.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
.. autofunction:: paddle.v2.fluid.layers.fc
:noindex: :noindex:
embedding
---------
assign
-------
.. autofunction:: paddle.v2.fluid.layers.embedding .. autofunction:: paddle.v2.fluid.layers.embedding
:noindex: :noindex:
dynamic_lstm
------------
split_lod_tensor .. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
----------------
.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
:noindex: :noindex:
dynamic_lstmp
-------------
merge_lod_tensor .. autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
:noindex:
dynamic_gru
-----------
.. autofunction:: paddle.v2.fluid.layers.dynamic_gru
:noindex:
gru_unit
--------
.. autofunction:: paddle.v2.fluid.layers.gru_unit
:noindex:
linear_chain_crf
---------------- ----------------
.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
:noindex:
crf_decoding
------------
.. autofunction:: paddle.v2.fluid.layers.crf_decoding
:noindex: :noindex:
cos_sim cos_sim
-------- -------
.. autofunction:: paddle.v2.fluid.layers.cos_sim .. autofunction:: paddle.v2.fluid.layers.cos_sim
:noindex: :noindex:
cross_entropy cross_entropy
------------- -------------
.. autofunction:: paddle.v2.fluid.layers.cross_entropy .. autofunction:: paddle.v2.fluid.layers.cross_entropy
:noindex: :noindex:
square_error_cost square_error_cost
----------------- -----------------
.. autofunction:: paddle.v2.fluid.layers.square_error_cost .. autofunction:: paddle.v2.fluid.layers.square_error_cost
:noindex: :noindex:
accuracy accuracy
--------- --------
.. autofunction:: paddle.v2.fluid.layers.accuracy .. autofunction:: paddle.v2.fluid.layers.accuracy
:noindex: :noindex:
chunk_eval
----------
.. autofunction:: paddle.v2.fluid.layers.chunk_eval
:noindex:
sequence_conv sequence_conv
------------- -------------
.. autofunction:: paddle.v2.fluid.layers.sequence_conv .. autofunction:: paddle.v2.fluid.layers.sequence_conv
:noindex: :noindex:
conv2d conv2d
------ ------
.. autofunction:: paddle.v2.fluid.layers.conv2d .. autofunction:: paddle.v2.fluid.layers.conv2d
:noindex: :noindex:
sequence_pool sequence_pool
------------- -------------
.. autofunction:: paddle.v2.fluid.layers.sequence_pool .. autofunction:: paddle.v2.fluid.layers.sequence_pool
:noindex: :noindex:
pool2d
------
sequence_first_step .. autofunction:: paddle.v2.fluid.layers.pool2d
-------------------
.. autofunction:: paddle.v2.fluid.layers.sequence_first_step
:noindex: :noindex:
batch_norm
----------
.. autofunction:: paddle.v2.fluid.layers.batch_norm
:noindex:
sequence_last_step beam_search_decode
------------------ ------------------
.. autofunction:: paddle.v2.fluid.layers.sequence_last_step
.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
:noindex: :noindex:
conv2d_transpose
----------------
pool2d .. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
------
.. autofunction:: paddle.v2.fluid.layers.pool2d
:noindex: :noindex:
sequence_expand
---------------
batch_norm .. autofunction:: paddle.v2.fluid.layers.sequence_expand
:noindex:
lstm_unit
---------
.. autofunction:: paddle.v2.fluid.layers.lstm_unit
:noindex:
reduce_sum
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.batch_norm
.. autofunction:: paddle.v2.fluid.layers.reduce_sum
:noindex:
reduce_mean
-----------
.. autofunction:: paddle.v2.fluid.layers.reduce_mean
:noindex: :noindex:
reduce_max
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_max
:noindex:
beam_search_decode reduce_min
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_min
:noindex:
sequence_first_step
-------------------
.. autofunction:: paddle.v2.fluid.layers.sequence_first_step
:noindex:
sequence_last_step
------------------ ------------------
.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
.. autofunction:: paddle.v2.fluid.layers.sequence_last_step
:noindex:
dropout
-------
.. autofunction:: paddle.v2.fluid.layers.dropout
:noindex: :noindex:
split
-----
lod_rank_table .. autofunction:: paddle.v2.fluid.layers.split
--------------
.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
:noindex: :noindex:
ctc_greedy_decoder
------------------
max_sequence_len .. autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
----------------
.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
:noindex: :noindex:
edit_distance
-------------
topk .. autofunction:: paddle.v2.fluid.layers.edit_distance
-----
.. autofunction:: paddle.v2.fluid.layers.topk
:noindex: :noindex:
l2_normalize
------------
lod_tensor_to_array .. autofunction:: paddle.v2.fluid.layers.l2_normalize
-------------------
.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
:noindex: :noindex:
matmul
------
.. autofunction:: paddle.v2.fluid.layers.matmul
array_to_lod_tensor
-------------------
.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
:noindex: :noindex:
warpctc
-------
.. autofunction:: paddle.v2.fluid.layers.warpctc
:noindex:
sequence_reshape
----------------
fill_constant .. autofunction:: paddle.v2.fluid.layers.sequence_reshape
-------------
.. autofunction:: paddle.v2.fluid.layers.fill_constant
:noindex: :noindex:
transpose
---------
.. autofunction:: paddle.v2.fluid.layers.transpose
:noindex:
fill_constant_batch_size_like im2sequence
----------------------------- -----------
.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
.. autofunction:: paddle.v2.fluid.layers.im2sequence
:noindex: :noindex:
nce
---
ones .. autofunction:: paddle.v2.fluid.layers.nce
----
.. autofunction:: paddle.v2.fluid.layers.ones
:noindex: :noindex:
beam_search
-----------
zeros .. autofunction:: paddle.v2.fluid.layers.beam_search
-----
.. autofunction:: paddle.v2.fluid.layers.zeros
:noindex: :noindex:
row_conv
--------
increment .. autofunction:: paddle.v2.fluid.layers.row_conv
---------
.. autofunction:: paddle.v2.fluid.layers.increment
:noindex: :noindex:
multiplex
---------
array_write .. autofunction:: paddle.v2.fluid.layers.multiplex
-----------
.. autofunction:: paddle.v2.fluid.layers.array_write
:noindex: :noindex:
ops
===
mean
----
create_array .. autofunction:: paddle.v2.fluid.layers.mean
------------
.. autofunction:: paddle.v2.fluid.layers.create_array
:noindex: :noindex:
mul
---
less_than .. autofunction:: paddle.v2.fluid.layers.mul
---------
.. autofunction:: paddle.v2.fluid.layers.less_than
:noindex: :noindex:
reshape
-------
array_read .. autofunction:: paddle.v2.fluid.layers.reshape
----------
.. autofunction:: paddle.v2.fluid.layers.array_read
:noindex: :noindex:
scale
-----
shrink_memory .. autofunction:: paddle.v2.fluid.layers.scale
--------------
.. autofunction:: paddle.v2.fluid.layers.shrink_memory
:noindex: :noindex:
sigmoid_cross_entropy_with_logits
---------------------------------
array_length .. autofunction:: paddle.v2.fluid.layers.sigmoid_cross_entropy_with_logits
-------------
.. autofunction:: paddle.v2.fluid.layers.array_length
:noindex: :noindex:
elementwise_add
---------------
conv2d_transpose .. autofunction:: paddle.v2.fluid.layers.elementwise_add
----------------
.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
:noindex: :noindex:
elementwise_div
sequence_expand
--------------- ---------------
.. autofunction:: paddle.v2.fluid.layers.sequence_expand
.. autofunction:: paddle.v2.fluid.layers.elementwise_div
:noindex: :noindex:
elementwise_sub
---------------
gru_unit .. autofunction:: paddle.v2.fluid.layers.elementwise_sub
--------
.. autofunction:: paddle.v2.fluid.layers.gru_unit
:noindex: :noindex:
elementwise_mul
---------------
lstm_unit .. autofunction:: paddle.v2.fluid.layers.elementwise_mul
---------
.. autofunction:: paddle.v2.fluid.layers.lstm_unit
:noindex: :noindex:
elementwise_max
---------------
sequence_softmax .. autofunction:: paddle.v2.fluid.layers.elementwise_max
----------------
.. autofunction:: paddle.v2.fluid.layers.sequence_softmax
:noindex: :noindex:
elementwise_min
---------------
reduce_sum .. autofunction:: paddle.v2.fluid.layers.elementwise_min
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_sum
:noindex: :noindex:
elementwise_pow
---------------
reduce_mean .. autofunction:: paddle.v2.fluid.layers.elementwise_pow
-----------
.. autofunction:: paddle.v2.fluid.layers.reduce_mean
:noindex: :noindex:
clip
----
reduce_max .. autofunction:: paddle.v2.fluid.layers.clip
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_max
:noindex: :noindex:
clip_by_norm
------------
reduce_min .. autofunction:: paddle.v2.fluid.layers.clip_by_norm
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_min
:noindex: :noindex:
sequence_softmax
----------------
split .. autofunction:: paddle.v2.fluid.layers.sequence_softmax
-----
.. autofunction:: paddle.v2.fluid.layers.split
:noindex: :noindex:
sigmoid
-------
matmul .. autofunction:: paddle.v2.fluid.layers.sigmoid
------
.. autofunction:: paddle.v2.fluid.layers.matmul
:noindex: :noindex:
logsigmoid logsigmoid
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.logsigmoid .. autofunction:: paddle.v2.fluid.layers.logsigmoid
:noindex: :noindex:
exp exp
--- ---
.. autofunction:: paddle.v2.fluid.layers.exp .. autofunction:: paddle.v2.fluid.layers.exp
:noindex: :noindex:
relu relu
---- ----
.. autofunction:: paddle.v2.fluid.layers.relu .. autofunction:: paddle.v2.fluid.layers.relu
:noindex: :noindex:
tanh tanh
---- ----
.. autofunction:: paddle.v2.fluid.layers.tanh .. autofunction:: paddle.v2.fluid.layers.tanh
:noindex: :noindex:
tanh_shrink tanh_shrink
----------- -----------
.. autofunction:: paddle.v2.fluid.layers.tanh_shrink .. autofunction:: paddle.v2.fluid.layers.tanh_shrink
:noindex: :noindex:
softshrink softshrink
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.softshrink .. autofunction:: paddle.v2.fluid.layers.softshrink
:noindex: :noindex:
sqrt sqrt
---- ----
.. autofunction:: paddle.v2.fluid.layers.sqrt .. autofunction:: paddle.v2.fluid.layers.sqrt
:noindex: :noindex:
abs abs
---- ---
.. autofunction:: paddle.v2.fluid.layers.abs .. autofunction:: paddle.v2.fluid.layers.abs
:noindex: :noindex:
ceil ceil
---- ----
.. autofunction:: paddle.v2.fluid.layers.ceil .. autofunction:: paddle.v2.fluid.layers.ceil
:noindex: :noindex:
floor floor
----- -----
.. autofunction:: paddle.v2.fluid.layers.floor .. autofunction:: paddle.v2.fluid.layers.floor
:noindex: :noindex:
round round
----- -----
.. autofunction:: paddle.v2.fluid.layers.round .. autofunction:: paddle.v2.fluid.layers.round
:noindex: :noindex:
reciprocal reciprocal
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.reciprocal .. autofunction:: paddle.v2.fluid.layers.reciprocal
:noindex: :noindex:
log log
--- ---
.. autofunction:: paddle.v2.fluid.layers.log .. autofunction:: paddle.v2.fluid.layers.log
:noindex: :noindex:
square square
------ ------
.. autofunction:: paddle.v2.fluid.layers.square .. autofunction:: paddle.v2.fluid.layers.square
:noindex: :noindex:
softplus softplus
-------- --------
.. autofunction:: paddle.v2.fluid.layers.softplus .. autofunction:: paddle.v2.fluid.layers.softplus
:noindex: :noindex:
softsign softsign
--------- --------
.. autofunction:: paddle.v2.fluid.layers.softsign .. autofunction:: paddle.v2.fluid.layers.softsign
:noindex: :noindex:
brelu brelu
----- -----
.. autofunction:: paddle.v2.fluid.layers.brelu .. autofunction:: paddle.v2.fluid.layers.brelu
:noindex: :noindex:
leaky_relu leaky_relu
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.leaky_relu .. autofunction:: paddle.v2.fluid.layers.leaky_relu
:noindex: :noindex:
soft_relu soft_relu
--------- ---------
.. autofunction:: paddle.v2.fluid.layers.soft_relu .. autofunction:: paddle.v2.fluid.layers.soft_relu
:noindex: :noindex:
elu elu
---- ---
.. autofunction:: paddle.v2.fluid.layers.elu .. autofunction:: paddle.v2.fluid.layers.elu
:noindex: :noindex:
relu6 relu6
----- -----
.. autofunction:: paddle.v2.fluid.layers.relu6 .. autofunction:: paddle.v2.fluid.layers.relu6
:noindex: :noindex:
pow pow
---- ---
.. autofunction:: paddle.v2.fluid.layers.pow .. autofunction:: paddle.v2.fluid.layers.pow
:noindex: :noindex:
stanh
-----
.. autofunction:: paddle.v2.fluid.layers.stanh
:noindex:
hard_shrink hard_shrink
----------- -----------
.. autofunction:: paddle.v2.fluid.layers.hard_shrink .. autofunction:: paddle.v2.fluid.layers.hard_shrink
:noindex: :noindex:
thresholded_relu thresholded_relu
---------------- ----------------
.. autofunction:: paddle.v2.fluid.layers.thresholded_relu .. autofunction:: paddle.v2.fluid.layers.thresholded_relu
:noindex: :noindex:
hard_sigmoid hard_sigmoid
------------- ------------
.. autofunction:: paddle.v2.fluid.layers.hard_sigmoid .. autofunction:: paddle.v2.fluid.layers.hard_sigmoid
:noindex: :noindex:
swish swish
------ -----
.. autofunction:: paddle.v2.fluid.layers.swish .. autofunction:: paddle.v2.fluid.layers.swish
:noindex: :noindex:
im2sequence tensor
======
create_tensor
-------------
.. autofunction:: paddle.v2.fluid.layers.create_tensor
:noindex:
create_parameter
----------------
.. autofunction:: paddle.v2.fluid.layers.create_parameter
:noindex:
create_global_var
-----------------
.. autofunction:: paddle.v2.fluid.layers.create_global_var
:noindex:
cast
----
.. autofunction:: paddle.v2.fluid.layers.cast
:noindex:
concat
------ ------
.. autofunction:: paddle.v2.fluid.layers.im2sequence
.. autofunction:: paddle.v2.fluid.layers.concat
:noindex: :noindex:
edit_distance sums
--------------- ----
.. autofunction:: paddle.v2.fluid.layers.edit_distance_error
.. autofunction:: paddle.v2.fluid.layers.sums
:noindex: :noindex:
ctc_greedy_decoder assign
--------------- ------
.. autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
.. autofunction:: paddle.v2.fluid.layers.assign
:noindex: :noindex:
l2_normalize fill_constant_batch_size_like
------------ -----------------------------
.. autofunction:: paddle.v2.fluid.layers.l2_normalize
.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
:noindex: :noindex:
sequence_reshape fill_constant
---------------- -------------
.. autofunction:: paddle.v2.fluid.layers.sequence_reshape
.. autofunction:: paddle.v2.fluid.layers.fill_constant
:noindex: :noindex:
row_conv ones
-------- ----
.. autofunction:: paddle.v2.fluid.layers.row_conv
.. autofunction:: paddle.v2.fluid.layers.ones
:noindex: :noindex:
multiplex zeros
--------- -----
.. autofunction:: paddle.v2.fluid.layers.multiplex
.. autofunction:: paddle.v2.fluid.layers.zeros
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Nets !DO NOT EDIT THIS FILE MANUALLY!
===========
====
nets
====
simple_img_conv_pool simple_img_conv_pool
-------------------- --------------------
.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
:noindex:
.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
img_conv_group
---------------
.. autofunction:: paddle.v2.fluid.nets.img_conv_group
:noindex: :noindex:
sequence_conv_pool sequence_conv_pool
------------------ ------------------
.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool .. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
:noindex: :noindex:
glu glu
--- ---
.. autofunction:: paddle.v2.fluid.nets.glu .. autofunction:: paddle.v2.fluid.nets.glu
:noindex: :noindex:
scaled_dot_product_attention scaled_dot_product_attention
---------------------------- ----------------------------
.. autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention .. autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Optimizer !DO NOT EDIT THIS FILE MANUALLY!
===========
Optimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: Optimizer
:noindex:
=========
optimizer
=========
SGDOptimizer SGD
----------- ---
.. automodule:: paddle.v2.fluid.optimizer
:members: SGDOptimizer
:noindex:
.. autoclass:: paddle.v2.fluid.optimizer.SGD
:members:
:noindex:
Momentum
--------
MomentumOptimizer .. autoclass:: paddle.v2.fluid.optimizer.Momentum
----------------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: MomentumOptimizer
:noindex: :noindex:
Adagrad
-------
.. autoclass:: paddle.v2.fluid.optimizer.Adagrad
AdagradOptimizer :members:
----------------
.. automodule:: paddle.v2.fluid.optimizer
:members: AdagradOptimizer
:noindex: :noindex:
Adam
----
AdamOptimizer .. autoclass:: paddle.v2.fluid.optimizer.Adam
------------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: AdamOptimizer
:noindex: :noindex:
Adamax
------
AdamaxOptimizer .. autoclass:: paddle.v2.fluid.optimizer.Adamax
----------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: AdamaxOptimizer
:noindex: :noindex:
DecayedAdagrad
--------------
DecayedAdagradOptimizer .. autoclass:: paddle.v2.fluid.optimizer.DecayedAdagrad
----------------------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: DecayedAdagradOptimizer
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
==========
param_attr
==========
ParamAttr ParamAttr
=========== ---------
.. autoclass:: paddle.v2.fluid.param_attr.ParamAttr
:members:
:noindex:
WeightNormParamAttr
-------------------
ParamAttr .. autoclass:: paddle.v2.fluid.param_attr.WeightNormParamAttr
----------- :members:
.. automodule:: paddle.v2.fluid.param_attr
:members: ParamAttr
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Profiler !DO NOT EDIT THIS FILE MANUALLY!
===========
========
profiler
========
cuda_profiler
-------------
Profiler
-----------
.. autofunction:: paddle.v2.fluid.profiler.cuda_profiler .. autofunction:: paddle.v2.fluid.profiler.cuda_profiler
:noindex: :noindex:
reset_profiler
--------------
.. autofunction:: paddle.v2.fluid.profiler.reset_profiler
:noindex:
profiler
--------
.. autofunction:: paddle.v2.fluid.profiler.profiler
:noindex:
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=========== ===========
Regularizer regularizer
=========== ===========
WeightDecayRegularizer append_regularization_ops
---------------------- -------------------------
.. automodule:: paddle.v2.fluid.regularizer
:members: WeightDecayRegularizer
:noindex:
L2DecayRegularizer .. autofunction:: paddle.v2.fluid.regularizer.append_regularization_ops
------------------
.. automodule:: paddle.v2.fluid.regularizer
:members: L2DecayRegularizer
:noindex: :noindex:
L1Decay
-------
.. autoclass:: paddle.v2.fluid.regularizer.L1Decay
:members:
:noindex:
L1DecayRegularizer L2Decay
------------------- -------
.. automodule:: paddle.v2.fluid.regularizer
:members: L1DecayRegularizer
.. autoclass:: paddle.v2.fluid.regularizer.L2Decay
:members:
:noindex:
...@@ -140,7 +140,19 @@ TODO by Assignees ...@@ -140,7 +140,19 @@ TODO by Assignees
### Beam Search with CTC and LM ### Beam Search with CTC and LM
TODO by Assignees <div align="center">
<img src="image/beam_search.png" width=600><br/>
Figure 2. Algorithm for CTC Beam Search Decoder.
</div>
- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
- 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
- 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
- Such external scorer consists of language model, word count or any other custom scorers.
- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
## Future Work ## Future Work
...@@ -153,3 +165,4 @@ TODO by Assignees ...@@ -153,3 +165,4 @@ TODO by Assignees
1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016. 1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595. 2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595.
3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
### Design Doc: Switch
### Background
Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid.
The following example shows the usage of `fluid.switch`.
```python
a = fluid.Var(10)
b = fluid.Var(0)
switch = fluid.switch()
with switch.block():
with switch.case(fluid.less_equal(a, 10)):
fluid.print("Case 1")
with switch.case(fluid.larger(a, 0)):
fluid.print("Case 2")
with switch.default():
fluid.print("Case 3")
```
### The Semantics
1. A `switch` control-flow checks cases one-by-one.
1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
1. It runs the first matched case, or the default case if there is one.
1. Once it matches a case, it runs the corresponding branch and only that branch. It's like there is a C's `break` keyword at the end of each case.
The above program should print and print only "Case 1".
The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
...@@ -115,7 +115,7 @@ PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种B ...@@ -115,7 +115,7 @@ PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种B
"WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON" "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
"WITH_PYTHON", "是否内嵌PYTHON解释器", "ON" "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
"WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON" "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
"WITH_TESTING", "是否开启单元测试", "ON" "WITH_TESTING", "是否开启单元测试", "OFF"
"WITH_DOC", "是否编译中英文文档", "OFF" "WITH_DOC", "是否编译中英文文档", "OFF"
"WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto" "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto"
"WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON" "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
......
...@@ -126,7 +126,7 @@ You can add :code:`-D` argument to pass such options, like: ...@@ -126,7 +126,7 @@ You can add :code:`-D` argument to pass such options, like:
"WITH_AVX", "Build with AVX support", "ON" "WITH_AVX", "Build with AVX support", "ON"
"WITH_PYTHON", "Build with integrated Python interpreter", "ON" "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
"WITH_STYLE_CHECK", "Check code style when building", "ON" "WITH_STYLE_CHECK", "Check code style when building", "ON"
"WITH_TESTING", "Build unit tests", "ON" "WITH_TESTING", "Build unit tests", "OFF"
"WITH_DOC", "Build documentations", "OFF" "WITH_DOC", "Build documentations", "OFF"
"WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto" "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
"WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON" "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
......
...@@ -95,6 +95,12 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note ...@@ -95,6 +95,12 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
docker run -p 8888:8888 paddlepaddle/book docker run -p 8888:8888 paddlepaddle/book
国内用户可以使用下面的镜像源来加速访问:
.. code-block: bash
docker run -p 8888:8888 docker.paddlepaddlehub.com/book
然后在浏览器中输入以下网址: 然后在浏览器中输入以下网址:
.. code-block:: text .. code-block:: text
......
...@@ -102,6 +102,12 @@ We provide a packaged book image, simply issue the command: ...@@ -102,6 +102,12 @@ We provide a packaged book image, simply issue the command:
docker run -p 8888:8888 paddlepaddle/book docker run -p 8888:8888 paddlepaddle/book
For users in China, we provide a faster mirror:
.. code-block: bash
docker run -p 8888:8888 docker.paddlepaddlehub.com/book
Then, you would back and paste the address into the local browser: Then, you would back and paste the address into the local browser:
.. code-block:: text .. code-block:: text
......
...@@ -92,11 +92,11 @@ paddle.init( ...@@ -92,11 +92,11 @@ paddle.init(
参数说明 参数说明
- use_gpu: **可选,默认False**,是否启用GPU训练 - use_gpu: **可选,默认False**,是否启用GPU训练
- trainer_count:**必选,默认1**,当前训练任务trainer总个数 - trainer_count:**必选,默认1**,当前trainer的线程数目
- port:**必选,默认7164**,连接到pserver的端口 - port:**必选,默认7164**,连接到pserver的端口
- ports_num:**必选,默认1**,连接到pserver的端口个数 - ports_num:**必选,默认1**,连接到pserver的端口个数
- ports_num_for_sparse:**必选,默认0**,和pserver之间用于稀疏类型参数通信的端口个数 - ports_num_for_sparse:**必选,默认0**,和pserver之间用于稀疏类型参数通信的端口个数
- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数 - num_gradient_servers:**必选,默认1**,当前训练任务trainer总数
- trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数 - trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数
- pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开 - pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开
......
...@@ -95,11 +95,11 @@ paddle.init( ...@@ -95,11 +95,11 @@ paddle.init(
Parameter Description Parameter Description
- use_gpu: **optional, default False**, set to "True" to enable GPU training. - use_gpu: **optional, default False**, set to "True" to enable GPU training.
- trainer_count: **required, default 1**, total count of trainers in the training job. - trainer_count: **required, default 1**, number of threads in current trainer.
- port: **required, default 7164**, port to connect to parameter server. - port: **required, default 7164**, port to connect to parameter server.
- ports_num: **required, default 1**, number of ports for communication. - ports_num: **required, default 1**, number of ports for communication.
- ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation. - ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
- num_gradient_servers: **required, default 1**, total number of gradient server. - num_gradient_servers: **required, default 1**, number of trainers in current job.
- trainer_id: **required, default 0**, ID for every trainer, start from 0. - trainer_id: **required, default 0**, ID for every trainer, start from 0.
- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",". - pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
......
...@@ -8,4 +8,3 @@ PaddlePaddle 文档 ...@@ -8,4 +8,3 @@ PaddlePaddle 文档
howto/index_cn.rst howto/index_cn.rst
api/index_cn.rst api/index_cn.rst
faq/index_cn.rst faq/index_cn.rst
mobile/index_cn.rst
...@@ -7,4 +7,3 @@ PaddlePaddle Documentation ...@@ -7,4 +7,3 @@ PaddlePaddle Documentation
getstarted/index_en.rst getstarted/index_en.rst
howto/index_en.rst howto/index_en.rst
api/index_en.rst api/index_en.rst
mobile/index_en.rst
MOBILE
======
.. toctree::
:maxdepth: 1
cross_compiling_for_android_cn.md
cross_compiling_for_ios_cn.md
cross_compiling_for_raspberry_cn.md
MOBILE
======
.. toctree::
:maxdepth: 1
cross_compiling_for_android_en.md
cross_compiling_for_ios_en.md
cross_compiling_for_raspberry_en.md
...@@ -22,7 +22,7 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) ...@@ -22,7 +22,7 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
cc_test(variable_test SRCS variable_test.cc) cc_test(variable_test SRCS variable_test.cc)
......
...@@ -23,12 +23,10 @@ namespace framework { ...@@ -23,12 +23,10 @@ namespace framework {
template <typename T> template <typename T>
class Channel { class Channel {
public: public:
virtual void Send(T*) = 0; virtual bool Send(T*) = 0;
virtual void Receive(T*) = 0; virtual bool Receive(T*) = 0;
virtual size_t Cap() = 0; virtual size_t Cap() = 0;
virtual void Close() = 0;
// Don't delete channels; instead, call Channel::Close.
protected:
virtual ~Channel() {} virtual ~Channel() {}
}; };
...@@ -50,11 +48,7 @@ Channel<T>* MakeChannel(size_t buffer_size) { ...@@ -50,11 +48,7 @@ Channel<T>* MakeChannel(size_t buffer_size) {
template <typename T> template <typename T>
void CloseChannel(Channel<T>* ch) { void CloseChannel(Channel<T>* ch) {
if (ch->Cap() > 0) { ch->Close();
delete dynamic_cast<details::Buffered<T>*>(ch);
} else {
delete dynamic_cast<details::UnBuffered<T>*>(ch);
}
} }
} // namespace framework } // namespace framework
......
...@@ -14,13 +14,329 @@ limitations under the License. */ ...@@ -14,13 +14,329 @@ limitations under the License. */
#include "paddle/framework/channel.h" #include "paddle/framework/channel.h"
#include <chrono>
#include <thread>
#include "gtest/gtest.h" #include "gtest/gtest.h"
using paddle::framework::Channel;
using paddle::framework::MakeChannel;
using paddle::framework::CloseChannel;
TEST(Channel, MakeAndClose) { TEST(Channel, MakeAndClose) {
using paddle::framework::Channel; using paddle::framework::details::Buffered;
using paddle::framework::MakeChannel; using paddle::framework::details::UnBuffered;
using paddle::framework::CloseChannel; {
// MakeChannel should return a buffered channel is buffer_size > 0.
auto ch = MakeChannel<int>(10);
EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
CloseChannel(ch);
delete ch;
}
{
// MakeChannel should return an un-buffered channel is buffer_size = 0.
auto ch = MakeChannel<int>(0);
EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
CloseChannel(ch);
delete ch;
}
}
TEST(Channel, SufficientBufferSizeDoesntBlock) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
for (size_t i = 0; i < buffer_size; ++i) {
EXPECT_EQ(ch->Send(&i), true); // should not block
}
size_t out;
for (size_t i = 0; i < buffer_size; ++i) {
EXPECT_EQ(ch->Receive(&out), true); // should not block
EXPECT_EQ(out, i);
}
CloseChannel(ch);
delete ch;
}
TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
size_t sum = 0;
std::thread t([&]() {
// Try to write more than buffer size.
for (size_t i = 0; i < 2 * buffer_size; ++i) {
if (i < buffer_size)
EXPECT_EQ(ch->Send(&i), true); // should block after 10 iterations
else
EXPECT_EQ(ch->Send(&i), false);
sum += i;
}
});
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec
EXPECT_EQ(sum, 45U);
CloseChannel(ch);
t.join();
delete ch;
}
TEST(Channel, SimpleUnbufferedChannelTest) {
auto ch = MakeChannel<int>(0);
unsigned sum_send = 0;
std::thread t([&]() {
for (int i = 0; i < 5; i++) {
EXPECT_EQ(ch->Send(&i), true);
sum_send += i;
}
});
for (int i = 0; i < 5; i++) {
int recv;
EXPECT_EQ(ch->Receive(&recv), true);
EXPECT_EQ(recv, i);
}
CloseChannel(ch);
t.join();
EXPECT_EQ(sum_send, 10U);
delete ch;
}
// This tests that closing a buffered channel also unblocks
// any receivers waiting on the channel
TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
auto ch = MakeChannel<int>(1);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
// All reads should return false
EXPECT_EQ(ch->Receive(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that all threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the channel
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
}
// This tests that closing a buffered channel also unblocks
// any senders waiting for channel to have write space
TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
auto ch = MakeChannel<int>(1);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
bool send_success[num_threads];
// Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
[&](bool *ended, bool *success) {
int data = 10;
*success = ch->Send(&data);
*ended = true;
},
&thread_ended[i], &send_success[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that atleast 4 threads are blocked
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (thread_ended[i] == false) ct++;
}
// Atleast 4 threads must be blocked
EXPECT_GE(ct, 4);
// Explicitly close the thread
// This should unblock all senders
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
// Verify that only 1 send was successful
ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (send_success[i]) ct++;
}
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
}
// This tests that closing an unbuffered channel also unblocks
// unblocks any receivers waiting for senders
TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
auto ch = MakeChannel<int>(0);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked becausew of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
EXPECT_EQ(ch->Receive(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the thread
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
}
// This tests that closing an unbuffered channel also unblocks
// unblocks any senders waiting for senders
TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
auto ch = MakeChannel<int>(0);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked becausew of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data = 10;
EXPECT_EQ(ch->Send(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the thread
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
}
TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
auto ch = MakeChannel<int>(0);
unsigned sum_send = 0;
// Send should block after three iterations
// since we only have three receivers.
std::thread t([&]() {
// Try to send more number of times
// than receivers
for (int i = 0; i < 4; i++) {
ch->Send(&i);
sum_send += i;
}
});
for (int i = 0; i < 3; i++) {
int recv;
ch->Receive(&recv);
EXPECT_EQ(recv, i);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec
EXPECT_EQ(sum_send, 3U);
CloseChannel(ch);
t.join();
delete ch;
}
TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
auto ch = MakeChannel<int>(0);
unsigned sum_send = 0;
unsigned sum_receive = 0;
// The receiver should block after 5
// iterations, since there are only 5 senders.
std::thread t([&]() {
for (int i = 0; i < 8; i++) {
int recv;
ch->Receive(&recv); // should block after the fifth iteration.
EXPECT_EQ(recv, i);
sum_receive += i;
}
});
for (int i = 0; i < 5; i++) {
ch->Send(&i);
sum_send += i;
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
EXPECT_EQ(sum_send, 10U);
EXPECT_EQ(sum_receive, 10U);
// send three more elements
for (int i = 5; i < 8; i++) {
ch->Send(&i);
sum_send += i;
}
Channel<int>* ch = MakeChannel<int>(10);
CloseChannel(ch); CloseChannel(ch);
t.join();
EXPECT_EQ(sum_send, 28U);
EXPECT_EQ(sum_receive, 28U);
delete ch;
} }
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <mutex> #include <mutex>
#include "paddle/framework/channel.h" #include "paddle/framework/channel.h"
#include "paddle/platform/enforce.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -29,9 +30,11 @@ class Buffered : public paddle::framework::Channel<T> { ...@@ -29,9 +30,11 @@ class Buffered : public paddle::framework::Channel<T> {
friend void paddle::framework::CloseChannel<T>(Channel<T>*); friend void paddle::framework::CloseChannel<T>(Channel<T>*);
public: public:
virtual void Send(T*); virtual bool Send(T*);
virtual void Receive(T*); virtual bool Receive(T*);
virtual size_t Cap() { return cap_; } virtual size_t Cap() { return cap_; }
virtual void Close();
virtual ~Buffered();
private: private:
size_t cap_; size_t cap_;
...@@ -39,42 +42,64 @@ class Buffered : public paddle::framework::Channel<T> { ...@@ -39,42 +42,64 @@ class Buffered : public paddle::framework::Channel<T> {
std::condition_variable empty_cond_var_; std::condition_variable empty_cond_var_;
std::condition_variable full_cond_var_; std::condition_variable full_cond_var_;
std::deque<T> channel_; std::deque<T> channel_;
bool closed_;
Buffered(size_t cap) : cap_(cap) {} Buffered(size_t cap) : cap_(cap), closed_(false) {
virtual ~Buffered(); PADDLE_ENFORCE_GT(cap, 0);
}
void NotifyAllSenders(std::unique_lock<std::mutex>*); void NotifyAllParticipants(std::unique_lock<std::mutex>*);
}; };
template <typename T> template <typename T>
void Buffered<T>::Send(T* item) { bool Buffered<T>::Send(T* item) {
std::unique_lock<std::mutex> lock(mu_);
full_cond_var_.wait(lock,
[this]() { return channel_.size() < cap_ || closed_; });
bool ret = false;
if (!closed_) {
channel_.push_back(std::move(*item));
lock.unlock();
empty_cond_var_.notify_one();
ret = true;
}
return ret;
}
template <typename T>
bool Buffered<T>::Receive(T* item) {
std::unique_lock<std::mutex> lock(mu_); std::unique_lock<std::mutex> lock(mu_);
full_cond_var_.wait(lock, [this]() { return channel_.size() < cap_; }); empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
channel_.push_back(std::move(*item)); bool ret = false;
lock.unlock(); if (!closed_) {
empty_cond_var_.notify_one(); *item = std::move(channel_.front());
channel_.pop_front();
full_cond_var_.notify_one();
ret = true;
}
return ret;
} }
template <typename T> template <typename T>
void Buffered<T>::Receive(T* item) { void Buffered<T>::Close() {
std::unique_lock<std::mutex> lock(mu_); std::unique_lock<std::mutex> lock(mu_);
empty_cond_var_.wait(lock, [this]() { return !channel_.empty(); }); closed_ = true;
*item = std::move(channel_.front()); NotifyAllParticipants(&lock);
channel_.pop_front();
NotifyAllSenders(&lock);
} }
template <typename T> template <typename T>
Buffered<T>::~Buffered() { Buffered<T>::~Buffered() {
std::unique_lock<std::mutex> lock(mu_); std::unique_lock<std::mutex> lock(mu_);
closed_ = true;
channel_.clear(); channel_.clear();
NotifyAllSenders(&lock); NotifyAllParticipants(&lock);
} }
template <typename T> template <typename T>
void Buffered<T>::NotifyAllSenders(std::unique_lock<std::mutex>* lock) { void Buffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
lock->unlock(); lock->unlock();
full_cond_var_.notify_one(); full_cond_var_.notify_all();
empty_cond_var_.notify_all();
} }
} // namespace details } // namespace details
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <atomic>
#include <condition_variable> #include <condition_variable>
#include <deque>
#include <mutex> #include <mutex>
#include "paddle/framework/channel.h" #include "paddle/framework/channel.h"
...@@ -29,23 +29,117 @@ class UnBuffered : public paddle::framework::Channel<T> { ...@@ -29,23 +29,117 @@ class UnBuffered : public paddle::framework::Channel<T> {
friend void paddle::framework::CloseChannel<T>(Channel<T>*); friend void paddle::framework::CloseChannel<T>(Channel<T>*);
public: public:
virtual void Send(T*); virtual bool Send(T*);
virtual void Receive(T*); virtual bool Receive(T*);
virtual size_t Cap() { return 0; } virtual size_t Cap() { return 0; }
virtual void Close();
virtual ~UnBuffered();
private: private:
UnBuffered() {} std::mutex mu_ch_;
virtual ~UnBuffered(); // Mutex for readers and writers who are waiting for other reader
// and writer to complete execution
std::recursive_mutex mu_read_, mu_write_;
// reader_found_ is set true when a reader is ready to accept data
// writer_found_ is set true when a writer is ready to send data
// A transaction occurs only when both are true
std::atomic<bool> reader_found_{false}, writer_found_{false};
std::condition_variable cv_channel_;
std::condition_variable_any cv_reader_, cv_writer_;
T* item{nullptr};
std::atomic<bool> closed_{false};
UnBuffered() : closed_(false) {}
void NotifyAllParticipants(std::unique_lock<std::mutex>*);
}; };
// This function implements the concept of how data should
// be sent from a writer to a reader.
template <typename T>
bool UnBuffered<T>::Send(T* data) {
// Prevent other writers from entering
std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
writer_found_ = true;
std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
// If writer comes first, it should wait till a reader arrives
cv_writer_.wait(cv_lock,
[this]() { return reader_found_ == true || closed_; });
cv_reader_.notify_one();
bool ret = false;
if (!closed_) {
std::unique_lock<std::mutex> channel_lock(mu_ch_);
item = data;
channel_lock.unlock();
cv_channel_.notify_one();
channel_lock.lock();
cv_channel_.wait(channel_lock,
[this]() { return item == nullptr || closed_; });
ret = true;
}
writer_found_ = false;
return ret;
}
// This function implements the concept of how
// data that was sent by a writer is read from a reader.
template <typename T>
bool UnBuffered<T>::Receive(T* data) {
// Prevent other readers from entering
std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
reader_found_ = true;
std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
// If reader comes first, it should wait till a writer arrives
cv_reader_.wait(cv_lock,
[this]() { return writer_found_ == true || closed_; });
cv_writer_.notify_one();
bool ret = false;
if (!closed_) {
std::unique_lock<std::mutex> lock_ch{mu_ch_};
// Reader should wait for the writer to first write its data
cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
if (!closed_) {
*data = std::move(*item);
item = nullptr;
lock_ch.unlock();
ret = true;
}
cv_channel_.notify_one();
}
reader_found_ = false;
return ret;
}
// This function implements the sequence of events
// that take place once the channel is closed.
template <typename T> template <typename T>
void UnBuffered<T>::Send(T* channel_element) {} void UnBuffered<T>::Close() {
std::unique_lock<std::mutex> lock(mu_ch_);
item = nullptr;
closed_ = true;
NotifyAllParticipants(&lock);
}
// This function implements the sequence of events
// that are executed once the object of an UnBuffered
// channel is destroyed.
template <typename T> template <typename T>
void UnBuffered<T>::Receive(T*) {} UnBuffered<T>::~UnBuffered() {
std::unique_lock<std::mutex> lock(mu_ch_);
item = nullptr;
closed_ = true;
NotifyAllParticipants(&lock);
}
// This function notifies all the readers, writers and
// the channel condition variables.
template <typename T> template <typename T>
UnBuffered<T>::~UnBuffered() {} void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
lock->unlock();
cv_writer_.notify_all();
cv_channel_.notify_all();
cv_reader_.notify_all();
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
...@@ -25,7 +25,7 @@ limitations under the License. */ ...@@ -25,7 +25,7 @@ limitations under the License. */
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
#include "paddle/platform/profiler.h" #include "paddle/platform/profiler.h"
DECLARE_bool(do_memory_benchmark); DECLARE_bool(benchmark);
DEFINE_bool(check_nan_inf, false, DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be " "Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely."); "extremely slow so please use this flag wisely.");
...@@ -33,9 +33,6 @@ DEFINE_bool(check_nan_inf, false, ...@@ -33,9 +33,6 @@ DEFINE_bool(check_nan_inf, false,
namespace paddle { namespace paddle {
namespace framework { namespace framework {
const std::string kFeedOpType = "feed";
const std::string kFetchOpType = "fetch";
Executor::Executor(const platform::Place& place) : place_(place) {} Executor::Executor(const platform::Place& place) : place_(place) {}
static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
...@@ -125,7 +122,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -125,7 +122,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
op->Run(*local_scope, place_); op->Run(*local_scope, place_);
VLOG(3) << op->DebugStringEx(local_scope); VLOG(3) << op->DebugStringEx(local_scope);
if (FLAGS_do_memory_benchmark) { if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: " VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_); << memory::memory_usage(place_);
} }
...@@ -142,7 +139,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -142,7 +139,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
if (create_vars && create_local_scope) { if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
} }
if (FLAGS_do_memory_benchmark) { if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------"; VLOG(2) << "-------------------------------------------------------";
VLOG(2) << "Memory used after deleting local scope: " VLOG(2) << "Memory used after deleting local scope: "
<< memory::memory_usage(place_); << memory::memory_usage(place_);
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector> #include <vector>
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
...@@ -20,5 +21,8 @@ namespace paddle { ...@@ -20,5 +21,8 @@ namespace paddle {
namespace framework { namespace framework {
using FeedFetchType = LoDTensor; using FeedFetchType = LoDTensor;
using FeedFetchList = std::vector<FeedFetchType>; using FeedFetchList = std::vector<FeedFetchType>;
static const std::string kFeedOpType = "feed";
static const std::string kFetchOpType = "fetch";
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <string.h> // for strdup #include <string.h> // for strdup
#include <algorithm> #include <algorithm>
#include <stdexcept>
#include <string> #include <string>
#include "paddle/framework/init.h" #include "paddle/framework/init.h"
...@@ -46,17 +47,23 @@ void InitDevices() { ...@@ -46,17 +47,23 @@ void InitDevices() {
std::vector<platform::Place> places; std::vector<platform::Place> places;
places.emplace_back(platform::CPUPlace()); places.emplace_back(platform::CPUPlace());
int count = 0;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
int count = platform::GetCUDADeviceCount(); try {
for (int i = 0; i < count; ++i) { count = platform::GetCUDADeviceCount();
places.emplace_back(platform::CUDAPlace(i)); } catch (const std::exception &exp) {
LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
} }
#else #else
LOG(WARNING) LOG(WARNING)
<< "'GPU' is not supported, Please re-compile with WITH_GPU option"; << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
#endif #endif
for (int i = 0; i < count; ++i) {
places.emplace_back(platform::CUDAPlace(i));
}
platform::DeviceContextPool::Init(places); platform::DeviceContextPool::Init(places);
} }
......
...@@ -20,7 +20,21 @@ TEST(InitDevices, CPU) { ...@@ -20,7 +20,21 @@ TEST(InitDevices, CPU) {
using paddle::framework::InitDevices; using paddle::framework::InitDevices;
using paddle::platform::DeviceContextPool; using paddle::platform::DeviceContextPool;
#ifndef PADDLE_WITH_CUDA
InitDevices(); InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance(); DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_GE(pool.size(), 1U); ASSERT_EQ(pool.size(), 1U);
#endif
}
TEST(InitDevices, CUDA) {
using paddle::framework::InitDevices;
using paddle::platform::DeviceContextPool;
#ifdef PADDLE_WITH_CUDA
int count = paddle::platform::GetCUDADeviceCount();
InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
#endif
} }
...@@ -24,8 +24,6 @@ limitations under the License. */ ...@@ -24,8 +24,6 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <iterator> #include <iterator>
#include <glog/logging.h>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -18,11 +18,11 @@ limitations under the License. */ ...@@ -18,11 +18,11 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <thrust/device_vector.h> #include <thrust/device_vector.h>
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>
#endif #endif
#include <glog/logging.h> #include <glog/logging.h>
#include "paddle/framework/ddim.h" #include "paddle/framework/ddim.h"
#include "paddle/framework/mixed_vector.h"
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
#include "paddle/framework/tensor_util.h" #include "paddle/framework/tensor_util.h"
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
...@@ -31,15 +31,6 @@ limitations under the License. */ ...@@ -31,15 +31,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
#ifndef PADDLE_WITH_CUDA
template <typename T>
using Vector = std::vector<T>;
#else
template <typename T>
using Vector = thrust::host_vector<
T, thrust::system::cuda::experimental::pinned_allocator<T>>;
#endif
/* /*
* LoD is short for Level of Details. * LoD is short for Level of Details.
* *
...@@ -55,7 +46,15 @@ using Vector = thrust::host_vector< ...@@ -55,7 +46,15 @@ using Vector = thrust::host_vector<
* 0 2 4 7 * 0 2 4 7
* 0 2 5 7 10 12 15 20 * 0 2 5 7 10 12 15 20
*/ */
using LoD = std::vector<Vector<size_t>>; struct LoD : public std::vector<Vector<size_t>> {
using std::vector<Vector<size_t>>::vector;
void CopyFromCUDA() {
for (auto it = this->begin(); it != this->end(); ++it) {
it->CopyFromCUDA();
}
}
};
std::ostream& operator<<(std::ostream& os, const LoD& lod); std::ostream& operator<<(std::ostream& os, const LoD& lod);
std::ostream& operator<<(std::ostream& os, const LoDTensor& t); std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
...@@ -109,7 +108,10 @@ bool CheckAbsLoD(const LoD& in, int tensor_height = -1); ...@@ -109,7 +108,10 @@ bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
*/ */
class LoDTensor : public Tensor { class LoDTensor : public Tensor {
public: public:
LoDTensor() {} LoDTensor() : Tensor() {}
/* Constructor with place should only be used in pybind */
explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
explicit LoDTensor(const LoD& lod) : lod_(lod) {} explicit LoDTensor(const LoD& lod) : lod_(lod) {}
......
...@@ -23,6 +23,17 @@ ...@@ -23,6 +23,17 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
TEST(LoD, data) {
LoD lod{{0, 1, 2}};
lod.push_back({0, 2, 4, 5});
lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
auto& v = lod[0];
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], i);
}
}
TEST(LodExpand, test) { TEST(LodExpand, test) {
LoD lod{{0, 2}}; LoD lod{{0, 2}};
LoDTensor tensor; LoDTensor tensor;
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <stdio.h>
#include "paddle/framework/init.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/platform/assert.h" #include "paddle/platform/assert.h"
...@@ -26,7 +28,48 @@ __global__ void test(size_t* a, int size) { ...@@ -26,7 +28,48 @@ __global__ void test(size_t* a, int size) {
} }
} }
TEST(Vector, Normal) {
using namespace paddle::framework;
using namespace paddle::platform;
using namespace paddle::memory;
paddle::framework::InitDevices();
paddle::framework::Vector<size_t> vec({1, 2, 3});
size_t* ptr = vec.data();
for (size_t i = 0; i < vec.size(); ++i) {
EXPECT_EQ(vec[i], *(ptr + i));
}
vec.clear();
vec.CopyFromCUDA();
std::vector<size_t> v = {1, 2, 3};
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], vec[i]);
}
}
TEST(LoD, data) {
paddle::framework::InitDevices();
paddle::framework::LoD lod{{0, 1, 2}};
lod.push_back({0, 2, 4, 5});
lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
auto& v = lod[0];
test<<<1, 1>>>(v.cuda_data(), v.size());
cudaDeviceSynchronize();
v.CopyFromCUDA();
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], i * 2);
}
}
TEST(LoDTensor, LoDInGPU) { TEST(LoDTensor, LoDInGPU) {
paddle::framework::InitDevices();
paddle::framework::LoDTensor lod_tensor; paddle::framework::LoDTensor lod_tensor;
paddle::platform::CUDAPlace place(0); paddle::platform::CUDAPlace place(0);
...@@ -42,8 +85,9 @@ TEST(LoDTensor, LoDInGPU) { ...@@ -42,8 +85,9 @@ TEST(LoDTensor, LoDInGPU) {
auto lod = lod_tensor.lod(); auto lod = lod_tensor.lod();
test<<<1, 8>>>(lod[0].data(), lod[0].size()); test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size());
cudaDeviceSynchronize(); cudaDeviceSynchronize();
lod.CopyFromCUDA();
for (size_t i = 0; i < src_lod[0].size(); ++i) { for (size_t i = 0; i < src_lod[0].size(); ++i) {
EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2); EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <initializer_list>
#include <vector>
#include "paddle/memory/memcpy.h"
#include "paddle/memory/memory.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace framework {
/**
* @brief Vector support both cpu and gpu.
* host vector lifetime is same with Vector
* device vector is lazily malloc and modified.
*/
template <typename T>
class Vector : public std::vector<T> {
public:
using std::vector<T>::vector;
Vector() {}
Vector(const std::vector<T> &v) : std::vector<T>(v) {} // NOLINT
virtual ~Vector() {
#ifdef PADDLE_WITH_CUDA
if (cuda_ptr_ != nullptr) {
memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
}
#endif
}
/* Get device vector */
T *cuda_data() {
CopyToCUDA();
PADDLE_ENFORCE_NOT_NULL(
cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
return static_cast<T *>(cuda_ptr_);
}
/* Get host vector */
T *data() { return std::vector<T>::data(); }
const T *data() const { return std::vector<T>::data(); }
/* Synchronize host vector to device vector */
void CopyToCUDA();
/* Synchronize device vector to host vector */
void CopyFromCUDA();
/* Switch device vector location */
void CopyToPeer(platform::Place);
private:
void *cuda_ptr_ = nullptr;
size_t cuda_size_ = 0; // device vector numel
platform::CUDAPlace place_;
};
template <typename T>
void Vector<T>::CopyToCUDA() {
#ifdef PADDLE_WITH_CUDA
if (cuda_size_ < this->size()) {
if (cuda_ptr_ != nullptr) {
memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
}
cuda_ptr_ =
memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T));
}
cuda_size_ = this->size();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *ctx = pool.GetByPlace(place_);
memory::Copy(place_, cuda_ptr_, platform::CPUPlace(),
static_cast<const void *>(this->data()),
this->size() * sizeof(T), ctx->stream());
ctx->Wait();
#endif
}
template <typename T>
void Vector<T>::CopyFromCUDA() {
#ifdef PADDLE_WITH_CUDA
if (cuda_ptr_ == nullptr) {
LOG(WARNING) << "No uncommitted cuda data.";
return;
}
this->resize(cuda_size_);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *ctx = pool.GetByPlace(place_);
memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
ctx->stream());
ctx->Wait();
#endif
}
template <typename T>
void Vector<T>::CopyToPeer(platform::Place peer_place) {
#ifdef PADDLE_WITH_CUDA
auto *ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
void *peer_cuda_ptr = memory::Alloc<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(peer_place), this->size() * sizeof(T));
memory::Copy(boost::get<platform::CUDAPlace>(peer_place), peer_cuda_ptr,
place_, cuda_ptr_, this->size() * sizeof(T), ctx->stream());
ctx->Wait();
memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
place_ = boost::get<platform::CUDAPlace>(peer_place);
cuda_ptr_ = peer_cuda_ptr;
#endif
}
template class Vector<int>;
template class Vector<unsigned>;
template class Vector<size_t>;
template class Vector<int64_t>;
} // namespace framework
} // namespace paddle
...@@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext {
bool HasOutputs(const std::string &name) const override; bool HasOutputs(const std::string &name) const override;
DDim GetInputDim(const std::string &name) const override;
void SetOutputDim(const std::string &name, const DDim &dim) override;
AttrReader Attrs() const override; AttrReader Attrs() const override;
const std::vector<std::string> &Inputs( const std::vector<std::string> &Inputs(
...@@ -444,21 +440,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const { ...@@ -444,21 +440,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
return true; return true;
} }
DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const {
std::vector<DDim> ddims = GetInputsDim(name);
auto length = ddims.size();
PADDLE_ENFORCE_EQ(length, 1UL,
"Input(%s) should have 1 value, "
"but it has %d now",
name, length);
return ddims[0];
}
void CompileTimeInferShapeContext::SetOutputDim(const std::string &name,
const DDim &dim) {
SetOutputsDim(name, {dim});
}
AttrReader CompileTimeInferShapeContext::Attrs() const { AttrReader CompileTimeInferShapeContext::Attrs() const {
return AttrReader(op_.GetAttrMap()); return AttrReader(op_.GetAttrMap());
} }
......
...@@ -22,9 +22,7 @@ limitations under the License. */ ...@@ -22,9 +22,7 @@ limitations under the License. */
#include "paddle/framework/shape_inference.h" #include "paddle/framework/shape_inference.h"
#include "paddle/framework/var_type.h" #include "paddle/framework/var_type.h"
DEFINE_bool(op_sync, false, DECLARE_bool(benchmark);
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -368,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -368,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
return true; return true;
} }
DDim GetInputDim(const std::string& name) const override {
return GetDim(op_.Input(name));
}
void SetOutputDim(const std::string& name, const DDim& dim) override {
SetDim(op_.Output(name), dim);
}
AttrReader Attrs() const override { return AttrReader(op_.Attrs()); } AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
const std::vector<std::string>& Inputs( const std::vector<std::string>& Inputs(
...@@ -531,7 +521,7 @@ void OperatorWithKernel::Run(const Scope& scope, ...@@ -531,7 +521,7 @@ void OperatorWithKernel::Run(const Scope& scope,
ExecutionContext(*this, new_scope, *new_dev_ctx)); ExecutionContext(*this, new_scope, *new_dev_ctx));
/*For profiling/benchmark only*/ /*For profiling/benchmark only*/
if (FLAGS_op_sync) { if (FLAGS_benchmark) {
new_dev_ctx->Wait(); new_dev_ctx->Wait();
} }
} }
......
...@@ -14,13 +14,11 @@ limitations under the License. */ ...@@ -14,13 +14,11 @@ limitations under the License. */
#include "paddle/framework/program_desc.h" #include "paddle/framework/program_desc.h"
#include "paddle/framework/block_desc.h" #include "paddle/framework/block_desc.h"
#include "paddle/framework/feed_fetch_type.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
const std::string kFeedOpType = "feed";
const std::string kFetchOpType = "fetch";
BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) { BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
auto *b = desc_.add_blocks(); auto *b = desc_.add_blocks();
b->set_parent_idx(parent.ID()); b->set_parent_idx(parent.ID());
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/framework/block_desc.h"
#include "paddle/framework/framework.pb.h" #include "paddle/framework/framework.pb.h"
#include "paddle/framework/proto_desc.h" #include "paddle/framework/proto_desc.h"
#include "paddle/platform/macros.h" #include "paddle/platform/macros.h"
......
...@@ -20,9 +20,11 @@ limitations under the License. */ ...@@ -20,9 +20,11 @@ limitations under the License. */
#include "paddle/framework/threadpool.h" #include "paddle/framework/threadpool.h"
#include "paddle/string/printf.h" #include "paddle/string/printf.h"
DEFINE_bool(do_memory_benchmark, false, DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, " "Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs"); "and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -93,7 +95,7 @@ void Scope::DeleteScope(Scope* scope) { ...@@ -93,7 +95,7 @@ void Scope::DeleteScope(Scope* scope) {
PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
this->kids_.erase(it); this->kids_.erase(it);
// When making memory benchmark on Fluid, we have to delete scope sync. // When making memory benchmark on Fluid, we have to delete scope sync.
if (FLAGS_do_memory_benchmark) { if (FLAGS_benchmark) {
delete scope; delete scope;
} else { } else {
Async([scope] { delete scope; }); Async([scope] { delete scope; });
......
...@@ -18,10 +18,18 @@ limitations under the License. */ ...@@ -18,10 +18,18 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
std::vector<framework::DDim> InferShapeContext::GetInputsDim( DDim InferShapeContext::GetInputDim(const std::string &name) const {
const std::vector<std::string> &arg_names = Inputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Input(%s) should hold one element, but now it holds %d",
name, arg_names.size());
return this->GetDim(arg_names[0]);
}
std::vector<DDim> InferShapeContext::GetInputsDim(
const std::string &name) const { const std::string &name) const {
const std::vector<std::string> &names = Inputs(name); const std::vector<std::string> &arg_names = Inputs(name);
return GetDims(names); return GetDims(arg_names);
} }
DDim InferShapeContext::GetInputsElementDim(const std::string &name, DDim InferShapeContext::GetInputsElementDim(const std::string &name,
...@@ -30,24 +38,31 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name, ...@@ -30,24 +38,31 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name,
return this->GetDim(names[idx]); return this->GetDim(names[idx]);
} }
void InferShapeContext::SetOutputsDim( void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
const std::string &name, const std::vector<framework::DDim> &dims) { auto &arg_names = Outputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Output(%s) should hold one element, but now it holds %d",
name, arg_names.size());
SetDim(arg_names[0], dim);
}
void InferShapeContext::SetOutputsDim(const std::string &name,
const std::vector<DDim> &dims) {
auto &names = Outputs(name); auto &names = Outputs(name);
SetDims(names, dims); SetDims(names, dims);
} }
std::vector<framework::DDim> InferShapeContext::GetDims( std::vector<DDim> InferShapeContext::GetDims(
const std::vector<std::string> &names) const { const std::vector<std::string> &names) const {
std::vector<framework::DDim> ret; std::vector<DDim> ret;
ret.reserve(names.size()); ret.reserve(names.size());
std::transform( std::transform(
names.begin(), names.end(), std::back_inserter(ret), names.begin(), names.end(), std::back_inserter(ret),
[this](const std::string &name) { return this->GetDim(name); }); [this](const std::string &name) { return this->GetDim(name); });
return ret; return ret;
} }
void InferShapeContext::SetDims(const std::vector<std::string> &names, void InferShapeContext::SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims) { const std::vector<DDim> &dims) {
size_t length = names.size(); size_t length = names.size();
PADDLE_ENFORCE_EQ(length, dims.size()); PADDLE_ENFORCE_EQ(length, dims.size());
for (size_t i = 0; i < length; ++i) { for (size_t i = 0; i < length; ++i) {
......
...@@ -35,14 +35,13 @@ class InferShapeContext { ...@@ -35,14 +35,13 @@ class InferShapeContext {
virtual bool HasInputs(const std::string &name) const = 0; virtual bool HasInputs(const std::string &name) const = 0;
virtual bool HasOutputs(const std::string &name) const = 0; virtual bool HasOutputs(const std::string &name) const = 0;
virtual framework::DDim GetInputDim(const std::string &name) const = 0; DDim GetInputDim(const std::string &name) const;
std::vector<framework::DDim> GetInputsDim(const std::string &name) const; std::vector<DDim> GetInputsDim(const std::string &name) const;
DDim GetInputsElementDim(const std::string &name, int idx) const; DDim GetInputsElementDim(const std::string &name, int idx) const;
virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0; void SetOutputDim(const std::string &name, const DDim &dim);
void SetOutputsDim(const std::string &name, void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims);
const std::vector<framework::DDim> &dims);
virtual AttrReader Attrs() const = 0; virtual AttrReader Attrs() const = 0;
virtual const std::vector<std::string> &Inputs( virtual const std::vector<std::string> &Inputs(
...@@ -57,15 +56,13 @@ class InferShapeContext { ...@@ -57,15 +56,13 @@ class InferShapeContext {
// Note: In while op, we need this to be public // Note: In while op, we need this to be public
void SetDims(const std::vector<std::string> &names, void SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims); const std::vector<DDim> &dims);
protected: protected:
virtual framework::DDim GetDim(const std::string &name) const = 0; virtual DDim GetDim(const std::string &name) const = 0;
virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0; virtual void SetDim(const std::string &name, const DDim &dim) = 0;
std::vector<framework::DDim> GetDims(
const std::vector<std::string> &names) const;
std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
std::vector<proto::VarDesc::VarType> GetVarTypes( std::vector<proto::VarDesc::VarType> GetVarTypes(
const std::vector<std::string> &names) const; const std::vector<std::string> &names) const;
......
...@@ -47,6 +47,11 @@ class Tensor { ...@@ -47,6 +47,11 @@ class Tensor {
public: public:
Tensor() : offset_(0) {} Tensor() : offset_(0) {}
/*! Constructor with place should only be used in pybind. */
explicit Tensor(const platform::Place& place) : offset_(0) {
holder_->set_place(place);
}
/*! Return a pointer to mutable memory block. */ /*! Return a pointer to mutable memory block. */
template <typename T> template <typename T>
inline T* data(); inline T* data();
...@@ -137,6 +142,7 @@ class Tensor { ...@@ -137,6 +142,7 @@ class Tensor {
virtual std::type_index type() const = 0; virtual std::type_index type() const = 0;
virtual platform::Place place() const = 0; virtual platform::Place place() const = 0;
virtual void set_type(std::type_index type) = 0; virtual void set_type(std::type_index type) = 0;
virtual void set_place(platform::Place place) = 0;
}; };
template <typename Place> template <typename Place>
...@@ -156,6 +162,7 @@ class Tensor { ...@@ -156,6 +162,7 @@ class Tensor {
virtual void* ptr() const { return static_cast<void*>(ptr_.get()); } virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
virtual std::type_index type() const { return type_; } virtual std::type_index type() const { return type_; }
virtual void set_type(std::type_index type) { type_ = type; } virtual void set_type(std::type_index type) { type_ = type; }
virtual void set_place(platform::Place place) { place_ = place; }
/*! the pointer of memory block. */ /*! the pointer of memory block. */
std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_; std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
......
...@@ -178,19 +178,22 @@ public: ...@@ -178,19 +178,22 @@ public:
real* inputData = inputs[0].data<real>(); real* inputData = inputs[0].data<real>();
real* filterData = inputs[1].data<real>(); real* filterData = inputs[1].data<real>();
real* outputData = outputs[0].data<real>(); real* outputData = outputs[0].data<real>();
real* colData = NULL;
bool needIm2col = isNeedIm2col(filter); bool needIm2col = isNeedIm2col(filter);
TensorShape imShape = TensorShape imShape =
TensorShape({inputChannels / groups_, inputHeight, inputWidth}); TensorShape({inputChannels / groups_, inputHeight, inputWidth});
TensorShape colShape; TensorShape colShape;
real* colData = NULL;
size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth; // Max col matrix width 4096, Max col matrix size 4M.
size_t colWidth = outputHeight * outputWidth; size_t outputHeightSteps =
// Max col matrix height 256, Max col matrix width 1024 std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256)); size_t maxColWidth = outputHeightSteps * outputWidth;
size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048)); size_t channelSteps =
std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
(size_t)1),
inputChannels / groups_);
size_t maxColHeight = channelSteps * filterHeight * filterWidth;
if (needIm2col) { if (needIm2col) {
colShape = TensorShape({inputChannels / groups_, colShape = TensorShape({inputChannels / groups_,
...@@ -199,7 +202,7 @@ public: ...@@ -199,7 +202,7 @@ public:
outputHeight, outputHeight,
outputWidth}); outputWidth});
resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real)); resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
colData = reinterpret_cast<real*>(memory_->getBuf()); colData = reinterpret_cast<real*>(memory_->getBuf());
} }
...@@ -209,20 +212,24 @@ public: ...@@ -209,20 +212,24 @@ public:
(outputChannels / groups_) * outputHeight * outputWidth; (outputChannels / groups_) * outputHeight * outputWidth;
size_t filterOffset = filter.getElements() / groups_; size_t filterOffset = filter.getElements() / groups_;
int nStride = colWidth; int nStride = outputHeight * outputWidth;
int kStride = colHeight; int kStride = inputChannels / groups_ * filterHeight * filterWidth;
for (size_t i = 0; i < batchSize; i++) { for (size_t i = 0; i < batchSize; i++) {
filterData = inputs[1].data<real>();
for (size_t g = 0; g < groups_; g++) { for (size_t g = 0; g < groups_; g++) {
if (needIm2col) { if (needIm2col) {
real beta_ = beta; real beta_ = beta;
for (size_t colHeightStart = 0; colHeightStart < colHeight; for (size_t ic = 0; ic < inputChannels / groups_;
colHeightStart += stepColHeight) { ic += channelSteps) {
for (size_t colWidthStart = 0; colWidthStart < colWidth; int channels = std::min(inputChannels / groups_ - ic, channelSteps);
colWidthStart += stepColWidth) { for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
int N = std::min(colWidth - colWidthStart, stepColWidth); int height = std::min(outputHeight - oh, outputHeightSteps);
int K = std::min(colHeight - colHeightStart, stepColHeight);
int M = outputChannels / groups_;
int N = height * outputWidth;
int K = channels * filterHeight * filterWidth;
// im2col // im2col
im2col(inputData + g * inputOffset, im2col(inputData,
imShape, imShape,
colData, colData,
colShape, colShape,
...@@ -232,13 +239,12 @@ public: ...@@ -232,13 +239,12 @@ public:
paddingW(), paddingW(),
dilationH(), dilationH(),
dilationW(), dilationW(),
colHeightStart, channels,
K, oh,
colWidthStart, height,
N); N);
// gemm // gemm
int M = outputChannels / groups_;
BlasGemm<Device, real>::compute( BlasGemm<Device, real>::compute(
false, false,
false, false,
...@@ -246,12 +252,12 @@ public: ...@@ -246,12 +252,12 @@ public:
N, N,
K, K,
1.0f, 1.0f,
filterData + g * filterOffset + colHeightStart, filterData + ic * filterHeight * filterWidth,
kStride, kStride,
colData, colData,
N, N,
beta_, beta_,
outputData + g * outputOffset + colWidthStart, outputData + oh * outputWidth,
nStride); nStride);
} }
beta_ = 1.0; beta_ = 1.0;
...@@ -266,17 +272,18 @@ public: ...@@ -266,17 +272,18 @@ public:
N, N,
K, K,
1.0f, 1.0f,
filterData + g * filterOffset, filterData,
K, K,
inputData + g * inputOffset, inputData,
N, N,
beta, beta,
outputData + g * outputOffset, outputData,
N); N);
} }
inputData += inputOffset;
outputData += outputOffset;
filterData += filterOffset;
} }
inputData += inputChannels * inputHeight * inputWidth;
outputData += outputChannels * outputHeight * outputWidth;
} }
memory_.reset(); memory_.reset();
......
...@@ -111,39 +111,42 @@ public: ...@@ -111,39 +111,42 @@ public:
int paddingWidth, int paddingWidth,
int dilationHeight, int dilationHeight,
int dilationWidth, int dilationWidth,
int colHeightStart, int inputChannels,
int colHeightSize, int colOffset,
int colWidthStart, int colOutputHeight,
int colWidthSize) { int colWidth) {
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
int filterHeight = colShape[1]; int filterHeight = colShape[1];
int filterWidth = colShape[2]; int filterWidth = colShape[2];
int outputWidth = colShape[4]; int outputWidth = colShape[4];
for (int colh = 0; colh < colHeightSize; colh++) { for (int ic = 0; ic < inputChannels; ic++) {
int wOffset = (colHeightStart + colh) % filterWidth; for (int oh = 0; oh < colOutputHeight; oh++) {
int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight; T* dstData = colData + oh * outputWidth;
int c_im = (colHeightStart + colh) / filterWidth / filterHeight; for (int fh = 0; fh < filterHeight; fh++) {
for (int fw = 0; fw < filterWidth; fw++) {
for (int colw = 0; colw < colWidthSize; colw++) { int imRowIdx = (oh + colOffset) * strideHeight +
int h = (colWidthStart + colw) / outputWidth; fh * dilationHeight - paddingHeight;
int w = (colWidthStart + colw) % outputWidth; if (imRowIdx < 0 || imRowIdx >= inputHeight) {
memset(dstData, 0, outputWidth * sizeof(T));
int imRowIdx = h * strideHeight + hOffset * dilationHeight; } else {
int imColIdx = w * strideWidth + wOffset * dilationWidth; for (int ow = 0; ow < outputWidth; ow++) {
if ((imRowIdx - paddingHeight) < 0 || int imColIdx =
(imRowIdx - paddingHeight) >= inputHeight || ow * strideWidth + fw * dilationWidth - paddingWidth;
(imColIdx - paddingWidth) < 0 || if (imColIdx < 0 || imColIdx >= inputWidth) {
(imColIdx - paddingWidth) >= inputWidth) { dstData[ow] = T(0);
colData[colh * colWidthSize + colw] = static_cast<T>(0); } else {
} else { dstData[ow] = imData[imRowIdx * inputWidth + imColIdx];
imRowIdx += c_im * inputHeight - paddingHeight; }
imColIdx -= paddingWidth; }
colData[colh * colWidthSize + colw] = }
imData[imRowIdx * inputWidth + imColIdx]; dstData += colWidth;
}
} }
} }
colData += filterHeight * filterWidth * colWidth;
imData += inputHeight * inputWidth;
} }
} }
}; };
......
...@@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() { ...@@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() {
padding, padding,
dilation, dilation,
dilation, dilation,
channels,
0, 0,
height, outputHeight,
0, outputHeight * outputWidth);
width);
autotest::TensorCheckEqual(*output1, *output2); autotest::TensorCheckEqual(*output1, *output2);
} }
......
set(FLUID_CORE_MODULES proto_desc paddle_memory executor prune init) set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
cc_library(paddle_fluid_api cc_library(paddle_fluid_api
SRCS io.cc SRCS io.cc
...@@ -29,19 +29,6 @@ add_custom_target(inference_lib_dist DEPENDS ...@@ -29,19 +29,6 @@ add_custom_target(inference_lib_dist DEPENDS
inference_lib framework_lib memory_lib platform_lib string_lib inference_lib framework_lib memory_lib platform_lib string_lib
gflags_lib glog_lib protobuf_lib eigen3_lib) gflags_lib glog_lib protobuf_lib eigen3_lib)
add_executable(example example.cc) if(WITH_TESTING)
if(APPLE) add_subdirectory(tests/book)
set(OPTIONAL_LINK_FLAGS)
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
set(OPTIONAL_LINK_FLAGS "-undefined dynamic_lookup")
endif()
target_link_libraries(example
-Wl,-force_load paddle_fluid
${OPTIONAL_LINK_FLAGS}
${PTOOLS_LIB})
else()
target_link_libraries(example
-Wl,--start-group -Wl,--whole-archive paddle_fluid
-Wl,--no-whole-archive -Wl,--end-group
${PTOOLS_LIB})
endif() endif()
...@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and ...@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/inference/io.h" #include "paddle/inference/io.h"
#include <fstream> #include <fstream>
#include "paddle/framework/block_desc.h"
#include "paddle/framework/feed_fetch_type.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
const std::string kFeedOpType = "feed";
bool IsParameter(const framework::VarDesc* var, bool IsParameter(const framework::VarDesc* var,
const framework::ProgramDesc& main_program) { const framework::ProgramDesc& main_program) {
if (var->Persistable()) { if (var->Persistable()) {
...@@ -27,7 +28,7 @@ bool IsParameter(const framework::VarDesc* var, ...@@ -27,7 +28,7 @@ bool IsParameter(const framework::VarDesc* var,
for (size_t i = 0; i < main_program.Size(); ++i) { for (size_t i = 0; i < main_program.Size(); ++i) {
const framework::BlockDesc& block = main_program.Block(i); const framework::BlockDesc& block = main_program.Block(i);
for (auto* op : block.AllOps()) { for (auto* op : block.AllOps()) {
if (op->Type() == kFeedOpType) { if (op->Type() == framework::kFeedOpType) {
continue; continue;
} }
for (auto input_argument_name : op->InputArgumentNames()) { for (auto input_argument_name : op->InputArgumentNames()) {
...@@ -51,7 +52,7 @@ void LoadPersistables(framework::Executor& executor, ...@@ -51,7 +52,7 @@ void LoadPersistables(framework::Executor& executor,
framework::BlockDesc* load_block = load_program->MutableBlock(0); framework::BlockDesc* load_block = load_program->MutableBlock(0);
for (auto* var : global_block.AllVars()) { for (auto* var : global_block.AllVars()) {
if (IsParameter(var, main_program)) { if (IsParameter(var, main_program)) {
LOG(INFO) << "parameter's name: " << var->Name(); VLOG(3) << "parameter's name: " << var->Name();
framework::VarDesc* new_var = load_block->Var(var->Name()); framework::VarDesc* new_var = load_block->Var(var->Name());
new_var->SetShape(var->Shape()); new_var->SetShape(var->Shape());
......
...@@ -17,18 +17,13 @@ limitations under the License. */ ...@@ -17,18 +17,13 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/framework/block_desc.h"
#include "paddle/framework/executor.h" #include "paddle/framework/executor.h"
#include "paddle/framework/program_desc.h" #include "paddle/framework/program_desc.h"
#include "paddle/framework/scope.h" #include "paddle/framework/scope.h"
#include "paddle/framework/var_desc.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
bool IsParameter(const framework::VarDesc* var,
const framework::ProgramDesc& main_program);
void LoadPersistables(framework::Executor& executor, void LoadPersistables(framework::Executor& executor,
framework::Scope& scope, framework::Scope& scope,
const std::string& dirname, const std::string& dirname,
......
set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
cc_test(test_inference_recognize_digits_mlp
SRCS test_inference_recognize_digits.cc
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
set_tests_properties(test_inference_recognize_digits_mlp
PROPERTIES DEPENDS test_recognize_digits)
...@@ -12,93 +12,102 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,93 +12,102 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h>
#include <time.h> #include <time.h>
#include <iostream> #include <sstream>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/framework/init.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/inference/io.h" #include "paddle/inference/io.h"
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
int main(int argc, char** argv) { template <typename Place, typename T>
google::ParseCommandLineFlags(&argc, &argv, true); void TestInference(const std::string& dirname,
if (FLAGS_dirname.empty()) { const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
// Example: std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
// ./example --dirname=recognize_digits_mlp.inference.model // 1. Define place, executor and scope
std::cout << "Usage: ./example --dirname=path/to/your/model" << std::endl; auto place = Place();
exit(1); auto executor = paddle::framework::Executor(place);
}
// 1. Define place, executor, scope
auto place = paddle::platform::CPUPlace();
paddle::framework::InitDevices();
auto* executor = new paddle::framework::Executor(place);
auto* scope = new paddle::framework::Scope(); auto* scope = new paddle::framework::Scope();
std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl; // 2. Initialize the inference_program and load all parameters from file
std::string dirname = FLAGS_dirname; auto inference_program = paddle::inference::Load(executor, *scope, dirname);
// 2. Initialize the inference program
auto inference_program = paddle::inference::Load(*executor, *scope, dirname);
// 3. Optional: perform optimization on the inference_program // 3. Get the feed_target_names and fetch_target_names
// 4. Get the feed_target_names and fetch_target_names
const std::vector<std::string>& feed_target_names = const std::vector<std::string>& feed_target_names =
inference_program->GetFeedTargetNames(); inference_program->GetFeedTargetNames();
const std::vector<std::string>& fetch_target_names = const std::vector<std::string>& fetch_target_names =
inference_program->GetFetchTargetNames(); inference_program->GetFetchTargetNames();
// 5. Generate input // 4. Prepare inputs: set up maps for feed targets
paddle::framework::LoDTensor input;
srand(time(0));
float* input_ptr =
input.mutable_data<float>({1, 784}, paddle::platform::CPUPlace());
for (int i = 0; i < 784; ++i) {
input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
}
std::vector<paddle::framework::LoDTensor> feeds;
feeds.push_back(input);
std::vector<paddle::framework::LoDTensor> fetchs;
// Set up maps for feed and fetch targets
std::map<std::string, const paddle::framework::LoDTensor*> feed_targets; std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
// set_feed_variable
for (size_t i = 0; i < feed_target_names.size(); ++i) { for (size_t i = 0; i < feed_target_names.size(); ++i) {
feed_targets[feed_target_names[i]] = &feeds[i]; // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
feed_targets[feed_target_names[i]] = cpu_feeds[i];
} }
// get_fetch_variable // 5. Define Tensor to get the outputs: set up maps for fetch targets
fetchs.resize(fetch_target_names.size()); std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
for (size_t i = 0; i < fetch_target_names.size(); ++i) { for (size_t i = 0; i < fetch_target_names.size(); ++i) {
fetch_targets[fetch_target_names[i]] = &fetchs[i]; fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
} }
// Run the inference program // 6. Run the inference program
executor->Run(*inference_program, scope, feed_targets, fetch_targets); executor.Run(*inference_program, scope, feed_targets, fetch_targets);
// Get outputs delete scope;
for (size_t i = 0; i < fetchs.size(); ++i) { }
auto dims_i = fetchs[i].dims();
std::cout << "dims_i:"; TEST(inference, recognize_digits) {
for (int j = 0; j < dims_i.size(); ++j) { if (FLAGS_dirname.empty()) {
std::cout << " " << dims_i[j]; LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
std::cout << std::endl;
std::cout << "result:";
float* output_ptr = fetchs[i].data<float>();
for (int j = 0; j < paddle::framework::product(dims_i); ++j) {
std::cout << " " << output_ptr[j];
}
std::cout << std::endl;
} }
delete scope; LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
delete executor; std::string dirname = FLAGS_dirname;
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
return 0; paddle::framework::LoDTensor input;
srand(time(0));
float* input_ptr =
input.mutable_data<float>({1, 28, 28}, paddle::platform::CPUPlace());
for (int i = 0; i < 784; ++i) {
input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
}
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace, float>(
dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace, float>(
dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();
EXPECT_EQ(output1.dims(), output2.dims());
EXPECT_EQ(output1.numel(), output2.numel());
float err = 1E-3;
int count = 0;
for (int64_t i = 0; i < output1.numel(); ++i) {
if (fabs(output1.data<float>()[i] - output2.data<float>()[i]) > err) {
count++;
}
}
EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
#endif
} }
...@@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, ...@@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
CHECK_EQ(channels * outLength, maskMatP->getWidth()); CHECK_EQ(channels * outLength, maskMatP->getWidth());
} }
/* initialize the data_ */
for (size_t i = 0; i < height_; i++) {
for (size_t j = 0; j < width_; j++) {
outData[i * outStride + j] = -(real)FLT_MAX;
}
}
/* pool max one by one */ /* pool max one by one */
for (size_t n = 0; n < num; ++n) { // frame by frame for (size_t n = 0; n < num; ++n) { // frame by frame
if (!isContiguous()) { if (!isContiguous()) {
...@@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, ...@@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
for (size_t c = 0; c < channels; ++c) { // channel by channel for (size_t c = 0; c < channels; ++c) { // channel by channel
for (size_t ph = 0; ph < outputH; ++ph) { for (size_t ph = 0; ph < outputH; ++ph) {
int hstart = ph * strideH - paddingH; int hstart = ph * strideH - paddingH;
int hend = std::min(hstart + sizeY, imgSizeH); int hend = hstart + sizeY;
hstart = std::max(hstart, 0); hstart = hstart < 0 ? 0 : hstart;
hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
for (size_t pw = 0; pw < outputW; ++pw) { for (size_t pw = 0; pw < outputW; ++pw) {
int wstart = pw * strideW - paddingW; int wstart = pw * strideW - paddingW;
int wend = std::min(wstart + sizeX, imgSizeW); int wend = wstart + sizeX;
wstart = std::max(wstart, 0); wstart = wstart < 0 ? 0 : wstart;
wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
if (maskData == NULL) { if (maskData == NULL) {
real tmp = -(real)FLT_MAX;
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
outData[ph * outputW + pw] = std::max( tmp = tmp < inputData[h * imgSizeW + w]
outData[ph * outputW + pw], inputData[h * imgSizeW + w]); ? inputData[h * imgSizeW + w]
: tmp;
} }
} }
outData[ph * outputW + pw] = tmp;
} else { } else {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
......
...@@ -122,9 +122,11 @@ if(WITH_DISTRIBUTE) ...@@ -122,9 +122,11 @@ if(WITH_DISTRIBUTE)
set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor) op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
else() else()
set(DEPS_OPS ${DEPS_OPS} send_op recv_op) set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op)
endif() endif()
op_library(cond_op DEPS framework_proto tensor net_op) op_library(cond_op DEPS framework_proto tensor net_op)
...@@ -156,7 +158,10 @@ op_library(parallel_do_op DEPS executor) ...@@ -156,7 +158,10 @@ op_library(parallel_do_op DEPS executor)
# Regist multiple Kernel to pybind # Regist multiple Kernel to pybind
if (WITH_GPU) if (WITH_GPU)
op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col)
op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
vol2col depthwise_conv)
op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function) op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling) op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
...@@ -173,6 +178,8 @@ endif() ...@@ -173,6 +178,8 @@ endif()
# FIXME(typhoonzero): save/load depends lodtensor serialization functions # FIXME(typhoonzero): save/load depends lodtensor serialization functions
op_library(save_op DEPS lod_tensor) op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor)
op_library(save_combine_op DEPS lod_tensor)
op_library(load_combine_op DEPS lod_tensor)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
...@@ -192,3 +199,4 @@ if(WITH_GPU) ...@@ -192,3 +199,4 @@ if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif() endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
...@@ -82,7 +82,7 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> { ...@@ -82,7 +82,7 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func; math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
auto grad_merge = merge_func(context, grad); auto grad_merge = merge_func(context, grad);
auto* grad_merge_data = grad_merge.mutable_value()->template data<T>(); auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
auto& merge_rows = grad_merge.rows(); framework::Vector<int64_t> merge_rows(grad_merge.rows());
// 2. m += g_m * g_m // 2. m += g_m * g_m
math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func; math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
auto grad_square = sqare_func(context, grad_merge, grad_merge); auto grad_square = sqare_func(context, grad_merge, grad_merge);
...@@ -101,8 +101,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> { ...@@ -101,8 +101,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
SparseAdagradFunctorKernel< SparseAdagradFunctorKernel<
T, 256><<<grid2, threads, 0, T, 256><<<grid2, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(grad_merge_data, grad_merge.rows().data(), .stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr,
lr, param_data, moment_data, grad_width, param_data, moment_data, grad_width,
epsilon); epsilon);
} }
}; };
......
...@@ -199,7 +199,12 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -199,7 +199,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
merge_func(ctx.template device_context<DeviceContext>(), grad); merge_func(ctx.template device_context<DeviceContext>(), grad);
auto& grad_tensor = grad_merge.value(); auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>(); const T* grad_data = grad_tensor.template data<T>();
auto* rows = grad_merge.rows().data(); int64_t* rows = nullptr;
if (platform::is_gpu_place(ctx.GetPlace())) {
rows = grad_merge.mutable_rows()->cuda_data();
} else {
rows = grad_merge.mutable_rows()->data();
}
auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
SparseAdamFunctor<T> functor( SparseAdamFunctor<T> functor(
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -28,12 +28,18 @@ class BipartiteMatchOp : public framework::OperatorWithKernel { ...@@ -28,12 +28,18 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("DistMat"), PADDLE_ENFORCE(ctx->HasInput("DistMat"),
"Input(DistMat) of BipartiteMatch should not be null."); "Input(DistMat) of BipartiteMatch should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("ColToRowMatchIndices"),
"Output(ColToRowMatchIndices) of BipartiteMatch should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("ColToRowMatchDist"),
"Output(ColToRowMatchDist) of BipartiteMatch should not be null.");
auto dims = ctx->GetInputDim("DistMat"); auto dims = ctx->GetInputDim("DistMat");
PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2."); PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
ctx->SetOutputDim("ColToRowMatchIndices", dims); ctx->SetOutputDim("ColToRowMatchIndices", dims);
ctx->SetOutputDim("ColToRowMatchDis", dims); ctx->SetOutputDim("ColToRowMatchDist", dims);
} }
}; };
...@@ -91,7 +97,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> { ...@@ -91,7 +97,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* dist_mat = context.Input<LoDTensor>("DistMat"); auto* dist_mat = context.Input<LoDTensor>("DistMat");
auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices"); auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
auto* match_dist = context.Output<Tensor>("ColToRowMatchDis"); auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");
auto& dev_ctx = context.device_context<platform::CPUDeviceContext>(); auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
...@@ -148,13 +154,13 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -148,13 +154,13 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
"Otherwise, it means B[j] is matched to row " "Otherwise, it means B[j] is matched to row "
"ColToRowMatchIndices[i][j] in i-th instance. The row number of " "ColToRowMatchIndices[i][j] in i-th instance. The row number of "
"i-th instance is saved in ColToRowMatchIndices[i][j]."); "i-th instance is saved in ColToRowMatchIndices[i][j].");
AddOutput("ColToRowMatchDis", AddOutput("ColToRowMatchDist",
"(Tensor) A 2-D Tensor with shape [N, M] in float type. " "(Tensor) A 2-D Tensor with shape [N, M] in float type. "
"N is batch size. If ColToRowMatchIndices[i][j] is -1, " "N is batch size. If ColToRowMatchIndices[i][j] is -1, "
"ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed " "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed "
"ColToRowMatchIndices[i][j] = d, and the row offsets of each " "ColToRowMatchIndices[i][j] = d, and the row offsets of each "
"instance are called LoD. Then " "instance are called LoD. Then "
"ColToRowMatchDis[i][j] = DistMat[d+LoD[i]][j]"); "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]");
AddComment(R"DOC( AddComment(R"DOC(
This operator is a greedy bipartite matching algorithm, which is used to This operator is a greedy bipartite matching algorithm, which is used to
obtain the matching with the maximum distance based on the input obtain the matching with the maximum distance based on the input
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/box_coder_op.h"
namespace paddle {
namespace operators {
class BoxCoderOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
"Input(PriorBox) of BoxCoderOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
"Input(PriorBoxVar) of BoxCoderOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
"Input(TargetBox) of BoxCoderOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
"Output(OutputBox) of BoxCoderOp should not be null.");
auto prior_box_dims = ctx->GetInputDim("PriorBox");
auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
auto target_box_dims = ctx->GetInputDim("TargetBox");
PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
"The rank of Input of PriorBoxVar must be 2");
PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
"The rank of Input of TargetBox must be 2");
PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
"The shape of TargetBox is [M, 4]");
GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
ctx->SetOutputDim(
"OutputBox",
framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
}
};
class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
public:
BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"PriorBox",
"(Tensor, default Tensor<float>) "
"Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, "
"each box is represented as [xmin, ymin, xmax, ymax], "
"[xmin, ymin] is the left top coordinate of the anchor box, "
"if the input is image feature map, they are close to the origin "
"of the coordinate system. [xmax, ymax] is the right bottom "
"coordinate of the anchor box.");
AddInput("PriorBoxVar",
"(Tensor, default Tensor<float>) "
"PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
"of variance.");
AddInput(
"TargetBox",
"(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
"[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
"[xmin, ymin] is the left top coordinate of the box if the input "
"is image feature map, they are close to the origin of the coordinate "
"system. [xmax, ymax] is the right bottom coordinate of the box. "
"This tensor can contain LoD information to represent a batch "
"of inputs. One instance of this batch can contain different "
"numbers of entities.");
AddAttr<std::string>("code_type",
"(string, default encode_center_size) "
"the code type used with the target box")
.SetDefault("encode_center_size")
.InEnum({"encode_center_size", "decode_center_size"});
AddOutput(
"OutputBox",
"(LoDTensor or Tensor) "
"(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
"representing the result of N target boxes encoded/decoded with "
"M Prior boxes and variances.");
AddComment(R"DOC(
Bounding Box Coder Operator.
Encode/Decode the target bounding box with the priorbox information.
The Encoding schema described below:
ox = (tx - px) / pw / pxv
oy = (ty - py) / ph / pyv
ow = log(abs(tw / pw)) / pwv
oh = log(abs(th / ph)) / phv
The Decoding schema described below:
ox = (pw * pxv * tx * + px) - tw / 2
oy = (ph * pyv * ty * + py) - th / 2
ow = exp(pwv * tw) * pw + tw / 2
oh = exp(phv * th) * ph + th / 2
where tx, ty, tw, th denote the target box's center coordinates, width and
height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
width and height.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
ops::BoxCoderKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/box_coder_op.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void EncodeCenterSizeKernel(const T* prior_box_data,
const T* prior_box_var_data,
const T* target_box_data, const int row,
const int col, const int len,
T* output) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < row * col) {
const int row_idx = idx / col;
const int col_idx = idx % col;
T prior_box_width =
prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
T prior_box_height =
prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
T prior_box_center_x =
(prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
prior_box_data[col_idx * len + 1]) /
2;
T target_box_center_x =
(target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
2;
T target_box_center_y = (target_box_data[row_idx * len + 3] +
target_box_data[row_idx * len + 1]) /
2;
T target_box_width =
target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
T target_box_height =
target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
output[idx * len] = (target_box_center_x - prior_box_center_x) /
prior_box_width / prior_box_var_data[col_idx * len];
output[idx * len + 1] = (target_box_center_y - prior_box_center_y) /
prior_box_height /
prior_box_var_data[col_idx * len + 1];
output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) /
prior_box_var_data[col_idx * len + 2];
output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) /
prior_box_var_data[col_idx * len + 3];
}
}
template <typename T>
__global__ void DecodeCenterSizeKernel(const T* prior_box_data,
const T* prior_box_var_data,
const T* target_box_data, const int row,
const int col, const int len,
T* output) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < row * col) {
const int row_idx = idx / col;
const int col_idx = idx % col;
T prior_box_width =
prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
T prior_box_height =
prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
T prior_box_center_x =
(prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
prior_box_data[col_idx * len + 1]) /
2;
T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
target_box_data[row_idx * len + 2]) *
prior_box_width;
T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
target_box_data[row_idx * len + 3]) *
prior_box_height;
T target_box_center_x = prior_box_var_data[col_idx * len] *
target_box_data[row_idx * len] *
prior_box_width +
prior_box_center_x;
T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
target_box_data[row_idx * len + 1] *
prior_box_height +
prior_box_center_y;
output[idx * len] = target_box_center_x - target_box_width / 2;
output[idx * len + 1] = target_box_center_y - target_box_height / 2;
output[idx * len + 2] = target_box_center_x + target_box_width / 2;
output[idx * len + 3] = target_box_center_y + target_box_height / 2;
}
}
template <typename T>
class BoxCoderCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
"This kernel only runs on GPU device.");
auto* prior_box = context.Input<framework::Tensor>("PriorBox");
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
auto* output_box = context.Output<framework::Tensor>("OutputBox");
if (target_box->lod().size()) {
PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
"Only support 1 level of LoD.");
}
auto row = target_box->dims()[0];
auto col = prior_box->dims()[0];
auto len = prior_box->dims()[1];
int block = 512;
int grid = (row * col + block - 1) / block;
auto& device_ctx = context.cuda_device_context();
const T* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = prior_box_var->data<T>();
const T* target_box_data = target_box->data<T>();
output_box->mutable_data<T>({row, col, len}, context.GetPlace());
T* output = output_box->data<T>();
auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
if (code_type == BoxCodeType::kEncodeCenterSize) {
EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
prior_box_data, prior_box_var_data, target_box_data, row, col, len,
output);
} else if (code_type == BoxCodeType::kDecodeCenterSize) {
DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
prior_box_data, prior_box_var_data, target_box_data, row, col, len,
output);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel<float>,
ops::BoxCoderCUDAKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
inline BoxCodeType GetBoxCodeType(const std::string& type) {
if (type == "encode_center_size") {
return BoxCodeType::kEncodeCenterSize;
} else if (type == "decode_center_size") {
return BoxCodeType::kDecodeCenterSize;
}
PADDLE_THROW("Not support type %s.", type);
}
template <typename T>
class BoxCoderKernel : public framework::OpKernel<T> {
public:
void EncodeCenterSize(const framework::Tensor& target_box,
const framework::Tensor& prior_box,
const framework::Tensor& prior_box_var,
T* output) const {
int64_t row = target_box.dims()[0];
int64_t col = prior_box.dims()[0];
int64_t len = prior_box.dims()[1];
auto* target_box_data = target_box.data<T>();
auto* prior_box_data = prior_box.data<T>();
auto* prior_box_var_data = prior_box_var.data<T>();
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
T prior_box_width =
prior_box_data[j * len + 2] - prior_box_data[j * len];
T prior_box_height =
prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
T prior_box_center_x =
(prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
T prior_box_center_y =
(prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
T target_box_center_x =
(target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
T target_box_center_y =
(target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
T target_box_width =
target_box_data[i * len + 2] - target_box_data[i * len];
T target_box_height =
target_box_data[i * len + 3] - target_box_data[i * len + 1];
size_t offset = i * col * len + j * len;
output[offset] = (target_box_center_x - prior_box_center_x) /
prior_box_width / prior_box_var_data[j * len];
output[offset + 1] = (target_box_center_y - prior_box_center_y) /
prior_box_height / prior_box_var_data[j * len + 1];
output[offset + 2] =
std::log(std::fabs(target_box_width / prior_box_width)) /
prior_box_var_data[j * len + 2];
output[offset + 3] =
std::log(std::fabs(target_box_height / prior_box_height)) /
prior_box_var_data[j * len + 3];
}
}
}
void DecodeCenterSize(const framework::Tensor& target_box,
const framework::Tensor& prior_box,
const framework::Tensor& prior_box_var,
T* output) const {
int64_t row = target_box.dims()[0];
int64_t col = prior_box.dims()[0];
int64_t len = prior_box.dims()[1];
auto* target_box_data = target_box.data<T>();
auto* prior_box_data = prior_box.data<T>();
auto* prior_box_var_data = prior_box_var.data<T>();
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
T prior_box_width =
prior_box_data[j * len + 2] - prior_box_data[j * len];
T prior_box_height =
prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
T prior_box_center_x =
(prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
T prior_box_center_y =
(prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
T target_box_center_x = prior_box_var_data[j * len] *
target_box_data[i * len] * prior_box_width +
prior_box_center_x;
T target_box_center_y = prior_box_var_data[j * len + 1] *
target_box_data[i * len + 1] *
prior_box_height +
prior_box_center_y;
T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
target_box_data[i * len + 2]) *
prior_box_width;
T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
target_box_data[i * len + 3]) *
prior_box_height;
size_t offset = i * col * len + j * len;
output[offset] = target_box_center_x - target_box_width / 2;
output[offset + 1] = target_box_center_y - target_box_height / 2;
output[offset + 2] = target_box_center_x + target_box_width / 2;
output[offset + 3] = target_box_center_y + target_box_height / 2;
}
}
}
void Compute(const framework::ExecutionContext& context) const override {
auto* prior_box = context.Input<framework::Tensor>("PriorBox");
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
auto* output_box = context.Output<framework::Tensor>("OutputBox");
if (target_box->lod().size()) {
PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
"Only support 1 level of LoD.");
}
auto row = target_box->dims()[0];
auto col = prior_box->dims()[0];
auto len = prior_box->dims()[1];
output_box->mutable_data<T>({row, col, len}, context.GetPlace());
auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
T* output = output_box->data<T>();
if (code_type == BoxCodeType::kEncodeCenterSize) {
EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
} else if (code_type == BoxCodeType::kDecodeCenterSize) {
DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
}
}
};
} // namespace operators
} // namespace paddle
...@@ -54,7 +54,15 @@ class CompareOpKernel ...@@ -54,7 +54,15 @@ class CompareOpKernel
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
using T = typename Functor::ELEM_TYPE; using T = typename Functor::ELEM_TYPE;
ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context); using Tensor = framework::Tensor;
auto* x = context.Input<Tensor>("X");
auto* y = context.Input<Tensor>("Y");
auto* z = context.Output<Tensor>("Out");
z->mutable_data<T>(context.GetPlace());
int axis = context.Attr<int>("axis");
ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
z);
} }
}; };
......
...@@ -318,9 +318,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( ...@@ -318,9 +318,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad, REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
ops::ConvOpGrad); ops::ConvOpGrad);
// depthwise convolution op
REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
depthwise_conv2d_grad, ops::ConvOpGrad);
REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad, REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
ops::ConvOpGrad); ops::ConvOpGrad);
// depthwise conv kernel
// TODO(xingzhaolong): neon kernel for mobile
REGISTER_OP_CPU_KERNEL(
depthwise_conv2d,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
depthwise_conv2d_grad,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>, conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>); ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
......
...@@ -16,6 +16,16 @@ limitations under the License. */ ...@@ -16,6 +16,16 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d,
ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d_grad,
ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>, conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>); ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/math/depthwise_conv.h"
#include "paddle/operators/math/im2col.h" #include "paddle/operators/math/im2col.h"
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/vol2col.h" #include "paddle/operators/math/vol2col.h"
...@@ -350,5 +351,72 @@ class GemmConvGradKernel : public framework::OpKernel<T> { ...@@ -350,5 +351,72 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
} }
} }
}; };
template <typename DeviceContext, typename T>
class DepthwiseConvKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
output->mutable_data<T>(context.GetPlace());
PADDLE_ENFORCE_EQ(
output->dims()[1] % input->dims()[1], 0,
"The output channels must be a multiple of the input channels");
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
auto& dev_ctx = context.template device_context<DeviceContext>();
depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);
}
};
template <typename DeviceContext, typename T>
class DepthwiseConvGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad =
context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter"));
Tensor filter = *context.Input<Tensor>("Filter");
if (!input_grad && !filter_grad) return;
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
math::SetConstant<DeviceContext, T> set_zero;
auto& dev_ctx = context.template device_context<DeviceContext>();
math::DepthwiseConvInputGradFunctor<DeviceContext, T>
depthwiseConvInputGrad;
math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
depthwiseConvFilterGrad;
if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(0));
depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
paddings, input_grad);
}
if (filter_grad) {
filter_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
filter_grad);
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -69,12 +69,11 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> { ...@@ -69,12 +69,11 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>( MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
num_tokens, tokens, num_seq, input_lod[level].data(), blank, num_tokens, tokens, num_seq, input_lod[level].cuda_data(), blank,
merge_repeated, dev_out_lod0_ptr, output_data); merge_repeated, dev_out_lod0_ptr, output_data);
// set output lod // set output lod
thrust::host_vector<size_t> host_out_lod0(dev_out_lod0.begin(), std::vector<size_t> host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end());
dev_out_lod0.end());
framework::LoD out_lod; framework::LoD out_lod;
out_lod.push_back(host_out_lod0); out_lod.push_back(host_out_lod0);
output->set_lod(out_lod); output->set_lod(out_lod);
......
...@@ -51,6 +51,13 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -51,6 +51,13 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
"'dropout_prob' must be between 0.0 and 1.0."); "'dropout_prob' must be between 0.0 and 1.0.");
}); });
AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false); AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
AddAttr<bool>("fix_seed",
"A flag indicating whether to use a fixed seed to generate "
"random mask. NOTE: DO NOT set this flag to true in "
"training. Setting this flag to true is only useful in "
"unittest or for debug that always the same output units "
"will be dropped.")
.SetDefault(false);
AddAttr<int>("seed", "Dropout random seed.").SetDefault(0); AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
AddComment(R"DOC( AddComment(R"DOC(
......
...@@ -62,7 +62,11 @@ class GPUDropoutKernel : public framework::OpKernel<T> { ...@@ -62,7 +62,11 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
auto* mask = context.Output<Tensor>("Mask"); auto* mask = context.Output<Tensor>("Mask");
auto* mask_data = mask->mutable_data<T>(context.GetPlace()); auto* mask_data = mask->mutable_data<T>(context.GetPlace());
int size = framework::product(mask->dims()); int size = framework::product(mask->dims());
int seed = context.Attr<int>("seed");
std::random_device rnd;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
thrust::counting_iterator<unsigned int> index_sequence_begin(0); thrust::counting_iterator<unsigned int> index_sequence_begin(0);
thrust::transform(index_sequence_begin, index_sequence_begin + size, thrust::transform(index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(mask_data), thrust::device_ptr<T>(mask_data),
......
...@@ -38,9 +38,15 @@ class CPUDropoutKernel : public framework::OpKernel<T> { ...@@ -38,9 +38,15 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
if (!context.Attr<bool>("is_test")) { if (!context.Attr<bool>("is_test")) {
auto* mask = context.Output<Tensor>("Mask"); auto* mask = context.Output<Tensor>("Mask");
auto* mask_data = mask->mutable_data<T>(context.GetPlace()); auto* mask_data = mask->mutable_data<T>(context.GetPlace());
int seed = context.Attr<int>("seed");
// NOTE: fixed seed should only be used in unittest or for debug.
// Guarantee to use random seed in training.
std::random_device rnd;
std::minstd_rand engine; std::minstd_rand engine;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
engine.seed(seed); engine.seed(seed);
std::uniform_real_distribution<float> dist(0, 1); std::uniform_real_distribution<float> dist(0, 1);
size_t size = framework::product(mask->dims()); size_t size = framework::product(mask->dims());
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
......
...@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T> ...@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
class ElementwiseAddKernel : public framework::OpKernel<T> { class ElementwiseAddKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx); using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
} }
}; };
...@@ -92,9 +99,19 @@ template <typename DeviceContext, typename T> ...@@ -92,9 +99,19 @@ template <typename DeviceContext, typename T>
class ElementwiseAddGradKernel : public framework::OpKernel<T> { class ElementwiseAddGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseAddGradFunctor<T>, ElementwiseGradCompute<DeviceContext, T, ElementwiseAddGradFunctor<T>,
ElementwiseAddBroadCastGradFunctor<T>, ElementwiseAddBroadCastGradFunctor<T>,
ElementwiseAddBroadCast2GradFunctor<T>>(ctx); ElementwiseAddBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
} }
}; };
......
...@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T> ...@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
class ElementwiseDivKernel : public framework::OpKernel<T> { class ElementwiseDivKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx); using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
} }
}; };
...@@ -111,9 +118,19 @@ template <typename DeviceContext, typename T> ...@@ -111,9 +118,19 @@ template <typename DeviceContext, typename T>
class ElementwiseDivGradKernel : public framework::OpKernel<T> { class ElementwiseDivGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>, ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>,
ElementwiseDivBroadCastGradFunctor<T>, ElementwiseDivBroadCastGradFunctor<T>,
ElementwiseDivBroadCast2GradFunctor<T>>(ctx); ElementwiseDivBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
} }
}; };
......
...@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T> ...@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
class ElementwiseMaxKernel : public framework::OpKernel<T> { class ElementwiseMaxKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx); using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
} }
}; };
...@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T> ...@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T>
class ElementwiseMaxGradKernel : public framework::OpKernel<T> { class ElementwiseMaxGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseMaxGradFunctor<T>, ElementwiseGradCompute<DeviceContext, T, ElementwiseMaxGradFunctor<T>,
ElementwiseMaxBroadCastGradFunctor<T>, ElementwiseMaxBroadCastGradFunctor<T>,
ElementwiseMaxBroadCast2GradFunctor<T>>(ctx); ElementwiseMaxBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
} }
}; };
......
...@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T> ...@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
class ElementwiseMinKernel : public framework::OpKernel<T> { class ElementwiseMinKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx); using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
} }
}; };
...@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T> ...@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T>
class ElementwiseMinGradKernel : public framework::OpKernel<T> { class ElementwiseMinGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseMinGradFunctor<T>, ElementwiseGradCompute<DeviceContext, T, ElementwiseMinGradFunctor<T>,
ElementwiseMinBroadCastGradFunctor<T>, ElementwiseMinBroadCastGradFunctor<T>,
ElementwiseMinBroadCast2GradFunctor<T>>(ctx); ElementwiseMinBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
} }
}; };
......
...@@ -27,7 +27,14 @@ template <typename DeviceContext, typename T> ...@@ -27,7 +27,14 @@ template <typename DeviceContext, typename T>
class ElementwiseMulKernel : public framework::OpKernel<T> { class ElementwiseMulKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx); using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
} }
}; };
...@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T> ...@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T>
class ElementwiseMulGradKernel : public framework::OpKernel<T> { class ElementwiseMulGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>, ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>,
ElementwiseMulBroadCastGradFunctor<T>, ElementwiseMulBroadCastGradFunctor<T>,
ElementwiseMulBroadCast2GradFunctor<T>>(ctx); ElementwiseMulBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
} }
}; };
......
...@@ -313,21 +313,18 @@ EIGEN_FUNCTOR(Div, EIGEN_DIV); ...@@ -313,21 +313,18 @@ EIGEN_FUNCTOR(Div, EIGEN_DIV);
template <typename DeviceContext, typename T, typename functor, template <typename DeviceContext, typename T, typename functor,
typename broadcastfunctor, typename broadcast2functor> typename broadcastfunctor, typename broadcast2functor>
void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
const framework::Tensor* x,
const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout, int axis,
framework::Tensor* dx, framework::Tensor* dy) {
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto x_dims = x->dims(); auto x_dims = x->dims();
auto y_dims = y->dims(); auto y_dims = y->dims();
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
if (dx) { if (dx) {
dx->mutable_data<T>(ctx.GetPlace()); dx->mutable_data<T>(ctx.GetPlace());
} }
...@@ -348,7 +345,6 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { ...@@ -348,7 +345,6 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
x_dims = framework::make_ddim(extended_dims); x_dims = framework::make_ddim(extended_dims);
} }
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
int pre, n, post; int pre, n, post;
...@@ -367,13 +363,10 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { ...@@ -367,13 +363,10 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
template <typename Functor, typename DeviceContext, typename T, template <typename Functor, typename DeviceContext, typename T,
typename OutType = T> typename OutType = T>
void ElementwiseComputeEx(const framework::ExecutionContext& ctx) { void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
using Tensor = framework::Tensor; const framework::Tensor* x,
const framework::Tensor* y, int axis,
auto* x = ctx.Input<Tensor>("X"); framework::Tensor* z) {
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<OutType>(ctx.GetPlace());
TransformFunctor<Functor, T, DeviceContext, OutType> functor( TransformFunctor<Functor, T, DeviceContext, OutType> functor(
x, y, z, ctx.template device_context<DeviceContext>(), Functor()); x, y, z, ctx.template device_context<DeviceContext>(), Functor());
...@@ -394,7 +387,6 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx) { ...@@ -394,7 +387,6 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx) {
x_dims = framework::make_ddim(extended_dims); x_dims = framework::make_ddim(extended_dims);
} }
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
"Axis should be in range [0, x_dims)"); "Axis should be in range [0, x_dims)");
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/elementwise_pow_op.h"
#include "paddle/operators/elementwise_op.h"
namespace paddle {
namespace operators {
class ElementwisePowOpMaker : public ElementwiseOpMaker {
public:
ElementwisePowOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: ElementwiseOpMaker(proto, op_checker) {
SetComment("Pow", "Out = X ^ Y");
AddComment(comment_);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(elementwise_pow, ops::ElementwiseOp,
ops::ElementwisePowOpMaker);
REGISTER_OP_CPU_KERNEL(
elementwise_pow,
ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/elementwise_pow_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
elementwise_pow,
ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cmath>
#include "paddle/operators/elementwise_op_function.h"
namespace paddle {
namespace operators {
template <typename T>
struct PowFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return std::pow(a, b); }
};
template <typename DeviceContext, typename T>
class ElementwisePowKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
}
};
} // namespace operators
} // namespace paddle
...@@ -27,7 +27,14 @@ template <typename DeviceContext, typename T> ...@@ -27,7 +27,14 @@ template <typename DeviceContext, typename T>
class ElementwiseSubKernel : public framework::OpKernel<T> { class ElementwiseSubKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx); using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
} }
}; };
...@@ -93,9 +100,19 @@ template <typename DeviceContext, typename T> ...@@ -93,9 +100,19 @@ template <typename DeviceContext, typename T>
class ElementwiseSubGradKernel : public framework::OpKernel<T> { class ElementwiseSubGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>, ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>,
ElementwiseSubBroadCastGradFunctor<T>, ElementwiseSubBroadCastGradFunctor<T>,
ElementwiseSubBroadCast2GradFunctor<T>>(ctx); ElementwiseSubBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
} }
}; };
......
...@@ -52,7 +52,11 @@ class FeedOp : public framework::OperatorBase { ...@@ -52,7 +52,11 @@ class FeedOp : public framework::OperatorBase {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
framework::Copy(feed_item, place, dev_ctx, out_item); if (platform::is_same_place(feed_item.place(), place)) {
out_item->ShareDataWith(feed_item);
} else {
framework::Copy(feed_item, place, dev_ctx, out_item);
}
out_item->set_lod(feed_item.lod()); out_item->set_lod(feed_item.lod());
} }
}; };
......
...@@ -30,11 +30,12 @@ using Tensor = framework::Tensor; ...@@ -30,11 +30,12 @@ using Tensor = framework::Tensor;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
inline void ReorderInitState(const DeviceContext& ctx, inline void ReorderInitState(const DeviceContext& ctx,
const framework::Tensor& src, const size_t* index, const framework::Tensor& src,
framework::Vector<size_t> index_lod,
framework::Tensor* dst, bool indexed_src) { framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle; math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
dst->mutable_data<T>(src.dims(), ctx.GetPlace()); dst->mutable_data<T>(src.dims(), ctx.GetPlace());
row_shuffle(ctx, src, index, *dst, indexed_src); row_shuffle(ctx, src, index_lod, *dst, indexed_src);
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -76,7 +77,9 @@ class GRUKernel : public framework::OpKernel<T> { ...@@ -76,7 +77,9 @@ class GRUKernel : public framework::OpKernel<T> {
gru_value.state_weight = gru_value.state_weight =
const_cast<T*>(weight_data + 2 * frame_size * frame_size); const_cast<T*>(weight_data + 2 * frame_size * frame_size);
Tensor ordered_h0; Tensor ordered_h0;
const size_t* order = batch_gate->lod()[2].data();
framework::Vector<size_t> order(batch_gate->lod()[2]);
if (h0) { if (h0) {
// Since the batch computing for GRU reorders the input sequences // Since the batch computing for GRU reorders the input sequences
// according to their length. The initialized cell state also needs // according to their length. The initialized cell state also needs
...@@ -159,7 +162,9 @@ class GRUGradKernel : public framework::OpKernel<T> { ...@@ -159,7 +162,9 @@ class GRUGradKernel : public framework::OpKernel<T> {
zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0)); zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
Tensor ordered_h0, ordered_h0_grad; Tensor ordered_h0, ordered_h0_grad;
const size_t* order = batch_gate->lod()[2].data();
framework::Vector<size_t> order(batch_gate->lod()[2]);
if (h0) { if (h0) {
ReorderInitState<DeviceContext, T>(dev_ctx, *h0, order, &ordered_h0, ReorderInitState<DeviceContext, T>(dev_ctx, *h0, order, &ordered_h0,
true); true);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/label_smooth_op.h"
namespace paddle {
namespace operators {
class LabelSmoothOp : public framework::OperatorWithKernel {
public:
LabelSmoothOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of LabelSmoothOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of LabelSmoothOp should not be null.");
auto in_dims = ctx->GetInputDim("X");
if (ctx->HasInput("PriorDist")) {
auto noise_dims = ctx->GetInputDim("PriorDist");
auto noise_numel = paddle::framework::product(noise_dims);
PADDLE_ENFORCE(
in_dims[1] == noise_numel,
"The number of elements in Input(PriorDist) must be equal to the "
"dimension of each label.");
}
ctx->ShareLoD("X", /*->*/ "Out");
ctx->SetOutputDim("Out", in_dims);
}
};
class LabelSmoothOpMaker : public framework::OpProtoAndCheckerMaker {
public:
LabelSmoothOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X",
"(LoDTensor) The input labels of LabelSmooth operator. This "
"input can be batched labels in one-hot encoding or output from "
"softmax, with shape [N x K], where N is the batch size and K is "
"the number of classes");
AddInput("PriorDist",
"(Tensor, optional)"
"The prior distribution to be added to the smoothed label. It is "
"fixed during training and the number of elements should be equal "
"to the dimension K of each label. Default is uniform "
"distribution and each element will be set to 1/K if not provided "
"in input.")
.AsDispensable();
AddOutput("Out",
"(loDTensor) The smoothed label of LabelSmooth operator. It has"
"the same shape and LoD with the Input(LoDTensor).");
AddAttr<float>("epsilon",
"(float, default 0.0f)"
"The smoothing parameter of LabelSmooth operator.")
.SetDefault(0.0f);
AddComment(R"DOC(
LabelSmooth Operator.
Label smoothing is a mechanism to regularize the classifier layer. In machine
learning, optimizing the log-likelihood of the correct label directly may
cause two problems. First, it may result in overfitting: if the model learns
to assign full probability to the ground-truth label for each training example,
it is not guaranteed to generalize. Second, it encourages the differences
between the largest logit and all others to become large, reducing the ability
of the model to adapt. Label smoothing is proposed to encourage the model to
be less confident, which replaces the ground-truth label $y$ with the weighted
sum of itself and some fixed distribution $\mu$, i.e.
$$
\tilde{y} = (1 - \epsilon) * y + \epsilon * \mu,
$$
where $(1 - \epsilon)$ and $\epsilon$ are the weights respectively, and
$\tilde{y}$ is the smoothed label. Usually uniform distribution is used for
$\mu$. This change in the ground-truth label is called label-smoothing
regularization or LSR.
See more details about label smoothing in https://arxiv.org/abs/1512.00567.
)DOC");
}
};
class LabelSmoothGradOp : public framework::OperatorWithKernel {
public:
LabelSmoothGradOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null.");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
label_smooth_grad, ops::LabelSmoothGradOp);
REGISTER_OP_CPU_KERNEL(
label_smooth,
ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,
ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
label_smooth_grad,
ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/label_smooth_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
label_smooth,
ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, float>,
ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
label_smooth_grad,
ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class LabelSmoothKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto* out_t = ctx.Output<framework::LoDTensor>("Out");
auto* in_t = ctx.Input<framework::LoDTensor>("X");
auto* dist_t = ctx.Input<framework::Tensor>("PriorDist");
auto label_dim = in_t->dims()[1];
out_t->mutable_data<T>(ctx.GetPlace());
auto epsilon = ctx.Attr<float>("epsilon");
auto out = framework::EigenVector<T>::Flatten(*out_t);
auto in = framework::EigenVector<T>::Flatten(*in_t);
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
if (dist_t) {
auto dist = framework::EigenVector<T>::Flatten(*dist_t);
out.device(dev) =
static_cast<T>(1 - epsilon) * in +
epsilon * dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
} else {
out.device(dev) = static_cast<T>(1 - epsilon) * in +
static_cast<T>(epsilon / label_dim);
}
}
};
template <typename DeviceContext, typename T>
class LabelSmoothGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
d_in_t->mutable_data<T>(ctx.GetPlace());
auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
auto d_in = framework::EigenVector<T>::Flatten(*d_in_t);
auto epsilon = ctx.Attr<float>("epsilon");
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
d_in.device(dev) = static_cast<T>(1 - epsilon) * d_out;
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/layer_norm_op.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
template <typename T>
using EigenMatrixMapRowMajor = Eigen::Map<
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
template <typename T>
using ConstEigenMatrixMapRowMajor = Eigen::Map<
const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
class LayerNormOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Y"),
"Output(Y) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Mean"),
"Output(Mean) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Variance"),
"Output(Variance) of LayerNormOp should not be null.");
auto x_dim = ctx->GetInputDim("X");
auto begin_norm_axis = ctx->Attrs().Get<int>("begin_norm_axis");
PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(),
"'begin_norm_axis' must be less than the rank of X.");
auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
if (ctx->HasInput("Scale")) {
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
}
if (ctx->HasInput("Bias")) {
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
}
ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
ctx->SetOutputDim("Mean", {left});
ctx->SetOutputDim("Variance", {left});
ctx->ShareLoD("X", "Y");
}
};
class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
public:
LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(LoDTensor) The input tensor.");
AddInput("Scale",
"(Tensor, optional) Scale is a 1-dimensional tensor of size "
"H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
"It is applied to the output.")
.AsDispensable();
AddInput("Bias",
"(Tensor, optional) Bias is a 1-dimensional tensor of size "
"H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
"It is applied to the output.")
.AsDispensable();
AddOutput("Y", "(LoDTensor) Result after normalization.");
AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
.AsIntermediate();
AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
.AsIntermediate();
AddAttr<float>("epsilon",
"(float, default 1e-5) Constant for "
"numerical stability")
.SetDefault(1e-5)
.AddCustomChecker([](const float &epsilon) {
PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
"'epsilon' should be between 0.0 and 0.001.");
});
AddAttr<int>("begin_norm_axis",
"(int default:1), the "
"axis of `begin_norm_axis ... Rank(X) - 1` will be "
"normalized. `begin_norm_axis` splits the tensor(`X`) to a "
"matrix [N,H].")
.SetDefault(1)
.AddCustomChecker([](const int &begin_norm_axis) {
PADDLE_ENFORCE_GT(begin_norm_axis, 0,
"'begin_norm_axis' should be greater than zero.");
});
AddComment(R"DOC(
Layer Normalization.
Layer Norm has been implemented as discussed in the paper:
https://arxiv.org/abs/1607.06450
...
)DOC");
}
};
template <typename T>
class LayerNormKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon");
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims();
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
auto *output = ctx.Output<Tensor>("Y");
auto *mean = ctx.Output<Tensor>("Mean");
auto *var = ctx.Output<Tensor>("Variance");
output->mutable_data<T>(ctx.GetPlace());
mean->mutable_data<T>(ctx.GetPlace());
var->mutable_data<T>(ctx.GetPlace());
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
auto input_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
auto mean_map = EigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
auto var_map = EigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
auto output_map = EigenMatrixMapRowMajor<T>(output->data<T>(), left, right);
auto squre = [](T ele) { return ele * ele; };
auto add_epslion = [epsilon](T ele) { return ele + epsilon; };
mean_map = input_map.rowwise().mean();
var_map = (input_map - mean_map.replicate(1, right))
.unaryExpr(squre)
.rowwise()
.mean()
.unaryExpr(add_epslion);
auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
// TODO(zcd): Some thinking about output_map, is it appropriate that
// `output_map` and `input_map` point to the same memory.
auto inv_std = var_map.unaryExpr(inv_std_func);
if (scale && bias) {
auto scale_map =
ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right))
.cwiseProduct(scale_map.replicate(left, 1)) +
bias_map.replicate(left, 1);
} else if (scale) {
auto scale_map =
ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right))
.cwiseProduct(scale_map.replicate(left, 1));
} else if (bias) {
auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right)) +
bias_map.replicate(left, 1);
} else {
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right));
}
}
};
class LayerNormGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
// check input
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Scale"),
"Input(Scale) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Mean"),
"Input(Mean) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Variance"),
"Input(Variance) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
"Input(Y@GRAD) of LayerNormOp should not be null.");
// check output
if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
if (ctx->HasOutput(framework::GradVarName("Scale"))) {
ctx->SetOutputDim(framework::GradVarName("Scale"),
ctx->GetInputDim("Scale"));
}
if (ctx->HasOutput(framework::GradVarName("Bias"))) {
ctx->SetOutputDim(framework::GradVarName("Bias"),
ctx->GetInputDim("Bias"));
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
const auto *var = ctx.InputVar(framework::GradVarName("Y"));
if (var == nullptr) {
PADDLE_THROW("can't find Y@GRAD");
}
const Tensor *t = nullptr;
if (var->IsType<Tensor>()) {
t = &var->Get<Tensor>();
} else if (var->IsType<LoDTensor>()) {
t = &var->Get<LoDTensor>();
}
if (t == nullptr) {
PADDLE_THROW("can't find Y@GRAD");
}
return framework::OpKernelType(framework::ToDataType(t->type()),
ctx.GetPlace());
}
};
template <typename T>
class LayerNormGradKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *x = ctx.Input<Tensor>("X");
const auto *mean = ctx.Input<Tensor>("Mean");
const auto *var = ctx.Input<Tensor>("Variance");
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto &x_dims = x->dims();
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
// init output
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto x_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
auto d_y_map = ConstEigenMatrixMapRowMajor<T>(d_y->data<T>(), left, right);
auto mean_map = ConstEigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
auto var_map = ConstEigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
if (d_bias) {
d_bias->mutable_data<T>(ctx.GetPlace());
auto d_bias_map = EigenMatrixMapRowMajor<T>(d_bias->data<T>(), 1, right);
d_bias_map = d_y_map.colwise().sum();
}
if (d_scale) {
d_scale->mutable_data<T>(ctx.GetPlace());
auto d_scale_map =
EigenMatrixMapRowMajor<T>(d_scale->data<T>(), 1, right);
auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
// There are two equation to compute d_scale. One uses "Y" and the other
// does not use "Y"
d_scale_map =
((x_map - mean_map.replicate(1, right))
.cwiseProduct(
var_map.unaryExpr(inv_std_func).replicate(1, right))
.cwiseProduct(d_y_map))
.colwise()
.sum();
}
if (d_x) {
d_x->mutable_data<T>(ctx.GetPlace());
auto d_x_map = EigenMatrixMapRowMajor<T>(d_x->data<T>(), left, right);
auto triple_product_func = [](T ele) { return ele * ele * ele; };
auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
// TODO(zcd): these code can be refined
if (d_scale) {
auto scale_map =
ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
// dy_dx
auto dx_end = var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map)
.cwiseProduct(scale_map.replicate(left, 1));
// dy_dmean_dx
auto dx_mean = (T(-1.0) / right) *
var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map)
.cwiseProduct(scale_map.replicate(left, 1))
.rowwise()
.sum()
.replicate(1, right);
// dy_var_dx
auto dvar_end_part = (x_map - mean_map.replicate(1, right))
.cwiseProduct(scale_map.replicate(left, 1))
.cwiseProduct(d_y_map)
.rowwise()
.sum();
auto dvar_end = var_map.unaryExpr(inv_std_func)
.unaryExpr(triple_product_func)
.cwiseProduct(dvar_end_part)
.replicate(1, right);
auto dx_var =
(T(-1.0) / right) *
(x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
d_x_map = dx_end + dx_mean + dx_var;
} else {
// dy_dx
auto dx_end = var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map);
// dy_dmean_dx
auto dx_mean = (T(-1.0) / right) *
var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map)
.rowwise()
.sum()
.replicate(1, right);
// dy_var_dx
auto dvar_end_part = (x_map - mean_map.replicate(1, right))
.cwiseProduct(d_y_map)
.rowwise()
.sum();
auto dvar_end = var_map.unaryExpr(inv_std_func)
.unaryExpr(triple_product_func)
.cwiseProduct(dvar_end_part)
.replicate(1, right);
auto dx_var =
(T(-1.0) / right) *
(x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
d_x_map = dx_end + dx_mean + dx_var;
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
layer_norm_grad, ops::LayerNormGradOp);
REGISTER_OP_CPU_KERNEL(
layer_norm,
ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
layer_norm_grad,
ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class LayerNormKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
};
template <typename DeviceContext, typename T>
class LayerNormGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint.h>
#include <sys/stat.h>
#include <ostream>
#include <thread>
#include <unistd.h>
#include "paddle/framework/executor.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/proto_desc.h"
#include "paddle/operators/detail/grpc_server.h"
#include "paddle/operators/detail/sendrecvop_utils.h"
#include "paddle/operators/detail/simple_block_queue.h"
#include "paddle/string/printf.h"
namespace paddle {
namespace operators {
constexpr char kOptimizeBlock[] = "OptimizeBlock";
void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
service->RunSyncUpdate();
VLOG(4) << "RunServer thread end";
}
static void CreateTensorFromMessageType(framework::Variable *var,
sendrecv::VarType var_type) {
if (var_type == sendrecv::VarType::LOD_TENSOR) {
var->GetMutable<framework::LoDTensor>();
} else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
var->GetMutable<framework::SelectedRows>();
} else {
PADDLE_THROW(
"VariableMessage type %d is not in "
"[LoDTensor, SelectedRows]",
var_type);
}
}
class ListenAndServOp : public framework::OperatorBase {
public:
ListenAndServOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {
if (!rpc_service_) {
std::string endpoint = Attr<std::string>("endpoint");
rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
server_thread_.reset(new std::thread(RunServer, rpc_service_));
}
}
void Stop() override {
detail::MessageWithName term_msg;
term_msg.first = LISTEN_TERMINATE_MESSAGE;
rpc_service_->Push(term_msg);
rpc_service_->ShutDown();
server_thread_->join();
}
std::string GetGradVarNameForTrainer(const std::string &varname) const {
if (grads_counter_.find(varname) == grads_counter_.end()) {
grads_counter_[varname] = 0;
}
return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++);
}
void Run(const framework::Scope &scope,
const platform::Place &dev_place) const override {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
framework::Scope &recv_scope = scope.NewScope();
// FIXME(Yancey1989): initialize rpc server with lazy mode.
rpc_service_->SetScope(&recv_scope);
rpc_service_->SetDevCtx(&dev_ctx);
auto param_list = Attr<std::vector<std::string>>("ParamList");
auto grad_list = Attr<std::vector<std::string>>("GradList");
auto fan_in = Attr<int>("Fanin");
auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
auto *program = block->Program();
framework::Executor executor(dev_place);
// TODO(typhoonzero): change this to a while_op for every cluster-batch.
bool exit_flag = false;
while (!exit_flag) {
// Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_->SetCond(0);
size_t recv_var_cnt = 0;
int batch_barrier = 0;
while (batch_barrier != fan_in) {
const detail::MessageWithName &v = rpc_service_->Get();
auto grad_var_name = v.first;
if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
LOG(INFO) << "received terminate message and exit";
exit_flag = true;
break;
} else if (grad_var_name == BATCH_BARRIER_MESSAGE) {
VLOG(3) << "recv batch barrier message";
batch_barrier++;
continue;
} else {
// receive a variable
recv_var_cnt++;
auto it =
std::find(grad_list.begin(), grad_list.end(), grad_var_name);
std::string param_var_name;
if (it != grad_list.end()) {
param_var_name = param_list[it - grad_list.begin()];
} else {
LOG(ERROR) << "grad has no paired param:" << grad_var_name;
}
VLOG(3) << "received grad: " << grad_var_name
<< " updating param: " << param_var_name;
if (fan_in > 1) {
grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
}
auto *var = recv_scope.FindVar(grad_var_name);
if (var == nullptr) {
LOG(ERROR) << "Can not find server side var: " << grad_var_name;
PADDLE_THROW("Can not find server side var");
}
detail::DeserializeFromMessage(v.second, dev_ctx, var);
}
}
VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier.";
// TODO(Yancey1989): merge SelectedRows variables here
if (exit_flag) {
rpc_service_->ShutDown();
}
try {
executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
false /*create_local_scope*/, false /*create_vars*/);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
rpc_service_->SetCond(1);
rpc_service_->WaitClientGet(recv_var_cnt);
grads_counter_.clear();
} // while(true)
}
protected:
std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
std::shared_ptr<std::thread> server_thread_;
mutable std::unordered_map<std::string, int> grads_counter_;
};
class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
public:
ListenAndServOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddComment(R"DOC(
ListenAndServ operator
This operator will start a RPC server which can receive variables
from send_op and send back variables to recv_op.
)DOC");
AddAttr<std::string>("endpoint",
"(string, default 127.0.0.1:6164)"
"IP address to listen on.")
.SetDefault("127.0.0.1:6164")
.AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
AddAttr<framework::BlockDesc *>(kOptimizeBlock,
"BlockID to run on server side.");
AddAttr<std::vector<std::string>>(
"ParamList", "type list of string",
"grad->param name mapping to find which parameters to optimize.")
.SetDefault({});
AddAttr<std::vector<std::string>>(
"GradList", "type list of string",
"grad->param name mapping to find which parameters to optimize.")
.SetDefault({});
AddAttr<int>("Fanin", "type int",
"Number of trainers in the current cluster job")
.SetDefault(1);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(listen_and_serv, ops::ListenAndServOp,
ops::ListenAndServOpMaker);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include "paddle/framework/op_registry.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace operators {
class LoadCombineOp : public framework::OperatorBase {
public:
LoadCombineOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope,
const platform::Place &place) const override {
auto filename = Attr<std::string>("file_path");
std::ifstream fin(filename);
PADDLE_ENFORCE(static_cast<bool>(fin),
"Cannot open file %s for load_combine op", filename);
auto out_var_names = Outputs("Out");
PADDLE_ENFORCE_GT(
static_cast<int>(out_var_names.size()), 0,
"The number of output variables should be greater than 0.");
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
for (size_t i = 0; i < out_var_names.size(); i++) {
auto *out_var = scope.FindVar(out_var_names[i]);
PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
out_var_names[i]);
auto *tensor = out_var->GetMutable<framework::LoDTensor>();
// Error checking
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
filename);
// Get data from fin to tensor
DeserializeFromStream(fin, tensor, dev_ctx);
if (platform::is_gpu_place(place)) {
// copy CPU to GPU
framework::LoDTensor cpu_tensor;
cpu_tensor.ShareDataWith(*tensor);
cpu_tensor.set_lod(tensor->lod());
// reset tensor
out_var->Clear();
tensor = out_var->GetMutable<framework::LoDTensor>();
tensor->set_lod(cpu_tensor.lod());
Copy(cpu_tensor, place, dev_ctx, tensor);
}
}
}
};
class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
LoadCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddOutput(
"Out",
"(vector) The output LoDTensors that will be read from the input file.")
.AsDuplicable();
AddAttr<std::string>("file_path",
"(string) "
"LoDTensors will be loaded from \"file_path\".")
.AddCustomChecker(
[](const std::string &path) { return !path.empty(); });
AddComment(R"DOC(
LoadCombine Operator.
LoadCombine operator loads LoDTensor variables from a file. The file should
contain one or more LoDTensors serialized using the SaveCombine operator. The
LoadCombine operator applies a deserialization strategy to appropriately load
the LodTensors, and this strategy complements the serialization strategy used
in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
with the SaveCombine operator, and can only deserialize one or more LoDTensors
that were saved using the SaveCombine operator.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(load_combine, ops::LoadCombineOp,
ops::LoadCombineOpProtoMaker);
...@@ -125,8 +125,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> { ...@@ -125,8 +125,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
new_rows.resize(ids_dim[0]); new_rows.resize(ids_dim[0]);
auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace()); auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data, memory::Copy(platform::CPUPlace(), new_rows.cuda_data(), gpu_place,
ids_dim[0] * sizeof(int64_t), stream); ids_data, ids_dim[0] * sizeof(int64_t), stream);
d_table->set_rows(new_rows); d_table->set_rows(new_rows);
......
...@@ -27,11 +27,12 @@ using Tensor = framework::Tensor; ...@@ -27,11 +27,12 @@ using Tensor = framework::Tensor;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
inline void ReorderInitState(const DeviceContext& ctx, inline void ReorderInitState(const DeviceContext& ctx,
const framework::Tensor& src, const size_t* index, const framework::Tensor& src,
framework::Vector<size_t> index_lod,
framework::Tensor* dst, bool indexed_src) { framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle; math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
dst->mutable_data<T>(src.dims(), ctx.GetPlace()); dst->mutable_data<T>(src.dims(), ctx.GetPlace());
row_shuffle(ctx, src, index, *dst, indexed_src); row_shuffle(ctx, src, index_lod, *dst, indexed_src);
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -84,7 +85,9 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -84,7 +85,9 @@ class LSTMKernel : public framework::OpKernel<T> {
} }
lstm_value.prev_state_value = nullptr; lstm_value.prev_state_value = nullptr;
Tensor ordered_c0; Tensor ordered_c0;
const size_t* order = batch_gate->lod()[2].data();
framework::Vector<size_t> order(batch_gate->lod()[2]);
if (cell_t0) { if (cell_t0) {
// Since the batch computing for LSTM reorders the input sequence // Since the batch computing for LSTM reorders the input sequence
// according to their length. The initialized cell state also needs // according to their length. The initialized cell state also needs
...@@ -202,7 +205,8 @@ class LSTMGradKernel : public framework::OpKernel<T> { ...@@ -202,7 +205,8 @@ class LSTMGradKernel : public framework::OpKernel<T> {
// ordered_h0_g/c0_g is the reordered gradient of hidden/cell // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
// initialization. // initialization.
Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g; Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
const size_t* order = batch_gate->lod()[2].data(); framework::Vector<size_t> order(batch_gate->lod()[2]);
if (c0) { if (c0) {
ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0, ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
true); true);
......
...@@ -34,7 +34,8 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>; ...@@ -34,7 +34,8 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
inline void ReorderInitState(const DeviceContext& ctx, inline void ReorderInitState(const DeviceContext& ctx,
const framework::Tensor& src, const size_t* index, const framework::Tensor& src,
framework::Vector<size_t> index,
framework::Tensor* dst, bool indexed_src) { framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle; math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
dst->mutable_data<T>(src.dims(), ctx.GetPlace()); dst->mutable_data<T>(src.dims(), ctx.GetPlace());
...@@ -109,7 +110,9 @@ class LSTMPKernel : public framework::OpKernel<T> { ...@@ -109,7 +110,9 @@ class LSTMPKernel : public framework::OpKernel<T> {
} }
lstmp_value.prev_state_value = nullptr; lstmp_value.prev_state_value = nullptr;
Tensor ordered_c0; Tensor ordered_c0;
const size_t* order = batch_gate->lod()[2].data();
framework::Vector<size_t> order(batch_gate->lod()[2]);
if (cell_t0) { if (cell_t0) {
// Since the batch computing for LSTMP reorders the input sequence // Since the batch computing for LSTMP reorders the input sequence
// according to their length. The initialized cell state also needs // according to their length. The initialized cell state also needs
...@@ -275,7 +278,9 @@ class LSTMPGradKernel : public framework::OpKernel<T> { ...@@ -275,7 +278,9 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
// ordered_h0_g/c0_g is the reordered gradient of hidden/cell // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
// initialization. // initialization.
Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g; Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
const size_t* order = batch_gate->lod()[2].data();
framework::Vector<size_t> order(batch_gate->lod()[2]);
if (c0) { if (c0) {
ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0, ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
true); true);
......
...@@ -8,6 +8,7 @@ if(WITH_GPU) ...@@ -8,6 +8,7 @@ if(WITH_GPU)
nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context) nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context)
nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
nv_library(depthwise_conv SRCS depthwise_conv.cu DEPS device_context)
nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function) nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
......
/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/depthwise_conv.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
namespace operators {
namespace math {
// A Cuda kernel to compute the depthwise convolution forward pass
// in NCHW format.
template <typename T>
__global__ void KernelDepthwiseConv(
const int nthreads, const T* const input_data, const T* const filter_data,
const int batch_size, const int output_channels, const int output_height,
const int output_width, const int input_channels, const int input_height,
const int input_width, const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, T* const output_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int batch = index / output_channels / output_height / output_width;
const int c_out = (index / output_height / output_width) % output_channels;
const int h_out = (index / output_width) % output_height;
const int w_out = index % output_width;
const int c_in = c_out / filter_multiplier;
const T* weight = filter_data + c_out * filter_height * filter_width;
T value = 0;
const int h_in_start = -padding_height + h_out * stride_height;
const int w_in_start = -padding_width + w_out * stride_width;
const int h_in_end = h_in_start + filter_height;
const int w_in_end = w_in_start + filter_width;
const int in_offset =
((batch * input_channels + c_in) * input_height) * input_width;
const int h_end = h_in_end < input_height ? h_in_end : input_height;
const int w_end = w_in_end < input_width ? w_in_end : input_width;
const int h_start = h_in_start > 0 ? h_in_start : 0;
const int w_start = w_in_start > 0 ? w_in_start : 0;
for (int h_in = h_start; h_in < h_end; h_in++) {
for (int w_in = w_start; w_in < w_end; w_in++) {
const int offset = in_offset + h_in * input_width + w_in;
value +=
weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] *
input_data[offset];
}
}
output_data[index] = value;
}
}
// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
template <typename T>
__global__ void KernelDepthwiseConvInputGrad(
const int nthreads, const T* const output_grad_data,
const T* const filter_data, const int batch_size, const int output_channels,
const int output_height, const int output_width, const int input_channels,
const int input_height, const int input_width, const int filter_multiplier,
const int filter_height, const int filter_width, const int stride_height,
const int stride_width, const int padding_height, const int padding_width,
T* const input_grad_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int batch = index / input_channels / input_height / input_width;
const int c_in = (index / input_height / input_width) % input_channels;
const int h_in = (index / input_width) % input_height;
const int w_in = index % input_width;
const int c_out_start = c_in * filter_multiplier;
int h_out_start =
(h_in - filter_height + padding_height + stride_height) / stride_height;
h_out_start = 0 > h_out_start ? 0 : h_out_start;
int h_out_end = (h_in + padding_height) / stride_height;
h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end;
int w_out_start =
(w_in - filter_width + padding_width + stride_width) / stride_width;
w_out_start = 0 > w_out_start ? 0 : w_out_start;
int w_out_end = (w_in + padding_width) / stride_width;
w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end;
T value = 0;
for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
c_out++) {
for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
const int filter_h = h_in + padding_height - h_out * stride_height;
for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
const int filter_w = w_in + padding_width - w_out * stride_width;
const int filter_offset = c_out * filter_height * filter_width +
filter_h * filter_width + filter_w;
const int output_grad_offset =
((batch * output_channels + c_out) * output_height + h_out) *
output_width +
w_out;
value +=
output_grad_data[output_grad_offset] * filter_data[filter_offset];
}
}
}
input_grad_data[index] += value;
}
}
// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
template <typename T>
__global__ void KernelDepthwiseConvFilterGrad(
const int nthreads, const T* const output_grad_data,
const T* const input_data, const int num, const int output_channels,
const int output_height, const int output_width, const int input_channels,
const int input_height, const int input_width, const int filter_multiplier,
const int filter_height, const int filter_width, const int stride_height,
const int stride_width, const int padding_height, const int padding_width,
T* const filter_grad_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int w_out = index % output_width;
const int h_out = (index / output_width) % output_height;
const int c_out = (index / output_width / output_height) % output_channels;
const int batch = (index / output_width / output_height / output_channels);
const int c_in = c_out / filter_multiplier;
const int h_in_start = -padding_height + h_out * stride_height;
const int w_in_start = -padding_width + w_out * stride_width;
const int h_in_end =
-padding_height + h_out * stride_height + filter_height;
const int w_in_end = -padding_width + w_out * stride_width + filter_width;
const int in_offset =
(batch * input_channels + c_in) * input_height * input_width;
T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
const int h_end = h_in_end < input_height ? h_in_end : input_height;
const int w_end = w_in_end < input_width ? w_in_end : input_width;
const int h_start = h_in_start > 0 ? h_in_start : 0;
const int w_start = w_in_start > 0 ? w_in_start : 0;
for (int h_in = h_start; h_in < h_end; h_in++) {
for (int w_in = w_start; w_in < w_end; w_in++) {
const int offset = in_offset + h_in * input_width + w_in;
const T diff_temp = output_grad_data[index] * input_data[offset];
T* addr = addr_offset + (h_in - h_in_start) * filter_width +
(w_in - w_in_start);
paddle::platform::CudaAtomicAdd(addr, diff_temp);
}
}
}
}
/*
* All tensors are in NCHW format.
* Ksize, strides, paddings are two elements. These two elements represent
* height and width, respectively.
*/
template <class T>
class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* output) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output->dims()[1];
const int output_height = output->dims()[2];
const int output_width = output->dims()[3];
const int ksize_height = filter.dims()[2];
const int ksize_width = filter.dims()[3];
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const T* input_data = input.data<T>();
const T* filter_data = filter.data<T>();
T* output_data = output->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_height * output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelDepthwiseConv<T><<<grid, threads, 0, context.stream()>>>(
nthreads, input_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
output_channels / input_channels, ksize_height, ksize_width,
stride_height, stride_width, padding_height, padding_width,
output_data);
}
};
template <typename T>
class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& filter,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output_grad.dims()[1];
const int output_height = output_grad.dims()[2];
const int output_width = output_grad.dims()[3];
const int ksize_height = filter.dims()[2];
const int ksize_width = filter.dims()[3];
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const T* filter_data = filter.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * input_channels * input_height * input_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelDepthwiseConvInputGrad<T><<<grid, threads, 0, context.stream()>>>(
nthreads, output_grad_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
output_channels / input_channels, ksize_height, ksize_width,
stride_height, stride_width, padding_height, padding_width,
input_grad_data);
}
};
template <typename T>
class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* filter_grad) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output_grad.dims()[1];
const int output_height = output_grad.dims()[2];
const int output_width = output_grad.dims()[3];
const int ksize_height = filter_grad->dims()[2];
const int ksize_width = filter_grad->dims()[3];
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const T* input_data = input.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_height * output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelDepthwiseConvFilterGrad<T><<<grid, threads, 0, context.stream()>>>(
nthreads, output_grad_data, input_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
output_channels / input_channels, ksize_height, ksize_width,
stride_height, stride_width, padding_height, padding_width,
filter_grad_data);
}
};
template class DepthwiseConvFunctor<platform::CUDADeviceContext, float>;
template class DepthwiseConvFunctor<platform::CUDADeviceContext, double>;
template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
float>;
template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
double>;
template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
float>;
template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/hostdevice.h"
namespace paddle {
namespace operators {
namespace math {
/*
* \brief Compute the depthwise convolution which include
* forward process and backpropagation process
*/
template <typename DeviceContext, typename T>
class DepthwiseConvFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* output);
};
template <typename DeviceContext, typename T>
class DepthwiseConvInputGradFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& filter,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad);
};
template <typename DeviceContext, typename T>
class DepthwiseConvFilterGradFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* filter_grad);
};
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -31,7 +31,7 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> { ...@@ -31,7 +31,7 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
PADDLE_ENFORCE_EQ(in1_height, input2.height()); PADDLE_ENFORCE_EQ(in1_height, input2.height());
output->set_height(in1_height); output->set_height(in1_height);
auto& in1_rows = input1.rows(); framework::Vector<int64_t> in1_rows(input1.rows());
auto& in2_rows = input2.rows(); auto& in2_rows = input2.rows();
std::vector<int64_t> out_rows; std::vector<int64_t> out_rows;
out_rows.reserve(in1_rows.size() + in2_rows.size()); out_rows.reserve(in1_rows.size() + in2_rows.size());
...@@ -108,7 +108,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> { ...@@ -108,7 +108,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
auto& in1_value = input1.value(); auto& in1_value = input1.value();
auto& in1_rows = input1.rows(); framework::Vector<int64_t> in1_rows(input1.rows());
int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
...@@ -126,7 +126,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> { ...@@ -126,7 +126,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
dim3 grid(1, in1_rows.size()); dim3 grid(1, in1_rows.size());
SelectedRowsAddTensorKernel< SelectedRowsAddTensorKernel<
T, block_size><<<grid, threads, 0, context.stream()>>>( T, block_size><<<grid, threads, 0, context.stream()>>>(
in1_data, in1_rows.data(), out_data, in1_row_numel); in1_data, in1_rows.cuda_data(), out_data, in1_row_numel);
auto out_eigen = framework::EigenVector<T>::Flatten(*output); auto out_eigen = framework::EigenVector<T>::Flatten(*output);
auto in2_eigen = framework::EigenVector<T>::Flatten(input2); auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
...@@ -146,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> { ...@@ -146,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
auto in1_height = input1.height(); auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height()); PADDLE_ENFORCE_EQ(in1_height, input2->height());
auto& in1_rows = input1.rows(); framework::Vector<int64_t> in1_rows(input1.rows());
auto& in2_rows = *(input2->mutable_rows()); auto& in2_rows = *(input2->mutable_rows());
auto& in1_value = input1.value(); auto& in1_value = input1.value();
...@@ -204,7 +204,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> { ...@@ -204,7 +204,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
auto& in1_value = input1.value(); auto& in1_value = input1.value();
auto& in1_rows = input1.rows(); framework::Vector<int64_t> in1_rows(input1.rows());
int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
...@@ -216,7 +216,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> { ...@@ -216,7 +216,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
dim3 grid(1, in1_rows.size()); dim3 grid(1, in1_rows.size());
SelectedRowsAddToTensorKernel< SelectedRowsAddToTensorKernel<
T, block_size><<<grid, threads, 0, context.stream()>>>( T, block_size><<<grid, threads, 0, context.stream()>>>(
in1_data, in1_rows.data(), in2_data, in1_row_numel); in1_data, in1_rows.cuda_data(), in2_data, in1_row_numel);
} }
}; };
...@@ -257,7 +257,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> { ...@@ -257,7 +257,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
framework::SelectedRows operator()(const platform::CUDADeviceContext& context, framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
const framework::SelectedRows& input) { const framework::SelectedRows& input) {
framework::SelectedRows out; framework::SelectedRows out;
auto input_rows = input.rows(); framework::Vector<int64_t> input_rows(input.rows());
std::set<int64_t> row_set(input_rows.begin(), input_rows.end()); std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end()); std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
...@@ -283,9 +283,9 @@ struct MergeAdd<platform::CUDADeviceContext, T> { ...@@ -283,9 +283,9 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
MergeAddKernel< MergeAddKernel<
T, 256><<<grid1, threads, 0, T, 256><<<grid1, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(input_data, input.rows().data(), out_data, .stream()>>>(input_data, input_rows.cuda_data(), out_data,
out.rows().data(), out.rows().size(), out.mutable_rows()->cuda_data(),
input_width); out.rows().size(), input_width);
return out; return out;
} }
}; };
...@@ -370,8 +370,8 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> { ...@@ -370,8 +370,8 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1); dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
dim3 grid(1, in1_rows.size()); dim3 grid(1, in1_rows.size());
UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<< UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
grid, threads, 0, context.stream()>>>(in1_data, in1_rows.data(), op, grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(),
in2_data, in1_row_numel); op, in2_data, in1_row_numel);
} }
}; };
} // namespace scatter } // namespace scatter
......
...@@ -23,8 +23,10 @@ template <typename T> ...@@ -23,8 +23,10 @@ template <typename T>
class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> { class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& src, const size_t* index, const framework::Tensor& src,
framework::Tensor& dst, bool is_src_index) { framework::Vector<size_t> index_lod, framework::Tensor& dst,
bool is_src_index) {
size_t* index = index_lod.data();
auto src_dims = src.dims(); auto src_dims = src.dims();
auto dst_dims = dst.dims(); auto dst_dims = dst.dims();
PADDLE_ENFORCE_EQ(src_dims.size(), 2UL, PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
......
...@@ -42,8 +42,10 @@ template <typename T> ...@@ -42,8 +42,10 @@ template <typename T>
class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> { class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& src, const size_t* index, const framework::Tensor& src,
framework::Tensor& dst, bool is_src_index) { framework::Vector<size_t> index_lod, framework::Tensor& dst,
bool is_src_index) {
size_t* index = index_lod.cuda_data();
auto src_dims = src.dims(); auto src_dims = src.dims();
auto dst_dims = dst.dims(); auto dst_dims = dst.dims();
PADDLE_ENFORCE_EQ(src_dims.size(), 2, PADDLE_ENFORCE_EQ(src_dims.size(), 2,
......
...@@ -35,7 +35,7 @@ class CopyMatrixRowsFunctor { ...@@ -35,7 +35,7 @@ class CopyMatrixRowsFunctor {
// copy the input src to the indexed rows of output dst. // copy the input src to the indexed rows of output dst.
// The indexed rows are based on the input index. // The indexed rows are based on the input index.
void operator()(const DeviceContext& context, const framework::Tensor& src, void operator()(const DeviceContext& context, const framework::Tensor& src,
const size_t* index, framework::Tensor& dst, framework::Vector<size_t> index_lod, framework::Tensor& dst,
bool is_src_index); bool is_src_index);
}; };
...@@ -66,7 +66,7 @@ class LoDTensor2BatchFunctor { ...@@ -66,7 +66,7 @@ class LoDTensor2BatchFunctor {
PADDLE_ENFORCE_EQ(lods[1].size(), PADDLE_ENFORCE_EQ(lods[1].size(),
static_cast<size_t>(lod_tensor.dims()[0])); static_cast<size_t>(lod_tensor.dims()[0]));
CopyMatrixRowsFunctor<DeviceContext, T> to_batch; CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
to_batch(context, lod_tensor, lods[1].data(), batch, true); to_batch(context, lod_tensor, lods[1], batch, true);
return; return;
} }
...@@ -144,7 +144,7 @@ class LoDTensor2BatchFunctor { ...@@ -144,7 +144,7 @@ class LoDTensor2BatchFunctor {
batch.set_lod(batch_lods); batch.set_lod(batch_lods);
CopyMatrixRowsFunctor<DeviceContext, T> to_batch; CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
to_batch(context, lod_tensor, seq2batch_idx, batch, true); to_batch(context, lod_tensor, batch_lods[1], batch, true);
} }
}; };
...@@ -159,8 +159,7 @@ class Batch2LoDTensorFunctor { ...@@ -159,8 +159,7 @@ class Batch2LoDTensorFunctor {
PADDLE_ENFORCE_EQ(in_lod[1].size(), PADDLE_ENFORCE_EQ(in_lod[1].size(),
static_cast<size_t>(lod_tensor.dims()[0])); static_cast<size_t>(lod_tensor.dims()[0]));
CopyMatrixRowsFunctor<DeviceContext, T> to_seq; CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
size_t* index = in_lod[1].data(); to_seq(context, batch, in_lod[1], lod_tensor, false);
to_seq(context, batch, index, lod_tensor, false);
} }
}; };
......
...@@ -120,12 +120,14 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -120,12 +120,14 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
T* padding_data = padding.data<T>(); T* padding_data = padding.data<T>();
if (norm_by_times) { if (norm_by_times) {
SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(), padding_data, const_cast<T*>(seq_data),
sequence_width, max_sequence_length, num_sequences); abs_offset_lod[level].cuda_data(), sequence_width,
max_sequence_length, num_sequences);
} else { } else {
SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(), padding_data, const_cast<T*>(seq_data),
sequence_width, max_sequence_length, num_sequences); abs_offset_lod[level].cuda_data(), sequence_width,
max_sequence_length, num_sequences);
} }
} }
}; };
...@@ -193,12 +195,14 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -193,12 +195,14 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
T* seq_data = seq.data<T>(); T* seq_data = seq.data<T>();
if (norm_by_times) { if (norm_by_times) {
SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(), const_cast<T*>(padding_data), seq_data,
sequence_width, max_sequence_length, num_sequences); abs_offset_lod[level].cuda_data(), sequence_width,
max_sequence_length, num_sequences);
} else { } else {
SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(), const_cast<T*>(padding_data), seq_data,
sequence_width, max_sequence_length, num_sequences); abs_offset_lod[level].cuda_data(), sequence_width,
max_sequence_length, num_sequences);
} }
} }
}; };
......
...@@ -73,7 +73,7 @@ class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> { ...@@ -73,7 +73,7 @@ class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
dim3 grid(num_seq, 1); dim3 grid(num_seq, 1);
auto stream = context.stream(); auto stream = context.stream();
KeMaxSequencePool<T><<<grid, threads, 0, stream>>>( KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
in_data, starts.data(), out_data, max_index, num_seq, dim); in_data, starts.cuda_data(), out_data, max_index, num_seq, dim);
} }
}; };
......
...@@ -46,7 +46,7 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -46,7 +46,7 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<< SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>( num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
seq_data, abs_offset_lod[level].data(), scales, seq_width); seq_data, abs_offset_lod[level].cuda_data(), scales, seq_width);
} }
}; };
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
enum MiningType { kNone = 0, kMaxNegative, kHardExample };
template <typename T>
bool SortScoreDescend(const std::pair<float, T>& pair1,
const std::pair<float, T>& pair2) {
return pair1.first > pair2.first;
}
inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,
const float match_dist,
const float neg_dist_threshold) {
if (mining_type == MiningType::kMaxNegative) {
return match_idx == -1 && match_dist < neg_dist_threshold;
} else if (mining_type == MiningType::kHardExample) {
return true;
} else {
return false;
}
}
inline MiningType GetMiningType(std::string str) {
if (str == "max_negative") {
return MiningType::kMaxNegative;
} else if (str == "hard_example") {
return MiningType::kHardExample;
} else {
return MiningType::kNone;
}
}
template <typename DeviceContext, typename T>
class MineHardExamplesKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in_cls_loss = ctx.Input<framework::Tensor>("ClsLoss");
auto* in_loc_loss = ctx.Input<framework::Tensor>("LocLoss");
auto* in_matched_indices = ctx.Input<framework::Tensor>("MatchIndices");
auto* in_match_dist = ctx.Input<framework::Tensor>("MatchDist");
float neg_pos_ratio = ctx.Attr<float>("neg_pos_ratio");
T neg_dist_threshold =
static_cast<T>(ctx.Attr<float>("neg_dist_threshold"));
int sample_size = ctx.Attr<int>("sample_size");
MiningType mining_type =
GetMiningType(ctx.Attr<std::string>("mining_type"));
auto out_neg_indices = ctx.Output<framework::LoDTensor>("NegIndices");
auto out_match_indices =
ctx.Output<framework::Tensor>("UpdatedMatchIndices");
framework::Copy(*in_matched_indices, ctx.GetPlace(), out_match_indices);
int batch_size = in_matched_indices->dims()[0];
int prior_num = in_matched_indices->dims()[1];
auto match_indices = framework::EigenMatrix<int>::From(*in_matched_indices);
auto match_indices_et =
framework::EigenMatrix<int>::From(*out_match_indices);
auto match_dist = framework::EigenMatrix<T>::From(*in_match_dist);
const T* cls_loss = in_cls_loss->data<T>();
const T* loc_loss = nullptr;
if (in_loc_loss) {
loc_loss = in_loc_loss->data<T>();
}
std::vector<std::vector<int>> all_neg_indices;
std::vector<size_t> batch_starts = {0};
for (int n = 0; n < batch_size; ++n) {
std::vector<std::pair<T, size_t>> loss_idx;
int neg_sel = 0;
for (int m = 0; m < prior_num; ++m) {
if (IsEligibleMining(mining_type, match_indices(n, m), match_dist(n, m),
neg_dist_threshold)) {
T loss = cls_loss[n * prior_num + m];
if (mining_type == MiningType::kHardExample && loc_loss != nullptr) {
loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m];
}
loss_idx.push_back(std::make_pair(loss, m));
++neg_sel;
}
}
if (mining_type == MiningType::kMaxNegative) {
int num_pos = 0;
for (int m = 0; m < prior_num; ++m) {
if (match_indices(n, m) != -1) ++num_pos;
}
neg_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), neg_sel);
} else if (mining_type == MiningType::kHardExample) {
neg_sel = std::min(sample_size, neg_sel);
}
std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend<size_t>);
std::set<int> sel_indices;
std::vector<int> neg_indices;
std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel,
std::inserter(sel_indices, sel_indices.begin()),
[](std::pair<T, size_t>& l) -> int {
return static_cast<int>(l.second);
});
if (mining_type == MiningType::kHardExample) {
for (int m = 0; m < prior_num; ++m) {
if (match_indices(n, m) > -1) {
if (sel_indices.find(m) == sel_indices.end()) {
match_indices_et(n, m) = -1;
}
} else {
if (sel_indices.find(m) != sel_indices.end()) {
neg_indices.push_back(m);
}
}
}
} else {
neg_indices.resize(sel_indices.size());
std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin());
}
all_neg_indices.push_back(neg_indices);
batch_starts.push_back(batch_starts.back() + neg_indices.size());
}
framework::LoD out_neg_indices_lod;
out_neg_indices_lod.emplace_back(batch_starts);
int neg_offset = 0;
auto neg_data = out_neg_indices->mutable_data<int>(
framework::make_ddim({static_cast<int>(batch_starts.back()), 1}),
ctx.GetPlace());
for (auto neg_indices : all_neg_indices) {
std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset);
neg_offset += neg_indices.size();
}
out_neg_indices->set_lod(out_neg_indices_lod);
return;
}
};
class MineHardExamplesOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("ClsLoss"),
"Input(ClsLoss) of MineHardExamplesOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("MatchIndices"),
"Input(MatchIndices) of MineHardExamplesOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("MatchDist"),
"Input(MatchDist) of MineHardExamplesOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("NegIndices"),
"Output(NegIndices) of MineHardExamplesOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("UpdatedMatchIndices"),
"Output(UpdatedMatchIndices) of MineHardExamplesOp should "
"not be null.");
auto cls_loss_dims = ctx->GetInputDim("ClsLoss");
auto idx_dims = ctx->GetInputDim("MatchIndices");
auto dis_dims = ctx->GetInputDim("MatchDist");
PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL,
"The shape of ClsLoss is [N, Np].");
PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL,
"The shape of MatchIndices is [N, Np].");
PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL,
"The shape of MatchDist is [N, Np].");
if (ctx->HasInput("LocLoss")) {
auto loc_loss_dims = ctx->GetInputDim("LocLoss");
PADDLE_ENFORCE_EQ(loc_loss_dims.size(), 2UL,
"The shape of LocLoss is [N, Np].");
PADDLE_ENFORCE_EQ(cls_loss_dims[0], loc_loss_dims[0],
"Batch size of ClsLoss and LocLoss must be the same.");
PADDLE_ENFORCE_EQ(
cls_loss_dims[1], loc_loss_dims[1],
"Prior box number of ClsLoss and LocLoss must be the same.");
}
PADDLE_ENFORCE_EQ(
cls_loss_dims[0], idx_dims[0],
"Batch size of ClsLoss and MatchIndices must be the same.");
PADDLE_ENFORCE_EQ(
cls_loss_dims[1], idx_dims[1],
"Prior box number of ClsLoss and MatchIndices must be the same.");
PADDLE_ENFORCE_EQ(cls_loss_dims[0], dis_dims[0],
"Batch size of ClsLoss and MatchDist must be the same.");
PADDLE_ENFORCE_EQ(
cls_loss_dims[1], idx_dims[1],
"Prior box number of ClsLoss and MatchDist must be the same.");
auto mining_type =
GetMiningType(ctx->Attrs().Get<std::string>("mining_type"));
PADDLE_ENFORCE_NE(mining_type, MiningType::kNone,
"mining_type must be hard_example or max_negative");
if (mining_type == MiningType::kMaxNegative) {
auto neg_pos_ratio = ctx->Attrs().Get<float>("neg_pos_ratio");
auto neg_dist_threshold = ctx->Attrs().Get<float>("neg_dist_threshold");
PADDLE_ENFORCE_GT(
neg_pos_ratio, 0.0f,
"neg_pos_ratio must greater than zero in max_negative mode");
PADDLE_ENFORCE_GT(
neg_dist_threshold, 0.0f,
"neg_dist_threshold must greater than zero in max_negative mode");
} else if (mining_type == MiningType::kHardExample) {
auto sample_size = ctx->Attrs().Get<int>("sample_size");
PADDLE_ENFORCE_GT(
sample_size, 0,
"sample_size must greater than zero in hard_example mode");
}
ctx->SetOutputDim("UpdatedMatchIndices", idx_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("ClsLoss")->type()),
ctx.device_context());
}
};
class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
public:
MineHardExamplesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"ClsLoss",
"(Tensor, default Tensor<float>), The classification loss with shape "
"[N, Np], N is the batch size and Np is the number of prior box.");
AddInput("LocLoss",
"(Tensor, optional, default Tensor<float>), The localization loss "
"with shape [N, Np], N is the batch size and Np is the number of "
"prior box.")
.AsDispensable();
AddInput("MatchIndices",
"(Tensor, Tensor<int>), Matched indices with shape [N, Np], N is "
"the batch size and Np is the number of prior box. "
"MatchIndices[i][j] equal -1 means the j-th prior box in i-th "
"instance does not match any entity, otherwise means it is "
"matched to row.");
AddInput("MatchDist",
"(Tensor, default Tensor<float>) Matched indices with shape [N, "
"Np], N is the batch size and Np is the number of prior box.");
AddAttr<float>("neg_pos_ratio",
"(float) The ratio of the negative box to the positive "
"box. Use only when mining_type is max_negative.")
.SetDefault(1.0);
AddAttr<float>("neg_dist_threshold",
"(float) The negative overlap upper bound for the unmatched "
"predictions. Use only when mining_type is max_negative.")
.SetDefault(0.5);
AddAttr<int>("sample_size",
"(float) The max sample size of negative box. Use only when "
"mining_type is hard_example.")
.SetDefault(0);
AddAttr<std::string>("mining_type",
"(float) The mining algorithm name, the value is "
"hard_example or max_negative.")
.SetDefault("max_negative")
.InEnum({"hard_example", "max_negative"});
AddOutput(
"NegIndices",
"(LoDTensor<int>) The output of negative example indices. a LoDTensor "
"with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, "
"and each element is the prior box index. "
"For example, the batch size is 2, the lod is [[0, 1, 2]], "
"the sample 0's box 1(MatchIndices[0][1]) is selected, "
"and sample 1's box 0 is selected. The output NegIndices is "
"[[1], [0]].");
AddOutput("UpdatedMatchIndices",
"(Tensor<int>) The output of updated MatchIndices, a tensor with "
"shape [N, Np]. Only update when mining_type is "
"hard_example. The input MatchIndices elements will be update to "
"-1 when it is not in the candidate high loss list of negative "
"examples.");
AddComment(R"DOC(
Mine hard examples Operator.
This operator implements hard example mining to select a subset of negative box indices.
For each image, selects the box with highest losses. subject to the condition that the
box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative.
The selected number is min(sample_size, max_negative_box_number) when mining_type is
hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number)
when mining_type is max_negative, where the max_negative_box_number is the count of
MatchIndices elements with value -1.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp,
ops::MineHardExamplesOpMaker);
REGISTER_OP_CPU_KERNEL(
mine_hard_examples,
ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, float>,
ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
constexpr int64_t kOutputDim = 6;
constexpr int64_t kBBoxSize = 4;
class MultiClassNMSOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("BBoxes"),
"Input(BBoxes) of MultiClassNMS should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Scores"),
"Input(Scores) of MultiClassNMS should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of MultiClassNMS should not be null.");
auto box_dims = ctx->GetInputDim("BBoxes");
auto score_dims = ctx->GetInputDim("Scores");
PADDLE_ENFORCE_EQ(box_dims.size(), 2,
"The rank of Input(BBoxes) must be 2.");
PADDLE_ENFORCE_EQ(score_dims.size(), 3,
"The rank of Input(Scores) must be 3.");
PADDLE_ENFORCE_EQ(box_dims[1], 4,
"The 2nd dimension of Input(BBoxes) must be 4, "
"represents the layout of coordinate "
"[xmin, ymin, xmax, ymax]");
PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2],
"The 1st dimensiong of Input(BBoxes) must be equal to "
"3rd dimension of Input(Scores), which represents the "
"predicted bboxes.");
// Here the box_dims[0] is not the real dimension of output.
// It will be rewritten in the computing kernel.
ctx->SetOutputDim("Out", {box_dims[0], 6});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>("Scores")->type()),
ctx.device_context());
}
};
template <class T>
bool SortScorePairDescend(const std::pair<float, T>& pair1,
const std::pair<float, T>& pair2) {
return pair1.first > pair2.first;
}
template <class T>
static inline void GetMaxScoreIndex(
const std::vector<T>& scores, const T threshold, int top_k,
std::vector<std::pair<T, int>>* sorted_indices) {
for (size_t i = 0; i < scores.size(); ++i) {
if (scores[i] > threshold) {
sorted_indices->push_back(std::make_pair(scores[i], i));
}
}
// Sort the score pair according to the scores in descending order
std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
SortScorePairDescend<int>);
// Keep top_k scores if needed.
if (top_k > -1 && top_k < sorted_indices->size()) {
sorted_indices->resize(top_k);
}
}
template <class T>
static inline T BBoxArea(const T* box, const bool normalized) {
if (box[2] < box[0] || box[3] < box[1]) {
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return static_cast<T>(0.);
} else {
const T w = box[2] - box[0];
const T h = box[3] - box[1];
if (normalized) {
return w * h;
} else {
// If coordinate values are not within range [0, 1].
return (w + 1) * (h + 1);
}
}
}
template <class T>
static inline T JaccardOverlap(const T* box1, const T* box2,
const bool normalized) {
if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
box2[3] < box1[1]) {
return static_cast<T>(0.);
} else {
const T inter_xmin = std::max(box1[0], box2[0]);
const T inter_ymin = std::max(box1[1], box2[1]);
const T inter_xmax = std::min(box1[2], box2[2]);
const T inter_ymax = std::min(box1[3], box2[3]);
const T inter_w = inter_xmax - inter_xmin;
const T inter_h = inter_ymax - inter_ymin;
const T inter_area = inter_w * inter_h;
const T bbox1_area = BBoxArea<T>(box1, normalized);
const T bbox2_area = BBoxArea<T>(box2, normalized);
return inter_area / (bbox1_area + bbox2_area - inter_area);
}
}
template <typename T>
class MultiClassNMSKernel : public framework::OpKernel<T> {
public:
void NMSFast(const Tensor& bbox, const Tensor& scores,
const T score_threshold, const T nms_threshold, const T eta,
const int64_t top_k, std::vector<int>* selected_indices) const {
// The total boxes for each instance.
int64_t num_boxes = bbox.dims()[0];
// 4: [xmin ymin xmax ymax]
int64_t box_size = bbox.dims()[1];
std::vector<T> scores_data(num_boxes);
std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
std::vector<std::pair<T, int>> sorted_indices;
GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
selected_indices->clear();
T adaptive_threshold = nms_threshold;
const T* bbox_data = bbox.data<T>();
while (sorted_indices.size() != 0) {
const int idx = sorted_indices.front().second;
bool keep = true;
for (int k = 0; k < selected_indices->size(); ++k) {
if (keep) {
const int kept_idx = (*selected_indices)[k];
T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
bbox_data + kept_idx * box_size, true);
keep = overlap <= adaptive_threshold;
} else {
break;
}
}
if (keep) {
selected_indices->push_back(idx);
}
sorted_indices.erase(sorted_indices.begin());
if (keep && eta < 1 && adaptive_threshold > 0.5) {
adaptive_threshold *= eta;
}
}
}
void MultiClassNMS(const framework::ExecutionContext& ctx,
const Tensor& scores, const Tensor& bboxes,
std::map<int, std::vector<int>>& indices,
int& num_nmsed_out) const {
int64_t background_label = ctx.Attr<int>("background_label");
int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
int64_t class_num = scores.dims()[0];
int64_t predict_dim = scores.dims()[1];
int num_det = 0;
for (int64_t c = 0; c < class_num; ++c) {
if (c == background_label) continue;
Tensor score = scores.Slice(c, c + 1);
NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k,
&(indices[c]));
num_det += indices[c].size();
}
num_nmsed_out = num_det;
const T* scores_data = scores.data<T>();
if (keep_top_k > -1 && num_det > keep_top_k) {
std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
for (const auto& it : indices) {
int label = it.first;
const T* sdata = scores_data + label * predict_dim;
const std::vector<int>& label_indices = it.second;
for (int j = 0; j < label_indices.size(); ++j) {
int idx = label_indices[j];
PADDLE_ENFORCE_LT(idx, predict_dim);
score_index_pairs.push_back(
std::make_pair(sdata[idx], std::make_pair(label, idx)));
}
}
// Keep top k results per image.
std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
SortScorePairDescend<std::pair<int, int>>);
score_index_pairs.resize(keep_top_k);
// Store the new indices.
std::map<int, std::vector<int>> new_indices;
for (int j = 0; j < score_index_pairs.size(); ++j) {
int label = score_index_pairs[j].second.first;
int idx = score_index_pairs[j].second.second;
new_indices[label].push_back(idx);
}
new_indices.swap(indices);
num_nmsed_out = keep_top_k;
}
}
void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
std::map<int, std::vector<int>>& selected_indices,
Tensor* outs) const {
int predict_dim = scores.dims()[1];
auto* scores_data = scores.data<T>();
auto* bboxes_data = bboxes.data<T>();
auto* odata = outs->data<T>();
int count = 0;
for (const auto& it : selected_indices) {
int label = it.first;
const T* sdata = scores_data + label * predict_dim;
const std::vector<int>& indices = it.second;
for (int j = 0; j < indices.size(); ++j) {
int idx = indices[j];
const T* bdata = bboxes_data + idx * kBBoxSize;
odata[count * kOutputDim] = label; // label
odata[count * kOutputDim + 1] = sdata[idx]; // score
// xmin, ymin, xmax, ymax
std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
count++;
}
}
}
void Compute(const framework::ExecutionContext& ctx) const override {
auto* boxes = ctx.Input<Tensor>("BBoxes");
auto* scores = ctx.Input<Tensor>("Scores");
auto* outs = ctx.Output<LoDTensor>("Out");
auto score_dims = scores->dims();
int64_t batch_size = score_dims[0];
int64_t class_num = score_dims[1];
int64_t predict_dim = score_dims[2];
std::vector<std::map<int, std::vector<int>>> all_indices;
std::vector<size_t> batch_starts = {0};
for (int64_t i = 0; i < batch_size; ++i) {
Tensor ins_score = scores->Slice(i, i + 1);
ins_score.Resize({class_num, predict_dim});
std::map<int, std::vector<int>> indices;
int num_nmsed_out = 0;
MultiClassNMS(ctx, ins_score, *boxes, indices, num_nmsed_out);
all_indices.push_back(indices);
batch_starts.push_back(batch_starts.back() + num_nmsed_out);
}
int num_kept = batch_starts.back();
if (num_kept == 0) {
T* od = outs->mutable_data<T>({1}, ctx.GetPlace());
od[0] = -1;
} else {
outs->mutable_data<T>({num_kept, kOutputDim}, ctx.GetPlace());
for (int64_t i = 0; i < batch_size; ++i) {
Tensor ins_score = scores->Slice(i, i + 1);
ins_score.Resize({class_num, predict_dim});
int64_t s = batch_starts[i];
int64_t e = batch_starts[i + 1];
if (e > s) {
Tensor out = outs->Slice(s, e);
MultiClassOutput(ins_score, *boxes, all_indices[i], &out);
}
}
}
framework::LoD lod;
lod.emplace_back(batch_starts);
outs->set_lod(lod);
}
};
class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
public:
MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("BBoxes",
"(Tensor) A 2-D Tensor with shape [M, 4] represents the "
"predicted locations of M bounding bboxes. Each bounding box "
"has four coordinate values and the layout is "
"[xmin, ymin, xmax, ymax].");
AddInput("Scores",
"(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
"predicted confidence predictions. N is the batch size, C is the "
"class number, M is number of bounding boxes. For each category "
"there are total M scores which corresponding M bounding boxes. "
" Please note, M is equal to the 1st dimension of BBoxes. ");
AddAttr<int>(
"background_label",
"(int64_t, defalut: 0) "
"The index of background label, the background label will be ignored. "
"If set to -1, then all categories will be considered.")
.SetDefault(0);
AddAttr<float>("score_threshold",
"(float) "
"Threshold to filter out bounding boxes with low "
"confidence score. If not provided, consider all boxes.");
AddAttr<int>("nms_top_k",
"(int64_t) "
"Maximum number of detections to be kept according to the "
"confidences aftern the filtering detections based on "
"score_threshold");
AddAttr<float>("nms_threshold",
"(float, defalut: 0.3) "
"The threshold to be used in NMS.")
.SetDefault(0.3);
AddAttr<float>("nms_eta",
"(float) "
"The parameter for adaptive NMS.")
.SetDefault(1.0);
AddAttr<int>("keep_top_k",
"(int64_t) "
"Number of total bboxes to be kept per image after NMS "
"step. -1 means keeping all bboxes after NMS step.");
AddOutput("Out",
"(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
"detections. Each row has 6 values: "
"[label, confidence, xmin, ymin, xmax, ymax], No is the total "
"number of detections in this mini-batch. For each instance, "
"the offsets in first dimension are called LoD, the number of "
"offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
"no detected bbox.");
AddComment(R"DOC(
This operator is to do multi-class non maximum suppression (NMS) on a batched
of boxes and scores.
In the NMS step, this operator greedily selects a subset of detection bounding
boxes that have high scores larger than score_threshold, if providing this
threshold, then selects the largest nms_top_k confidences scores if nms_top_k
is larger than -1. Then this operator pruns away boxes that have high IOU
(intersection over union) overlap with already selected boxes by adaptive
threshold NMS based on parameters of nms_threshold and nms_eta.
Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
per image if keep_top_k is larger than -1.
This operator support multi-class and batched inputs. It applying NMS
independently for each class. The outputs is a 2-D LoDTenosr, for each
image, the offsets in first dimension of LoDTensor are called LoD, the number
of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
means there is no detected bbox for this image. If there is no detected boxes
for all images, all the elements in LoD are 0, and the Out only contains one
value which is -1.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp,
ops::MultiClassNMSOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel<float>,
ops::MultiClassNMSKernel<double>);
...@@ -12,187 +12,60 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,187 +12,60 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <stdint.h>
#include <sys/stat.h>
#include <ostream> #include <ostream>
#include <thread>
#include <unistd.h> #include "paddle/framework/data_type.h"
#include "paddle/framework/executor.h"
#include "paddle/framework/framework.pb.h" #include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/proto_desc.h"
#include "paddle/operators/detail/grpc_server.h" #include <future>
#include "paddle/operators/detail/sendrecvop_utils.h" #include "paddle/operators/detail/grpc_client.h"
#include "paddle/operators/detail/simple_block_queue.h"
#include "paddle/string/printf.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
constexpr char kOptimizeBlock[] = "OptimizeBlock";
void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
service->RunSyncUpdate();
VLOG(4) << "RunServer thread end";
}
static void CreateTensorFromMessageType(framework::Variable *var,
sendrecv::VarType var_type) {
if (var_type == sendrecv::VarType::LOD_TENSOR) {
var->GetMutable<framework::LoDTensor>();
} else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
var->GetMutable<framework::SelectedRows>();
} else {
PADDLE_THROW(
"VariableMessage type %d is not in "
"[LoDTensor, SelectedRows]",
var_type);
}
}
class RecvOp : public framework::OperatorBase { class RecvOp : public framework::OperatorBase {
public: public:
RecvOp(const std::string &type, const framework::VariableNameMap &inputs, RecvOp(const std::string& type, const framework::VariableNameMap& inputs,
const framework::VariableNameMap &outputs, const framework::VariableNameMap& outputs,
const framework::AttributeMap &attrs) const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) { : OperatorBase(type, inputs, outputs, attrs) {}
if (!rpc_service_) {
std::string endpoint = Attr<std::string>("endpoint"); void Run(const framework::Scope& scope,
rpc_service_.reset(new detail::AsyncGRPCServer(endpoint)); const platform::Place& place) const override {
server_thread_.reset(new std::thread(RunServer, rpc_service_)); auto outs = Outputs("Out");
} std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
}
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
void Stop() override { auto& ctx = *pool.Get(place);
detail::MessageWithName term_msg;
term_msg.first = LISTEN_TERMINATE_MESSAGE; for (size_t i = 0; i < outs.size(); i++) {
rpc_service_->Push(term_msg); VLOG(3) << "getting " << outs[i];
rpc_service_->ShutDown(); client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
server_thread_->join();
}
std::string GetGradVarNameForTrainer(const std::string &varname) const {
if (grads_counter_.find(varname) == grads_counter_.end()) {
grads_counter_[varname] = 0;
} }
return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++); PADDLE_ENFORCE(client_.Wait());
} }
void Run(const framework::Scope &scope, private:
const platform::Place &dev_place) const override { mutable detail::RPCClient client_;
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
framework::Scope &recv_scope = scope.NewScope();
// FIXME(Yancey1989): initialize rpc server with laze mode.
rpc_service_->SetScope(&recv_scope);
rpc_service_->SetDevCtx(&dev_ctx);
auto param_list = Attr<std::vector<std::string>>("ParamList");
auto grad_list = Attr<std::vector<std::string>>("GradList");
auto fan_in = Attr<int>("Fanin");
auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
auto *program = block->Program();
framework::Executor executor(dev_place);
// TODO(typhoonzero): change this to a while_op for every cluster-batch.
bool exit_flag = false;
while (!exit_flag) {
// Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_->SetCond(0);
size_t recv_var_cnt = 0;
int batch_barrier = 0;
while (batch_barrier != fan_in) {
const detail::MessageWithName &v = rpc_service_->Get();
auto grad_var_name = v.first;
if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
LOG(INFO) << "received terminate message and exit";
exit_flag = true;
break;
} else if (grad_var_name == BATCH_BARRIER_MESSAGE) {
VLOG(3) << "recv batch barrier message";
batch_barrier++;
continue;
} else {
// receive a variable
recv_var_cnt++;
auto it =
std::find(grad_list.begin(), grad_list.end(), grad_var_name);
std::string param_var_name;
if (it != grad_list.end()) {
param_var_name = param_list[it - grad_list.begin()];
} else {
LOG(ERROR) << "grad has no paired param:" << grad_var_name;
}
VLOG(3) << "received grad: " << grad_var_name
<< " updating param: " << param_var_name;
if (fan_in > 1) {
grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
}
auto *var = recv_scope.FindVar(grad_var_name);
if (var == nullptr) {
LOG(ERROR) << "Can not find server side var: " << grad_var_name;
PADDLE_THROW("Can not find server side var");
}
detail::DeserializeFromMessage(v.second, dev_ctx, var);
}
}
VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier.";
// TODO(Yancey1989): merge SelectedRows variables here
if (exit_flag) {
break;
}
try {
executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
false /*create_local_scope*/, false /*create_vars*/);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
rpc_service_->SetCond(1);
rpc_service_->WaitClientGet(recv_var_cnt);
grads_counter_.clear();
} // while(true)
}
protected:
std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
std::shared_ptr<std::thread> server_thread_;
mutable std::unordered_map<std::string, int> grads_counter_;
}; };
class RecvOpMaker : public framework::OpProtoAndCheckerMaker { class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker) RecvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
AddComment(R"DOC( AddComment(R"DOC(
Recv operator Recv operator
This operator will recieve tensor from send_op This operator can get variables from server side.
)DOC"); )DOC");
AddAttr<std::string>("endpoint", AddAttr<std::vector<std::string>>("epmap",
"(string, default 127.0.0.1:6164)" "(string vector, default 127.0.0.1:6164)"
"IP address to listen on.") "Server endpoints in the order of input "
.SetDefault("127.0.0.1:6164") "variables for mapping")
.AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
AddAttr<framework::BlockDesc *>(
kOptimizeBlock, "Serialized ProgramDesc string for recv to run.");
AddAttr<std::vector<std::string>>(
"ParamList", "type list of string",
"grad->param name mapping to find which parameters to optimize.")
.SetDefault({});
AddAttr<std::vector<std::string>>(
"GradList", "type list of string",
"grad->param name mapping to find which parameters to optimize.")
.SetDefault({}); .SetDefault({});
AddAttr<int>("Fanin", "type int",
"Number of trainers in the current cluster job")
.SetDefault(1);
} }
}; };
......
...@@ -307,7 +307,7 @@ class RowConvKernel<platform::CUDADeviceContext, T> ...@@ -307,7 +307,7 @@ class RowConvKernel<platform::CUDADeviceContext, T>
int input_dim = X->dims()[1]; int input_dim = X->dims()[1];
int num_sequence = batch_indices.size() - 1; int num_sequence = batch_indices.size() - 1;
int future_context = Filter->dims()[0]; int future_context = Filter->dims()[0];
size_t *idx = batch_indices.data(); size_t *idx = batch_indices.cuda_data();
auto stream = context.cuda_device_context().stream(); auto stream = context.cuda_device_context().stream();
if (future_context <= 32) { if (future_context <= 32) {
...@@ -345,7 +345,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T> ...@@ -345,7 +345,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
int input_dim = X->dims()[1]; int input_dim = X->dims()[1];
int num_sequence = batch_indices.size() - 1; int num_sequence = batch_indices.size() - 1;
int future_context = Filter->dims()[0]; int future_context = Filter->dims()[0];
size_t *idx = batch_indices.data(); size_t *idx = batch_indices.cuda_data();
auto &device_ctx = context.cuda_device_context(); auto &device_ctx = context.cuda_device_context();
math::SetConstant<platform::CUDADeviceContext, T> zero; math::SetConstant<platform::CUDADeviceContext, T> zero;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint.h>
#include <sys/stat.h>
#include <fstream>
#include <numeric>
#include <sstream>
#include "paddle/framework/data_type.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_registry.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace operators {
// TODO(sidgoyal78): These function are needed by other files (save_op), move
// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
constexpr char kSEP = '/';
static bool FileExists(const std::string &filepath) {
struct stat buffer;
return (stat(filepath.c_str(), &buffer) == 0);
}
static std::string DirName(const std::string &filepath) {
auto pos = filepath.rfind(kSEP);
if (pos == std::string::npos) {
return "";
}
return filepath.substr(0, pos);
}
static void MkDir(const char *path) {
if (mkdir(path, 0755)) {
PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
}
}
static void MkDirRecursively(const char *fullpath) {
if (*fullpath == '\0') return; // empty string
if (FileExists(fullpath)) return;
MkDirRecursively(DirName(fullpath).c_str());
MkDir(fullpath);
}
class SaveCombineOp : public framework::OperatorBase {
public:
SaveCombineOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope,
const platform::Place &place) const override {
auto filename = Attr<std::string>("file_path");
auto overwrite = Attr<bool>("overwrite");
bool is_present = FileExists(filename);
if (is_present && !overwrite) {
PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
filename, overwrite);
}
MkDirRecursively(DirName(filename).c_str());
std::ofstream fout(filename);
PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
filename);
auto inp_var_names = Inputs("X");
PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
"The number of input variables should be greater than 0");
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
for (size_t i = 0; i < inp_var_names.size(); i++) {
auto *var = scope.FindVar(inp_var_names[i]);
PADDLE_ENFORCE(var != nullptr,
"Cannot find variable %s for save_combine_op",
inp_var_names[i]);
PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
"SaveCombineOp only supports LoDTensor, %s has wrong type",
inp_var_names[i]);
auto &tensor = var->Get<framework::LoDTensor>();
// Serialize tensor
framework::SerializeToStream(fout, tensor, dev_ctx);
}
fout.close();
}
};
class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
SaveCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"X",
"(vector) Input LoDTensors that need to be saved together in a file.")
.AsDuplicable();
AddComment(R"DOC(
SaveCombine operator
This operator will serialize and write a list of input LoDTensor variables
to a file on disk.
)DOC");
AddAttr<bool>("overwrite",
"(boolean, default true)"
"Overwrite the output file if it exists.")
.SetDefault(true);
AddAttr<std::string>(
"file_path",
"(string)"
"The \"file_path\" where the LoDTensor variables will be saved.")
.AddCustomChecker(
[](const std::string &path) { return !path.empty(); });
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(save_combine, ops::SaveCombineOp,
ops::SaveCombineOpProtoMaker);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/framework/op_registry.h"
USE_NO_KERNEL_OP(save_combine);
USE_NO_KERNEL_OP(load_combine);
int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
std::string var_name,
paddle::platform::CPUPlace& place,
paddle::framework::Scope& scope,
paddle::framework::LoD& expect_lod) {
auto var = scope.Var(var_name);
auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
tensor->Resize({x, y});
expect_lod.resize(1);
for (size_t i = 0; i < lod_info.size(); i++) {
expect_lod[0].push_back(lod_info[i]);
}
tensor->set_lod(expect_lod);
int* expect = tensor->mutable_data<int>(place);
for (int64_t i = 0; i < tensor->numel(); ++i) {
expect[i] = static_cast<int>(i);
}
return expect;
}
paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad(
const std::string out_var_name, paddle::framework::Scope& scope) {
auto load_var = scope.Var(out_var_name);
auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
return target;
}
int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
paddle::framework::Scope& scope,
paddle::framework::LoD& actual_lod) {
int* actual = target->data<int>();
actual_lod = target->lod();
return actual;
}
void CheckValues(int* expect, int* actual, paddle::framework::LoD expect_lod,
paddle::framework::LoD actual_lod, const int& numel) {
for (int64_t i = 0; i < numel; ++i) {
EXPECT_EQ(expect[i], actual[i]);
}
EXPECT_EQ(expect_lod.size(), actual_lod.size());
for (size_t i = 0; i < expect_lod.size(); ++i) {
for (size_t j = 0; j < expect_lod[i].size(); ++j) {
EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
}
}
}
// Here, we create 4 LoDTensors and use save_combine_op to first save these
// in a single file. Then, we use load_combine_op to load these sequentially
TEST(SaveLoadCombineOp, CPU) {
paddle::framework::Scope scope;
paddle::platform::CPUPlace place;
std::vector<int> lod1 = {0, 1, 2, 3, 10};
int numel1 = 100;
paddle::framework::LoD expect_lod1;
int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope,
expect_lod1);
std::vector<int> lod2 = {0, 2, 5, 10};
int numel2 = 200;
paddle::framework::LoD expect_lod2;
int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope,
expect_lod2);
std::vector<int> lod3 = {0, 2, 3, 20};
int numel3 = 4000;
paddle::framework::LoD expect_lod3;
int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place,
scope, expect_lod3);
std::vector<int> lod4 = {0, 1, 20};
int numel4 = 1000;
paddle::framework::LoD expect_lod4;
int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope,
expect_lod4);
// Set attributes
std::string filename = "check_tensor.ls";
paddle::framework::AttributeMap attrs;
attrs.insert({"file_path", std::string(filename)});
// Run the save_combine_op
auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
"save_combine",
{{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
save_combine_op->Run(scope, place);
// Set up output vars
auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope);
auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope);
auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope);
auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope);
// Run the load_combine_op
auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
"load_combine", {},
{{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
load_combine_op->Run(scope, place);
paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1);
int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2);
int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3);
int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4);
CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1);
CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2);
CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3);
CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4);
}
// Test with original SaveLoadTest
TEST(SaveLoadTestWithCombineOp, CPU) {
paddle::framework::Scope scope;
paddle::platform::CPUPlace place;
auto var = scope.Var("test_var");
auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
tensor->Resize({3, 10});
paddle::framework::LoD expect_lod;
expect_lod.resize(1);
expect_lod[0].push_back(0);
expect_lod[0].push_back(1);
expect_lod[0].push_back(2);
expect_lod[0].push_back(3);
tensor->set_lod(expect_lod);
int* expect = tensor->mutable_data<int>(place);
for (int64_t i = 0; i < tensor->numel(); ++i) {
expect[i] = static_cast<int>(i);
}
paddle::framework::AttributeMap attrs;
attrs.insert({"file_path", std::string("check_t.save")});
auto save_op = paddle::framework::OpRegistry::CreateOp(
"save_combine", {{"X", {"test_var"}}}, {}, attrs);
save_op->Run(scope, place);
auto load_var = scope.Var("out_var");
auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
auto load_op = paddle::framework::OpRegistry::CreateOp(
"load_combine", {}, {{"Out", {"out_var"}}}, attrs);
load_op->Run(scope, place);
int* actual = target->data<int>();
for (int64_t i = 0; i < tensor->numel(); ++i) {
EXPECT_EQ(expect[i], actual[i]);
}
auto& actual_lod = target->lod();
EXPECT_EQ(expect_lod.size(), actual_lod.size());
for (size_t i = 0; i < expect_lod.size(); ++i) {
for (size_t j = 0; j < expect_lod[i].size(); ++j) {
EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
}
}
}
...@@ -24,7 +24,7 @@ TEST(SaveLoadOp, CPU) { ...@@ -24,7 +24,7 @@ TEST(SaveLoadOp, CPU) {
auto var = scope.Var("test_var"); auto var = scope.Var("test_var");
auto tensor = var->GetMutable<paddle::framework::LoDTensor>(); auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
tensor->Resize({10, 10}); tensor->Resize({3, 10});
paddle::framework::LoD expect_lod; paddle::framework::LoD expect_lod;
expect_lod.resize(1); expect_lod.resize(1);
expect_lod[0].push_back(0); expect_lod[0].push_back(0);
......
...@@ -42,28 +42,34 @@ class SendOp : public framework::OperatorBase { ...@@ -42,28 +42,34 @@ class SendOp : public framework::OperatorBase {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place); auto& ctx = *pool.Get(place);
auto client_var_name = Output("RPCClient");
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
"Can not find variable '%s' in the scope.",
client_var_name);
auto* client_var = scope.FindVar(client_var_name);
detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
for (size_t i = 0; i < ins.size(); i++) { for (size_t i = 0; i < ins.size(); i++) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
client_.AsyncSendVariable(epmap[i], ctx, scope, ins[i]); rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
} }
PADDLE_ENFORCE(client_.Wait()); PADDLE_ENFORCE(rpc_client->Wait());
for (auto& ep : endpoints) { for (auto& ep : endpoints) {
VLOG(3) << "batch barrier, ep: " << ep; VLOG(3) << "batch barrier, ep: " << ep;
client_.AsyncSendBatchBarrier(ep); rpc_client->AsyncSendBatchBarrier(ep);
} }
PADDLE_ENFORCE(client_.Wait()); PADDLE_ENFORCE(rpc_client->Wait());
for (size_t i = 0; i < outs.size(); i++) { if (outs.size() > 0) {
VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; for (size_t i = 0; i < outs.size(); i++) {
client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]); VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
}
PADDLE_ENFORCE(rpc_client->Wait());
} }
PADDLE_ENFORCE(client_.Wait());
} }
private:
mutable detail::RPCClient client_;
}; };
class SendOpMaker : public framework::OpProtoAndCheckerMaker { class SendOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -73,11 +79,16 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -73,11 +79,16 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable(); AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable();
AddOutput("Out", "(Tensor) Output tensor to be received from server") AddOutput("Out", "(Tensor) Output tensor to be received from server")
.AsDuplicable(); .AsDuplicable();
AddOutput("RPCClient",
"(RPCClient) The RPC client object which is"
"initialized at most once.");
AddComment(R"DOC( AddComment(R"DOC(
Send operator Send operator
This operator will send tensor to recv_op at the parameter server. This operator will send tensor to recv_op at the parameter server.
)DOC"); )DOC");
// TODO(typhoonzero): remove this attr generate de-duplicated vector from
// epmap when initializing.
AddAttr<std::vector<std::string>>("endpoints", AddAttr<std::vector<std::string>>("endpoints",
"(string vector, default 127.0.0.1:6164)" "(string vector, default 127.0.0.1:6164)"
"Server endpoints to send variables to.") "Server endpoints to send variables to.")
......
...@@ -25,7 +25,7 @@ limitations under the License. */ ...@@ -25,7 +25,7 @@ limitations under the License. */
#include "paddle/string/printf.h" #include "paddle/string/printf.h"
USE_NO_KERNEL_OP(send); USE_NO_KERNEL_OP(send);
USE_NO_KERNEL_OP(recv); USE_NO_KERNEL_OP(listen_and_serv);
USE_OP(sum); USE_OP(sum);
namespace f = paddle::framework; namespace f = paddle::framework;
...@@ -33,7 +33,7 @@ namespace p = paddle::platform; ...@@ -33,7 +33,7 @@ namespace p = paddle::platform;
namespace m = paddle::operators::math; namespace m = paddle::operators::math;
// global for simplicity. // global for simplicity.
std::unique_ptr<f::OperatorBase> recv_op; std::unique_ptr<f::OperatorBase> listen_and_serv_op;
void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) { void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
p::CPUDeviceContext ctx(place); p::CPUDeviceContext ctx(place);
...@@ -120,7 +120,7 @@ void StartServerNet(bool is_sparse) { ...@@ -120,7 +120,7 @@ void StartServerNet(bool is_sparse) {
InitTensorsInScope(scope, place); InitTensorsInScope(scope, place);
} }
// sub program run in recv_op, for simple test we use sum // sub program run in listen_and_serv_op, for simple test we use sum
f::ProgramDesc program; f::ProgramDesc program;
f::BlockDesc *block = program.MutableBlock(0); f::BlockDesc *block = program.MutableBlock(0);
// X for server side tensors, RX for received tensers, must be of same shape. // X for server side tensors, RX for received tensers, must be of same shape.
...@@ -131,8 +131,9 @@ void StartServerNet(bool is_sparse) { ...@@ -131,8 +131,9 @@ void StartServerNet(bool is_sparse) {
attrs.insert({"ParamList", std::vector<std::string>({"Out"})}); attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
attrs.insert({"GradList", std::vector<std::string>({"x1"})}); attrs.insert({"GradList", std::vector<std::string>({"x1"})});
attrs.insert({"OptimizeBlock", block}); attrs.insert({"OptimizeBlock", block});
recv_op = f::OpRegistry::CreateOp("recv", {{"RX", {"x1"}}}, {}, attrs); listen_and_serv_op =
recv_op->Run(scope, place); f::OpRegistry::CreateOp("listen_and_serv", {}, {}, attrs);
listen_and_serv_op->Run(scope, place);
} }
TEST(SendRecvOp, CPUDense) { TEST(SendRecvOp, CPUDense) {
...@@ -161,9 +162,9 @@ TEST(SendRecvOp, CPUDense) { ...@@ -161,9 +162,9 @@ TEST(SendRecvOp, CPUDense) {
for (int64_t i = 0; i < target->numel(); ++i) { for (int64_t i = 0; i < target->numel(); ++i) {
EXPECT_EQ(expected[i] * 2, actual[i]); EXPECT_EQ(expected[i] * 2, actual[i]);
} }
recv_op->Stop(); listen_and_serv_op->Stop();
server_thread.join(); server_thread.join();
recv_op.reset(nullptr); listen_and_serv_op.reset(nullptr);
} }
TEST(SendRecvOp, CPUSparse) { TEST(SendRecvOp, CPUSparse) {
...@@ -200,7 +201,7 @@ TEST(SendRecvOp, CPUSparse) { ...@@ -200,7 +201,7 @@ TEST(SendRecvOp, CPUSparse) {
EXPECT_EQ(expect_value->mutable_data<float>(place)[i], EXPECT_EQ(expect_value->mutable_data<float>(place)[i],
actual->mutable_data<float>(place)[i]); actual->mutable_data<float>(place)[i]);
} }
recv_op->Stop(); listen_and_serv_op->Stop();
server_thread.join(); server_thread.join();
recv_op.reset(); listen_and_serv_op.reset();
} }
...@@ -96,9 +96,8 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> { ...@@ -96,9 +96,8 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1, GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>( PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr); num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
// Set LoD for output // Set LoD for output
thrust::host_vector<size_t> out_lod0 = dev_out_lod; std::vector<size_t> out_lod0(dev_out_lod.begin(), dev_out_lod.end());
framework::LoD out_lod; framework::LoD out_lod;
out_lod.push_back(out_lod0); out_lod.push_back(out_lod0);
out->set_lod(out_lod); out->set_lod(out_lod);
......
...@@ -89,7 +89,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> { ...@@ -89,7 +89,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(in_height, out_dims[0]); PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
auto& in_value = grad->value(); auto& in_value = grad->value();
auto& in_rows = grad->rows(); framework::Vector<int64_t> in_rows(grad->rows());
int64_t in_row_numel = in_value.numel() / in_rows.size(); int64_t in_row_numel = in_value.numel() / in_rows.size();
PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
...@@ -102,7 +102,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> { ...@@ -102,7 +102,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
dim3 grid(1, in_rows.size()); dim3 grid(1, in_rows.size());
SparseSGDFunctorKernel< SparseSGDFunctorKernel<
T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>( T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
in_data, in_rows.data(), learning_rate->data<T>(), out_data, in_data, in_rows.cuda_data(), learning_rate->data<T>(), out_data,
in_row_numel); in_row_numel);
} else { } else {
......
...@@ -68,7 +68,32 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -68,7 +68,32 @@ class SumKernel : public framework::OpKernel<T> {
} }
} }
} else if (out_var->IsType<framework::SelectedRows>()) { } else if (out_var->IsType<framework::SelectedRows>()) {
PADDLE_ENFORCE(!in_place, "SelectedRows not support inplace sum now"); std::unique_ptr<framework::SelectedRows> in0;
if (in_place) {
// If is in_place, we store the input[0] to in0
auto &in_sel0 = in_vars[0]->Get<SelectedRows>();
auto &rows = in_sel0.rows();
#ifdef PADDLE_WITH_CUDA
std::vector<int64_t> rows_in_cpu;
rows_in_cpu.reserve(rows.size());
for (auto item : rows) {
rows_in_cpu.push_back(item);
}
in0.reset(new framework::SelectedRows(rows_in_cpu, in_sel0.height()));
#else
in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
#endif
in0->mutable_value()->ShareDataWith(in_sel0.value());
}
auto get_selected_row = [&](size_t i) -> const SelectedRows & {
if (i == 0 && in0) {
return *in0.get();
} else {
return in_vars[i]->Get<SelectedRows>();
}
};
auto *out = context.Output<SelectedRows>("Out"); auto *out = context.Output<SelectedRows>("Out");
out->mutable_rows()->clear(); out->mutable_rows()->clear();
auto *out_value = out->mutable_value(); auto *out_value = out->mutable_value();
...@@ -76,24 +101,26 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -76,24 +101,26 @@ class SumKernel : public framework::OpKernel<T> {
// Runtime InferShape // Runtime InferShape
size_t first_dim = 0; size_t first_dim = 0;
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
first_dim += in_vars[i]->Get<SelectedRows>().rows().size(); auto &sel_row = get_selected_row(i);
first_dim += sel_row.rows().size();
} }
auto in_dim = in_vars[0]->Get<SelectedRows>().value().dims(); auto in_dim =
auto in_dim_vec = framework::vectorize(in_dim); framework::vectorize(get_selected_row(N - 1).value().dims());
in_dim_vec[0] = static_cast<int64_t>(first_dim); in_dim[0] = static_cast<int64_t>(first_dim);
out_value->Resize(framework::make_ddim(in_dim_vec)); out_value->Resize(framework::make_ddim(in_dim));
out_value->mutable_data<T>(context.GetPlace()); out_value->mutable_data<T>(context.GetPlace());
math::SelectedRowsAddTo<DeviceContext, T> functor; math::SelectedRowsAddTo<DeviceContext, T> functor;
int64_t offset = 0; int64_t offset = 0;
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
PADDLE_ENFORCE_EQ(out->height(), auto &sel_row = get_selected_row(i);
in_vars[i]->Get<SelectedRows>().height());
functor(context.template device_context<DeviceContext>(), PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
in_vars[i]->Get<SelectedRows>(), offset, out); functor(context.template device_context<DeviceContext>(), sel_row,
offset += in_vars[i]->Get<SelectedRows>().value().numel(); offset, out);
offset += sel_row.value().numel();
} }
} else if (out_var->IsType<framework::LoDTensorArray>()) { } else if (out_var->IsType<framework::LoDTensorArray>()) {
auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>(); auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
......
...@@ -53,6 +53,8 @@ class WhileOp : public framework::OperatorBase { ...@@ -53,6 +53,8 @@ class WhileOp : public framework::OperatorBase {
auto step_scopes = auto step_scopes =
scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>(); scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
"Condition of while op must in CPU memory.");
while (cond.data<bool>()[0]) { while (cond.data<bool>()[0]) {
auto &current_scope = scope.NewScope(); auto &current_scope = scope.NewScope();
step_scopes->push_back(&current_scope); step_scopes->push_back(&current_scope);
...@@ -99,6 +101,9 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -99,6 +101,9 @@ class WhileGradOp : public framework::OperatorBase {
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::Place &dev_place) const override { const platform::Place &dev_place) const override {
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
framework::Executor executor(dev_place); framework::Executor executor(dev_place);
auto *block = Attr<framework::BlockDesc *>(kStepBlock); auto *block = Attr<framework::BlockDesc *>(kStepBlock);
auto *program = block->Program(); auto *program = block->Program();
...@@ -205,6 +210,8 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -205,6 +210,8 @@ class WhileGradOp : public framework::OperatorBase {
sum_op->Run(cur_scope, dev_place); sum_op->Run(cur_scope, dev_place);
cur_scope.Rename(new_inside_name, inside_grad_name); cur_scope.Rename(new_inside_name, inside_grad_name);
} }
dev_ctx.Wait();
const_cast<framework::Scope &>(scope).DeleteScope(&cur_scope);
} }
} }
}; };
......
...@@ -233,7 +233,7 @@ void ParseEvents(std::vector<std::vector<Event>>& events, ...@@ -233,7 +233,7 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
}; };
break; break;
default: default:
sorted_domain = "event end time"; sorted_domain = "event first end time";
} }
std::vector<std::vector<EventItem>> events_table; std::vector<std::vector<EventItem>> events_table;
......
...@@ -124,44 +124,25 @@ PYBIND11_PLUGIN(core) { ...@@ -124,44 +124,25 @@ PYBIND11_PLUGIN(core) {
.def( .def(
"__init__", "__init__",
[](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) { [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
#ifndef PADDLE_WITH_CUDA LoD new_lod;
new (&instance) LoDTensor(lod); new_lod.reserve(lod.size());
#else std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
LoD new_lod; new (&instance) LoDTensor(new_lod);
new_lod.reserve(lod.size());
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
new (&instance) LoDTensor(new_lod);
#endif
}) })
.def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); }) .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
.def("set_lod", .def("set_lod",
[](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) { [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
#ifndef PADDLE_WITH_CUDA
self.set_lod(lod);
#else
LoD new_lod; LoD new_lod;
new_lod.reserve(lod.size()); new_lod.reserve(lod.size());
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
self.set_lod(new_lod); self.set_lod(new_lod);
#endif
}) })
.def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> { .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
#ifndef PADDLE_WITH_CUDA auto lod = self.lod();
return self.lod(); std::vector<std::vector<size_t>> new_lod;
#else new_lod.reserve(lod.size());
auto lod = self.lod(); std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
std::vector<std::vector<size_t>> new_lod; return new_lod;
new_lod.reserve(lod.size());
std::transform(lod.begin(), lod.end(), std::back_inserter(new_lod),
[](Vector<size_t> item) ->
std::vector<size_t> {
std::vector<size_t> v;
v.reserve(item.size());
std::copy(item.begin(), item.end(), std::back_inserter(v));
return v;
});
return new_lod;
#endif
}); });
py::class_<SelectedRows>(m, "SelectedRows") py::class_<SelectedRows>(m, "SelectedRows")
......
...@@ -56,7 +56,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF" ...@@ -56,7 +56,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
| ------ | -------- | ----------- | | ------ | -------- | ----------- |
| `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. | | `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. |
| `WITH_AVX` | OFF | Set to "ON" to enable AVX support. | | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
| `WITH_TESTING` | ON | Build unit tests binaries. | | `WITH_TESTING` | OFF | Build unit tests binaries. |
| `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. | | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
| `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. | | `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
| `WITH_SWIG_PY` | ON | Build with SWIG python API support. | | `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
......
...@@ -32,7 +32,7 @@ function cmake_gen() { ...@@ -32,7 +32,7 @@ function cmake_gen() {
cat <<EOF cat <<EOF
======================================== ========================================
Configuring cmake in /paddle/build ... Configuring cmake in /paddle/build ...
-DCMAKE_BUILD_TYPE=${BUILD_TYPE:Release} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
${PYTHON_FLAGS} ${PYTHON_FLAGS}
-DWITH_DOC=OFF -DWITH_DOC=OFF
-DWITH_GPU=${WITH_GPU:-OFF} -DWITH_GPU=${WITH_GPU:-OFF}
...@@ -40,6 +40,7 @@ function cmake_gen() { ...@@ -40,6 +40,7 @@ function cmake_gen() {
-DWITH_MKL=${WITH_MKL:-ON} -DWITH_MKL=${WITH_MKL:-ON}
-DWITH_AVX=${WITH_AVX:-OFF} -DWITH_AVX=${WITH_AVX:-OFF}
-DWITH_GOLANG=${WITH_GOLANG:-ON} -DWITH_GOLANG=${WITH_GOLANG:-ON}
-DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-DWITH_SWIG_PY=ON -DWITH_SWIG_PY=ON
-DWITH_C_API=${WITH_C_API:-OFF} -DWITH_C_API=${WITH_C_API:-OFF}
-DWITH_PYTHON=${WITH_PYTHON:-ON} -DWITH_PYTHON=${WITH_PYTHON:-ON}
...@@ -54,7 +55,7 @@ EOF ...@@ -54,7 +55,7 @@ EOF
# docker environment is fully controlled by this script. # docker environment is fully controlled by this script.
# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option. # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
cmake .. \ cmake .. \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE:Release} \ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
${PYTHON_FLAGS} \ ${PYTHON_FLAGS} \
-DWITH_DOC=OFF \ -DWITH_DOC=OFF \
-DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_GPU=${WITH_GPU:-OFF} \
...@@ -62,6 +63,7 @@ EOF ...@@ -62,6 +63,7 @@ EOF
-DWITH_MKL=${WITH_MKL:-ON} \ -DWITH_MKL=${WITH_MKL:-ON} \
-DWITH_AVX=${WITH_AVX:-OFF} \ -DWITH_AVX=${WITH_AVX:-OFF} \
-DWITH_GOLANG=${WITH_GOLANG:-ON} \ -DWITH_GOLANG=${WITH_GOLANG:-ON} \
-DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
-DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \ -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-DWITH_C_API=${WITH_C_API:-OFF} \ -DWITH_C_API=${WITH_C_API:-OFF} \
-DWITH_PYTHON=${WITH_PYTHON:-ON} \ -DWITH_PYTHON=${WITH_PYTHON:-ON} \
...@@ -77,6 +79,7 @@ function run_build() { ...@@ -77,6 +79,7 @@ function run_build() {
Building in /paddle/build ... Building in /paddle/build ...
============================================ ============================================
EOF EOF
make clean
make -j `nproc` make -j `nproc`
} }
......
#!/bin/bash
set -e
# the number of process to run tests
NUM_PROC=6
# calculate and set the memory usage for each process
MEM_USAGE=$(printf "%.2f" `echo "scale=5; 1.0 / $NUM_PROC" | bc`)
export FLAGS_fraction_of_gpu_memory_to_use=$MEM_USAGE
# get the CUDA device count
CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
for (( i = 0; i < $NUM_PROC; i++ )); do
cuda_list=()
for (( j = 0; j < $CUDA_DEVICE_COUNT; j++ )); do
s=$[i+j]
n=$[s%CUDA_DEVICE_COUNT]
if [ $j -eq 0 ]; then
cuda_list=("$n")
else
cuda_list="$cuda_list,$n"
fi
done
echo $cuda_list
# CUDA_VISIBLE_DEVICES http://acceleware.com/blog/cudavisibledevices-masking-gpus
# ctest -I https://cmake.org/cmake/help/v3.0/manual/ctest.1.html?highlight=ctest
env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC --output-on-failure &
done
wait
...@@ -22,12 +22,15 @@ limitations under the License. */ ...@@ -22,12 +22,15 @@ limitations under the License. */
int main(int argc, char** argv) { int main(int argc, char** argv) {
std::vector<char*> new_argv; std::vector<char*> new_argv;
std::string gflags_env; std::string gflags_env;
new_argv.push_back(argv[0]); for (int i = 0; i < argc; ++i) {
new_argv.push_back(argv[i]);
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
new_argv.push_back( new_argv.push_back(
strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory")); strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory,"
"warpctc_dir"));
#else #else
new_argv.push_back(strdup("--tryfromenv=use_pinned_memory")); new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,warpctc_dir"));
#endif #endif
int new_argc = static_cast<int>(new_argv.size()); int new_argc = static_cast<int>(new_argv.size());
char** new_argv_address = new_argv.data(); char** new_argv_address = new_argv.data();
......
...@@ -26,6 +26,7 @@ import initializer ...@@ -26,6 +26,7 @@ import initializer
import layers import layers
import nets import nets
import optimizer import optimizer
import learning_rate_decay
import backward import backward
import regularizer import regularizer
from param_attr import ParamAttr from param_attr import ParamAttr
...@@ -35,27 +36,16 @@ from distribute_transpiler import DistributeTranspiler ...@@ -35,27 +36,16 @@ from distribute_transpiler import DistributeTranspiler
from distribute_transpiler_simple import SimpleDistributeTranspiler from distribute_transpiler_simple import SimpleDistributeTranspiler
import clip import clip
from memory_optimization_transpiler import memory_optimize from memory_optimization_transpiler import memory_optimize
import profiler
Tensor = LoDTensor Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + [ __all__ = framework.__all__ + executor.__all__ + [
'io', 'io', 'initializer', 'layers', 'nets', 'optimizer', 'learning_rate_decay',
'initializer', 'backward', 'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor',
'layers',
'nets',
'optimizer',
'backward',
'regularizer',
'LoDTensor',
'CPUPlace',
'CUDAPlace',
'Tensor',
'ParamAttr' 'ParamAttr'
'DataFeeder', 'DataFeeder', 'clip', 'SimpleDistributeTranspiler', 'DistributeTranspiler',
'clip', 'memory_optimize', 'profiler'
'SimpleDistributeTranspiler',
'DistributeTranspiler',
'memory_optimize',
] ]
...@@ -87,10 +77,10 @@ def __bootstrap__(): ...@@ -87,10 +77,10 @@ def __bootstrap__():
os.environ['OMP_NUM_THREADS'] = str(num_threads) os.environ['OMP_NUM_THREADS'] = str(num_threads)
read_env_flags = [ read_env_flags = [
'use_pinned_memory', 'check_nan_inf', 'do_memory_benchmark' 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir'
] ]
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += ['fraction_of_gpu_memory_to_use', 'op_sync'] read_env_flags += ['fraction_of_gpu_memory_to_use']
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)]) ["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0]) core.init_glog(sys.argv[0])
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import re
from graphviz import GraphPreviewGenerator
import proto.framework_pb2 as framework_pb2
_vartype2str_ = [
"UNK",
"LoDTensor",
"SelectedRows",
"FeedMinibatch",
"FetchList",
"StepScopes",
"LodRankTable",
"LoDTensorArray",
"PlaceList",
]
_dtype2str_ = [
"bool",
"int16",
"int32",
"int64",
"float16",
"float32",
"float64",
]
def repr_data_type(type):
return _dtype2str_[type]
def repr_tensor(proto):
return "tensor(type={}, shape={})".format(_dtype2str_[int(proto.data_type)],
str(proto.dims))
reprtpl = "{ttype} {name} ({reprs})"
def repr_lodtensor(proto):
if not proto.lod_tensor: return
level = proto.lod_tensor.lod_level
reprs = repr_tensor(proto.lod_tensor.tensor)
return reprtpl.format(
ttype="LoDTensor" if level > 0 else "Tensor",
name=proto.name,
reprs="level=%d, %s" % (level, reprs) if level > 0 else reprs)
def repr_selected_rows(proto):
if not proto.selected_rows: return
return reprtpl.format(
ttype="SelectedRows",
name=proto.name,
reprs=repr_tensor(proto.selected_rows))
def repr_tensor_array(proto):
if not proto.tensor_array: return
return reprtpl.format(
ttype="TensorArray",
name=proto.name,
reprs="level=%d, %s" % (proto.tensor_array.lod_level,
repr_tensor(proto.lod_tensor)))
type_handlers = [
repr_lodtensor,
repr_selected_rows,
repr_tensor_array,
]
def repr_var(vardesc):
for handler in type_handlers:
res = handler(vardesc)
if res:
return res
def pprint_program_codes(program_desc):
reprs = []
for block_idx in range(program_desc.num_blocks()):
block_desc = program_desc.block(block_idx)
block_repr = pprint_block_codes(block_desc)
reprs.append(block_repr)
return '\n'.join(reprs)
def pprint_block_codes(block_desc, show_backward=False):
def is_op_backward(op_desc):
if op_desc.type.endswith('_grad'): return True
def is_var_backward(var):
if "@GRAD" in var.parameter: return True
for arg in var.arguments:
if "@GRAD" in arg: return True
for var in op_desc.inputs:
if is_var_backward(var): return True
for var in op_desc.outputs:
if is_var_backward(var): return True
return False
def is_var_backward(var_desc):
return "@GRAD" in var_desc.name
if type(block_desc) is not framework_pb2.BlockDesc:
block_desc = framework_pb2.BlockDesc.FromString(
block_desc.serialize_to_string())
var_reprs = []
op_reprs = []
for var in block_desc.vars:
if not show_backward and is_var_backward(var):
continue
var_reprs.append(repr_var(var))
for op in block_desc.ops:
if not show_backward and is_op_backward(op): continue
op_reprs.append(repr_op(op))
tpl = "// block-{idx} parent-{pidx}\n// variables\n{vars}\n\n// operators\n{ops}\n"
return tpl.format(
idx=block_desc.idx,
pidx=block_desc.parent_idx,
vars='\n'.join(var_reprs),
ops='\n'.join(op_reprs), )
def repr_attr(desc):
tpl = "{key}={value}"
valgetter = [
lambda attr: attr.i,
lambda attr: attr.f,
lambda attr: attr.s,
lambda attr: attr.ints,
lambda attr: attr.floats,
lambda attr: attr.strings,
lambda attr: attr.b,
lambda attr: attr.bools,
lambda attr: attr.block_idx,
lambda attr: attr.l,
]
key = desc.name
value = valgetter[desc.type](desc)
if key == "dtype":
value = repr_data_type(value)
return tpl.format(key=key, value=str(value)), (key, value)
def _repr_op_fill_constant(optype, inputs, outputs, attrs):
if optype == "fill_constant":
return "{output} = {data} [shape={shape}]".format(
output=','.join(outputs),
data=attrs['value'],
shape=str(attrs['shape']))
op_repr_handlers = [_repr_op_fill_constant, ]
def repr_op(opdesc):
optype = None
attrs = []
attr_dict = {}
is_target = None
inputs = []
outputs = []
tpl = "{outputs} = {optype}({inputs}{is_target}) [{attrs}]"
args2value = lambda args: args[0] if len(args) == 1 else str(list(args))
for var in opdesc.inputs:
key = var.parameter
value = args2value(var.arguments)
inputs.append("%s=%s" % (key, value))
for var in opdesc.outputs:
value = args2value(var.arguments)
outputs.append(value)
for attr in opdesc.attrs:
attr_repr, attr_pair = repr_attr(attr)
attrs.append(attr_repr)
attr_dict[attr_pair[0]] = attr_pair[1]
is_target = opdesc.is_target
for handler in op_repr_handlers:
res = handler(opdesc.type, inputs, outputs, attr_dict)
if res: return res
return tpl.format(
outputs=', '.join(outputs),
optype=opdesc.type,
inputs=', '.join(inputs),
attrs="{%s}" % ','.join(attrs),
is_target=", is_target" if is_target else "")
def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
'''
Generate a debug graph for block.
Args:
block(Block): a block.
'''
graph = GraphPreviewGenerator("some graph")
# collect parameters and args
protostr = block.desc.serialize_to_string()
desc = framework_pb2.BlockDesc.FromString(str(protostr))
def need_highlight(name):
if highlights is None: return False
for pattern in highlights:
assert type(pattern) is str
if re.match(pattern, name):
return True
return False
# draw parameters and args
vars = {}
for var in desc.vars:
shape = [str(i) for i in var.lod_tensor.tensor.dims]
if not shape:
shape = ['null']
# create var
if var.persistable:
varn = graph.add_param(
var.name, var.type, shape, highlight=need_highlight(var.name))
else:
varn = graph.add_arg(var.name, highlight=need_highlight(var.name))
vars[var.name] = varn
def add_op_link_var(op, var, op2var=False):
for arg in var.arguments:
if arg not in vars:
# add missing variables as argument
vars[arg] = graph.add_arg(arg, highlight=need_highlight(arg))
varn = vars[arg]
highlight = need_highlight(op.description) or need_highlight(
varn.description)
if op2var:
graph.add_edge(op, varn, highlight=highlight)
else:
graph.add_edge(varn, op, highlight=highlight)
for op in desc.ops:
opn = graph.add_op(op.type, highlight=need_highlight(op.type))
for var in op.inputs:
add_op_link_var(opn, var, False)
for var in op.outputs:
add_op_link_var(opn, var, True)
graph(path, show=True)
...@@ -153,11 +153,18 @@ class DistributeTranspiler: ...@@ -153,11 +153,18 @@ class DistributeTranspiler:
self.param_grad_ep_mapping[ep]["params"].append(param) self.param_grad_ep_mapping[ep]["params"].append(param)
self.param_grad_ep_mapping[ep]["grads"].append(grad) self.param_grad_ep_mapping[ep]["grads"].append(grad)
rpc_client_var = program.global_block().create_var(
name="RPC_CLIENT_VAR",
psersistable=True,
dtype='float32', # dtype and shape is not used in fact
shape=[0])
# create send_op # create send_op
send_op = program.global_block().append_op( send_op = program.global_block().append_op(
type="send", type="send",
inputs={"X": send_inputs}, inputs={"X": send_inputs},
outputs={"Out": send_outputs}, outputs={"Out": send_outputs,
"RPCClient": rpc_client_var},
attrs={"endpoints": pserver_endpoints, attrs={"endpoints": pserver_endpoints,
"epmap": eplist}) "epmap": eplist})
# step4 # step4
...@@ -471,9 +478,9 @@ class DistributeTranspiler: ...@@ -471,9 +478,9 @@ class DistributeTranspiler:
else: else:
self._append_pserver_non_opt_ops(optimize_sub_program, self._append_pserver_non_opt_ops(optimize_sub_program,
pserver_program, opt_op) pserver_program, opt_op)
# Append the recv op # Append the listen_and_serv op
pserver_program.global_block().append_op( pserver_program.global_block().append_op(
type="recv", type="listen_and_serv",
inputs={}, inputs={},
outputs={}, outputs={},
attrs={ attrs={
......
...@@ -451,9 +451,8 @@ class Operator(object): ...@@ -451,9 +451,8 @@ class Operator(object):
if not given == need: if not given == need:
raise ValueError(("Incorrect setting for output(s) of " raise ValueError(("Incorrect setting for output(s) of "
"operator \"%s\". Need: [%s] Given: [%s]") % "operator \"%s\". Need: [%s] Given: [%s]") %
(type, ", ".join(str(e) (type, ", ".join(str(e) for e in need),
for e in need), ", ".join( ", ".join(str(e) for e in given)))
str(e) for e in given)))
for out_proto in proto.outputs: for out_proto in proto.outputs:
out_args = outputs[out_proto.name] out_args = outputs[out_proto.name]
...@@ -489,7 +488,8 @@ class Operator(object): ...@@ -489,7 +488,8 @@ class Operator(object):
no_kernel_op_set = { no_kernel_op_set = {
'feed', 'fetch', 'save', 'load', 'recurrent', 'feed', 'fetch', 'save', 'load', 'recurrent',
'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
'recv', 'parallel_do' 'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
'load_combine'
} }
if type not in no_kernel_op_set: if type not in no_kernel_op_set:
self.desc.infer_var_type(self.block.desc) self.desc.infer_var_type(self.block.desc)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import subprocess
import logging
def crepr(v):
if type(v) is str or type(v) is unicode:
return '"%s"' % v
return str(v)
class Rank(object):
def __init__(self, kind, name, priority):
'''
kind: str
name: str
priority: int
'''
self.kind = kind
self.name = name
self.priority = priority
self.nodes = []
def __str__(self):
if not self.nodes:
return ''
return '{' + 'rank={};'.format(self.kind) + \
','.join([node.name for node in self.nodes]) + '}'
class Graph(object):
rank_counter = 0
def __init__(self, title, **attrs):
self.title = title
self.attrs = attrs
self.nodes = []
self.edges = []
self.rank_groups = {}
def code(self):
return self.__str__()
def rank_group(self, kind, priority):
name = "rankgroup-%d" % Graph.rank_counter
Graph.rank_counter += 1
rank = Rank(kind, name, priority)
self.rank_groups[name] = rank
return name
def node(self, label, prefix, description="", **attrs):
node = Node(label, prefix, description, **attrs)
if 'rank' in attrs:
rank = self.rank_groups[attrs['rank']]
del attrs['rank']
rank.nodes.append(node)
self.nodes.append(node)
return node
def edge(self, source, target, **attrs):
edge = Edge(source, target, **attrs)
self.edges.append(edge)
return edge
def compile(self, dot_path):
file = open(dot_path, 'w')
file.write(self.__str__())
image_path = os.path.join(
os.path.dirname(__file__), dot_path[:-3] + "pdf")
cmd = ["dot", "-Tpdf", dot_path, "-o", image_path]
subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
logging.warning("write block debug graph to {}".format(image_path))
return image_path
def show(self, dot_path):
image = self.compile(dot_path)
cmd = ["open", image]
subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
def _rank_repr(self):
ranks = sorted(
self.rank_groups.items(),
cmp=lambda a, b: a[1].priority > b[1].priority)
repr = []
for x in ranks:
repr.append(str(x[1]))
return '\n'.join(repr) + '\n'
def __str__(self):
reprs = [
'digraph G {',
'title = {}'.format(crepr(self.title)),
]
for attr in self.attrs:
reprs.append("{key}={value};".format(
key=attr, value=crepr(self.attrs[attr])))
reprs.append(self._rank_repr())
random.shuffle(self.nodes)
reprs += [str(node) for node in self.nodes]
for x in self.edges:
reprs.append(str(x))
reprs.append('}')
return '\n'.join(reprs)
class Node(object):
counter = 1
def __init__(self, label, prefix, description="", **attrs):
self.label = label
self.name = "%s_%d" % (prefix, Node.counter)
self.description = description
self.attrs = attrs
Node.counter += 1
def __str__(self):
reprs = '{name} [label={label} {extra} ];'.format(
name=self.name,
label=self.label,
extra=',' + ','.join("%s=%s" % (key, crepr(value))
for key, value in self.attrs.items())
if self.attrs else "")
return reprs
class Edge(object):
def __init__(self, source, target, **attrs):
'''
Link source to target.
:param source: Node
:param target: Node
:param graph: Graph
:param attrs: dic
'''
self.source = source
self.target = target
self.attrs = attrs
def __str__(self):
repr = "{source} -> {target} {extra}".format(
source=self.source.name,
target=self.target.name,
extra="" if not self.attrs else
"[" + ','.join("{}={}".format(attr[0], crepr(attr[1]))
for attr in self.attrs.items()) + "]")
return repr
class GraphPreviewGenerator(object):
'''
Generate a graph image for ONNX proto.
'''
def __init__(self, title):
# init graphviz graph
self.graph = Graph(
title,
layout="dot",
concentrate="true",
rankdir="TB", )
self.op_rank = self.graph.rank_group('same', 2)
self.param_rank = self.graph.rank_group('same', 1)
self.arg_rank = self.graph.rank_group('same', 0)
def __call__(self, path='temp.dot', show=False):
if not show:
self.graph.compile(path)
else:
self.graph.show(path)
def add_param(self, name, data_type, shape, highlight=False):
label = '\n'.join([
'<<table cellpadding="5">',
' <tr>',
' <td bgcolor="#2b787e">',
' <b>',
name,
' </b>',
' </td>',
' </tr>',
' <tr>',
' <td>',
str(data_type),
' </td>'
' </tr>',
' <tr>',
' <td>',
'[%s]' % 'x'.join(shape),
' </td>'
' </tr>',
'</table>>',
])
return self.graph.node(
label,
prefix="param",
description=name,
shape="none",
style="rounded,filled,bold",
width="1.3",
color="#148b97" if not highlight else "orange",
fontcolor="#ffffff",
fontname="Arial")
def add_op(self, opType, **kwargs):
highlight = False
if 'highlight' in kwargs:
highlight = kwargs['highlight']
del kwargs['highlight']
return self.graph.node(
"<<B>%s</B>>" % opType,
prefix="op",
description=opType,
shape="box",
style="rounded, filled, bold",
color="#303A3A" if not highlight else "orange",
fontname="Arial",
fontcolor="#ffffff",
width="1.3",
height="0.84", )
def add_arg(self, name, highlight=False):
return self.graph.node(
crepr(name),
prefix="arg",
description=name,
shape="box",
style="rounded,filled,bold",
fontname="Arial",
fontcolor="#999999",
color="#dddddd" if not highlight else "orange")
def add_edge(self, source, target, **kwargs):
highlight = False
if 'highlight' in kwargs:
highlight = kwargs['highlight']
del kwargs['highlight']
return self.graph.edge(
source,
target,
color="#00000" if not highlight else "orange",
**kwargs)
...@@ -46,6 +46,9 @@ def is_parameter(var): ...@@ -46,6 +46,9 @@ def is_parameter(var):
def is_persistable(var): def is_persistable(var):
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST:
return False
return var.persistable return var.persistable
...@@ -60,7 +63,12 @@ def _clone_var_in_block_(block, var): ...@@ -60,7 +63,12 @@ def _clone_var_in_block_(block, var):
persistable=True) persistable=True)
def save_vars(executor, dirname, main_program=None, vars=None, predicate=None): def save_vars(executor,
dirname,
main_program=None,
vars=None,
predicate=None,
save_file_name=None):
""" """
Save variables to directory by executor. Save variables to directory by executor.
...@@ -69,9 +77,12 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None): ...@@ -69,9 +77,12 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
:param main_program: program. If vars is None, then filter all variables in this :param main_program: program. If vars is None, then filter all variables in this
program which fit `predicate`. Default default_main_program. program which fit `predicate`. Default default_main_program.
:param predicate: The Predicate describes a callable that returns a variable :param predicate: The Predicate describes a callable that returns a variable
as a bool. If it returns true, the variables will be saved. as a bool. If it returns true, the corresponding input variable will be saved.
:param vars: variables need to be saved. If specify vars, program & predicate :param vars: variables need to be saved. If vars is specified, program & predicate
will be ignored will be ignored
:param save_file_name: The name of a single file that all vars are saved to.
If it is None, save variables to separate files.
:return: None :return: None
""" """
if vars is None: if vars is None:
...@@ -83,21 +94,39 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None): ...@@ -83,21 +94,39 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
save_vars( save_vars(
executor, executor,
dirname=dirname, dirname=dirname,
vars=filter(predicate, main_program.list_vars())) vars=filter(predicate, main_program.list_vars()),
save_file_name=save_file_name)
else: else:
save_program = Program() save_program = Program()
save_block = save_program.global_block() save_block = save_program.global_block()
save_var_map = {}
for each_var in vars: for each_var in vars:
new_var = _clone_var_in_block_(save_block, each_var) new_var = _clone_var_in_block_(save_block, each_var)
if save_file_name is None:
save_block.append_op(
type='save',
inputs={'X': [new_var]},
outputs={},
attrs={'file_path': os.path.join(dirname, new_var.name)})
else:
save_var_map[new_var.name] = new_var
if save_file_name is not None:
save_var_list = []
for name in sorted(save_var_map.keys()):
save_var_list.append(save_var_map[name])
save_block.append_op( save_block.append_op(
type='save', type='save_combine',
inputs={'X': [new_var]}, inputs={'X': save_var_list},
outputs={}, outputs={},
attrs={'file_path': os.path.join(dirname, new_var.name)}) attrs={'file_path': os.path.join(dirname, save_file_name)})
executor.run(save_program) executor.run(save_program)
def save_params(executor, dirname, main_program=None): def save_params(executor, dirname, main_program=None, save_file_name=None):
""" """
Save all parameters to directory with executor. Save all parameters to directory with executor.
""" """
...@@ -106,10 +135,12 @@ def save_params(executor, dirname, main_program=None): ...@@ -106,10 +135,12 @@ def save_params(executor, dirname, main_program=None):
dirname=dirname, dirname=dirname,
main_program=main_program, main_program=main_program,
vars=None, vars=None,
predicate=is_parameter) predicate=is_parameter,
save_file_name=save_file_name)
def save_persistables(executor, dirname, main_program=None): def save_persistables(executor, dirname, main_program=None,
save_file_name=None):
""" """
Save all persistables to directory with executor. Save all persistables to directory with executor.
""" """
...@@ -118,21 +149,30 @@ def save_persistables(executor, dirname, main_program=None): ...@@ -118,21 +149,30 @@ def save_persistables(executor, dirname, main_program=None):
dirname=dirname, dirname=dirname,
main_program=main_program, main_program=main_program,
vars=None, vars=None,
predicate=is_persistable) predicate=is_persistable,
save_file_name=save_file_name)
def load_vars(executor, dirname, main_program=None, vars=None, predicate=None): def load_vars(executor,
dirname,
main_program=None,
vars=None,
predicate=None,
load_file_name=None):
""" """
Load variables from directory by executor. Load variables from directory by executor.
:param executor: executor that save variable :param executor: executor that load variable
:param dirname: directory path :param dirname: directory path
:param main_program: program. If vars is None, then filter all variables in this :param main_program: program. If vars is None, then filter all variables in this
program which fit `predicate`. Default default_main_program(). program which fit `predicate`. Default default_main_program().
:param predicate: The Predicate describes a callable that returns a variable :param predicate: The Predicate describes a callable that returns a variable
as a bool. If it returns true, the variables will be loaded. as a bool. If it returns true, the corresponding input variable will be loaded.
:param vars: variables need to be loaded. If specify vars, program & :param vars: variables need to be loaded. If vars is specified, program &
predicate will be ignored predicate will be ignored
:param load_file_name: The name of the single file that all vars are loaded from.
If it is None, load variables from separate files.
:return: None :return: None
""" """
if vars is None: if vars is None:
...@@ -144,23 +184,40 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None): ...@@ -144,23 +184,40 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
load_vars( load_vars(
executor, executor,
dirname=dirname, dirname=dirname,
vars=filter(predicate, main_program.list_vars())) vars=filter(predicate, main_program.list_vars()),
load_file_name=load_file_name)
else: else:
load_prog = Program() load_prog = Program()
load_block = load_prog.global_block() load_block = load_prog.global_block()
load_var_map = {}
for each_var in vars: for each_var in vars:
assert isinstance(each_var, Variable) assert isinstance(each_var, Variable)
new_var = _clone_var_in_block_(load_block, each_var) new_var = _clone_var_in_block_(load_block, each_var)
if load_file_name is None:
load_block.append_op(
type='load',
inputs={},
outputs={'Out': [new_var]},
attrs={'file_path': os.path.join(dirname, new_var.name)})
else:
load_var_map[new_var.name] = new_var
if load_file_name is not None:
load_var_list = []
for name in sorted(load_var_map.keys()):
load_var_list.append(load_var_map[name])
load_block.append_op( load_block.append_op(
type='load', type='load_combine',
inputs={}, inputs={},
outputs={"Out": [new_var]}, outputs={"Out": load_var_list},
attrs={'file_path': os.path.join(dirname, new_var.name)}) attrs={'file_path': os.path.join(dirname, load_file_name)})
executor.run(load_prog) executor.run(load_prog)
def load_params(executor, dirname, main_program=None): def load_params(executor, dirname, main_program=None, load_file_name=None):
""" """
load all parameters from directory by executor. load all parameters from directory by executor.
""" """
...@@ -168,10 +225,12 @@ def load_params(executor, dirname, main_program=None): ...@@ -168,10 +225,12 @@ def load_params(executor, dirname, main_program=None):
executor, executor,
dirname=dirname, dirname=dirname,
main_program=main_program, main_program=main_program,
predicate=is_parameter) predicate=is_parameter,
load_file_name=load_file_name)
def load_persistables(executor, dirname, main_program=None): def load_persistables(executor, dirname, main_program=None,
load_file_name=None):
""" """
load all persistables from directory by executor. load all persistables from directory by executor.
""" """
...@@ -179,7 +238,8 @@ def load_persistables(executor, dirname, main_program=None): ...@@ -179,7 +238,8 @@ def load_persistables(executor, dirname, main_program=None):
executor, executor,
dirname=dirname, dirname=dirname,
main_program=main_program, main_program=main_program,
predicate=is_persistable) predicate=is_persistable,
load_file_name=load_file_name)
def get_inference_program(target_vars, main_program=None): def get_inference_program(target_vars, main_program=None):
...@@ -238,7 +298,8 @@ def save_inference_model(dirname, ...@@ -238,7 +298,8 @@ def save_inference_model(dirname,
feeded_var_names, feeded_var_names,
target_vars, target_vars,
executor, executor,
main_program=None): main_program=None,
save_file_name=None):
""" """
Build a model especially for inference, Build a model especially for inference,
and save it to directory by the executor. and save it to directory by the executor.
...@@ -249,6 +310,8 @@ def save_inference_model(dirname, ...@@ -249,6 +310,8 @@ def save_inference_model(dirname,
:param executor: executor that save inference model :param executor: executor that save inference model
:param main_program: original program, which will be pruned to build the inference model. :param main_program: original program, which will be pruned to build the inference model.
Default default_main_program(). Default default_main_program().
:param save_file_name: The name of a single file that all parameters are saved to.
If it is None, save parameters to separate files.
:return: None :return: None
""" """
...@@ -283,25 +346,7 @@ def save_inference_model(dirname, ...@@ -283,25 +346,7 @@ def save_inference_model(dirname,
with open(model_file_name, "wb") as f: with open(model_file_name, "wb") as f:
f.write(inference_program.desc.serialize_to_string()) f.write(inference_program.desc.serialize_to_string())
save_params(executor, dirname, main_program) save_persistables(executor, dirname, inference_program, save_file_name)
def load_persistables_if_exist(executor, dirname, main_program=None):
filenames = next(os.walk(dirname))[2]
filenames = set(filenames)
def _is_presistable_and_exist_(var):
if not is_persistable(var):
return False
else:
return var.name in filenames
load_vars(
executor,
dirname,
main_program=main_program,
vars=None,
predicate=_is_presistable_and_exist_)
def get_feed_targets_names(program): def get_feed_targets_names(program):
...@@ -322,13 +367,15 @@ def get_fetch_targets_names(program): ...@@ -322,13 +367,15 @@ def get_fetch_targets_names(program):
return fetch_targets_names return fetch_targets_names
def load_inference_model(dirname, executor): def load_inference_model(dirname, executor, load_file_name=None):
""" """
Load inference model from a directory Load inference model from a directory
:param dirname: directory path :param dirname: directory path
:param executor: executor that load inference model :param executor: executor that load inference model
:param load_file_name: The name of the single file that all parameters are loaded from.
If it is None, load parameters from separate files.
:return: [program, feed_target_names, fetch_targets] :return: [program, feed_target_names, fetch_targets]
program: program especially for inference. program: program especially for inference.
feed_target_names: Names of variables that need to feed data feed_target_names: Names of variables that need to feed data
...@@ -342,7 +389,7 @@ def load_inference_model(dirname, executor): ...@@ -342,7 +389,7 @@ def load_inference_model(dirname, executor):
program_desc_str = f.read() program_desc_str = f.read()
program = Program.parse_from_string(program_desc_str) program = Program.parse_from_string(program_desc_str)
load_persistables_if_exist(executor, dirname, program) load_persistables(executor, dirname, program, load_file_name)
feed_target_names = get_feed_targets_names(program) feed_target_names = get_feed_targets_names(program)
fetch_target_names = get_fetch_targets_names(program) fetch_target_names = get_fetch_targets_names(program)
...@@ -359,6 +406,7 @@ def get_parameter_value(para, executor): ...@@ -359,6 +406,7 @@ def get_parameter_value(para, executor):
:param executor: executor for retrieving the value :param executor: executor for retrieving the value
:param para: the given parameter :param para: the given parameter
:return: the LoDTensor for the parameter :return: the LoDTensor for the parameter
""" """
assert is_parameter(para) assert is_parameter(para)
...@@ -377,6 +425,7 @@ def get_parameter_value_by_name(name, executor, program=None): ...@@ -377,6 +425,7 @@ def get_parameter_value_by_name(name, executor, program=None):
:param name: the name of the parameter :param name: the name of the parameter
:param program: the program where the variable is found :param program: the program where the variable is found
Default default_main_program(). Default default_main_program().
:return: the LoDTensor for the variable :return: the LoDTensor for the variable
""" """
if program is None: if program is None:
......
...@@ -108,7 +108,7 @@ class ListenAndServ(object): ...@@ -108,7 +108,7 @@ class ListenAndServ(object):
""" """
def __init__(self, endpoint, fan_in=1, optimizer_mode=True): def __init__(self, endpoint, fan_in=1, optimizer_mode=True):
self.helper = LayerHelper("recv") self.helper = LayerHelper("listen_and_serv")
self.inputs = [] self.inputs = []
self.outputs = [] self.outputs = []
self.endpoint = endpoint self.endpoint = endpoint
...@@ -158,7 +158,7 @@ class ListenAndServ(object): ...@@ -158,7 +158,7 @@ class ListenAndServ(object):
param_names = [p.name for p in params] param_names = [p.name for p in params]
grad_names = [g.name for g in grads] grad_names = [g.name for g in grads]
parent_block.append_op( parent_block.append_op(
type='recv', type='listen_and_serv',
inputs={}, inputs={},
outputs={}, outputs={},
attrs={ attrs={
...@@ -196,3 +196,31 @@ def Send(endpoints, send_vars, get_vars): ...@@ -196,3 +196,31 @@ def Send(endpoints, send_vars, get_vars):
outputs={"Out": get_vars}, outputs={"Out": get_vars},
attrs={"endpoints": endpoints, attrs={"endpoints": endpoints,
"epmap": epmap}) "epmap": epmap})
def Recv(endpoints, get_vars):
"""
Recv layer
Args:
endpoints: comma seperated IP:PORT pairs in the order
of send_vars to send
send_vars: vars to send
get_vars: vars to get from server after send completes.
Send variables to the server side, and get vars from server
side when server have finished running server side program.
"""
assert (type(send_vars) == list)
assert (type(get_vars) == list)
epmap = endpoints.split(",")
endpoints = list(set(epmap))
helper = LayerHelper("Recv", **locals())
helper.append_op(
type="recv",
inputs={"X": get_vars},
outputs={"Out": get_vars},
attrs={"endpoints": endpoints,
"epmap": epmap})
...@@ -145,7 +145,9 @@ def monkey_patch_variable(): ...@@ -145,7 +145,9 @@ def monkey_patch_variable():
# a*b == b*a. Do not need to reverse explicitly # a*b == b*a. Do not need to reverse explicitly
("__rmul__", "elementwise_mul", False), ("__rmul__", "elementwise_mul", False),
("__div__", "elementwise_div", False), ("__div__", "elementwise_div", False),
("__rdiv__", "elementwise_div", True)): ("__rdiv__", "elementwise_div", True),
("__pow__", "elementwise_pow", False),
("__rpow__", "elementwise_pow", True)):
setattr(Variable, method_name, setattr(Variable, method_name,
_elemwise_method_creator_(method_name, op_type, reverse)) _elemwise_method_creator_(method_name, op_type, reverse))
......
...@@ -847,7 +847,35 @@ def cos_sim(X, Y, **kwargs): ...@@ -847,7 +847,35 @@ def cos_sim(X, Y, **kwargs):
return out return out
def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs): def dropout(x, dropout_prob, is_test=False, seed=None, **kwargs):
"""
Computes dropout.
Drop or keep each element of `x` independently. Dropout is a regularization
technique for reducing overfitting by preventing neuron co-adaption during
training. The dropout operator randomly set (according to the given dropout
probability) the outputs of some units to zero, while others are remain
unchanged.
Args:
x(variable): The input tensor.
dropout_prob(float): Probability of setting units to zero.
is_test(bool): A flag indicating whether it is in test phrase or not.
seed(int): A Python integer used to create random seeds. If this
parameter is set to None, a random seed is used.
NOTE: If an integer seed is given, always the same output
units will be dropped. DO NOT use a fixed seed in training.
Returns:
Variable: A tensor variable.
Examples:
.. code-block:: python
x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
droped = fluid.layers.dropout(input=x, dropout_rate=0.5)
"""
helper = LayerHelper('dropout', **kwargs) helper = LayerHelper('dropout', **kwargs)
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_tmp_variable(dtype=x.dtype)
mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True) mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
...@@ -856,9 +884,12 @@ def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs): ...@@ -856,9 +884,12 @@ def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs):
inputs={'X': [x]}, inputs={'X': [x]},
outputs={'Out': [out], outputs={'Out': [out],
'Mask': [mask]}, 'Mask': [mask]},
attrs={'dropout_prob': dropout_prob, attrs={
'is_test': is_test, 'dropout_prob': dropout_prob,
'seed': seed}) 'is_test': is_test,
'fix_seed': seed is not None,
'seed': seed if seed is not None else 0
})
return out return out
...@@ -1200,10 +1231,17 @@ def conv2d(input, ...@@ -1200,10 +1231,17 @@ def conv2d(input,
""" """
if stride is None: if stride is None:
stride = [1, 1] stride = [1, 1]
helper = LayerHelper('conv2d', **locals())
dtype = helper.input_dtype()
num_channels = input.shape[1] num_channels = input.shape[1]
l_type = 'conv2d'
if (num_channels == groups and num_filters % num_channels == 0 and
not use_cudnn):
l_type = 'depthwise_conv2d'
helper = LayerHelper(l_type, **locals())
dtype = helper.input_dtype()
if groups is None: if groups is None:
num_filter_channels = num_channels num_filter_channels = num_channels
else: else:
...@@ -1236,7 +1274,7 @@ def conv2d(input, ...@@ -1236,7 +1274,7 @@ def conv2d(input,
pre_bias = helper.create_tmp_variable(dtype) pre_bias = helper.create_tmp_variable(dtype)
helper.append_op( helper.append_op(
type='conv2d', type=l_type,
inputs={ inputs={
'Input': input, 'Input': input,
'Filter': filter_param, 'Filter': filter_param,
...@@ -1447,7 +1485,9 @@ def batch_norm(input, ...@@ -1447,7 +1485,9 @@ def batch_norm(input,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
data_layout='NCHW', data_layout='NCHW',
name=None): name=None,
moving_mean_name=None,
moving_variance_name=None):
""" """
This function helps create an operator to implement This function helps create an operator to implement
the BatchNorm layer using the configurations from the input parameters. the BatchNorm layer using the configurations from the input parameters.
...@@ -1477,6 +1517,7 @@ def batch_norm(input, ...@@ -1477,6 +1517,7 @@ def batch_norm(input,
attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True) attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
mean = helper.create_global_variable( mean = helper.create_global_variable(
name=moving_mean_name,
dtype=input.dtype, dtype=input.dtype,
shape=param_shape, shape=param_shape,
persistable=True, persistable=True,
...@@ -1484,6 +1525,7 @@ def batch_norm(input, ...@@ -1484,6 +1525,7 @@ def batch_norm(input,
helper.set_variable_initializer(var=mean, initializer=Constant(0.0)) helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
variance = helper.create_global_variable( variance = helper.create_global_variable(
name=moving_variance_name,
dtype=input.dtype, dtype=input.dtype,
shape=param_shape, shape=param_shape,
persistable=True, persistable=True,
......
...@@ -56,8 +56,10 @@ __all__ = [ ...@@ -56,8 +56,10 @@ __all__ = [
'elementwise_mul', 'elementwise_mul',
'elementwise_max', 'elementwise_max',
'elementwise_min', 'elementwise_min',
'elementwise_pow',
'clip', 'clip',
'clip_by_norm', 'clip_by_norm',
'softmax',
'sequence_softmax', 'sequence_softmax',
] + __activations__ ] + __activations__
......
...@@ -16,12 +16,14 @@ from ..layer_helper import LayerHelper ...@@ -16,12 +16,14 @@ from ..layer_helper import LayerHelper
from ..param_attr import ParamAttr from ..param_attr import ParamAttr
from ..framework import convert_np_dtype_to_dtype_ from ..framework import convert_np_dtype_to_dtype_
from ..framework import Variable from ..framework import Variable
from ..initializer import Constant
from ..core import DataType from ..core import DataType
import numpy import numpy
__all__ = [ __all__ = [
'create_tensor', 'create_tensor',
'create_parameter', 'create_parameter',
'create_global_var',
'cast', 'cast',
'concat', 'concat',
'sums', 'sums',
...@@ -58,13 +60,22 @@ def create_parameter(shape, ...@@ -58,13 +60,22 @@ def create_parameter(shape,
Returns: Returns:
Parameter: the created parameter Parameter: the created parameter
""" """
helper = LayerHelper("create_parameter") helper = LayerHelper("create_parameter", **locals())
if attr is None: if attr is None:
attr = ParamAttr() attr = ParamAttr()
return helper.create_parameter(attr, shape, dtype, is_bias, return helper.create_parameter(attr, shape, dtype, is_bias,
default_initializer) default_initializer)
def create_global_var(shape, value, dtype, persistable=False, name=None):
helper = LayerHelper("global_var", **locals())
var = helper.create_global_variable(
dtype=dtype, shape=shape, persistable=persistable, name=name)
helper.set_variable_initializer(
var, initializer=Constant(value=float(value)))
return var
def cast(x, dtype): def cast(x, dtype):
""" """
This function takes in the input with input_dtype This function takes in the input with input_dtype
...@@ -284,7 +295,7 @@ def fill_constant_batch_size_like(input, ...@@ -284,7 +295,7 @@ def fill_constant_batch_size_like(input,
return out return out
def ones(shape, dtype): def ones(shape, dtype, force_cpu=False):
""" """
**ones** **ones**
...@@ -308,7 +319,7 @@ def ones(shape, dtype): ...@@ -308,7 +319,7 @@ def ones(shape, dtype):
return fill_constant(value=1.0, **locals()) return fill_constant(value=1.0, **locals())
def zeros(shape, dtype): def zeros(shape, dtype, force_cpu=False):
""" """
**zeros** **zeros**
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import layers
from framework import Variable
__all__ = ['exponential_decay', 'natural_exp_decay', 'inverse_time_decay']
"""
When training a model, it's often useful to decay the
learning rate during training process, this is called
learning_rate_decay. There are many strategies to do
this, this module will provide some classical method.
User can also implement their own learning_rate_decay
strategy according to this module.
"""
def exponential_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False):
"""Applies exponential decay to the learning rate.
```python
decayed_learning_rate = learning_rate *
decay_rate ^ (global_step / decay_steps)
```
Args:
learning_rate: A scalar float32 value or a Variable. This
will be the initial learning rate during training
global_step: A Variable that record the training step.
decay_steps: A Python `int32` number.
decay_rate: A Python `float` number.
staircase: Boolean. If set true, decay the learning rate every decay_steps.
Returns:
The decayed learning rate
"""
if not isinstance(global_step, Variable):
raise ValueError("global_step is required for exponential_decay.")
# update learning_rate
div_res = global_step / decay_steps
if staircase:
div_res = layers.floor(x=div_res)
return learning_rate * (decay_rate**div_res)
def natural_exp_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False):
"""Applies natural exponential decay to the initial learning rate.
```python
if not staircase:
decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
else:
decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
```
Args:
learning_rate: A scalar float32 value or a Variable. This
will be the initial learning rate during training
global_step: A Variable that record the training step.
decay_steps: A Python `int32` number.
decay_rate: A Python `float` number.
staircase: Boolean. If set true, decay the learning rate every decay_steps.
Returns:
The decayed learning rate
"""
if not isinstance(global_step, Variable):
raise ValueError("global_step is required for natural_exp_decay.")
div_res = global_step / decay_steps
if staircase:
div_res = layers.floor(x=div_res)
return learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
def inverse_time_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False):
"""Applies inverse time decay to the initial learning rate.
```python
if staircase:
decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
else
decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
```
Args:
learning_rate: A scalar float32 value or a Variable. This
will be the initial learning rate during training
global_step: A Variable that record the training step.
decay_steps: A Python `int32` number.
decay_rate: A Python `float` number.
staircase: Boolean. If set true, decay the learning rate every decay_steps.
Returns:
The decayed learning rate
"""
if not isinstance(global_step, Variable):
raise ValueError("global_step is required for inverse_time_decay.")
div_res = global_step / decay_steps
if staircase:
div_res = layers.floor(x=div_res)
return learning_rate / (1 + decay_rate * div_res)
...@@ -31,7 +31,7 @@ dtype_to_size = { ...@@ -31,7 +31,7 @@ dtype_to_size = {
class ControlFlowGraph(object): class ControlFlowGraph(object):
def __init__(self, Program, ops, forward_num): def __init__(self, Program, ops, forward_num, skip_opt):
self._program = Program self._program = Program
self._ops = ops self._ops = ops
self._forward_num = forward_num self._forward_num = forward_num
...@@ -41,6 +41,7 @@ class ControlFlowGraph(object): ...@@ -41,6 +41,7 @@ class ControlFlowGraph(object):
self._defs = defaultdict(set) self._defs = defaultdict(set)
self._live_in = defaultdict(set) self._live_in = defaultdict(set)
self._live_out = defaultdict(set) self._live_out = defaultdict(set)
self._skip_opt = skip_opt
def _add_connections(self, connections): def _add_connections(self, connections):
for node1, node2 in connections: for node1, node2 in connections:
...@@ -130,6 +131,10 @@ class ControlFlowGraph(object): ...@@ -130,6 +131,10 @@ class ControlFlowGraph(object):
block_desc, x, block_desc, x,
is_forward).type() != core.VarDesc.VarType.LOD_TENSOR: is_forward).type() != core.VarDesc.VarType.LOD_TENSOR:
return False return False
if x in self._skip_opt:
return False
if not self._find_var(block_desc, x, is_forward).shape():
return False
return True return True
self._build_graph() self._build_graph()
...@@ -140,6 +145,7 @@ class ControlFlowGraph(object): ...@@ -140,6 +145,7 @@ class ControlFlowGraph(object):
if op.type() == "while" or op.type() == "while_grad": if op.type() == "while" or op.type() == "while_grad":
continue continue
block_desc = op.block() block_desc = op.block()
self.current_block_desc = block_desc
is_forward = i < self._forward_num is_forward = i < self._forward_num
if self.pool: if self.pool:
defs_can_optimize = filter( defs_can_optimize = filter(
...@@ -197,28 +203,32 @@ def get_cfgs(input_program): ...@@ -197,28 +203,32 @@ def get_cfgs(input_program):
block_desc = pdesc.block(0) block_desc = pdesc.block(0)
op_size = block_desc.op_size() op_size = block_desc.op_size()
# Get global block ops # Get global block ops
ops_list.append(([block_desc.op(i) for i in range(op_size)], op_size)) ops_list.append(
([block_desc.op(i) for i in range(op_size)], op_size, set()))
while_sub_block_ids = [] while_sub_block_ids = []
while_grad_sub_block_ids = [] while_grad_sub_block_ids = []
while_pair = [] while_op_output = set()
while_block_id_pair = []
for i in range(op_size): for i in range(op_size):
op = block_desc.op(i) op = block_desc.op(i)
if op.type() == "while": if op.type() == "while":
while_sub_block_ids.append(op.attr("sub_block").id) while_sub_block_ids.append(op.attr("sub_block").id)
while_op_output.update(op.output_arg_names())
elif op.type() == "while_grad": elif op.type() == "while_grad":
while_grad_sub_block_ids.append(op.attr("sub_block").id) while_grad_sub_block_ids.append(op.attr("sub_block").id)
while_op_output.update(op.output_arg_names())
# Find while/while_grad block pair # Find while/while_grad block pair
for grad_id in while_grad_sub_block_ids: for grad_id in while_grad_sub_block_ids:
parent_id = pdesc.block(grad_id).parent parent_id = pdesc.block(grad_id).parent
if parent_id in while_sub_block_ids: if parent_id in while_sub_block_ids:
while_pair.append((parent_id, grad_id)) while_block_id_pair.append((parent_id, grad_id))
while_sub_block_ids.remove(parent_id) while_sub_block_ids.remove(parent_id)
# Get while/while_grad block ops # Get while/while_grad block ops
for parent_id, grad_id in while_pair: for parent_id, grad_id in while_block_id_pair:
while_block_ops = [] while_block_ops = []
while_block = pdesc.block(parent_id) while_block = pdesc.block(parent_id)
while_block_op_size = while_block.op_size() while_block_op_size = while_block.op_size()
...@@ -230,7 +240,7 @@ def get_cfgs(input_program): ...@@ -230,7 +240,7 @@ def get_cfgs(input_program):
for i in range(while_grad_block_op_size): for i in range(while_grad_block_op_size):
while_block_ops.append(while_grad_block.op(i)) while_block_ops.append(while_grad_block.op(i))
ops_list.append((while_block_ops, while_block_op_size)) ops_list.append((while_block_ops, while_block_op_size, while_op_output))
# Process rest while block ops # Process rest while block ops
for parent_id in while_sub_block_ids: for parent_id in while_sub_block_ids:
...@@ -242,7 +252,7 @@ def get_cfgs(input_program): ...@@ -242,7 +252,7 @@ def get_cfgs(input_program):
ops_list.append((while_block_ops, while_block_op_size)) ops_list.append((while_block_ops, while_block_op_size))
cfgs = [ControlFlowGraph(input_program, i, j) for i, j in ops_list] cfgs = [ControlFlowGraph(input_program, i, j, k) for i, j, k in ops_list]
return cfgs return cfgs
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
from collections import defaultdict from collections import defaultdict
import framework import framework
import layers
from backward import append_backward from backward import append_backward
from framework import unique_name, program_guard from framework import unique_name, program_guard
from initializer import Constant from initializer import Constant
...@@ -33,9 +34,11 @@ class Optimizer(object): ...@@ -33,9 +34,11 @@ class Optimizer(object):
but need to use one of it's implementation. but need to use one of it's implementation.
""" """
def __init__(self, global_step=None, regularization=None): def __init__(self, learning_rate, global_step=None, regularization=None):
assert learning_rate is not None
self._global_step = global_step self._global_step = global_step
self.regularization = regularization self.regularization = regularization
self._global_learning_rate = learning_rate
# Dictionary of accumulators. Some optimizer subclasses need to # Dictionary of accumulators. Some optimizer subclasses need to
# allocate and manage extra variables associated with the parameters # allocate and manage extra variables associated with the parameters
# to train. These variables are called accumulators. # to train. These variables are called accumulators.
...@@ -43,6 +46,28 @@ class Optimizer(object): ...@@ -43,6 +46,28 @@ class Optimizer(object):
self._accumulators = defaultdict(lambda: dict()) self._accumulators = defaultdict(lambda: dict())
self.helper = None self.helper = None
def _create_global_learning_rate(self):
if isinstance(self._global_learning_rate, float):
self._global_learning_rate = layers.create_global_var(
name=unique_name("learning_rate"),
shape=[1],
value=float(self._global_learning_rate),
dtype='float32',
persistable=True)
if not isinstance(self._global_learning_rate, framework.Variable):
raise ValueError("learning rate should be a Variable, "
"actual type is %s",
type(self._global_learning_rate))
@property
def global_learning_rate(self):
"""
get global decayed learning rate
:return:
"""
return self._global_learning_rate
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
""" append optimize operator to block and return all the added optimize_op """ append optimize operator to block and return all the added optimize_op
""" """
...@@ -52,17 +77,7 @@ class Optimizer(object): ...@@ -52,17 +77,7 @@ class Optimizer(object):
# create learning rate variable for every parameter # create learning rate variable for every parameter
param = param_and_grad[0] param = param_and_grad[0]
param_lr = param.optimize_attr['learning_rate'] param_lr = param.optimize_attr['learning_rate']
param_lr_shape = [1] return self._global_learning_rate * param_lr
param_lr_var = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=param_lr_shape,
lod_level=1,
persistable=True)
param_lr = param_lr * self._learning_rate
self.helper.set_variable_initializer(
var=param_lr_var, initializer=Constant(param_lr))
return param_lr_var
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
"""Create all accumulators needed by the parameters """Create all accumulators needed by the parameters
...@@ -163,7 +178,7 @@ class Optimizer(object): ...@@ -163,7 +178,7 @@ class Optimizer(object):
optimization. This will include parameter update ops, global step optimization. This will include parameter update ops, global step
update ops and any other custom ops required by subclasses to manage update ops and any other custom ops required by subclasses to manage
their internal state. their internal state.
:param startup_program: :param startup_program:
""" """
# This is a default implementation of create_optimization_pass that # This is a default implementation of create_optimization_pass that
# can be shared by most optimizers. This implementation assumes that # can be shared by most optimizers. This implementation assumes that
...@@ -178,6 +193,7 @@ class Optimizer(object): ...@@ -178,6 +193,7 @@ class Optimizer(object):
self.helper = LayerHelper(self.__class__.__name__) self.helper = LayerHelper(self.__class__.__name__)
self._create_accumulators(loss.block, self._create_accumulators(loss.block,
[p[0] for p in parameters_and_grads]) [p[0] for p in parameters_and_grads])
self._create_global_learning_rate()
optimize_ops = [] optimize_ops = []
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
...@@ -231,9 +247,9 @@ class SGDOptimizer(Optimizer): ...@@ -231,9 +247,9 @@ class SGDOptimizer(Optimizer):
def __init__(self, learning_rate, **kwargs): def __init__(self, learning_rate, **kwargs):
assert learning_rate is not None assert learning_rate is not None
super(SGDOptimizer, self).__init__(**kwargs) super(SGDOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs)
self.type = "sgd" self.type = "sgd"
self._learning_rate = learning_rate
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -259,9 +275,9 @@ class MomentumOptimizer(Optimizer): ...@@ -259,9 +275,9 @@ class MomentumOptimizer(Optimizer):
def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs): def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs):
assert learning_rate is not None assert learning_rate is not None
assert momentum is not None assert momentum is not None
super(MomentumOptimizer, self).__init__(**kwargs) super(MomentumOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs)
self.type = "momentum" self.type = "momentum"
self._learning_rate = learning_rate
self._momentum = momentum self._momentum = momentum
self._use_nesterov = bool(use_nesterov) self._use_nesterov = bool(use_nesterov)
...@@ -303,9 +319,9 @@ class AdagradOptimizer(Optimizer): ...@@ -303,9 +319,9 @@ class AdagradOptimizer(Optimizer):
def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs): def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs):
assert learning_rate is not None assert learning_rate is not None
assert epsilon is not None assert epsilon is not None
super(AdagradOptimizer, self).__init__(**kwargs) super(AdagradOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs)
self.type = "adagrad" self.type = "adagrad"
self._learning_rate = learning_rate
self._epsilon = epsilon self._epsilon = epsilon
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
...@@ -352,9 +368,9 @@ class AdamOptimizer(Optimizer): ...@@ -352,9 +368,9 @@ class AdamOptimizer(Optimizer):
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
assert epsilon is not None assert epsilon is not None
super(AdamOptimizer, self).__init__(**kwargs) super(AdamOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs)
self.type = "adam" self.type = "adam"
self._learning_rate = learning_rate
self._beta1 = beta1 self._beta1 = beta1
self._beta2 = beta2 self._beta2 = beta2
self._epsilon = epsilon self._epsilon = epsilon
...@@ -457,9 +473,9 @@ class AdamaxOptimizer(Optimizer): ...@@ -457,9 +473,9 @@ class AdamaxOptimizer(Optimizer):
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
assert epsilon is not None assert epsilon is not None
super(AdamaxOptimizer, self).__init__(**kwargs) super(AdamaxOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs)
self.type = "adamax" self.type = "adamax"
self._learning_rate = learning_rate
self._beta1 = beta1 self._beta1 = beta1
self._beta2 = beta2 self._beta2 = beta2
self._epsilon = epsilon self._epsilon = epsilon
...@@ -535,9 +551,9 @@ class DecayedAdagradOptimizer(Optimizer): ...@@ -535,9 +551,9 @@ class DecayedAdagradOptimizer(Optimizer):
assert decay is not None assert decay is not None
assert epsilon is not None assert epsilon is not None
super(DecayedAdagradOptimizer, self).__init__(**kwargs) super(DecayedAdagradOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs)
self.type = "decayed_adagrad" self.type = "decayed_adagrad"
self._learning_rate = learning_rate
self._decay = decay self._decay = decay
self._epsilon = epsilon self._epsilon = epsilon
......
...@@ -12,11 +12,11 @@ ...@@ -12,11 +12,11 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.v2.fluid.core as core import core
from contextlib import contextmanager from contextlib import contextmanager
import os import os
__all__ = ['CudaProfiler'] __all__ = ['cuda_profiler', 'reset_profiler', 'profiler']
NVPROF_CONFIG = [ NVPROF_CONFIG = [
"gpustarttimestamp", "gpustarttimestamp",
...@@ -103,10 +103,10 @@ def profiler(state, sorted_key=None): ...@@ -103,10 +103,10 @@ def profiler(state, sorted_key=None):
core.enable_profiler(prof_state) core.enable_profiler(prof_state)
yield yield
if sorted_key not in ['calls', 'total', 'max', 'min', 'ave']:
raise ValueError("The state must be in 'calls', 'total', "
"'max', 'min', 'ave'")
sorted_key = 'default' if sorted_key is None else sorted_key sorted_key = 'default' if sorted_key is None else sorted_key
if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
raise ValueError("The sorted_key must be None or in 'calls', 'total', "
"'max', 'min' and 'ave'")
key_map = { key_map = {
'default': core.EventSortingKey.kDefault, 'default': core.EventSortingKey.kDefault,
'calls': core.EventSortingKey.kCalls, 'calls': core.EventSortingKey.kCalls,
......
...@@ -5,9 +5,11 @@ if(NOT WITH_DISTRIBUTE) ...@@ -5,9 +5,11 @@ if(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_recv_op) list(REMOVE_ITEM TEST_OPS test_recv_op)
endif(NOT WITH_DISTRIBUTE) endif(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_warpctc_op)
foreach(src ${TEST_OPS}) foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py) py_test(${src} SRCS ${src}.py)
endforeach() endforeach()
py_test(test_warpctc_op SRCS test_warpctc_op.py ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
add_subdirectory(book) add_subdirectory(book)
add_subdirectory(book_distribute) add_subdirectory(book_distribute)
......
recognize_digits_*.inference.model
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
list(REMOVE_ITEM TEST_OPS test_image_classification_train test_recognize_digits)
py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet)
py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg)
py_test(test_recognize_digits_mlp_cpu
SRCS test_recognize_digits.py
ARGS mlp)
py_test(test_recognize_digits_mlp_cuda
SRCS test_recognize_digits.py
ARGS mlp --use_cuda)
py_test(test_recognize_digits_conv_cpu
SRCS test_recognize_digits.py
ARGS conv)
py_test(test_recognize_digits_conv_cuda
SRCS test_recognize_digits.py
ARGS conv --use_cuda)
py_test(test_recognize_digits_mlp_cpu_parallel
SRCS test_recognize_digits.py
ARGS mlp --parallel)
py_test(test_recognize_digits_mlp_cuda_parallel
SRCS test_recognize_digits.py
ARGS mlp --use_cuda --parallel)
py_test(test_recognize_digits_conv_cpu_parallel
SRCS test_recognize_digits.py
ARGS conv --parallel)
py_test(test_recognize_digits_conv_cuda_parallel
SRCS test_recognize_digits.py
ARGS conv --use_cuda --parallel)
# default test # default test
foreach(src ${TEST_OPS}) foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py) py_test(${src} SRCS ${src}.py)
......
...@@ -12,44 +12,74 @@ ...@@ -12,44 +12,74 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import contextlib
import unittest
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None) def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
y = fluid.layers.data(name='y', shape=[1], dtype='float32') x = fluid.layers.data(name='x', shape=[13], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y) y_predict = fluid.layers.fc(input=x, size=1, act=None)
avg_cost = fluid.layers.mean(x=cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) y = fluid.layers.data(name='y', shape=[1], dtype='float32')
sgd_optimizer.minimize(avg_cost)
BATCH_SIZE = 20 cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(x=cost)
train_reader = paddle.batch( sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
paddle.reader.shuffle( sgd_optimizer.minimize(avg_cost)
paddle.dataset.uci_housing.train(), buf_size=500),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace() BATCH_SIZE = 20
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.uci_housing.train(), buf_size=500),
batch_size=BATCH_SIZE)
PASS_NUM = 100 place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
for pass_id in range(PASS_NUM): feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
fluid.io.save_persistables(exe, "./fit_a_line.model/") exe = fluid.Executor(place)
fluid.io.load_persistables(exe, "./fit_a_line.model/")
for data in train_reader(): exe.run(fluid.default_startup_program())
avg_loss_value, = exe.run(fluid.default_main_program(),
feed=feeder.feed(data), PASS_NUM = 100
fetch_list=[avg_cost]) for pass_id in range(PASS_NUM):
print(avg_loss_value) fluid.io.save_persistables(exe, "./fit_a_line.model/")
if avg_loss_value[0] < 10.0: fluid.io.load_persistables(exe, "./fit_a_line.model/")
exit(0) # if avg cost less than 10.0, we think our code is good. for data in train_reader():
exit(1) avg_loss_value, = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost])
print(avg_loss_value)
if avg_loss_value[0] < 10.0:
return
raise AssertionError("Fit a line cost is too large, {0:2.2}".format(
avg_loss_value[0]))
class TestFitALine(unittest.TestCase):
def test_cpu(self):
with self.program_scope_guard():
main(use_cuda=False)
def test_cuda(self):
with self.program_scope_guard():
main(use_cuda=True)
@contextlib.contextmanager
def program_scope_guard(self):
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
if __name__ == '__main__':
unittest.main()
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
from __future__ import print_function from __future__ import print_function
import sys
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import unittest
import contextlib
def resnet_cifar10(input, depth=32): def resnet_cifar10(input, depth=32):
...@@ -89,56 +89,89 @@ def vgg16_bn_drop(input): ...@@ -89,56 +89,89 @@ def vgg16_bn_drop(input):
return fc2 return fc2
classdim = 10 def main(net_type, use_cuda):
data_shape = [3, 32, 32] if use_cuda and not fluid.core.is_compiled_with_cuda():
return
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') classdim = 10
data_shape = [3, 32, 32]
net_type = "vgg"
if len(sys.argv) >= 2: images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
net_type = sys.argv[1] label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if net_type == "vgg": if net_type == "vgg":
print("train vgg net") print("train vgg net")
net = vgg16_bn_drop(images) net = vgg16_bn_drop(images)
elif net_type == "resnet": elif net_type == "resnet":
print("train resnet") print("train resnet")
net = resnet_cifar10(images, 32) net = resnet_cifar10(images, 32)
else: else:
raise ValueError("%s network is not supported" % net_type) raise ValueError("%s network is not supported" % net_type)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax') predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label) cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer = fluid.optimizer.Adam(learning_rate=0.001)
opts = optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=predict, label=label) accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
BATCH_SIZE = 128 BATCH_SIZE = 128
PASS_NUM = 1 PASS_NUM = 1
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.cifar.train10(), buf_size=128 * 10), paddle.dataset.cifar.train10(), buf_size=128 * 10),
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
place = fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(place=place, feed_list=[images, label]) feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM): loss = 0.0
accuracy.reset(exe) for pass_id in range(PASS_NUM):
for data in train_reader(): accuracy.reset(exe)
loss, acc = exe.run(fluid.default_main_program(), for data in train_reader():
feed=feeder.feed(data), loss, acc = exe.run(fluid.default_main_program(),
fetch_list=[avg_cost] + accuracy.metrics) feed=feeder.feed(data),
pass_acc = accuracy.eval(exe) fetch_list=[avg_cost] + accuracy.metrics)
print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str( pass_acc = accuracy.eval(exe)
pass_acc)) print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
# this model is slow, so if we can train two mini batch, we think it works properly. pass_acc))
exit(0) return
exit(1)
raise AssertionError(
"Image classification loss is too large, {0:2.2}".format(loss))
class TestImageClassification(unittest.TestCase):
def test_vgg_cuda(self):
with self.scope_prog_guard():
main('vgg', use_cuda=True)
def test_resnet_cuda(self):
with self.scope_prog_guard():
main('resnet', use_cuda=True)
def test_vgg_cpu(self):
with self.scope_prog_guard():
main('vgg', use_cuda=False)
def test_resnet_cpu(self):
with self.scope_prog_guard():
main('resnet', use_cuda=False)
@contextlib.contextmanager
def scope_prog_guard(self):
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
if __name__ == '__main__':
unittest.main()
...@@ -175,7 +175,7 @@ def main(): ...@@ -175,7 +175,7 @@ def main():
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.conll05.test(), buf_size=8192), paddle.dataset.conll05.test(), buf_size=8192),
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
#place = fluid.CPUPlace() # place = fluid.CPUPlace()
place = fluid.CUDAPlace(0) place = fluid.CUDAPlace(0)
feeder = fluid.DataFeeder( feeder = fluid.DataFeeder(
feed_list=[ feed_list=[
......
...@@ -11,21 +11,20 @@ ...@@ -11,21 +11,20 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import contextlib
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as pd import paddle.v2.fluid.layers as pd
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
import unittest
dict_size = 30000 dict_size = 30000
source_dict_dim = target_dict_dim = dict_size source_dict_dim = target_dict_dim = dict_size
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
hidden_dim = 32 hidden_dim = 32
word_dim = 16 word_dim = 16
IS_SPARSE = True
batch_size = 2 batch_size = 2
max_length = 8 max_length = 8
topk_size = 50 topk_size = 50
...@@ -34,10 +33,8 @@ beam_size = 2 ...@@ -34,10 +33,8 @@ beam_size = 2
decoder_size = hidden_dim decoder_size = hidden_dim
place = core.CPUPlace()
def encoder(is_sparse):
def encoder():
# encoder # encoder
src_word_id = pd.data( src_word_id = pd.data(
name="src_word_id", shape=[1], dtype='int64', lod_level=1) name="src_word_id", shape=[1], dtype='int64', lod_level=1)
...@@ -45,7 +42,7 @@ def encoder(): ...@@ -45,7 +42,7 @@ def encoder():
input=src_word_id, input=src_word_id,
size=[dict_size, word_dim], size=[dict_size, word_dim],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=is_sparse,
param_attr=fluid.ParamAttr(name='vemb')) param_attr=fluid.ParamAttr(name='vemb'))
fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
...@@ -54,7 +51,7 @@ def encoder(): ...@@ -54,7 +51,7 @@ def encoder():
return encoder_out return encoder_out
def decoder_train(context): def decoder_train(context, is_sparse):
# decoder # decoder
trg_language_word = pd.data( trg_language_word = pd.data(
name="target_language_word", shape=[1], dtype='int64', lod_level=1) name="target_language_word", shape=[1], dtype='int64', lod_level=1)
...@@ -62,7 +59,7 @@ def decoder_train(context): ...@@ -62,7 +59,7 @@ def decoder_train(context):
input=trg_language_word, input=trg_language_word,
size=[dict_size, word_dim], size=[dict_size, word_dim],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=is_sparse,
param_attr=fluid.ParamAttr(name='vemb')) param_attr=fluid.ParamAttr(name='vemb'))
rnn = pd.DynamicRNN() rnn = pd.DynamicRNN()
...@@ -82,10 +79,10 @@ def decoder_train(context): ...@@ -82,10 +79,10 @@ def decoder_train(context):
return rnn() return rnn()
def decoder_decode(context): def decoder_decode(context, is_sparse):
init_state = context init_state = context
array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
counter = pd.zeros(shape=[1], dtype='int64') counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
# fill the first element with init_state # fill the first element with init_state
state_array = pd.create_array('float32') state_array = pd.create_array('float32')
...@@ -117,7 +114,7 @@ def decoder_decode(context): ...@@ -117,7 +114,7 @@ def decoder_decode(context):
input=pre_ids, input=pre_ids,
size=[dict_size, word_dim], size=[dict_size, word_dim],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE) is_sparse=is_sparse)
# use rnn unit to update rnn # use rnn unit to update rnn
current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded], current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded],
...@@ -150,7 +147,7 @@ def decoder_decode(context): ...@@ -150,7 +147,7 @@ def decoder_decode(context):
def set_init_lod(data, lod, place): def set_init_lod(data, lod, place):
res = core.LoDTensor() res = fluid.LoDTensor()
res.set(data, place) res.set(data, place)
res.set_lod(lod) res.set_lod(lod)
return res return res
...@@ -165,15 +162,19 @@ def to_lodtensor(data, place): ...@@ -165,15 +162,19 @@ def to_lodtensor(data, place):
lod.append(cur_len) lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64") flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1]) flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = core.LoDTensor() res = fluid.LoDTensor()
res.set(flattened_data, place) res.set(flattened_data, place)
res.set_lod([lod]) res.set_lod([lod])
return res return res
def train_main(): def train_main(use_cuda, is_sparse):
context = encoder() if use_cuda and not fluid.core.is_compiled_with_cuda():
rnn_out = decoder_train(context) return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
context = encoder(is_sparse)
rnn_out = decoder_train(context, is_sparse)
label = pd.data( label = pd.data(
name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
cost = pd.cross_entropy(input=rnn_out, label=label) cost = pd.cross_entropy(input=rnn_out, label=label)
...@@ -212,9 +213,13 @@ def train_main(): ...@@ -212,9 +213,13 @@ def train_main():
batch_id += 1 batch_id += 1
def decode_main(): def decode_main(use_cuda, is_sparse):
context = encoder() if use_cuda and not fluid.core.is_compiled_with_cuda():
translation_ids, translation_scores = decoder_decode(context) return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
context = encoder(is_sparse)
translation_ids, translation_scores = decoder_decode(context, is_sparse)
exe = Executor(place) exe = Executor(place)
exe.run(framework.default_startup_program()) exe.run(framework.default_startup_program())
...@@ -250,6 +255,60 @@ def decode_main(): ...@@ -250,6 +255,60 @@ def decode_main():
break break
class TestMachineTranslation(unittest.TestCase):
pass
@contextlib.contextmanager
def scope_prog_guard():
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
def inject_test_train(use_cuda, is_sparse):
f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse'
if is_sparse else 'dense')
def f(*args):
with scope_prog_guard():
train_main(use_cuda, is_sparse)
setattr(TestMachineTranslation, f_name, f)
def inject_test_decode(use_cuda, is_sparse, decorator=None):
f_name = 'test_{0}_{1}_decode'.format('cuda'
if use_cuda else 'cpu', 'sparse'
if is_sparse else 'dense')
def f(*args):
with scope_prog_guard():
decode_main(use_cuda, is_sparse)
if decorator is not None:
f = decorator(f)
setattr(TestMachineTranslation, f_name, f)
for _use_cuda_ in (False, True):
for _is_sparse_ in (False, True):
inject_test_train(_use_cuda_, _is_sparse_)
for _use_cuda_ in (False, True):
for _is_sparse_ in (False, True):
_decorator_ = None
if _use_cuda_:
_decorator_ = unittest.skip(
reason='Beam Search does not support CUDA!')
inject_test_decode(
is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_)
if __name__ == '__main__': if __name__ == '__main__':
# train_main() unittest.main()
decode_main()
...@@ -17,6 +17,7 @@ import paddle.v2.fluid as fluid ...@@ -17,6 +17,7 @@ import paddle.v2.fluid as fluid
import paddle.v2 as paddle import paddle.v2 as paddle
import sys import sys
import numpy import numpy
import unittest
def parse_arg(): def parse_arg():
...@@ -45,8 +46,9 @@ BATCH_SIZE = 64 ...@@ -45,8 +46,9 @@ BATCH_SIZE = 64
def loss_net(hidden, label): def loss_net(hidden, label):
prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label) loss = fluid.layers.cross_entropy(input=prediction, label=label)
return fluid.layers.mean(x=loss), fluid.layers.accuracy( avg_loss = fluid.layers.mean(x=loss)
input=prediction, label=label) acc = fluid.layers.accuracy(input=prediction, label=label)
return prediction, avg_loss, acc
def mlp(img, label): def mlp(img, label):
...@@ -73,25 +75,25 @@ def conv_net(img, label): ...@@ -73,25 +75,25 @@ def conv_net(img, label):
return loss_net(conv_pool_2, label) return loss_net(conv_pool_2, label)
def main(): def train(nn_type, use_cuda, parallel, save_dirname):
args = parse_arg() if use_cuda and not fluid.core.is_compiled_with_cuda():
print("recognize digits with args: {0}".format(" ".join(sys.argv[1:]))) return
img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if args.nn_type == 'mlp': if nn_type == 'mlp':
net_conf = mlp net_conf = mlp
else: else:
net_conf = conv_net net_conf = conv_net
if args.parallel: if parallel:
places = fluid.layers.get_places() places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places) pd = fluid.layers.ParallelDo(places)
with pd.do(): with pd.do():
img_ = pd.read_input(img) img_ = pd.read_input(img)
label_ = pd.read_input(label) label_ = pd.read_input(label)
for o in net_conf(img_, label_): prediction, avg_loss, acc = net_conf(img_, label_)
for o in [avg_loss, acc]:
pd.write_output(o) pd.write_output(o)
avg_loss, acc = pd() avg_loss, acc = pd()
...@@ -99,14 +101,14 @@ def main(): ...@@ -99,14 +101,14 @@ def main():
avg_loss = fluid.layers.mean(x=avg_loss) avg_loss = fluid.layers.mean(x=avg_loss)
acc = fluid.layers.mean(x=acc) acc = fluid.layers.mean(x=acc)
else: else:
avg_loss, acc = net_conf(img, label) prediction, avg_loss, acc = net_conf(img, label)
test_program = fluid.default_main_program().clone() test_program = fluid.default_main_program().clone()
optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer = fluid.optimizer.Adam(learning_rate=0.001)
optimizer.minimize(avg_loss) optimizer.minimize(avg_loss)
place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
...@@ -137,13 +139,85 @@ def main(): ...@@ -137,13 +139,85 @@ def main():
acc_val = numpy.array(acc_set).mean() acc_val = numpy.array(acc_set).mean()
avg_loss_val = numpy.array(avg_loss_set).mean() avg_loss_val = numpy.array(avg_loss_set).mean()
if float(acc_val) > 0.85: # test acc > 85% if float(acc_val) > 0.85: # test acc > 85%
exit(0) if save_dirname is not None:
fluid.io.save_inference_model(save_dirname, ["img"],
[prediction], exe)
return
else: else:
print( print(
'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'. 'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
format(pass_id, batch_id + 1, format(pass_id, batch_id + 1,
float(avg_loss_val), float(acc_val))) float(avg_loss_val), float(acc_val)))
raise AssertionError("Loss of recognize digits is too large")
def infer(use_cuda, save_dirname=None):
if save_dirname is None:
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# Use fluid.io.load_inference_model to obtain the inference program desc,
# the feed_target_names (the names of variables that will be feeded
# data using feed operators), and the fetch_targets (variables that
# we want to obtain data from using fetch operators).
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
# The input's dimension of conv should be 4-D or 5-D.
tensor_img = numpy.random.rand(1, 1, 28, 28).astype("float32")
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets.
results = exe.run(inference_program,
feed={feed_target_names[0]: tensor_img},
fetch_list=fetch_targets)
print("infer results: ", results[0])
def main(use_cuda, parallel, nn_type):
if not use_cuda and not parallel:
save_dirname = "recognize_digits_" + nn_type + ".inference.model"
else:
save_dirname = None
train(
nn_type=nn_type,
use_cuda=use_cuda,
parallel=parallel,
save_dirname=save_dirname)
infer(use_cuda=use_cuda, save_dirname=save_dirname)
class TestRecognizeDigits(unittest.TestCase):
pass
def inject_test_method(use_cuda, parallel, nn_type):
def __impl__(self):
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
main(use_cuda, parallel, nn_type)
fn = 'test_{0}_{1}_{2}'.format(nn_type, 'cuda'
if use_cuda else 'cpu', 'parallel'
if parallel else 'normal')
setattr(TestRecognizeDigits, fn, __impl__)
def inject_all_tests():
for use_cuda in (False, True):
for parallel in (False, True):
for nn_type in ('mlp', 'conv'):
inject_test_method(use_cuda, parallel, nn_type)
inject_all_tests()
if __name__ == '__main__': if __name__ == '__main__':
main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -12,9 +12,36 @@ ...@@ -12,9 +12,36 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import numpy as np import unittest
import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import paddle.v2 as paddle
import contextlib
def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
hid_dim=32):
emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
conv_3 = fluid.nets.sequence_conv_pool(
input=emb,
num_filters=hid_dim,
filter_size=3,
act="tanh",
pool_type="sqrt")
conv_4 = fluid.nets.sequence_conv_pool(
input=emb,
num_filters=hid_dim,
filter_size=4,
act="tanh",
pool_type="sqrt")
prediction = fluid.layers.fc(input=[conv_3, conv_4],
size=class_dim,
act="softmax")
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
adam_optimizer.minimize(avg_cost)
accuracy = fluid.layers.accuracy(input=prediction, label=label)
return avg_cost, accuracy
def stacked_lstm_net(data, def stacked_lstm_net(data,
...@@ -51,63 +78,77 @@ def stacked_lstm_net(data, ...@@ -51,63 +78,77 @@ def stacked_lstm_net(data,
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002) adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
adam_optimizer.minimize(avg_cost) adam_optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=prediction, label=label) accuracy = fluid.layers.accuracy(input=prediction, label=label)
return avg_cost, accuracy, accuracy.metrics[0] return avg_cost, accuracy
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def main():
BATCH_SIZE = 100
PASS_NUM = 5
word_dict = paddle.dataset.imdb.word_dict()
print "load word dict successfully" def main(word_dict, net_method, use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
BATCH_SIZE = 128
PASS_NUM = 5
dict_dim = len(word_dict) dict_dim = len(word_dict)
class_dim = 2 class_dim = 2
data = fluid.layers.data( data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1) name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64") label = fluid.layers.data(name="label", shape=[1], dtype="int64")
cost, accuracy, acc_out = stacked_lstm_net( cost, acc_out = net_method(
data, label, input_dim=dict_dim, class_dim=class_dim) data, label, input_dim=dict_dim, class_dim=class_dim)
train_data = paddle.batch( train_data = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=1000), paddle.dataset.imdb.train(word_dict), buf_size=1000),
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
place = fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[data, label], place=place) feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
for pass_id in xrange(PASS_NUM): for pass_id in xrange(PASS_NUM):
accuracy.reset(exe)
for data in train_data(): for data in train_data():
cost_val, acc_val = exe.run(fluid.default_main_program(), cost_val, acc_val = exe.run(fluid.default_main_program(),
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[cost, acc_out]) fetch_list=[cost, acc_out])
pass_acc = accuracy.eval(exe) print("cost=" + str(cost_val) + " acc=" + str(acc_val))
print("cost=" + str(cost_val) + " acc=" + str(acc_val) + if cost_val < 0.4 and acc_val > 0.8:
" pass_acc=" + str(pass_acc)) return
if cost_val < 1.0 and acc_val > 0.8: raise AssertionError("Cost is too large for {0}".format(
exit(0) net_method.__name__))
exit(1)
class TestUnderstandSentiment(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.word_dict = paddle.dataset.imdb.word_dict()
@contextlib.contextmanager
def new_program_scope(self):
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
def test_conv_cpu(self):
with self.new_program_scope():
main(self.word_dict, net_method=convolution_net, use_cuda=False)
def test_stacked_lstm_cpu(self):
with self.new_program_scope():
main(self.word_dict, net_method=stacked_lstm_net, use_cuda=False)
def test_conv_gpu(self):
with self.new_program_scope():
main(self.word_dict, net_method=convolution_net, use_cuda=True)
def test_stacked_lstm_gpu(self):
with self.new_program_scope():
main(self.word_dict, net_method=stacked_lstm_net, use_cuda=True)
if __name__ == '__main__': if __name__ == '__main__':
main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
hid_dim=32):
emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
conv_3 = fluid.nets.sequence_conv_pool(
input=emb,
num_filters=hid_dim,
filter_size=3,
act="tanh",
pool_type="sqrt")
conv_4 = fluid.nets.sequence_conv_pool(
input=emb,
num_filters=hid_dim,
filter_size=4,
act="tanh",
pool_type="sqrt")
prediction = fluid.layers.fc(input=[conv_3, conv_4],
size=class_dim,
act="softmax")
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
adam_optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
return avg_cost, accuracy, accuracy.metrics[0]
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def main():
BATCH_SIZE = 100
PASS_NUM = 5
word_dict = paddle.dataset.imdb.word_dict()
dict_dim = len(word_dict)
class_dim = 2
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
cost, accuracy, acc_out = convolution_net(
data, label, input_dim=dict_dim, class_dim=class_dim)
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=1000),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
exe.run(fluid.default_startup_program())
for pass_id in xrange(PASS_NUM):
accuracy.reset(exe)
for data in train_data():
cost_val, acc_val = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[cost, acc_out])
pass_acc = accuracy.eval(exe)
print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
" pass_acc=" + str(pass_acc))
if cost_val < 1.0 and pass_acc > 0.8:
exit(0)
exit(1)
if __name__ == '__main__':
main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
from paddle.v2.fluid.layer_helper import LayerHelper
def lstm(x, c_pre_init, hidden_dim, forget_bias=None):
"""
This function helps create an operator for the LSTM (Long Short Term
Memory) cell that can be used inside an RNN.
"""
helper = LayerHelper('lstm_unit', **locals())
rnn = fluid.layers.StaticRNN()
with rnn.step():
c_pre = rnn.memory(init=c_pre_init)
x_t = rnn.step_input(x)
before_fc = fluid.layers.concat(input=[x_t, c_pre], axis=1)
after_fc = fluid.layers.fc(input=before_fc, size=hidden_dim * 4)
dtype = x.dtype
c = helper.create_tmp_variable(dtype)
h = helper.create_tmp_variable(dtype)
helper.append_op(
type='lstm_unit',
inputs={"X": after_fc,
"C_prev": c_pre},
outputs={"C": c,
"H": h},
attrs={"forget_bias": forget_bias})
rnn.update_memory(c_pre, c)
rnn.output(h)
return rnn()
def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
data = fluid.layers.data(
name="words",
shape=[seq_len * batch_size, 1],
append_batch_size=False,
dtype="int64",
lod_level=1)
label = fluid.layers.data(
name="label",
shape=[batch_size, 1],
append_batch_size=False,
dtype="int64")
emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
emb = fluid.layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim])
emb = fluid.layers.transpose(x=emb, perm=[1, 0, 2])
c_pre_init = fluid.layers.fill_constant(
dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0)
c_pre_init.stop_gradient = False
layer_1_out = lstm(emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
layer_1_out = fluid.layers.transpose(x=layer_1_out, perm=[1, 0, 2])
prediction = fluid.layers.fc(input=layer_1_out,
size=class_dim,
act="softmax")
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
adam_optimizer.minimize(avg_cost)
acc = fluid.layers.accuracy(input=prediction, label=label)
return avg_cost, acc
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def chop_data(data, chop_len=80, batch_size=50):
data = [(x[0][:chop_len], x[1]) for x in data if len(x[0]) >= chop_len]
return data[:batch_size]
def prepare_feed_data(data, place):
tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
label = np.array(map(lambda x: x[1], data)).astype("int64")
label = label.reshape([len(label), 1])
tensor_label = fluid.LoDTensor()
tensor_label.set(label, place)
return tensor_words, tensor_label
def main():
BATCH_SIZE = 100
PASS_NUM = 5
word_dict = paddle.dataset.imdb.word_dict()
print "load word dict successfully"
dict_dim = len(word_dict)
class_dim = 2
cost, acc = lstm_net(dict_dim=dict_dim, class_dim=class_dim)
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=BATCH_SIZE * 10),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for pass_id in xrange(PASS_NUM):
for data in train_data():
chopped_data = chop_data(data)
tensor_words, tensor_label = prepare_feed_data(chopped_data, place)
outs = exe.run(fluid.default_main_program(),
feed={"words": tensor_words,
"label": tensor_label},
fetch_list=[cost, acc])
cost_val = np.array(outs[0])
acc_val = np.array(outs[1])
print("cost=" + str(cost_val) + " acc=" + str(acc_val))
if acc_val > 0.7:
exit(0)
exit(1)
if __name__ == '__main__':
main()
...@@ -12,76 +12,145 @@ ...@@ -12,76 +12,145 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import unittest
import os
PASS_NUM = 100
EMBED_SIZE = 32 def main(use_cuda, is_sparse, parallel):
HIDDEN_SIZE = 256 if use_cuda and not fluid.core.is_compiled_with_cuda():
N = 5 return
BATCH_SIZE = 32
IS_SPARSE = True PASS_NUM = 100
EMBED_SIZE = 32
word_dict = paddle.dataset.imikolov.build_dict() HIDDEN_SIZE = 256
dict_size = len(word_dict) N = 5
BATCH_SIZE = 32
first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64') IS_SPARSE = is_sparse
second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64') def __network__(words):
forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64') embed_first = fluid.layers.embedding(
next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64') input=words[0],
size=[dict_size, EMBED_SIZE],
embed_first = fluid.layers.embedding( dtype='float32',
input=first_word, is_sparse=IS_SPARSE,
size=[dict_size, EMBED_SIZE], param_attr='shared_w')
dtype='float32', embed_second = fluid.layers.embedding(
is_sparse=IS_SPARSE, input=words[1],
param_attr='shared_w') size=[dict_size, EMBED_SIZE],
embed_second = fluid.layers.embedding( dtype='float32',
input=second_word, is_sparse=IS_SPARSE,
size=[dict_size, EMBED_SIZE], param_attr='shared_w')
dtype='float32', embed_third = fluid.layers.embedding(
is_sparse=IS_SPARSE, input=words[2],
param_attr='shared_w') size=[dict_size, EMBED_SIZE],
embed_third = fluid.layers.embedding( dtype='float32',
input=third_word, is_sparse=IS_SPARSE,
size=[dict_size, EMBED_SIZE], param_attr='shared_w')
dtype='float32', embed_forth = fluid.layers.embedding(
is_sparse=IS_SPARSE, input=words[3],
param_attr='shared_w') size=[dict_size, EMBED_SIZE],
embed_forth = fluid.layers.embedding( dtype='float32',
input=forth_word, is_sparse=IS_SPARSE,
size=[dict_size, EMBED_SIZE], param_attr='shared_w')
dtype='float32',
is_sparse=IS_SPARSE, concat_embed = fluid.layers.concat(
param_attr='shared_w') input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
hidden1 = fluid.layers.fc(input=concat_embed,
concat_embed = fluid.layers.concat( size=HIDDEN_SIZE,
input=[embed_first, embed_second, embed_third, embed_forth], axis=1) act='sigmoid')
hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid') predict_word = fluid.layers.fc(input=hidden1,
predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax') size=dict_size,
cost = fluid.layers.cross_entropy(input=predict_word, label=next_word) act='softmax')
avg_cost = fluid.layers.mean(x=cost) cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) avg_cost = fluid.layers.mean(x=cost)
sgd_optimizer.minimize(avg_cost) return avg_cost
train_reader = paddle.batch( word_dict = paddle.dataset.imikolov.build_dict()
paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE) dict_size = len(word_dict)
place = fluid.CPUPlace() first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
exe = fluid.Executor(place) second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
feeder = fluid.DataFeeder( third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
feed_list=[first_word, second_word, third_word, forth_word, next_word], forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
place=place) next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
exe.run(fluid.default_startup_program()) if not parallel:
avg_cost = __network__(
for pass_id in range(PASS_NUM): [first_word, second_word, third_word, forth_word, next_word])
for data in train_reader(): else:
avg_cost_np = exe.run(fluid.default_main_program(), places = fluid.layers.get_places()
feed=feeder.feed(data), pd = fluid.layers.ParallelDo(places)
fetch_list=[avg_cost]) with pd.do():
if avg_cost_np[0] < 5.0: avg_cost = __network__(
exit(0) # if avg cost less than 10.0, we think our code is good. map(pd.read_input, [
exit(1) first_word, second_word, third_word, forth_word, next_word
]))
pd.write_output(avg_cost)
avg_cost = fluid.layers.mean(x=pd())
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)
train_reader = paddle.batch(
paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(
feed_list=[first_word, second_word, third_word, forth_word, next_word],
place=place)
exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM):
for data in train_reader():
avg_cost_np = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost])
if avg_cost_np[0] < 5.0:
return
raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))
FULL_TEST = os.getenv('FULL_TEST',
'0').lower() in ['true', '1', 't', 'y', 'yes', 'on']
SKIP_REASON = "Only run minimum number of tests in CI server, to make CI faster"
class W2VTest(unittest.TestCase):
pass
def inject_test_method(use_cuda, is_sparse, parallel):
fn_name = "test_{0}_{1}_{2}".format("cuda" if use_cuda else "cpu", "sparse"
if is_sparse else "dense", "parallel"
if parallel else "normal")
def __impl__(*args, **kwargs):
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
main(use_cuda=use_cuda, is_sparse=is_sparse, parallel=parallel)
if use_cuda and is_sparse and parallel:
fn = __impl__
else:
# skip the other test when on CI server
fn = unittest.skipUnless(
condition=FULL_TEST, reason=SKIP_REASON)(__impl__)
setattr(W2VTest, fn_name, fn)
for use_cuda in (False, True):
for is_sparse in (False, True):
for parallel in (False, True):
inject_test_method(use_cuda, is_sparse, parallel)
if __name__ == '__main__':
unittest.main()
...@@ -62,7 +62,7 @@ def batch_bipartite_match(distance, lod): ...@@ -62,7 +62,7 @@ def batch_bipartite_match(distance, lod):
return match_indices, match_dist return match_indices, match_dist
class TestBipartiteMatchOpForWithLoD(OpTest): class TestBipartiteMatchOpWithLoD(OpTest):
def setUp(self): def setUp(self):
self.op_type = 'bipartite_match' self.op_type = 'bipartite_match'
lod = [[0, 5, 11, 23]] lod = [[0, 5, 11, 23]]
...@@ -72,7 +72,7 @@ class TestBipartiteMatchOpForWithLoD(OpTest): ...@@ -72,7 +72,7 @@ class TestBipartiteMatchOpForWithLoD(OpTest):
self.inputs = {'DistMat': (dist, lod)} self.inputs = {'DistMat': (dist, lod)}
self.outputs = { self.outputs = {
'ColToRowMatchIndices': (match_indices), 'ColToRowMatchIndices': (match_indices),
'ColToRowMatchDis': (match_dist), 'ColToRowMatchDist': (match_dist),
} }
def test_check_output(self): def test_check_output(self):
...@@ -89,7 +89,7 @@ class TestBipartiteMatchOpWithoutLoD(OpTest): ...@@ -89,7 +89,7 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
self.inputs = {'DistMat': dist} self.inputs = {'DistMat': dist}
self.outputs = { self.outputs = {
'ColToRowMatchIndices': match_indices, 'ColToRowMatchIndices': match_indices,
'ColToRowMatchDis': match_dist, 'ColToRowMatchDist': match_dist,
} }
def test_check_output(self): def test_check_output(self):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import sys
import math
from op_test import OpTest
def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
prior_box_x = (
(prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0])
prior_box_y = (
(prior_box[:, 3] + prior_box[:, 1]) / 2).reshape(1, prior_box.shape[0])
prior_box_width = (
(prior_box[:, 2] - prior_box[:, 0])).reshape(1, prior_box.shape[0])
prior_box_height = (
(prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0])
prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0],
prior_box_var.shape[1])
if (code_type == "EncodeCenterSize"):
target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape(
target_box.shape[0], 1)
target_box_y = ((target_box[:, 3] + target_box[:, 1]) / 2).reshape(
target_box.shape[0], 1)
target_box_width = ((target_box[:, 2] - target_box[:, 0])).reshape(
target_box.shape[0], 1)
target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape(
target_box.shape[0], 1)
output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \
prior_box_var[:,:,0]
output_box[:,:,1] = (target_box_y - prior_box_y) / prior_box_height / \
prior_box_var[:,:,1]
output_box[:,:,2] = np.log(np.fabs(target_box_width / prior_box_width)) / \
prior_box_var[:,:,2]
output_box[:,:,3] = np.log(np.fabs(target_box_height / prior_box_height)) / \
prior_box_var[:,:,3]
elif (code_type == "DecodeCenterSize"):
target_box = target_box.reshape(target_box.shape[0], 1,
target_box.shape[1])
target_box_x = prior_box_var[:,:,0] * target_box[:,:,0] * \
prior_box_width + prior_box_x
target_box_y = prior_box_var[:,:,1] * target_box[:,:,1] * \
prior_box_height + prior_box_y
target_box_width = np.exp(prior_box_var[:,:,2] * target_box[:,:,2]) * \
prior_box_width
target_box_height = np.exp(prior_box_var[:,:,3] * target_box[:,:,3]) * \
prior_box_height
output_box[:, :, 0] = target_box_x - target_box_width / 2
output_box[:, :, 1] = target_box_y - target_box_height / 2
output_box[:, :, 2] = target_box_x + target_box_width / 2
output_box[:, :, 3] = target_box_y + target_box_height / 2
def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type):
n = target_box.shape[0]
m = prior_box.shape[0]
output_box = np.zeros((n, m, 4), dtype=np.float32)
for i in range(len(lod) - 1):
box_coder(target_box[lod[i]:lod[i + 1], :], prior_box, prior_box_var,
output_box[lod[i]:lod[i + 1], :, :], code_type)
return output_box
class TestBoxCoderOp(OpTest):
def test_check_output(self):
self.check_output()
def setUp(self):
self.op_type = "box_coder"
lod = [[0, 20]]
prior_box = np.random.random((10, 4)).astype('float32')
prior_box_var = np.random.random((10, 4)).astype('float32')
target_box = np.random.random((20, 4)).astype('float32')
code_type = "DecodeCenterSize"
output_box = batch_box_coder(prior_box, prior_box_var, target_box,
lod[0], code_type)
self.inputs = {
'PriorBox': prior_box,
'PriorBoxVar': prior_box_var,
'TargetBox': target_box,
}
self.attrs = {'code_type': 'decode_center_size'}
self.outputs = {'OutputBox': output_box}
class TestBoxCoderOpWithLoD(OpTest):
def test_check_output(self):
self.check_output()
def setUp(self):
self.op_type = "box_coder"
lod = [[0, 4, 12, 20]]
prior_box = np.random.random((10, 4)).astype('float32')
prior_box_var = np.random.random((10, 4)).astype('float32')
target_box = np.random.random((20, 4)).astype('float32')
code_type = "EncodeCenterSize"
output_box = batch_box_coder(prior_box, prior_box_var, target_box,
lod[0], code_type)
self.inputs = {
'PriorBox': prior_box,
'PriorBoxVar': prior_box_var,
'TargetBox': (target_box, lod),
}
self.attrs = {'code_type': 'encode_center_size'}
self.outputs = {'OutputBox': output_box}
if __name__ == '__main__':
unittest.main()
...@@ -241,6 +241,30 @@ class TestCUDNNWith1x1(TestWith1x1): ...@@ -241,6 +241,30 @@ class TestCUDNNWith1x1(TestWith1x1):
self.op_type = "conv2d" self.op_type = "conv2d"
class TestDepthwiseConv(TestConv2dOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
class TestDepthwiseConv2(TestConv2dOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
# cudnn v5 does not support dilation conv. # cudnn v5 does not support dilation conv.
# class TestCUDNNWithDilation(TestWithDilation): # class TestCUDNNWithDilation(TestWithDilation):
# def init_op_type(self): # def init_op_type(self):
......
...@@ -21,7 +21,7 @@ class TestDropoutOp(OpTest): ...@@ -21,7 +21,7 @@ class TestDropoutOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "dropout" self.op_type = "dropout"
self.inputs = {'X': np.random.random((32, 64)).astype("float32")} self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
self.attrs = {'dropout_prob': 0.0, 'is_test': False} self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
self.outputs = { self.outputs = {
'Out': self.inputs['X'], 'Out': self.inputs['X'],
'Mask': np.ones((32, 64)).astype('float32') 'Mask': np.ones((32, 64)).astype('float32')
...@@ -38,7 +38,7 @@ class TestDropoutOp2(TestDropoutOp): ...@@ -38,7 +38,7 @@ class TestDropoutOp2(TestDropoutOp):
def setUp(self): def setUp(self):
self.op_type = "dropout" self.op_type = "dropout"
self.inputs = {'X': np.random.random((32, 64)).astype("float32")} self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
self.attrs = {'dropout_prob': 1.0, 'is_test': False} self.attrs = {'dropout_prob': 1.0, 'fix_seed': True, 'is_test': False}
self.outputs = { self.outputs = {
'Out': np.zeros((32, 64)).astype('float32'), 'Out': np.zeros((32, 64)).astype('float32'),
'Mask': np.zeros((32, 64)).astype('float32') 'Mask': np.zeros((32, 64)).astype('float32')
...@@ -49,7 +49,7 @@ class TestDropoutOp3(TestDropoutOp): ...@@ -49,7 +49,7 @@ class TestDropoutOp3(TestDropoutOp):
def setUp(self): def setUp(self):
self.op_type = "dropout" self.op_type = "dropout"
self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")} self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
self.attrs = {'dropout_prob': 0.0, 'is_test': False} self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
self.outputs = { self.outputs = {
'Out': self.inputs['X'], 'Out': self.inputs['X'],
'Mask': np.ones((32, 64, 2)).astype('float32') 'Mask': np.ones((32, 64, 2)).astype('float32')
...@@ -60,7 +60,7 @@ class TestDropoutOp4(OpTest): ...@@ -60,7 +60,7 @@ class TestDropoutOp4(OpTest):
def setUp(self): def setUp(self):
self.op_type = "dropout" self.op_type = "dropout"
self.inputs = {'X': np.random.random((32, 64)).astype("float32")} self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
self.attrs = {'dropout_prob': 0.35, 'is_test': True} self.attrs = {'dropout_prob': 0.35, 'fix_seed': True, 'is_test': True}
self.outputs = { self.outputs = {
'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob']) 'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
} }
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
class TestElementwisePowOp(OpTest):
def setUp(self):
self.op_type = "elementwise_pow"
self.inputs = {
'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
}
self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
def test_check_output(self):
self.check_output()
class TestElementwisePowOp_scalar(TestElementwisePowOp):
def setUp(self):
self.op_type = "elementwise_pow"
self.inputs = {
'X': np.random.rand(2, 3, 4).astype('float32'),
'Y': np.random.rand(1).astype('float32')
}
self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
class TestLabelSmoothOp(OpTest):
def config(self):
self.op_type = "label_smooth"
self.epsilon = 0.1
batch_size, self.label_dim = 5, 10
self.label = np.zeros((batch_size, self.label_dim)).astype("float64")
nonzero_index = np.random.randint(self.label_dim, size=(batch_size))
self.label[np.arange(batch_size), nonzero_index] = 1
def setUp(self):
self.config()
smoothed_label = (1 - self.epsilon
) * self.label + self.epsilon / self.label_dim
self.inputs = {'X': self.label}
self.attrs = {'epsilon': self.epsilon}
self.outputs = {'Out': smoothed_label}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestLabelSmoothOpWithPriorDist(TestLabelSmoothOp):
def setUp(self):
self.config()
dist = np.random.random((1, self.label_dim))
smoothed_label = (1 - self.epsilon) * self.label + self.epsilon * dist
self.inputs = {'X': self.label, 'PriorDist': dist}
self.attrs = {'epsilon': self.epsilon}
self.outputs = {'Out': smoothed_label}
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from operator import mul
from op_test import OpTest
import paddle.v2.fluid.core as core
from paddle.v2.fluid.op import Operator
from paddle.v2.fluid.framework import grad_var_name
def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
x_shape = x.shape
N = reduce(mul, x_shape[0:begin_norm_axis], 1)
D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
x.shape = [N, D]
mean = np.mean(x, axis=1)
var = np.var(x, axis=1) + epsilon
output = scale.reshape([1, D]) * np.divide(
(x - mean.reshape([N, 1])),
(np.sqrt(var)).reshape([N, 1])) + beta.reshape([1, D])
x.shape, output.shape = x_shape, x_shape
return output, mean, var
def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
x_shape = x.shape
scale_shape = scale.shape
N = reduce(mul, x_shape[0:begin_norm_axis], 1)
D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
x.shape, grad_y.shape = [N, D], [N, D]
var.shape, mean.shape = [N, 1], [N, 1]
scale.shape = [1, D]
# d_bias
d_bias = np.sum(grad_y, axis=0).reshape([1, D])
# d_scale
d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y,
axis=0).reshape([1, D])
# dx
dx_end = scale * np.sqrt(1.0 / var) * grad_y
d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape(
[N, 1]) # the second part equals to zero.
d_mean = 1.0 / D * d_mean_0
d_std = np.sum(
-(1.0 / var) * (x - mean) * grad_y * scale, axis=1).reshape([N, 1]) * (
1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean))
grad_x = dx_end + d_mean + d_std
grad_y.shape = x_shape
x.shape = x_shape
scale.shape = scale_shape
return grad_x, d_scale, d_bias
def get_backward_op(scope, op, no_grad_set):
backward_op = core.Operator.backward(op, no_grad_set)
for input in backward_op.input_vars():
var = scope.var(input)
var.get_tensor()
for output in backward_op.output_vars():
var = scope.var(output)
var.get_tensor()
return backward_op
def create_or_get_tensor(scope, var_name, var, place):
tensor = scope.var(var_name).get_tensor()
if var is not None:
assert isinstance(var, np.ndarray)
tensor.set_lod([[]])
tensor.set_dims(var.shape)
tensor.set(var, place)
return tensor
def set_output_grad(scope, outputs, place, feed_dict=None):
def __set_tensor__(name, data=None):
out_tensor = scope.find_var(name).get_tensor()
grad_tensor = scope.var(grad_var_name(name)).get_tensor()
out_dtype = out_tensor.dtype()
if data is None:
if out_dtype == core.DataType.FP64:
data = np.ones(out_tensor.shape(), dtype=np.float64)
elif out_dtype == core.DataType.FP32:
data = np.ones(out_tensor.shape(), dtype=np.float32)
else:
raise ValueError("Not supported data type " + str(out_dtype))
grad_tensor.set(data, place)
for output in outputs:
data = None
if output in feed_dict:
data = feed_dict[output]
__set_tensor__(output, data)
class TestLayerNormdOp(OpTest):
def __assert_close(self, tensor, np_array, msg, atol=1e-4):
self.assertTrue(
np.allclose(
np.array(tensor).reshape(np_array.shape), np_array, atol=atol),
msg)
def __assert_grad_close(self,
tensor,
np_array,
name,
place,
max_relative_error=0.02):
a = np.array(tensor).reshape(np_array.shape)
b = np_array
abs_a = np.abs(a)
abs_a[abs_a < 1e-5] = 1
diff_mat = np.abs(a - b) / abs_a
max_diff = np.max(diff_mat)
def err_msg():
offset = np.argmax(diff_mat > max_relative_error)
return ("%s Variable %s max gradient diff %f over limit %f, "
"the first error element is %d, %f, %f") % (
"Gradient Check On %s" % str(place), name, max_diff,
max_relative_error, offset, a.flatten()[offset],
b.flatten()[offset])
self.assertLessEqual(max_diff, max_relative_error, err_msg())
def check_forward_backward(self, shape, begin_norm_axis):
def test_with_place(place, shape, begin_norm_axis=1):
# setUp
assert begin_norm_axis > 0 and begin_norm_axis < len(
shape), 'begin_norm_axis must be between 0 and len(shape)-1.'
# attr
epsilon = 0.00001
x_shape = shape
D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
scale_shape = [D]
np.random.random(123)
x_val = np.random.random_sample(x_shape).astype(np.float32)
scale_val = np.random.random_sample(scale_shape).astype(np.float32)
bias_val = np.random.random_sample(scale_shape).astype(np.float32)
y_grad = np.random.random_sample(x_shape).astype(np.float32)
# run forward
y_out, saved_mean, var_ref = _reference_layer_norm_naive(
x_val, scale_val, bias_val, epsilon, begin_norm_axis)
naive_fw = {"Y": y_out, "Mean": saved_mean, "Variance": var_ref}
# get gradient
x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad(
x_val, y_grad, scale_val, saved_mean, var_ref, begin_norm_axis)
naive_grad = {
"X": x_grad_ref,
"Scale": scale_grad_ref,
"Bias": bias_grad_ref
}
scope = core.Scope()
# create input
input_map = {"X": x_val, "Scale": scale_val, "Bias": bias_val}
for i_name in input_map:
create_or_get_tensor(scope, i_name, input_map[i_name], place)
# create output
output_map = {"Y": None, "Mean": None, "Variance": None}
output_tensor = {}
for o_name in output_map:
output_tensor[o_name] = create_or_get_tensor(
scope, o_name, output_map[o_name], place)
layer_norm_op = Operator(
"layer_norm",
# inputs
X="X",
Scale="Scale",
Bias="Bias",
# outputs
Y="Y",
Mean="Mean",
Variance="Variance",
# attrs
epsilon=epsilon,
begin_norm_axis=begin_norm_axis)
layer_norm_op.run(scope, place)
# check forward result
atol = 5e-2 if isinstance(place, core.CUDAPlace) else 1e-4
for o_tensor in output_tensor:
self.__assert_close(output_tensor[o_tensor], naive_fw[o_tensor],
o_tensor, atol)
# run backward
layer_norm_op_grad = get_backward_op(scope, layer_norm_op, set())
set_output_grad(
scope, ["Y", "Mean", "Variance"],
place,
feed_dict={"Y": y_grad})
layer_norm_op_grad.run(scope, place)
# get output
grad_tensor = {}
for o_name in naive_grad:
grad_tensor[o_name] = x_ = create_or_get_tensor(
scope, grad_var_name(o_name), None, place)
# check gradient output
for o_grad in naive_grad:
self.__assert_grad_close(grad_tensor[o_grad],
naive_grad[o_grad], o_grad + "@GRAD",
place)
places = [core.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
places.append(core.CUDAPlace(0))
for place in places:
test_with_place(place, shape, begin_norm_axis)
def test_check_forward_backward_with_scale_and_bias(self):
self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
def test_check_forward_backward_with_scale(self):
pass # TODO(zcd)
def test_check_forward_backward_with_bias(self):
pass # TODO(zcd)
def test_check_forward_backward(self):
pass # TODO(zcd)
if __name__ == '__main__':
unittest.main()
...@@ -223,6 +223,14 @@ class TestBook(unittest.TestCase): ...@@ -223,6 +223,14 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(layers.sequence_softmax(x=seq)) self.assertIsNotNone(layers.sequence_softmax(x=seq))
print(str(program)) print(str(program))
def test_softmax(self):
program = Program()
with program_guard(program):
data = layers.data(name='data', shape=[10], dtype='float32')
hid = layers.fc(input=data, size=20)
self.assertIsNotNone(layers.softmax(x=hid))
print(str(program))
def test_get_places(self): def test_get_places(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import math
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid as fluid
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.learning_rate_decay as lr_decay
def exponential_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False):
exponent = float(global_step) / float(decay_steps)
if staircase:
exponent = math.floor(exponent)
return learning_rate * decay_rate**exponent
def natural_exp_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False):
exponent = float(global_step) / float(decay_steps)
if staircase:
exponent = math.floor(exponent)
return learning_rate * math.exp(-1 * decay_rate * exponent)
def inverse_time_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False):
temp = float(global_step) / float(decay_steps)
if staircase:
temp = math.floor(temp)
return learning_rate / (1 + decay_rate * temp)
class TestLearningRateDecay(unittest.TestCase):
def check_decay(self, python_decay_fn, fluid_decay_fn, staircase):
init_lr = 1.0
decay_steps = 5
decay_rate = 0.5
global_step = layers.create_global_var(
shape=[1], value=0.0, dtype='float32', persistable=True)
decayed_lr = fluid_decay_fn(
learning_rate=init_lr,
global_step=global_step,
decay_steps=decay_steps,
decay_rate=decay_rate,
staircase=staircase)
layers.increment(global_step, 1.0)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for step in range(10):
step_val, lr_val = exe.run(fluid.default_main_program(),
feed=[],
fetch_list=[global_step, decayed_lr])
python_decayed_lr = python_decay_fn(
learning_rate=init_lr,
global_step=step,
decay_steps=decay_steps,
decay_rate=decay_rate,
staircase=staircase)
self.assertAlmostEqual(python_decayed_lr, lr_val[0])
def test_decay(self):
decay_fns = [
(exponential_decay, lr_decay.exponential_decay, True),
(exponential_decay, lr_decay.exponential_decay, False),
(natural_exp_decay, lr_decay.natural_exp_decay, True),
(natural_exp_decay, lr_decay.natural_exp_decay, False),
(inverse_time_decay, lr_decay.inverse_time_decay, True),
(inverse_time_decay, lr_decay.inverse_time_decay, False),
]
for py_decay_fn, fluid_decay_fn, staircase in decay_fns:
print("decay_fn=" + str(py_decay_fn) + " staircase=" + str(
staircase))
main_program = framework.Program()
startup_program = framework.Program()
with framework.program_guard(main_program, startup_program):
self.check_decay(py_decay_fn, fluid_decay_fn, staircase)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import sys
import math
from op_test import OpTest
class TestMineHardExamplesOp(OpTest):
def set_data(self):
self.init_test_data()
self.inputs = {
'ClsLoss': self.cls_loss,
'LocLoss': self.loc_loss,
'MatchIndices': self.match_indices,
'MatchDist': self.match_dis
}
self.attrs = {
'neg_pos_ratio': self.neg_pos_ratio,
'neg_overlap': self.neg_overlap,
'sample_size': self.sample_size,
'mining_type': self.mining_type
}
self.outputs = {
'NegIndices': (self.neg_indices, self.neg_indices_lod),
'UpdatedMatchIndices': self.updated_match_indices
}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
return
def setUp(self):
self.op_type = "mine_hard_examples"
self.set_data()
def init_test_data(self):
self.neg_pos_ratio = 1.0
self.neg_overlap = 0.5
self.sample_size = 0
self.mining_type = "max_negative"
self.cls_loss = np.array([[0.1, 0.1, 0.3],
[0.3, 0.1, 0.1]]).astype('float32')
self.loc_loss = np.array([[0.1, 0.2, 0.3],
[0.3, 0.4, 0.1]]).astype('float32')
self.match_dis = np.array([[0.2, 0.4, 0.8],
[0.1, 0.9, 0.3]]).astype('float32')
self.match_indices = np.array([[0, -1, -1],
[-1, 0, -1]]).astype('int32')
self.updated_match_indices = self.match_indices
self.neg_indices_lod = [[0, 1, 2]]
self.neg_indices = np.array([[1], [0]]).astype('int32')
class TestMineHardExamplesOpHardExample(TestMineHardExamplesOp):
def init_test_data(self):
super(TestMineHardExamplesOpHardExample, self).init_test_data()
self.mining_type = "hard_example"
self.sample_size = 2
self.cls_loss = np.array([[0.5, 0.1, 0.3],
[0.3, 0.1, 0.1]]).astype('float32')
self.loc_loss = np.array([[0.2, 0.2, 0.3],
[0.3, 0.1, 0.2]]).astype('float32')
self.match_indices = np.array([[0, -1, -1],
[-1, 0, -1]]).astype('int32')
self.updated_match_indices = np.array([[0, -1, -1],
[-1, -1, -1]]).astype('int32')
self.neg_indices_lod = [[0, 1, 3]]
self.neg_indices = np.array([[2], [0], [2]]).astype('int32')
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import unittest
import numpy as np
import copy
from op_test import OpTest
def iou(box_a, box_b):
"""Apply intersection-over-union overlap between box_a and box_b
"""
xmin_a = min(box_a[0], box_a[2])
ymin_a = min(box_a[1], box_a[3])
xmax_a = max(box_a[0], box_a[2])
ymax_a = max(box_a[1], box_a[3])
xmin_b = min(box_b[0], box_b[2])
ymin_b = min(box_b[1], box_b[3])
xmax_b = max(box_b[0], box_b[2])
ymax_b = max(box_b[1], box_b[3])
area_a = (ymax_a - ymin_a) * (xmax_a - xmin_a)
area_b = (ymax_b - ymin_b) * (xmax_b - xmin_b)
if area_a <= 0 and area_b <= 0:
return 0.0
xa = max(xmin_a, xmin_b)
ya = max(ymin_a, ymin_b)
xb = min(xmax_a, xmax_b)
yb = min(ymax_a, ymax_b)
inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0)
box_a_area = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
box_b_area = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
iou_ratio = inter_area / (area_a + area_b - inter_area)
return iou_ratio
def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0):
"""Apply non-maximum suppression at test time to avoid detecting too many
overlapping bounding boxes for a given object.
Args:
boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
scores: (tensor) The class predscores for the img, Shape:[num_priors].
score_threshold: (float) The confidence thresh for filtering low
confidence boxes.
nms_threshold: (float) The overlap thresh for suppressing unnecessary
boxes.
top_k: (int) The maximum number of box preds to consider.
eta: (float) The parameter for adaptive NMS.
Return:
The indices of the kept boxes with respect to num_priors.
"""
all_scores = copy.deepcopy(scores)
all_scores = all_scores.flatten()
selected_indices = np.argwhere(all_scores > score_threshold)
selected_indices = selected_indices.flatten()
all_scores = all_scores[selected_indices]
sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
sorted_scores = all_scores[sorted_indices]
if top_k > -1 and top_k < sorted_indices.shape[0]:
sorted_indices = sorted_indices[:top_k]
sorted_scores = sorted_scores[:top_k]
selected_indices = []
adaptive_threshold = nms_threshold
for i in range(sorted_scores.shape[0]):
idx = sorted_indices[i]
keep = True
for k in range(len(selected_indices)):
if keep:
kept_idx = selected_indices[k]
overlap = iou(boxes[idx], boxes[kept_idx])
keep = True if overlap <= adaptive_threshold else False
else:
break
if keep:
selected_indices.append(idx)
if keep and eta < 1 and adaptive_threshold > 0.5:
adaptive_threshold *= eta
return selected_indices
def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
nms_top_k, keep_top_k):
class_num = scores.shape[0]
priorbox_num = scores.shape[1]
selected_indices = {}
num_det = 0
for c in range(class_num):
if c == background: continue
indices = nms(boxes, scores[c], score_threshold, nms_threshold,
nms_top_k)
selected_indices[c] = indices
num_det += len(indices)
if keep_top_k > -1 and num_det > keep_top_k:
score_index = []
for c, indices in selected_indices.iteritems():
for idx in indices:
score_index.append((scores[c][idx], c, idx))
sorted_score_index = sorted(
score_index, key=lambda tup: tup[0], reverse=True)
sorted_score_index = sorted_score_index[:keep_top_k]
selected_indices = {}
for _, c, _ in sorted_score_index:
selected_indices[c] = []
for s, c, idx in sorted_score_index:
selected_indices[c].append(idx)
num_det = keep_top_k
return selected_indices, num_det
def batched_multiclass_nms(boxes, scores, background, score_threshold,
nms_threshold, nms_top_k, keep_top_k):
batch_size = scores.shape[0]
det_outs = []
lod = [0]
for n in range(batch_size):
nmsed_outs, nmsed_num = multiclass_nms(boxes, scores[n], background,
score_threshold, nms_threshold,
nms_top_k, keep_top_k)
lod.append(lod[-1] + nmsed_num)
if nmsed_num == 0: continue
for c, indices in nmsed_outs.iteritems():
for idx in indices:
xmin, ymin, xmax, ymax = boxes[idx][:]
det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
return det_outs, lod
class TestMulticlassNMSOp(OpTest):
def set_argument(self):
self.score_threshold = 0.01
def setUp(self):
self.set_argument()
N = 7
M = 1200
C = 21
BOX_SIZE = 4
background = 0
nms_threshold = 0.3
nms_top_k = 400
keep_top_k = 200
score_threshold = self.score_threshold
scores = np.random.random((N * M, C)).astype('float32')
def softmax(x):
shiftx = x - np.max(x).clip(-64.)
exps = np.exp(shiftx)
return exps / np.sum(exps)
scores = np.apply_along_axis(softmax, 1, scores)
scores = np.reshape(scores, (N, M, C))
scores = np.transpose(scores, (0, 2, 1))
boxes = np.random.random((M, BOX_SIZE)).astype('float32')
boxes[:, 0:2] = boxes[:, 0:2] * 0.5
boxes[:, 2:4] = boxes[:, 2:4] * 0.5 + 0.5
nmsed_outs, lod = batched_multiclass_nms(boxes, scores, background,
score_threshold, nms_threshold,
nms_top_k, keep_top_k)
nmsed_outs = [-1] if not nmsed_outs else nmsed_outs
nmsed_outs = np.array(nmsed_outs).astype('float32')
self.op_type = 'multiclass_nms'
self.inputs = {'BBoxes': boxes, 'Scores': scores}
self.outputs = {'Out': (nmsed_outs, [lod])}
self.attrs = {
'background_label': 0,
'nms_threshold': nms_threshold,
'nms_top_k': nms_top_k,
'keep_top_k': keep_top_k,
'score_threshold': score_threshold,
'nms_eta': 1.0,
}
def test_check_output(self):
self.check_output()
class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp):
def set_argument(self):
# Here set 2.0 to test the case there is no outputs.
# In practical use, 0.0 < score_threshold < 1.0
self.score_threshold = 2.0
class TestIOU(unittest.TestCase):
def test_iou(self):
box1 = np.array([4.0, 3.0, 7.0, 5.0]).astype('float32')
box2 = np.array([3.0, 4.0, 6.0, 8.0]).astype('float32')
expt_output = np.array([2.0 / 16.0]).astype('float32')
calc_output = np.array([iou(box1, box2)]).astype('float32')
self.assertTrue(np.allclose(calc_output, expt_output))
if __name__ == '__main__':
unittest.main()
...@@ -19,6 +19,7 @@ import paddle.v2.fluid.layers as layers ...@@ -19,6 +19,7 @@ import paddle.v2.fluid.layers as layers
import numpy import numpy
from multiprocessing import Process from multiprocessing import Process
import os, sys import os, sys
import time
class TestRecvOp(unittest.TestCase): class TestRecvOp(unittest.TestCase):
...@@ -28,6 +29,7 @@ class TestRecvOp(unittest.TestCase): ...@@ -28,6 +29,7 @@ class TestRecvOp(unittest.TestCase):
p = Process(target=self.init_serv, args=(place, )) p = Process(target=self.init_serv, args=(place, ))
p.daemon = True p.daemon = True
p.start() p.start()
time.sleep(1)
self.init_client(place) self.init_client(place)
# FIXME(typhoonzero): find a way to gracefully shutdown the server. # FIXME(typhoonzero): find a way to gracefully shutdown the server.
os.system("kill -9 %d" % p.pid) os.system("kill -9 %d" % p.pid)
......
...@@ -108,9 +108,31 @@ class TestTensor(unittest.TestCase): ...@@ -108,9 +108,31 @@ class TestTensor(unittest.TestCase):
scope = core.Scope() scope = core.Scope()
place = core.CPUPlace() place = core.CPUPlace()
lod_py = [[0, 2, 5], [0, 2, 4, 5]] lod_py = [[0, 2, 5], [0, 2, 4, 5]]
lod_tensor = core.LoDTensor(lod_py) lod_tensor = core.LoDTensor()
lod_tensor.set_dims([5, 2, 3, 4]) lod_tensor.set_dims([5, 2, 3, 4])
lod_tensor.set_lod(lod_py)
lod_tensor.alloc_float(place)
tensor_array = numpy.array(lod_tensor)
tensor_array[0, 0, 0, 0] = 1.0
tensor_array[0, 0, 0, 1] = 2.0
lod_tensor.set(tensor_array, place)
lod_v = numpy.array(lod_tensor)
self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
self.assertListEqual(lod_py, lod_tensor.lod())
def test_lod_tensor_gpu_init(self):
if not core.is_compiled_with_cuda():
return
scope = core.Scope()
place = core.CUDAPlace(0)
lod_py = [[0, 2, 5], [0, 2, 4, 5]]
lod_tensor = core.LoDTensor()
lod_tensor.set_dims([5, 2, 3, 4])
lod_tensor.set_lod(lod_py)
lod_tensor.alloc_float(place) lod_tensor.alloc_float(place)
tensor_array = numpy.array(lod_tensor) tensor_array = numpy.array(lod_tensor)
tensor_array[0, 0, 0, 0] = 1.0 tensor_array[0, 0, 0, 0] = 1.0
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册