“5a58a274926c39d991d49b19d0d0a4fddc5992a9”上不存在“paddlespeech/git@gitcode.net:paddlepaddle/DeepSpeech.git”
提交 a529d790 编写于 作者: T typhoonzero

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into gen_nccl_id_op

...@@ -16,34 +16,14 @@ env: ...@@ -16,34 +16,14 @@ env:
- JOB=check_style - JOB=check_style
- JOB=build_android - JOB=build_android
addons: addons:
apt:
packages:
- gcc-4.8
- g++-4.8
- git
- build-essential
- python
- python-pip
- python2.7-dev
- python-wheel
- libboost-dev
- curl
- swig
- graphviz
- clang-format-3.8
- automake
- libtool
- ccache
ssh_known_hosts: 13.229.163.131 ssh_known_hosts: 13.229.163.131
before_install: before_install:
- sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
- sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit
- | - |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script: script:
- | - |
# 43min timeout # 43min timeout
if [[ "$JOB" != "doc" ]]; then timeout 2580 paddle/scripts/paddle_docker_build.sh ${JOB}; else paddle/scripts/paddle_build.sh ${JOB}; fi; paddle/scripts/paddle_docker_build.sh ${JOB}
if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi; if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
- | - |
if [[ "$JOB" != "doc" ]]; then exit 0; fi; if [[ "$JOB" != "doc" ]]; then exit 0; fi;
......
...@@ -75,19 +75,19 @@ We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/g ...@@ -75,19 +75,19 @@ We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/g
You might want to start from this online interactive book that can run in a Jupyter Notebook. You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/cluster_train_en.html) - [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/index_en.html)
You can run distributed training jobs on MPI clusters. You can run distributed training jobs on MPI clusters.
- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/k8s_en.html) - [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/multi_cluster/k8s_en.html)
You can also run distributed training jobs on Kubernetes clusters. You can also run distributed training jobs on Kubernetes clusters.
- [Python API](http://www.paddlepaddle.org/docs/develop/documentation/en/api/index_en.html) - [Python API](http://www.paddlepaddle.org/docs/develop/api/en/overview.html)
Our new API enables much shorter programs. Our new API enables much shorter programs.
- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html) - [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/dev/contribute_to_paddle_en.html)
We appreciate your contributions! We appreciate your contributions!
......
#!/bin/bash
# Update to point to the source file.
VGG_SRC="vgg16_fluid.py"
export TRAINING_ROLE=PSERVER
export TRAINERS=2
export POD_IP=127.0.0.1
export PADDLE_INIT_PORT=6174
MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
# Need to wait for the ps to start first.
sleep 10
echo "done start ps"
export TRAINING_ROLE=TRAINER
export TRAINERS=2
export POD_IP=127.0.0.1
export PADDLE_INIT_PORT=6174
CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
...@@ -200,18 +200,19 @@ def main(): ...@@ -200,18 +200,19 @@ def main():
num_samples += len(data) num_samples += len(data)
train_pass_acc.add(value=acc, weight=b_size) train_pass_acc.add(value=acc, weight=b_size)
print( print(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s" "Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
% (pass_id, iters, loss, acc, "Speed = %.2f img/s " % (args.task_index, pass_id, iters,
len(data) / (time.time() - ts)) loss, acc,
len(data) / (time.time() - ts))
) # The accuracy is the accumulation of batches, but not the current batch. ) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time pass_elapsed = time.time() - start_time
pass_train_acc = train_pass_acc.eval() pass_train_acc = train_pass_acc.eval()
pass_test_acc = test(exe) pass_test_acc = test(exe)
print( print("Task:%d Pass = %d, Training performance = %f imgs/s, "
"Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n" "Train accuracy = %f, Test accuracy = %f\n" %
% (pass_id, num_samples / pass_elapsed, pass_train_acc, (args.task_index, pass_id, num_samples / pass_elapsed,
pass_test_acc)) pass_train_acc, pass_test_acc))
if args.local: if args.local:
# Parameter initialization # Parameter initialization
...@@ -239,8 +240,6 @@ def main(): ...@@ -239,8 +240,6 @@ def main():
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(
optimize_ops,
params_grads,
trainer_id=args.task_index, trainer_id=args.task_index,
pservers=args.ps_hosts, pservers=args.ps_hosts,
trainers=trainers) trainers=trainers)
......
# float16 benchmark
## Description
We want to compare the inference benchmark of float16 vs float32 on the "image_classification" example on Nvidia Tesla V100 GPU, where we can enable the tensor core computation for float16 mode. We test Vgg16 and Resnet50 on the imagenet data set, and Vgg16 and Resnet32 on the cifar10 data set. For completeness, we also add the inference benchmark of Vgg16 and Resnet50 on imagenet data set tested on Nvidia GeForce GTX 1080 Ti GPU.
For more details about tensor core, please refer to https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/
## Test environment
- GPU: single Nvidia Tesla V100 or single Nvidia GeForce GTX 1080 Ti
- CUDNN: 7.1.1
- CUDA: 9.0
- Code: https://github.com/PaddlePaddle/Paddle/pull/10331 (Tensor core is enabled in float16 mode)
## Benchmark on V100
All times are in ms (millisecond) averaged over 1000 iterations tested on a single Nvidia V100 GPU with respective to different mini-batch(mb) sizes.
### Vgg16 on imagenet (flowers data set: image.shape = [3, 224, 224]):
Total inference time for one batch:
| | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 |
|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
|float32| 14.01 | 9.70 | 22.99 | 28.26 | 53.87 | 84.42 | 178.95 |
|float16| 3.32 | 4.11 | 5.88 | 9.41 | 16.54 | 30.47 | 60.23 |
|Speedup| 4.22 | 2.36  | 3.91 | 3.00 | 3.26  | 2.77 | 2.97 |
Total time spent on conv op for one batch:
| | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 |
|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
|float32| 11.95 | 6.96 | 18.65 | 21.42 | 41.35 | 60.58 | 130.11 |
|float16| 1.78 | 2.10 | 2.93 | 4.55 | 7.99 | 14.63 | 28.67 |
|Speedup| 6.71 | 3.31  | 6.37 | 4.71 | 5.18  | 4.14 | 4.54 |
### Resnet50 on imagenet (flowers data set: image.shape = [3, 224, 224]):
Total inference time for one batch:
|       | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 |
|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
|float32| 7.03 | 7.41 | 9.16 | 12.55 | 21.13 | 38.27 | 67.93 | 127.02 |
|float16| 6.13 | 6.32 | 6.24 | 7.40 | 10.90 | 18.18 | 33.20 | 64.52 |
|Speedup| 1.15 | 1.17  | 1.47  | 1.70 | 1.94  | 2.11 | 2.05 | 1.97 |
Total time spent on conv op for one batch:
|       | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 |
|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
|float32| 5.43 | 5.46 | 6.50 | 8.36 | 13.80 | 24.45 | 41.21 | 73.44 |
|float16| 4.19 | 4.30 | 3.96 | 4.21 | 5.63 | 8.77 | 15.24 | 28.40 |
|Speedup| 1.30 | 1.27  | 1.64  | 1.99 | 2.45  | 2.79 | 2.70 | 2.59 |
### Vgg16 on cifar10 (image.shape = [3, 32, 32]):
Total inference time for one batch:
| | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | mb=256 | mb=512 |
|-------|-----:|-----:|-----:|-----:|------:|------:|------:|-------:|-------:|-------:|
|float32| 3.13 | 3.17 | 3.19 | 3.58 | 3.98 | 6.23 | 8.42 | 13.44 | 24.19 | 44.97 |
|float16| 2.72 | 2.77 | 2.76 | 2,88 | 2.96 | 3.24 | 4.01 | 5.78 | 9.65 | 17.37 |
|Speedup| 1.15 | 1.14 | 1.16 | 1.24 | 1.34 | 1.92  | 2.10 | 2.33  | 2.51 | 2.59 |
### Resnet32 on cifar10 (image.shape = [3, 32, 32]):
Total inference time for one batch:
| | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | mb=256 | mb=512 |
|-------|-----:|-----:|-----:|-----:|------:|------:|------:|-------:|-------:|-------:|
|float32| 3.11 | 3.14 | 2.99 | 3.04 | 3.10 | 3.28 | 4.47 | 6.86 | 11.63 | 21.16 |
|float16| 3.70 | 3.81 | 3.75 | 3.83 | 3.77 | 3.97 | 3.92 | 4.15 | 6.41 | 11.02 |
|Speedup|     |     |     |     |       | | 1.14  | 1.65 | 1.81 | 1.92 |
## Benchmark on 1080 Ti
All times are in ms (millisecond) averaged over 1000 iterations tested on a single Nvidia GeForce GTX 1080 Ti GPU with respective to different mini-batch(mb) sizes.
### Vgg16 on imagenet (flowers data set: image.shape = [3, 224, 224]):
Total inference time for one batch:
| | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 |
|-------|-----: |-----: |-----: |-----: |------: |-------:|
|float32| 5.60 | 9.38 | 15.86 | 29.79 | 57.60 | 117.73 |
|float16| 4.99 | 7.79 | 13.47 | 26.02 | 52.30 | 102.34 |
|Speedup| 1.12 | 1.20  | 1.18 | 1.15 | 1.10  | 1.15 |
### Resnet50 on imagenet (flowers data set: image.shape = [3, 224, 224]):
Total inference time for one batch:
| | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 |
|-------|-----: |-----: |-----: |-----: |------: |-------:|-------:|
|float32| 5.63 | 6.23 | 8.85 | 14.71 | 26.07 | 52.86 | 108.95 |
|float16| 5.89 | 6.44 | 7.94 | 12.57 | 22.03 | 45.06 | 92.68 |
|Speedup| |  | 1.12  | 1.17 | 1.18  | 1.17 | 1.18 |
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from float16_transpiler import Float16Transpiler
import argparse
import paddle
import paddle.fluid as fluid
import contextlib
import math
import sys
import numpy as np
import os
parser = argparse.ArgumentParser(
'Float16 inference accuracy test and benchmark.')
parser.add_argument(
'--train_batch_size', type=int, default=16, help="Batch size for training.")
parser.add_argument(
'--inf_batch_size', type=int, default=32, help="Batch size for inference.")
parser.add_argument(
'--repeat', type=int, default=1, help="How many times to run the test.")
parser.add_argument(
'--data_set',
type=str,
default='cifar10',
choices=['cifar10', 'imagenet'],
help="Optional dataset for benchmark.")
parser.add_argument(
'--model',
type=str,
default='vgg',
choices=['vgg', 'resnet'],
help="Optional model for benchmark.")
parser.add_argument(
'--threshold',
type=float,
default=0.005,
help='Save inference model when test accuracy reach this threshold.')
parser.add_argument('--learning_rate', type=float, default=0.001)
args = parser.parse_args()
def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
conv1 = fluid.layers.conv2d(
input=input,
filter_size=filter_size,
num_filters=ch_out,
stride=stride,
padding=padding,
act=None,
bias_attr=False)
return fluid.layers.batch_norm(input=conv1, act=act)
def shortcut(input, ch_out, stride):
ch_in = input.shape[1]
if ch_in != ch_out:
return conv_bn_layer(input, ch_out, 1, stride, 0, None)
else:
return input
def basicblock(input, ch_out, stride):
short = shortcut(input, ch_out, stride)
conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def bottleneck(input, ch_out, stride):
short = shortcut(input, ch_out * 4, stride)
conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
def layer_warp(block_func, input, ch_out, count, stride):
res_out = block_func(input, ch_out, stride)
for i in range(1, count):
res_out = block_func(res_out, ch_out, 1)
return res_out
def resnet_imagenet(input, depth=50):
cfg = {
18: ([2, 2, 2, 1], basicblock),
34: ([3, 4, 6, 3], basicblock),
50: ([3, 4, 6, 3], bottleneck),
101: ([3, 4, 23, 3], bottleneck),
152: ([3, 8, 36, 3], bottleneck)
}
stages, block_func = cfg[depth]
conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
pool1 = fluid.layers.pool2d(
input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
res2 = layer_warp(block_func, res1, 128, stages[1], 2)
res3 = layer_warp(block_func, res2, 256, stages[2], 2)
res4 = layer_warp(block_func, res3, 512, stages[3], 2)
pool2 = fluid.layers.pool2d(
input=res4,
pool_size=7,
pool_type='avg',
pool_stride=1,
global_pooling=True)
return pool2
def resnet_cifar10(input, depth=32):
assert (depth - 2) % 6 == 0
n = (depth - 2) // 6
conv1 = conv_bn_layer(
input=input, ch_out=16, filter_size=3, stride=1, padding=1)
res1 = layer_warp(basicblock, conv1, 16, n, 1)
res2 = layer_warp(basicblock, res1, 32, n, 2)
res3 = layer_warp(basicblock, res2, 64, n, 2)
pool = fluid.layers.pool2d(
input=res3, pool_size=8, pool_type='avg', pool_stride=1)
return pool
def vgg16(input):
def conv_block(input, num_filter, groups, dropouts):
return fluid.nets.img_conv_group(
input=input,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act='relu',
conv_with_batchnorm=True,
conv_batchnorm_drop_rate=dropouts,
pool_type='max')
conv1 = conv_block(input, 64, 2, [0.3, 0])
conv2 = conv_block(conv1, 128, 2, [0.4, 0])
conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
bn = fluid.layers.batch_norm(input=fc1, act='relu')
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
return fc2
def train(place, save_dirname):
if args.data_set == "cifar10":
class_dim = 10
data_shape = [3, 32, 32]
elif args.data_set == "imagenet":
class_dim = 102
data_shape = [3, 224, 224]
else:
raise ValueError("%s dataset is not supported" % data_set)
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if args.model == "vgg":
print("train vgg")
net = vgg16(images)
elif args.model == "resnet":
print("train resnet")
if args.data_set == "cifar10":
net = resnet_cifar10(images)
elif args.data_set == "imagenet":
net = resnet_imagenet(images)
else:
raise ValueError("%s dataset is not supported" % args.data_set)
else:
raise ValueError("%s network is not supported" % args.model)
predict = fluid.layers.fc(input=net, size=class_dim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=predict, label=label)
#Test program
test_program = fluid.default_main_program().clone(for_test=True)
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
optimizer.minimize(avg_cost)
BATCH_SIZE = args.train_batch_size
PASS_NUM = 100
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.flowers.train()
if args.data_set == 'imagenet' else paddle.dataset.cifar.train10(),
buf_size=128 * 10),
batch_size=args.train_batch_size)
test_reader = paddle.batch(
paddle.dataset.flowers.test()
if args.data_set == 'imagenet' else paddle.dataset.cifar.test10(),
batch_size=args.inf_batch_size)
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
exe.run(fluid.default_startup_program())
main_program = fluid.default_main_program()
for pass_id in range(PASS_NUM):
for batch_id, data in enumerate(train_reader()):
train_image = np.array(
map(lambda x: x[0].reshape(data_shape), data)).astype("float32")
train_label = np.array(map(lambda x: x[1], data)).astype("int64")
train_label = train_label.reshape([-1, 1])
exe.run(main_program,
feed={'pixel': train_image,
'label': train_label})
if (batch_id % 100) == 0:
acc_list = []
avg_loss_list = []
for tid, test_data in enumerate(test_reader()):
test_image = np.array(
map(lambda x: x[0].reshape(data_shape),
test_data)).astype("float32")
test_label = np.array(map(lambda x: x[1],
test_data)).astype("int64")
test_label = test_label.reshape([-1, 1])
loss_t, acc_t = exe.run(
program=test_program,
feed={"pixel": test_image,
"label": test_label},
fetch_list=[avg_cost, acc])
if math.isnan(float(loss_t)):
sys.exit("got NaN loss, training failed.")
acc_list.append(float(acc_t))
avg_loss_list.append(float(loss_t))
acc_value = np.array(acc_list).mean()
avg_loss_value = np.array(avg_loss_list).mean()
print(
'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Accuracy {3:2.2}'.
format(pass_id, batch_id + 1,
float(avg_loss_value), float(acc_value)))
if acc_value > args.threshold:
print(
'Save inference model with test accuracy of {0} at {1}'.
format(float(acc_value), save_dirname))
fluid.io.save_inference_model(save_dirname, ["pixel"],
[predict], exe)
return
def test_accuracy(executor, inference_program, feed_target_names,
fetch_targets):
if args.data_set == "cifar10":
data_shape = [3, 32, 32]
elif args.data_set == "imagenet":
data_shape = [3, 224, 224]
else:
raise ValueError("%s dataset is not supported" % data_set)
test_reader = paddle.batch(
paddle.dataset.cifar.test10()
if args.data_set == "cifar10" else paddle.dataset.flowers.test(),
batch_size=args.inf_batch_size)
test_num = 0
correct_num = 0
for test_data in test_reader():
test_image = np.array(
map(lambda x: x[0].reshape(data_shape), test_data)).astype(
"float32")
test_label = np.array(map(lambda x: x[1], test_data)).astype("int64")
test_label = test_label.reshape([-1, 1])
results = executor.run(program=inference_program,
feed={feed_target_names[0]: test_image},
fetch_list=fetch_targets)
prediction = np.argmax(results[0], axis=1).reshape([-1, 1])
correct_num += np.sum(prediction == test_label)
test_num += test_label.size
print("{0} out of {1} predictions are correct.".format(correct_num,
test_num))
print("Test accuray is {0}.".format(float(correct_num) / float(test_num)))
def infer(place, save_dirname):
exe = fluid.Executor(place)
inference_scope = fluid.core.Scope()
with fluid.scope_guard(inference_scope):
# Use fluid.io.load_inference_model to obtain the inference program desc,
# the feed_target_names (the names of variables that will be feeded
# data using feed operators), and the fetch_targets (variables that
# we want to obtain data from using fetch operators).
print("Load inference model from {0}".format(save_dirname))
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
print("The test set accuracy of inference in float mode is:")
test_accuracy(exe, inference_program, feed_target_names, fetch_targets)
float16_inference_program = inference_program.clone()
t = Float16Transpiler()
t.transpile(float16_inference_program, place)
print("The test set accuracy of inference in float16 mode is:")
test_accuracy(exe, float16_inference_program, feed_target_names,
fetch_targets)
fp16_save_dirname = "float16_" + save_dirname
fluid.io.save_inference_model(fp16_save_dirname, feed_target_names,
fetch_targets, exe,
float16_inference_program)
@contextlib.contextmanager
def scope_prog_guard():
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
if __name__ == "__main__":
if not fluid.core.is_compiled_with_cuda():
raise Exception("This test requires CUDA GPUs!")
place = fluid.CUDAPlace(0)
if not fluid.core.is_float16_supported(place):
raise Exception(
"This test requires compute capability of CUDA GPU >= 5.3!")
for i in range(args.repeat):
with scope_prog_guard():
save_dirname = "image_classification_" + args.data_set + "_" + args.model + ".inference.model"
train(place, save_dirname)
infer(place, save_dirname)
## Introduction
Working with deep neural networks (DNN) is a two-stage process. First we train DNN using labeled examples of inputs and desired outputs to obtain the model parameters (weights), then we deploy DNN along with the trained weights to run inference on unknown inputs. Typically, these weights are in float data type and hence we run inference in float mode using these weights. This post focuses on the discussion of how to use low precision float16 data type to represent these trained weights and run inference in float16 mode as well as the advantages of float16 inference over its float counterpart by showing some experiment results.
## What is float16?
float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has high tolerance against the loss of precision and range when using float16 to represent the weights and the inference accuracy will only be minimally affected in most cases. This gives us the opportunity to use float16 data type to speedup the inference.
Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type.
## Why float16?
The trend in today's deep learning community is to use bigger and deeper model. This translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float are correspondingly three-fold:
1. We only need half the memory size to load the same model using float16 representations. Moreover, most of the intermediate results generated during float16 inference are also of float16 data type. This makes the whole memory footprint of float16 inference roughly about half of its float counterpart. This is especially useful when deploying inference on mobile devices with limited available memory. Also given the same available memory, the maximum batch size for float16 inference is about twice that for float inference.
2. Because float16 occupies less memory than float, in theory hardware devices can achieve much higher floating point operators per second (FLOPS) for float16 data than float data. Right now, an outstanding example of hardware devices that actually deliver such advantages is Nvidia's latest Volta architecture GPUs, including Tesla V100 and Titan V. Moreover float16 takes less time to read from or write to memory and hence float16 can make inference more efficient especially in memory-bound applications where the performance is largely affected by how fast it is to read and write data.
3. From the energy efficiency perspective, the energy needed to read, write, and compute float16 data is much less that its float counterpart, which can significantly reduce the battery power consumption on mobile devices or the total cost of ownership (TCO) of data centers.
## Fluid implementation of float16 inference
### Overview
Fluid use [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block.
### Basic requirement
When an operator is run by an executor, it uses a kernel to perform computations on tensors contained in the input variables, and then write the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs.
This means that if we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will makes the program run in float mode and gives us a final output of float data type.
The same principle applies if we want a program to run in float16 mode. We provide input variable of float16 data type to the first operator and every subsequent operator will invoke the float16 kernel until we get the final output in float16 data type. So the preliminary requirements for float16 inference is to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax.
### float16 transpiler
Furthermore, we need a float16 transpiler to achieve the following usage code:
```python
# Get the float32 inference program and load the associated float32 weights
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
# Prepare the float input data
batch_size = 1
tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype(numpy.float32)
# Running inference_program in float mode
float_results = exe.run(inference_program,
feed={feed_target_names[0]: tensor_img},
fetch_list=fetch_targets)
# Use float16 transpiler to speedup
float16_inference_program = float_inference_program.clone()
t = Float16Transpiler()
t.transpile(float16_inference_program, GPUPlace)
# Running float16_inference_program in float16 mode using the same input data
float16_results = exe.run(float16_inference_program,
feed={feed_target_names[0]: tensor_img},
fetch_list=fetch_targets)
# Do some tests to verify the correctness of float16 inference
...
np.testing.assert_almost_equal(float_results, float16_results, ...)
...
# Save the float16 inference program and float16 weights for future deployment
fluid.io.save_inference_model(fp16_save_dirname, feed_target_names,
fetch_targets, exe,
float16_inference_program)
```
In this scenario, we already have a float32 inference program and some associated float32 weights that can do float32 inference. We can easily use the `transpile` method of the `Float16Transpiler` class to do certain modifications to the existing program and weights so that we have a new float16 program and the associated float16 weights.
We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. This requires us to add some cast operators in the program to convert between float16 tensor and float32 tensor.
The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference).
### Experiment results
We provide demo codes that can be used to reproduce the experiment results by doing:
```bash
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
# This line will generate a paddle development docker image with cuda 8 and cudnn 7
# If you want test on cuda 9 instead, change the line 5 in Paddle/Dockerfile
# from `FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04`
# to `FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04` and similarly for other configurations
nvidia-docker build -t paddle:float16 .
# After running this, different results will be written to different log files in Paddle/contrib/float16/
nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/contrib/float16/run_float16_demo.sh
```
#### Correctness
As is mentioned before, DNN inference has been found to be tolerant against the loss of precision and range incured by float16 and we want to see how good this tolerance is.
We train a resnet32 model using cifar10 data set, save it when test set accuracy is above 60%, and then test the inference accuracy on the 10000 examples of the cifar10 test set in float16 and float32 mode, respectively.
We repeat the test ten times and get the following results:
| | float16 | float32 |
|--------|--------:|--------: |
| # 1 | 62.75% | 62.72% |
| # 2 | 61.27% | 61.28% |
| # 3 | 62.24% | 62.23% |
| # 4 | 64.16% | 64.17% |
| # 5 | 60.75% | 60.77% |
| # 6 | 63.25% | 63.24% |
| # 7 | 62.15% | 62.13% |
| # 8 | 62.05% | 62.02% |
| # 9 | 65.19% | 65.20% |
| #10 | 62.53% | 62.48% |
| average| 62.63% | 62.62% |
We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over 10 tests.
#### Performance benchmark
Currently, Fluid inference in float16 mode is only supported on Nvidia GPU device. There is no motivation to support float16 inference on non-ARM CPUs because float16 is not natively supported there and float16 calculation will only be slower than its float counterpart.
Nvidia started to support its native float16 data type (which has the same internal memory representation as Fluid float16 class) on CUDA 7.5. Moreover, float16 speedups on common computational intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cublas 7.5 and cuDNN 5.0.
Recently, the introduction of [tensor core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in volta architecture GPUs and the support of tensor core calculation in CUDA 9.0 and cuDNN 7 make float16 truly superior to float in certain deep learning applications.
We thus benchmark the float16 inference performance on a single Nvidia Tesla V100 GPU (volta architecture and with tensor cores) and compare it with its float32 counterpart. All the following results are in ms (millisecond) averaged over 1000 mini-batches with respective to different mini-batch(mb) sizes.
Average inference time for one mini-batch on Vgg16 model tested on imagenet data set:
| total | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 |
|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
|float32| 14.01 | 9.70 | 22.99 | 28.26 | 53.87 | 84.42 | 178.95 |
|float16| 3.32 | 4.11 | 5.88 | 9.41 | 16.54 | 30.47 | 60.23 |
|Speedup| 4.22 | 2.36  | 3.91 | 3.00 | 3.26  | 2.77 | 2.97 |
We can see that float16 inference provides 2x ~ 4x speedup on different batch sizes.
Convolution operation is ususally the computational bottleneck of CNN, so we also check the average time spent on the Fluid convolution operators for one mini-batch as follows:
|conv op| mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 |
|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
|float32| 11.95 | 6.96 | 18.65 | 21.42 | 41.35 | 60.58 | 130.11 |
|float16| 1.78 | 2.10 | 2.93 | 4.55 | 7.99 | 14.63 | 28.67 |
|Speedup| 6.71 | 3.31  | 6.37 | 4.71 | 5.18  | 4.14 | 4.54 |
Fluid convolution operator uses cuDNN 7 to implement the kernel and we can see that with the help of tensor core, float16 convolution is significantly faster than its float32 counterpart, which makes the overall float16 inference performance much better.
Similarly, we also list the benchmark results of Resnet50 model tested on imagenet data set:
| total | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 |
|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
|float32| 7.03 | 7.41 | 9.16 | 12.55 | 21.13 | 38.27 | 67.93 | 127.02 |
|float16| 6.13 | 6.32 | 6.24 | 7.40 | 10.90 | 18.18 | 33.20 | 64.52 |
|Speedup| 1.15 | 1.17  | 1.47  | 1.70 | 1.94  | 2.11 | 2.05 | 1.97 |
|conv op| mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 |
|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
|float32| 5.43 | 5.46 | 6.50 | 8.36 | 13.80 | 24.45 | 41.21 | 73.44 |
|float16| 4.19 | 4.30 | 3.96 | 4.21 | 5.63 | 8.77 | 15.24 | 28.40 |
|Speedup| 1.30 | 1.27  | 1.64  | 1.99 | 2.45  | 2.79 | 2.70 | 2.59 |
We find that the speedup provided by float16 inference starts relatively small at 1.15x for batch size 1 and gradually increase to about 2x for larger batch sizes. Similar trend can be found for the time spent on the convolution operator. Note that right now the tensor core will only be utilized in the convolution operation when certain dimentional requirements are met for the input data and filter. The speedup by float16 inference for Resnet50 is smaller than the Vgg16 counterpart partially because the convolution operation in Resnet is much simpler than the Vgg counterpart and this makes the tensor core less utilized in Resnet than in Vgg.
We also did the same benchmark on a Nvidia GeForce GTX 1080 Ti GPU that does not support tensor core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then deliver around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that tensor core, which is specialized for float16 computations, is a critical component for high performance float16 inference.
Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/contrib/float16/float16_benchmark.md) for comprehensive benchmark results.
### Summary
1. Fluid is now able to run inference in float16 mode via a float16 transpiler. We currently support CNN programs, including Vgg and Resnet, to run in float16 inference mode.
2. The accuracy of float16 inference is verified to be almost identical to the float32 counterpart at least on CNNs.
3. float16 inference provides significant speedup on large and computationally intensive Vgg16 network on image net data set. For the much smaller and simpler Resnet50, the speedup provided by float16 inference is less significant than on Vgg16 but still favorable especially for large batch size.
4. We cannot achieve the superior float16 inference performance without the help of the newly introduced tensor cores on the Nvidia Volta architecture GPUs.
...@@ -13,115 +13,13 @@ ...@@ -13,115 +13,13 @@
# limitations under the License. # limitations under the License.
import numpy as np import numpy as np
from framework import Program import paddle.fluid.core as core
from executor import global_scope from paddle.fluid.framework import Program
from . import core from paddle.fluid.executor import global_scope
class InferenceTranspiler: class Float16Transpiler:
def transpile(self, program, place, scope=None): def transpile(self, program, place, scope=None):
'''
Transpile the program. Support only fuse batch normalization now.
:param program: program to transpile
:type program: Program
:param place: inference place
:type place: Place
:param scope: inference scope
:type scope: Scope or None
'''
if not isinstance(program, Program):
raise TypeError("program should be as Program type")
if not isinstance(place, core.CPUPlace) and not isinstance(
place, core.CUDAPlace):
raise TypeError("place should be as CPUPlace/CUDAPlace type")
if scope is None:
scope = global_scope()
if not isinstance(scope, core.Scope):
raise TypeError("scope should be as Scope type or None")
self.fuse_batch_norm(program, place, scope)
def fuse_batch_norm(self, program, place, scope):
'''
Transpile the program by fused batch normalization.
The batch normalization followed the convolution or fully connected layer
can be integrated with them. Doing so will give us a forward acceleration,
especially in environments like mobile or embedded.
For input X:
- Conv process: X = input * W + bias
- Batch norm process: X' = (X - mean) / std
- Scale Process: Y = a * X' + b
After fuse into one operation:
Y = (input * W + bias - mean) / std * a + b
= input * a * W / std + ((bias - mean) / std * a + b)
The operator transformation is:
- before:
- conv->batch_norm->any_other_op (bias == 0)
- conv->elementwise_add->batch_norm->any_other_op (bias != 0)
- after:
- conv->elementwise_add->any_other_op
The transpile stages are:
1. insert elementwise_add op when bias == 0.
2. fuse the batch_norm's parameters to conv and elementwise_add operators.
3. remove batch_norm ops which are not used in any other ops.
4. adjust the input of any_other_op to be the output of elementwise_add operator.
5. remove unused variables.
:param program: program to transpile
:type program: Program
:param place: inference place
:type place: Place
:param scope: inference scope
:type scope: Scope
'''
self.scope = scope
self.place = place
self.block = program.block(0)
self.input_map = {} # store the input names should be adjusted
i = 0
while i < len(self.block.ops):
current_op = self.block.ops[i]
# TODO(luotao1): consider only conv2d now. fc would be delt later.
if current_op.type in ['conv2d']:
# TODO(luotao1): consider single chain network now.
# For branch network, we counldn't use block.ops[i + 1] as
# the judgment condition.
next_op = self.block.ops[i + 1]
# conv2d without bias
if (next_op.type == 'batch_norm'):
# insert bias op
bias_op = self._insert_bias_op(i + 1, current_op, next_op)
# fuse batch_norm
self._fuse_param(current_op, next_op, bias_op, 0)
# remove batch_norm_op
self.block.remove_op(i + 2)
i = i + 1
# conv2d with bias, the next_op.type is elementwise_add
elif (next_op.type == 'elementwise_add'):
next_next_op = self.block.ops[i + 2]
if (next_next_op.type == 'batch_norm'):
# fuse batch_norm
self._fuse_param(current_op, next_next_op, next_op, 1)
# remove batch_norm_op
self.block.remove_op(i + 2)
i = i + 1
i = i + 1
self._adjust_input()
self._remove_unused_var()
# TODO(luotao): use clone() method to flush the program.desc in force,
# since some large program.desc will not be flushed immediately.
# And a better solution will be considered later.
program = program.clone()
def float16_transpile(self, program, place, scope=None):
''' '''
Transpile the program desc and cast the weights to float16 data type to Transpile the program desc and cast the weights to float16 data type to
enable float16 inference. enable float16 inference.
...@@ -155,8 +53,15 @@ class InferenceTranspiler: ...@@ -155,8 +53,15 @@ class InferenceTranspiler:
:param scope: inference scope :param scope: inference scope
:type scope: Scope :type scope: Scope
''' '''
if not isinstance(program, Program):
raise TypeError("program should be as Program type")
if not isinstance(place, core.CPUPlace) and not isinstance(
place, core.CUDAPlace):
raise TypeError("place should be as CPUPlace/CUDAPlace type")
if scope is None: if scope is None:
scope = global_scope() scope = global_scope()
if not isinstance(scope, core.Scope):
raise TypeError("scope should be as Scope type or None")
self.scope = scope self.scope = scope
self.place = place self.place = place
...@@ -174,101 +79,6 @@ class InferenceTranspiler: ...@@ -174,101 +79,6 @@ class InferenceTranspiler:
program = program.clone() program = program.clone()
# ====================== private transpiler functions ===================== # ====================== private transpiler functions =====================
def _insert_bias_op(self, index, current_op, bn_op):
'''
Construct elementwise_add operator for adding bias
and insert it into program.
:param index: insert location of bias_op
:type index: Int
:param current_op: current operator (conv or fc)
:type current_op: Operator
:param bn_op: batch norm operator
:type bn_op: Operator
:return: bias_op
:rtype: Operator
'''
# The input of bias_op is current_op's output and Bias of bn_op
# The output of bias_op is bn_op's output
x_var = self.block.var(current_op.output("Output")[0])
y_var = self.block.var(bn_op.input("Bias")[0])
out_var = self.block.var(bn_op.output("Y")[0])
bias_op = self.block.insert_op(
index,
type="elementwise_add",
inputs={"X": x_var,
"Y": y_var},
outputs={"Out": out_var},
attrs={"axis": 1}) # dim_start=1
return bias_op
def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
'''
fuse the batch_norm_op' parameters to current_op (conv or fc)
:param current_op: current operator (conv or fc)
:type current_op: Operator
:param bn_op: batch norm operator
:type bn_op: Operator
:param bias_op: elementwise_add operator for adding bias
:type bias_op: Operator
:param with_bias: If current operator has bias, with_bias = 1; otherwise 0.
:type with_bias: Int
'''
def _update_param(op, old_param_name, new_param):
# For the sake of remaining the original variables the same as before,
# create new variables in scope to store the new parameters.
old_param_name = old_param_name[0]
old_var = self.block.vars[old_param_name]
new_param_name = old_param_name + '_fuse_bn'
new_var = self.block.create_parameter(
name=new_param_name.encode('ascii'),
type=old_var.type,
dtype=old_var.dtype,
shape=old_var.shape)
op.rename_input(old_param_name, new_param_name)
self.scope.var(new_param_name)
tensor = self.scope.find_var(new_param_name).get_tensor()
tensor.set(np.array(new_param), self.place)
def _load_param(param_name):
return np.array(self.scope.find_var(param_name[0]).get_tensor())
bias_bn = _load_param(bn_op.input("Bias")) #Bias
scale_bn = _load_param(bn_op.input("Scale")) #Scale
mean_bn = _load_param(bn_op.input("Mean")) #Mean
var_bn = _load_param(bn_op.input("Variance")) #Variance
# TODO(luotao1): consider only conv2d now. fc would be delt later.
current_param = _load_param(current_op.input("Filter"))
std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5)))
tmp = np.float32(np.divide(scale_bn, std_bn))
# add bias of batch_norm_op to conv2d
if with_bias:
bias = _load_param(bias_op.input("Y"))
else:
bias = np.zeros(bias_bn.shape)
bias = np.float32(
np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn))
# re-compute weight of conv2d
tmp = tmp.reshape(tmp.shape[0], -1)
dst_param = current_param.reshape((tmp.shape[0], -1))
dst_param = np.float32(np.multiply(dst_param, tmp))
dst_param = dst_param.reshape(current_param.shape)
# update parameters
_update_param(current_op, current_op.input("Filter"), dst_param)
_update_param(bias_op, bias_op.input("Y"), bias)
# collect the renamed input
self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
def _adjust_input(self, skip=False): def _adjust_input(self, skip=False):
''' '''
Change the input variable name in operators. Change the input variable name in operators.
......
#!/bin/bash
BUILD_PATH=/paddle/fp16_build
WHEEL_PATH=$BUILD_PATH/python/dist
INFER_PATH=$BUILD_PATH/paddle/fluid/inference/tests/book
DEMO_PATH=/paddle/contrib/float16
# Use the single most powerful CUDA GPU on your machine
export CUDA_VISIBLE_DEVICES=0
# Build the PaddlePaddle Fluid wheel package and install it.
mkdir -p $BUILD_PATH && cd $BUILD_PATH
cmake .. -DWITH_AVX=OFF \
-DWITH_MKL=OFF \
-DWITH_GPU=ON \
-DWITH_TESTING=ON \
-DWITH_TIMER=ON \
-DWITH_PROFILER=ON \
-DWITH_FLUID_ONLY=ON
make -j `nproc`
pip install -U "$WHEEL_PATH/$(ls $WHEEL_PATH)"
cd $DEMO_PATH
# Clear previous log results
rm -f *.log
# Test the float16 inference accuracy of resnet32 on cifar10 data set
stdbuf -oL python float16_inference_demo.py \
--data_set=cifar10 \
--model=resnet \
--threshold=0.6 \
--repeat=10 \
2>&1 | tee -a float16_inference_accuracy.log
# Sleep to cool down the GPU for consistent benchmarking
sleep 2m
# benchmarking parameters
REPEAT=1000
MAXIMUM_BATCH_SIZE=512
for ((batch_size = 1; batch_size <= MAXIMUM_BATCH_SIZE; batch_size *= 2));
do
# Test inference benchmark of vgg16 on imagenet
stdbuf -oL python float16_inference_demo.py \
--data_set=imagenet \
--model=vgg \
--threshold=0.001 \
--repeat=1 \
$INFER_PATH/test_inference_image_classification_vgg \
--data_set=imagenet \
--dirname=$DEMO_PATH/image_classification_imagenet_vgg.inference.model \
--fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_vgg.inference.model \
--repeat=$REPEAT \
--batch_size=$batch_size \
--skip_cpu=true \
2>&1 | tee -a imagenet_vgg16_benchmark.log
sleep 2m
# Test inference benchmark of resnet50 on imagenet
stdbuf -oL python float16_inference_demo.py \
--data_set=imagenet \
--model=resnet \
--threshold=0.001 \
--repeat=1 \
$INFER_PATH/test_inference_image_classification_resnet \
--data_set=imagenet \
--dirname=$DEMO_PATH/image_classification_imagenet_resnet.inference.model \
--fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_resnet.inference.model \
--repeat=$REPEAT \
--batch_size=$batch_size \
--skip_cpu=true \
2>&1 | tee -a imagenet_resnet50_benchmark.log
sleep 2m
# Test inference benchmark of vgg16 on cifar10
stdbuf -oL python float16_inference_demo.py \
--data_set=cifar10 \
--model=vgg \
--threshold=0.001 \
--repeat=1 \
$INFER_PATH/test_inference_image_classification_vgg \
--data_set=cifar10 \
--dirname=$DEMO_PATH/image_classification_cifar10_vgg.inference.model \
--fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_vgg.inference.model \
--repeat=$REPEAT \
--batch_size=$batch_size \
--skip_cpu=true \
2>&1 | tee -a cifar10_vgg16_benchmark.log
sleep 1m
# Test inference benchmark of resnet32 on cifar10
stdbuf -oL python float16_inference_demo.py \
--data_set=cifar10 \
--model=resnet \
--threshold=0.001 \
--repeat=1 \
$INFER_PATH/test_inference_image_classification_vgg \
--data_set=cifar10 \
--dirname=$DEMO_PATH/image_classification_cifar10_resnet.inference.model \
--fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_resnet.inference.model \
--repeat=$REPEAT \
--batch_size=$batch_size \
--skip_cpu=true \
2>&1 | tee -a cifar10_resnet32_benchmark.log
sleep 1m
done
# Parallelism, Asynchronous, Synchronous, Codistillation
For valuable models, it’s worth using more hardware resources to reduce the training time and improve the final model quality. This doc discuss various solutions, their empirical results and some latest researches.
# Model Parallelism
In some situations, larger and more complex models can improve the model quality. Sometimes, such models cannot fit in one device. Sometimes, parts of the model can be executed in parallel to improve speed. Model Parallelism address the issues by partitioning a single model and place the shards on several devices for execution.
A common way of model parallelism is partition the logic of “gradient application” to parameter servers, while leaving the forward and backward computation at training servers.
More flexible model parallelism is challenging. For example, multi-level-single-direction LSTM can be partitioned by layers, while such solution is not helpful for bi-directional LSTM. Different models can have quite different ways of partitioning and the benefits also depend on the underlying hardware. Framework needs to provide flexible APIs for user to define the customized partition scheme. For example, in TensorFlow, user can use tf.device() to specify the device placement. In MxNet, mx.AttrScope(ctx_group='dev1') does similar things. Recent research proposes to automatically find the optimal partition scheme with Reinforcement Learning, which is essentially solution space search algorithm that could cost a lot of extra hardware sources.
# Data Parallelism
Data Parallelism runs the same model on multiple devices, each taking in a partition of the input batch. It’s more commonly used for a few reasons. It generally applies to common SGD mini-batch training. Compared with model parallelism, which requires users to carefully partition their model and tune for good performance, data parallelism usually involves no more than calling an extra API and speed up is more predictable.
# Asynchronous Training
In asynchronous training, it usually involves a set of trainers and a set of parameter servers. The parameter servers collectively hold a single copy of shared parameters. While the trainers each holds a unique copy of model and trains the model independently. Each trainer pulls parameters from parameter servers and sends gradients to the parameter servers independently. Similarly the parameter servers applies the gradients to parameters as soon as the gradients are received and sends parameters whenever they are requested.
In theory, asynchronous training is not safe and unstable. Each trainer is very likely using stale copy of parameters and parameters are also likely to apply stale gradients. However, in practice, especially for large-scale nonconvex optimization, it is effective [1]. Compared with synchronous solution, which will be discussed later, asynchronous distributed training is easier to implement and scales to a few dozen workers without losing much performance due to network communication or other overhead. Besides, asynchronous training can make progress even in case of random trainer failure in the cluster.
Many production models, such as [3], are trained with distributed asynchronous solutions due to its scalability and effectiveness in practice. However, asynchronous training has its limitations. Usually, it’s not as stable as synchronous training. A warm-up phase is sometimes needed. Learning rate is usually smaller compared with synchronous training and decay is also often needed. Normally, asynchronous training doesn’t scale beyond 100 trainers. In other words, when putting more trainers beyond that, the model cannot converge faster.
# Synchronous Training
Unlike asynchronous training, synchronous training requires step barriers. Parameter servers needs to wait for gradients from all trainers before they are applied to parameters and trainers will always pull the latest parameters.
An obvious advantage of synchronous training is that the behavior is more clearly defined. Usually, it's more stable than asynchronous training. Learning rate can be set larger and for some vision tasks, the final accuracy can be slightly higher. (In my practical experience, for some models, it can actually be worse).
Synchronous training usually faces scalability and performance issues, if not carefully implemented or deployed. In [2], native synchronous training can be 20%~40% slower than asynchronous training. A common trick to avoid slowness, discussed in [1] and [2], is to have backups. N+M replicas are scheduled while only the first N is needed for the training step the proceed.
Similar to asynchronous training, the benefit of synchronous training diminishes quickly. Depending on the models, increasing the number of trainers (effectively batch size) beyond a point won’t delivers faster converge time or better final model quality.
# Codistillation
Codistillation is a technique that tries to scale the training further. A few training instance (each training instance can be distributed) are performed during the same period. Each training instance has extra losses that comes from the prediction of other training instances. (likey teacher and student) The training process converges faster and usually converge to a better model quality. [4]
# Reference
[1] Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, Andrew Senior, Paul Tucker, Ke Yang, Quoc V Le, et al. Large scale distributed deep networks.
[2] Jianmin Chen, Rajat Monga, Samy Bengio, and Rafal Jozefowicz. Revisiting distributed synchronous SGD.
[3] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine translation system: Bridging the gap between human and machine translation.
[4] LARGE SCALE DISTRIBUTED NEURAL NETWORK TRAINING THROUGH ONLINE DISTILLATION
...@@ -8,28 +8,28 @@ The user's cluster environment is not the same. To facilitate everyone's deploym ...@@ -8,28 +8,28 @@ The user's cluster environment is not the same. To facilitate everyone's deploym
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
k8s_cn.md k8s_en.md
k8s_distributed_cn.md k8s_distributed_en.md
`OpenMPI <https://www.open-mpi.org>`_ is a mature high-performance parallel computing framework, which is widely used in the field of HPC. The following guide describes how to use OpenMPI to build PaddlePaddle's cluster training task: `OpenMPI <https://www.open-mpi.org>`_ is a mature high-performance parallel computing framework, which is widely used in the field of HPC. The following guide describes how to use OpenMPI to build PaddlePaddle's cluster training task:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
openmpi_cn.md openmpi_en.md
`Fabric <http://www.fabfile.org>`_ is a convenient tool for program deployment and management. We provide a way to deploy and manage with Fabric. If you want to know more about it, please read the following guidelines: `Fabric <http://www.fabfile.org>`_ is a convenient tool for program deployment and management. We provide a way to deploy and manage with Fabric. If you want to know more about it, please read the following guidelines:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
fabric_cn.md fabric_en.md
We also support the deployment of PaddlePaddle on AWS. Learn more about: We also support the deployment of PaddlePaddle on AWS. Learn more about:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
k8s_aws_cn.md k8s_aws_en.md
The examples can be found under `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ . The examples can be found under `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ .
\ No newline at end of file
...@@ -108,7 +108,7 @@ paddle_error paddle_matrix_get_row(paddle_matrix mat, ...@@ -108,7 +108,7 @@ paddle_error paddle_matrix_get_row(paddle_matrix mat,
paddle_error paddle_matrix_get_shape(paddle_matrix mat, paddle_error paddle_matrix_get_shape(paddle_matrix mat,
uint64_t* height, uint64_t* height,
uint64_t* width) { uint64_t* width) {
if (mat == nullptr) return kPD_NULLPTR; if (mat == nullptr || cast(mat)->mat == nullptr) return kPD_NULLPTR;
if (height != nullptr) { if (height != nullptr) {
*height = cast(mat)->mat->getHeight(); *height = cast(mat)->mat->getHeight();
} }
......
...@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef HL_BASE_H_ #pragma once
#define HL_BASE_H_
#include <cstddef> #include <cstddef>
...@@ -207,8 +206,8 @@ typedef struct { ...@@ -207,8 +206,8 @@ typedef struct {
#ifdef __NVCC__ #ifdef __NVCC__
#include "cuda_runtime.h" #include <cuda_runtime.h>
#include "hl_cuda.h" #include "paddle/cuda/include/hl_cuda.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
extern __thread bool g_sync_flag; extern __thread bool g_sync_flag;
...@@ -230,6 +229,11 @@ extern __thread cudaStream_t default_stream; ...@@ -230,6 +229,11 @@ extern __thread cudaStream_t default_stream;
// __shfl has been deprecated as of CUDA 9.0. // __shfl has been deprecated as of CUDA 9.0.
#if CUDA_VERSION < 9000 #if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
return __shfl_down(val, delta);
}
template <typename T> template <typename T>
__forceinline__ __device__ T __forceinline__ __device__ T
__shfl_sync(unsigned, T val, int src_line, int width) { __shfl_sync(unsigned, T val, int src_line, int width) {
...@@ -243,6 +247,4 @@ __shfl_sync(unsigned, T val, int src_line, int width) { ...@@ -243,6 +247,4 @@ __shfl_sync(unsigned, T val, int src_line, int width) {
mask = __ballot_sync(FULL_WARP_MASK, (predicate)) mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif #endif
#endif /* __NVCC__ */ #endif // __NVCC__
#endif /* HL_BASE_H_ */
...@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h" #include "paddle/cuda/include/hl_base.h"
#include "hl_sparse.ph" #include "paddle/cuda/include/hl_sparse.ph"
#include "hl_top_k.h" #include "paddle/cuda/include/hl_top_k.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
// using namespace hppl; // using namespace hppl;
...@@ -244,8 +244,9 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK, ...@@ -244,8 +244,9 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK,
if (--beamSize == 0) break; if (--beamSize == 0) break;
__syncthreads(); __syncthreads();
// NOTE(zcd): temporary solution
unsigned mask = 0u; unsigned mask = 0u;
// CREATE_SHFL_MASK(mask, tid < len); CREATE_SHFL_MASK(mask, true);
if (tid == maxId[0]) { if (tid == maxId[0]) {
if (beam < maxLength) { if (beam < maxLength) {
......
...@@ -143,7 +143,7 @@ OpDesc *BlockDesc::InsertOp(size_t index) { ...@@ -143,7 +143,7 @@ OpDesc *BlockDesc::InsertOp(size_t index) {
} }
void BlockDesc::RemoveOp(size_t s, size_t e) { void BlockDesc::RemoveOp(size_t s, size_t e) {
if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) { if (ops_.begin() + s >= ops_.end() || ops_.begin() + e > ops_.end()) {
return; return;
} }
need_update_ = true; need_update_ = true;
......
...@@ -15,12 +15,14 @@ if(WITH_GPU) ...@@ -15,12 +15,14 @@ if(WITH_GPU)
dynload_cuda) dynload_cuda)
set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda) nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
else() else()
set(multi_devices_graph_builder_deps) set(multi_devices_graph_builder_deps)
cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim) cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
endif() endif()
cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
......
...@@ -19,14 +19,12 @@ ...@@ -19,14 +19,12 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
: local_scopes_(local_scopes), places_(places) {}
void BroadcastOpHandle::RunImpl() { void BroadcastOpHandle::RunImpl() {
// the input and output may have dummy var. if (places_.size() == 1) return;
VarHandle *in_var_handle;
// The input and output may have dummy vars.
VarHandle *in_var_handle;
{ {
auto in_var_handles = DynamicCast<VarHandle>(inputs_); auto in_var_handles = DynamicCast<VarHandle>(inputs_);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1, PADDLE_ENFORCE_EQ(in_var_handles.size(), 1,
...@@ -55,27 +53,97 @@ void BroadcastOpHandle::RunImpl() { ...@@ -55,27 +53,97 @@ void BroadcastOpHandle::RunImpl() {
Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var); Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
for (auto *out : out_var_handles) { // NOTE: The tensors' Place of input and output must be all on GPU or all on
if (*out == *in_var_handle) { // CPU.
for (auto *out_var_handle : out_var_handles) {
if (out_var_handle->IsTheSameVar(*in_var_handle)) {
continue; continue;
} }
auto t_out_p = out_var_handle->place_;
auto &out_p = out->place_; auto *out_var = var_scopes.at(out_var_handle->scope_idx_)
auto *out_var = var_scopes.at(out->scope_idx_)->FindVar(out->name_); ->FindVar(out_var_handle->name_);
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_EQ(out_p.which(), in_var_handle->place_.which(), if (platform::is_gpu_place(in_tensor.place())) {
"Places must be all on CPU or all on CUDA."); PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
"Places of input and output must be all on GPU.");
} else {
t_out_p = platform::CPUPlace();
}
VariableVisitor::ShareDimsAndLoD(*in_var, out_var); VariableVisitor::ShareDimsAndLoD(*in_var, out_var);
VariableVisitor::GetMutableTensor(out_var).mutable_data(out_p, VariableVisitor::GetMutableTensor(out_var).mutable_data(t_out_p,
in_tensor.type()); in_tensor.type());
}
if (platform::is_cpu_place(in_tensor.place())) {
for (auto *out_var_handle : out_var_handles) {
if (out_var_handle->IsTheSameVar(*in_var_handle)) {
continue;
}
auto &out_p = out_var_handle->place_;
auto *out_var = var_scopes.at(out_var_handle->scope_idx_)
->FindVar(out_var_handle->name_);
RunAndRecordEvent(out_p, [in_tensor, out_var] {
paddle::framework::TensorCopy(
in_tensor, platform::CPUPlace(),
&VariableVisitor::GetMutableTensor(out_var));
});
}
} else {
#ifdef PADDLE_WITH_CUDA
VarHandle *out_handle = nullptr;
int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
std::vector<std::function<void()>> broadcast_calls;
for (auto out_var_handle : out_var_handles) {
Variable *out_var = var_scopes.at(out_var_handle->scope_idx_)
->FindVar(out_var_handle->name_);
int dst_id =
boost::get<platform::CUDAPlace>(out_var_handle->place_).device;
auto &nccl_ctx = nccl_ctxs_->at(dst_id);
void *send_recv_buffer = nullptr;
if (root_id == dst_id) {
send_recv_buffer = const_cast<void *>(in_tensor.data<void>());
out_handle = out_var_handle;
} else {
send_recv_buffer =
VariableVisitor::GetMutableTensor(out_var).mutable_data(
out_var_handle->place_);
}
int type = platform::ToNCCLDataType(in_tensor.type());
size_t numel = static_cast<size_t>(in_tensor.numel());
broadcast_calls.emplace_back(
[send_recv_buffer, numel, type, root_id, &nccl_ctx] {
PADDLE_ENFORCE(platform::dynload::ncclBcast(
send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
root_id, nccl_ctx.comm_, nccl_ctx.stream()));
});
}
auto dev_ctx = dev_ctxes_.at(out_p); this->RunAndRecordEvent([&] {
RunAndRecordEvent(out_p, [in_tensor, out_var, dev_ctx, out_p] { {
paddle::framework::TensorCopy( platform::NCCLGroupGuard guard;
in_tensor, out_p, *(dev_ctx), for (auto &call : broadcast_calls) {
&VariableVisitor::GetMutableTensor(out_var)); call();
}
}
if (!out_handle->IsTheSameVar(*in_var_handle)) {
auto out_var = var_scopes.at(in_var_handle->scope_idx_)
->FindVar(out_var_handles[0]->name_);
paddle::framework::TensorCopy(
in_tensor, in_var_handle->place_,
*(dev_ctxes_.at(in_var_handle->place_)),
&VariableVisitor::GetMutableTensor(out_var));
}
}); });
#else
PADDLE_THROW("CUDA is not enabled.");
#endif
} }
} }
......
...@@ -24,14 +24,32 @@ ...@@ -24,14 +24,32 @@
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
struct BroadcastOpHandle : public OpHandleBase { struct BroadcastOpHandle : public OpHandleBase {
public: public:
#ifdef PADDLE_WITH_CUDA
BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap *nccl_ctxs)
: local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
if (nccl_ctxs_) {
for (auto &p_ctx : nccl_ctxs_->contexts_) {
dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
}
}
}
#else
BroadcastOpHandle(const std::vector<Scope *> &local_scopes, BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places); const std::vector<platform::Place> &places)
: local_scopes_(local_scopes), places_(places) {}
#endif
std::string Name() const override; std::string Name() const override;
...@@ -44,6 +62,9 @@ struct BroadcastOpHandle : public OpHandleBase { ...@@ -44,6 +62,9 @@ struct BroadcastOpHandle : public OpHandleBase {
private: private:
const std::vector<Scope *> &local_scopes_; const std::vector<Scope *> &local_scopes_;
const std::vector<platform::Place> &places_; const std::vector<platform::Place> &places_;
#ifdef PADDLE_WITH_CUDA
const platform::NCCLContextMap *nccl_ctxs_;
#endif
}; };
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
...@@ -35,15 +35,25 @@ struct TestBroadcastOpHandle { ...@@ -35,15 +35,25 @@ struct TestBroadcastOpHandle {
std::unique_ptr<OpHandleBase> op_handle_; std::unique_ptr<OpHandleBase> op_handle_;
std::vector<std::unique_ptr<VarHandleBase>> vars_; std::vector<std::unique_ptr<VarHandleBase>> vars_;
std::vector<p::Place> gpu_list_; std::vector<p::Place> gpu_list_;
bool use_gpu_;
#ifdef PADDLE_WITH_CUDA
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif
void WaitAll() { void WaitAll() {
for (size_t j = 0; j < ctxs_.size(); ++j) { for (size_t j = 0; j < ctxs_.size(); ++j) {
ctxs_[j]->Wait(); ctxs_[j]->Wait();
} }
#ifdef PADDLE_WITH_CUDA
if (nccl_ctxs_) {
nccl_ctxs_->WaitAll();
}
#endif
} }
void InitCtxOnGpu(bool use_gpu) { void InitCtxOnGpu(bool use_gpu) {
if (use_gpu) { use_gpu_ = use_gpu;
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
int count = p::GetCUDADeviceCount(); int count = p::GetCUDADeviceCount();
if (count <= 1) { if (count <= 1) {
...@@ -57,6 +67,7 @@ struct TestBroadcastOpHandle { ...@@ -57,6 +67,7 @@ struct TestBroadcastOpHandle {
gpu_list_.push_back(p); gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CUDADeviceContext(p)); ctxs_.emplace_back(new p::CUDADeviceContext(p));
} }
nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW("CUDA is not support.");
#endif #endif
...@@ -67,6 +78,9 @@ struct TestBroadcastOpHandle { ...@@ -67,6 +78,9 @@ struct TestBroadcastOpHandle {
gpu_list_.push_back(p); gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CPUDeviceContext(p)); ctxs_.emplace_back(new p::CPUDeviceContext(p));
} }
#ifdef PADDLE_WITH_CUDA
nccl_ctxs_.reset(nullptr);
#endif
} }
} }
...@@ -82,7 +96,21 @@ struct TestBroadcastOpHandle { ...@@ -82,7 +96,21 @@ struct TestBroadcastOpHandle {
} }
param_scopes_[input_scope_idx]->Var("input"); param_scopes_[input_scope_idx]->Var("input");
op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_)); if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(
new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
#else
PADDLE_THROW("CUDA is not support.");
#endif
} else {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(
new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
#else
op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
#endif
}
auto* in_var_handle = auto* in_var_handle =
new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]); new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
...@@ -97,7 +125,9 @@ struct TestBroadcastOpHandle { ...@@ -97,7 +125,9 @@ struct TestBroadcastOpHandle {
op_handle_->AddInput(dummy_var_handle); op_handle_->AddInput(dummy_var_handle);
for (size_t j = 0; j < gpu_list_.size(); ++j) { for (size_t j = 0; j < gpu_list_.size(); ++j) {
op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get()); if (!use_gpu_) {
op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
}
VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]); VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
vars_.emplace_back(out_var_handle); vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle); op_handle_->AddOutput(out_var_handle);
......
...@@ -25,6 +25,7 @@ GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes, ...@@ -25,6 +25,7 @@ GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
: local_scopes_(local_scopes), places_(places) {} : local_scopes_(local_scopes), places_(places) {}
void GatherOpHandle::RunImpl() { void GatherOpHandle::RunImpl() {
if (places_.size() == 1) return;
// the input and output may have dummy var. // the input and output may have dummy var.
auto in_var_handles = DynamicCast<VarHandle>(inputs_); auto in_var_handles = DynamicCast<VarHandle>(inputs_);
...@@ -35,7 +36,6 @@ void GatherOpHandle::RunImpl() { ...@@ -35,7 +36,6 @@ void GatherOpHandle::RunImpl() {
VarHandle *out_var_handle; VarHandle *out_var_handle;
{ {
auto out_var_handles = DynamicCast<VarHandle>(outputs_); auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1, PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
"The number of output should be one."); "The number of output should be one.");
out_var_handle = out_var_handles.front(); out_var_handle = out_var_handles.front();
...@@ -50,68 +50,62 @@ void GatherOpHandle::RunImpl() { ...@@ -50,68 +50,62 @@ void GatherOpHandle::RunImpl() {
auto pre_in_var = auto pre_in_var =
var_scopes.at(in_0_handle->scope_idx_)->FindVar(in_0_handle->name_); var_scopes.at(in_0_handle->scope_idx_)->FindVar(in_0_handle->name_);
PADDLE_ENFORCE_NOT_NULL(pre_in_var); PADDLE_ENFORCE_NOT_NULL(pre_in_var);
PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(), PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
"Currently, gather_op only can gather SelectedRows."); "Currently, gather_op only can gather SelectedRows.");
auto pre_place = in_0_handle->place_;
PADDLE_ENFORCE_EQ(out_var_handle->place_.which(), pre_place.which(),
"The place of input and output should be the same.");
// Wait input done, this Wait is asynchronous operation // Wait input done, this Wait is asynchronous operation
WaitInputVarGenerated(in_var_handles); WaitInputVarGenerated(in_var_handles);
auto &pre_in_value = pre_in_var->Get<framework::SelectedRows>();
std::vector<int64_t> out_rows; std::vector<int64_t> out_rows;
std::vector<Tensor> in_tensors; std::vector<Tensor> in_tensors;
std::vector<platform::Place> in_places;
auto &pre_in = pre_in_var->Get<framework::SelectedRows>(); // Gather the inputs
// gather the inputs
for (auto *in_handle : in_var_handles) { for (auto *in_handle : in_var_handles) {
auto in_p = in_handle->place_;
in_places.push_back(in_p);
PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
"Places must be all on CPU or all on CUDA.");
auto *in_var = auto *in_var =
var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_); var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
auto &in_sr = in_var->Get<framework::SelectedRows>(); PADDLE_ENFORCE_NOT_NULL(in_var);
VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var);
PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(), auto &in_sr_value = in_var->Get<framework::SelectedRows>();
"The type of input is not consistent.");
PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(),
"The height of inputs is not consistent.");
PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(),
"The dims of inputs is not consistent.");
auto &in_sr_rows = in_sr.rows(); auto &in_sr_rows = in_sr_value.rows();
out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end()); out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end());
in_tensors.emplace_back(in_sr_value.value());
in_tensors.emplace_back(in_sr.value());
} }
// write the output // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
auto &out_place = out_var_handle->place_; platform::Place t_out_p = out_var_handle->place_;
auto out_scope_idx = out_var_handle->scope_idx_; if (platform::is_gpu_place(pre_in_value.place())) {
auto out_var = var_scopes.at(out_scope_idx)->FindVar(out_var_handle->name_); PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
"Places of input and output must be all on GPU.");
} else {
t_out_p = platform::CPUPlace();
}
auto out = out_var->GetMutable<framework::SelectedRows>(); auto out_var =
out->set_height(pre_in.height()); var_scopes.at(out_var_handle->scope_idx_)->FindVar(out_var_handle->name_);
out->set_rows(out_rows); PADDLE_ENFORCE_NOT_NULL(out_var);
auto out_value = out_var->GetMutable<framework::SelectedRows>();
out_value->set_height(pre_in_value.height());
out_value->set_rows(out_rows);
size_t rows = out_rows.size(); size_t rows = out_rows.size();
DDim out_dim = pre_in.GetCompleteDims(); DDim out_dim = pre_in_value.GetCompleteDims();
out_dim[0] = static_cast<int64_t>(rows); out_dim[0] = static_cast<int64_t>(rows);
out->mutable_value()->Resize(out_dim); out_value->mutable_value()->Resize(out_dim).mutable_data(
out->mutable_value()->mutable_data(out_place, pre_in.value().type()); t_out_p, pre_in_value.value().type());
Tensor *out_tensor = out->mutable_value(); Tensor *out_tensor = out_value->mutable_value();
// copy // copy
auto dev_ctx = dev_ctxes_[out_place]; auto dev_ctx = dev_ctxes_[out_var_handle->place_];
RunAndRecordEvent(out_place, [in_tensors, out_tensor, dev_ctx, out_place] { RunAndRecordEvent(out_var_handle->place_, [in_tensors, out_tensor, &dev_ctx,
t_out_p] {
int s = 0, e = 0; int s = 0, e = 0;
for (size_t j = 0; j < in_tensors.size(); ++j) { for (size_t j = 0; j < in_tensors.size(); ++j) {
e += in_tensors[j].dims()[0]; e += in_tensors[j].dims()[0];
auto sub_out = out_tensor->Slice(s, e); auto sub_out = out_tensor->Slice(s, e);
paddle::framework::TensorCopy(in_tensors[j], out_place, *(dev_ctx), paddle::framework::TensorCopy(in_tensors[j], t_out_p, *dev_ctx, &sub_out);
&sub_out);
s = e; s = e;
} }
}); });
......
...@@ -11,9 +11,11 @@ ...@@ -11,9 +11,11 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include <utility>
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include "paddle/fluid/framework/details/send_op_handle.h" #include "paddle/fluid/framework/details/send_op_handle.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -34,8 +36,8 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( ...@@ -34,8 +36,8 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::string &loss_var_name,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const std::vector<Scope *> &local_scopes, bool use_default_grad_scale, const std::vector<Scope *> &local_scopes,
platform::NCCLContextMap *nccl_ctxs) platform::NCCLContextMap *nccl_ctxs, bool use_default_grad_scale)
: loss_var_name_(loss_var_name), : loss_var_name_(loss_var_name),
places_(places), places_(places),
local_scopes_(local_scopes), local_scopes_(local_scopes),
...@@ -105,6 +107,11 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op, ...@@ -105,6 +107,11 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op,
std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
const ProgramDesc &program) const { const ProgramDesc &program) const {
std::unordered_map<std::string, proto::VarType::Type> var_types;
for (auto *var : program.Block(0).AllVars()) {
var_types[var->Name()] = var->GetType();
}
auto graph = new SSAGraph(); auto graph = new SSAGraph();
SSAGraph &result = *graph; SSAGraph &result = *graph;
std::unordered_set<std::string> og_has_been_broadcast; std::unordered_set<std::string> og_has_been_broadcast;
...@@ -133,12 +140,17 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -133,12 +140,17 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
is_forwarding = false; is_forwarding = false;
} else { } else {
CreateComputationalOps(&result, *op, places_.size()); CreateComputationalOps(&result, *op, places_.size());
if (!is_forwarding) { if (!is_forwarding && places_.size() > 1) {
// Currently, we assume that once gradient is generated, it can be // Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once. // broadcast, and each gradient is only broadcast once.
for (auto &og : op->OutputArgumentNames()) { for (auto &og : op->OutputArgumentNames()) {
if (IsParameterGradientOnce(og, &og_has_been_broadcast)) { if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
InsertNCCLAllReduceOp(&result, og); if (IsSparseGradient(var_types, og)) {
CreateReduceOp(&result, og, 0);
CreateBroadcastOp(&result, og, 0);
} else {
InsertNCCLAllReduceOp(&result, og);
}
} }
} }
} }
...@@ -165,6 +177,50 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -165,6 +177,50 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
return std::unique_ptr<SSAGraph>(graph); return std::unique_ptr<SSAGraph>(graph);
} }
bool MultiDevSSAGraphBuilder::IsSparseGradient(
const std::unordered_map<std::string, proto::VarType::Type> &var_types,
const std::string &og) const {
PADDLE_ENFORCE(var_types.count(og) != 0);
if (var_types.at(og) == proto::VarType::SELECTED_ROWS) {
return true;
}
return false;
}
void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result,
const std::string &p_name,
size_t src_dev_id) const {
#ifdef PADDLE_WITH_CUDA
auto *op_handle = new BroadcastOpHandle(local_scopes_, places_, nccl_ctxs_);
#else
auto *op_handle = new BroadcastOpHandle(local_scopes_, places_);
#endif
result->ops_.emplace_back(op_handle);
auto *in = result->vars_.at(src_dev_id).at(p_name).back().get();
op_handle->AddInput(in);
for (size_t i = 0; i < places_.size(); ++i) {
auto &vars = result->vars_.at(i).at(p_name);
auto &p = places_[i];
auto *out_var = new VarHandle(vars.size(), i, p_name, p);
vars.emplace_back(out_var);
op_handle->AddOutput(out_var);
#ifndef ADDLE_WITH_CUDA
op_handle->SetDeviceContext(p,
platform::DeviceContextPool::Instance().Get(p));
#endif
}
}
void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result,
const OpDesc &op,
int dev_id) const {
result->ops_.emplace_back(
new ComputationOpHandle(op, local_scopes_[dev_id], places_[dev_id]));
CreateOpHandleIOs(result, op, dev_id);
}
OpDesc *MultiDevSSAGraphBuilder::GetSendOpDesc( OpDesc *MultiDevSSAGraphBuilder::GetSendOpDesc(
const ProgramDesc &program) const { const ProgramDesc &program) const {
for (auto *op : program.Block(0).AllOps()) { for (auto *op : program.Block(0).AllOps()) {
...@@ -174,7 +230,6 @@ OpDesc *MultiDevSSAGraphBuilder::GetSendOpDesc( ...@@ -174,7 +230,6 @@ OpDesc *MultiDevSSAGraphBuilder::GetSendOpDesc(
} }
return nullptr; return nullptr;
} }
void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp( void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
SSAGraph *result, const std::string &og) const { SSAGraph *result, const std::string &og) const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -247,6 +302,36 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result, ...@@ -247,6 +302,36 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
} }
} }
VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
const std::string &og,
int dst_dev_id) const {
#ifdef PADDLE_WITH_CUDA
result->ops_.emplace_back(
new ReduceOpHandle(local_scopes_, places_, nccl_ctxs_));
#else
result->ops_.emplace_back(new ReduceOpHandle(local_scopes_, places_));
#endif
auto *op_handle = result->ops_.back().get();
for (size_t i = 0; i < places_.size(); ++i) {
auto &vars = result->vars_[i][og];
#ifndef PADDLE_WITH_CUDA
auto &p = places_[i];
op_handle->SetDeviceContext(p,
platform::DeviceContextPool::Instance().Get(p));
#endif
PADDLE_ENFORCE(!vars.empty());
auto &prev_grad = vars.back();
op_handle->AddInput(prev_grad.get());
}
auto &vars = result->vars_[dst_dev_id][og];
auto var =
new VarHandle(vars.size() - 1, dst_dev_id, og, places_[dst_dev_id]);
vars.emplace_back(var);
op_handle->AddOutput(var);
return var;
}
void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result, void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
const OpDesc &op) const { const OpDesc &op) const {
auto &p = places_[0]; auto &p = places_[0];
...@@ -263,6 +348,7 @@ bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const { ...@@ -263,6 +348,7 @@ bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
return op.OutputArgumentNames().size() == 1 && return op.OutputArgumentNames().size() == 1 &&
op.OutputArgumentNames()[0] == GradVarName(loss_var_name_); op.OutputArgumentNames()[0] == GradVarName(loss_var_name_);
} }
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/ssa_graph_builder.h" #include "paddle/fluid/framework/details/ssa_graph_builder.h"
...@@ -27,6 +27,7 @@ class NCCLContextMap; ...@@ -27,6 +27,7 @@ class NCCLContextMap;
namespace framework { namespace framework {
class Scope; class Scope;
namespace details { namespace details {
class MultiDevSSAGraphBuilder : public SSAGraphBuilder { class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
public: public:
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -34,8 +35,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -34,8 +35,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
const std::string &loss_var_name, const std::string &loss_var_name,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
bool skip_scale_loss, platform::NCCLContextMap *nccl_ctxs,
platform::NCCLContextMap *nccl_ctxs); bool use_default_grad_scale);
#else #else
MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places, MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::string &loss_var_name,
...@@ -74,6 +75,10 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -74,6 +75,10 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
size_t num_places) const; size_t num_places) const;
void CreateScaleLossGradOp(SSAGraph *result) const; void CreateScaleLossGradOp(SSAGraph *result) const;
VarHandle *CreateReduceOp(SSAGraph *result, const std::string &og,
int dst_dev_id) const;
void CreateComputationalOp(SSAGraph *result, const OpDesc &op,
int dev_id) const;
bool IsParameterGradientOnce( bool IsParameterGradientOnce(
const std::string &og, const std::string &og,
...@@ -81,11 +86,18 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -81,11 +86,18 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const; void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
size_t src_dev_id) const;
/** /**
* Get send op in the global block of program. * Get send op in the global block of program.
* nullptr if not found. * nullptr if not found.
*/ */
OpDesc *GetSendOpDesc(const ProgramDesc &program) const; OpDesc *GetSendOpDesc(const ProgramDesc &program) const;
bool IsSparseGradient(
const std::unordered_map<std::string, proto::VarType::Type> &var_types,
const std::string &og) const;
}; };
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
...@@ -22,6 +22,7 @@ namespace framework { ...@@ -22,6 +22,7 @@ namespace framework {
namespace details { namespace details {
void ReduceOpHandle::RunImpl() { void ReduceOpHandle::RunImpl() {
if (places_.size() == 1) return;
// the input and output may have dummy var. // the input and output may have dummy var.
auto in_var_handles = DynamicCast<VarHandle>(inputs_); auto in_var_handles = DynamicCast<VarHandle>(inputs_);
...@@ -51,44 +52,48 @@ void ReduceOpHandle::RunImpl() { ...@@ -51,44 +52,48 @@ void ReduceOpHandle::RunImpl() {
// Wait input done, this Wait is asynchronous operation // Wait input done, this Wait is asynchronous operation
WaitInputVarGenerated(in_var_handles); WaitInputVarGenerated(in_var_handles);
auto pre_place = in_0_handle->place_;
std::vector<platform::Place> in_places;
auto pre_in_tensor = VariableVisitor::GetMutableTensor(pre_in_var);
for (auto *in_handle : in_var_handles) {
auto in_p = in_handle->place_;
PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
"Places must be all on CPU or all on CUDA.");
in_places.emplace_back(in_p);
// NOTE: The Places of all input tensor must be all on CPU or all on GPU.
std::vector<platform::Place> in_places; // used to get dev_ctx
for (auto *in_handle : in_var_handles) {
in_places.emplace_back(in_handle->place_);
auto in_var = auto in_var =
var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_); var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(in_var);
VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var);
auto in_tensor = VariableVisitor::GetMutableTensor(in_var);
PADDLE_ENFORCE_EQ(in_tensor.type(), pre_in_tensor.type(),
"The type of input is not consistent.");
} }
auto out_var = auto out_var =
var_scopes.at(out_var_handle->scope_idx_)->FindVar(out_var_handle->name_); var_scopes.at(out_var_handle->scope_idx_)->FindVar(out_var_handle->name_);
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(out_var);
// NOTE: The tensors' Place of input and output must be all on GPU or all on
// CPU.
auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place();
platform::Place t_out_p;
if (platform::is_gpu_place(in_p)) {
PADDLE_ENFORCE(platform::is_gpu_place(out_var_handle->place_),
"Places of input and output must be all on GPU.");
t_out_p = out_var_handle->place_;
} else {
t_out_p = platform::CPUPlace();
}
if (pre_in_var->IsType<framework::SelectedRows>()) { if (pre_in_var->IsType<framework::SelectedRows>()) {
std::vector<const SelectedRows *> in_selected_rows = std::vector<const SelectedRows *> in_selected_rows =
GetInputValues<SelectedRows>(in_var_handles, var_scopes); GetInputValues<SelectedRows>(in_var_handles, var_scopes);
GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,
out_var_handle->place_,
out_var->GetMutable<framework::SelectedRows>()); out_var->GetMutable<framework::SelectedRows>());
} else { } else {
std::vector<const LoDTensor *> lod_tensors = std::vector<const LoDTensor *> lod_tensors =
GetInputValues<LoDTensor>(in_var_handles, var_scopes); GetInputValues<LoDTensor>(in_var_handles, var_scopes);
if (paddle::platform::is_cpu_place(pre_place)) { if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
ReduceLoDTensor func(lod_tensors, ReduceLoDTensor func(lod_tensors,
out_var->GetMutable<framework::LoDTensor>()); out_var->GetMutable<framework::LoDTensor>());
VisitDataType(ToDataType(lod_tensors[0]->type()), func); VisitDataType(ToDataType(lod_tensors[0]->type()), func);
} else if (paddle::platform::is_gpu_place(pre_place)) { } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto pre_in = pre_in_var->Get<framework::LoDTensor>(); auto pre_in = pre_in_var->Get<framework::LoDTensor>();
VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var); VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
...@@ -96,7 +101,7 @@ void ReduceOpHandle::RunImpl() { ...@@ -96,7 +101,7 @@ void ReduceOpHandle::RunImpl() {
out_var_handle->place_, pre_in.type()); out_var_handle->place_, pre_in.type());
auto out_p = out_var_handle->place_; auto out_p = out_var_handle->place_;
int root = boost::get<platform::CUDAPlace>(out_p).device; int root_id = boost::get<platform::CUDAPlace>(out_p).device;
std::vector<std::function<void()>> all_reduce_calls; std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < var_scopes.size(); ++i) { for (size_t i = 0; i < var_scopes.size(); ++i) {
auto &p = in_places[i]; auto &p = in_places[i];
...@@ -104,23 +109,23 @@ void ReduceOpHandle::RunImpl() { ...@@ -104,23 +109,23 @@ void ReduceOpHandle::RunImpl() {
int dev_id = boost::get<platform::CUDAPlace>(p).device; int dev_id = boost::get<platform::CUDAPlace>(p).device;
auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto &nccl_ctx = nccl_ctxs_->at(dev_id);
auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_;
void *buffer = const_cast<void *>(lod_tensor.data<void>()); void *buffer = const_cast<void *>(lod_tensor.data<void>());
void *recvbuffer = nullptr; void *recvbuffer = nullptr;
if (root == dev_id) { if (root_id == dev_id) {
recvbuffer = recvbuffer =
out_var->GetMutable<framework::LoDTensor>()->mutable_data( out_var->GetMutable<framework::LoDTensor>()->mutable_data(
out_var_handle->place_); out_var_handle->place_);
} }
int type = platform::ToNCCLDataType(lod_tensor.type()); int type = platform::ToNCCLDataType(lod_tensor.type());
all_reduce_calls.emplace_back([=] { size_t numel = static_cast<size_t>(lod_tensor.numel());
PADDLE_ENFORCE(platform::dynload::ncclReduce( all_reduce_calls.emplace_back(
buffer, recvbuffer, static_cast<size_t>(lod_tensor.numel()), [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] {
static_cast<ncclDataType_t>(type), ncclSum, root, comm, stream)); PADDLE_ENFORCE(platform::dynload::ncclReduce(
}); buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type),
ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream()));
});
} }
this->RunAndRecordEvent([&] { this->RunAndRecordEvent([&] {
...@@ -130,7 +135,7 @@ void ReduceOpHandle::RunImpl() { ...@@ -130,7 +135,7 @@ void ReduceOpHandle::RunImpl() {
} }
}); });
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW("CUDA is not enabled.");
#endif #endif
} else { } else {
PADDLE_THROW("Place should be CPUPlace or CUDAPlace."); PADDLE_THROW("Place should be CPUPlace or CUDAPlace.");
......
...@@ -55,7 +55,7 @@ struct ReduceOpHandle : public OpHandleBase { ...@@ -55,7 +55,7 @@ struct ReduceOpHandle : public OpHandleBase {
std::string Name() const override; std::string Name() const override;
bool IsMultiDeviceTransfer() override { return false; }; bool IsMultiDeviceTransfer() override { return true; };
protected: protected:
void RunImpl() override; void RunImpl() override;
......
...@@ -62,7 +62,7 @@ struct VarHandle : public VarHandleBase { ...@@ -62,7 +62,7 @@ struct VarHandle : public VarHandleBase {
std::string name_; std::string name_;
platform::Place place_; platform::Place place_;
bool operator==(const VarHandle& o) const { bool IsTheSameVar(const VarHandle& o) const {
return o.generated_op_ == generated_op_ && o.name_ == name_ && return o.generated_op_ == generated_op_ && o.name_ == name_ &&
o.scope_idx_ == scope_idx_; o.scope_idx_ == scope_idx_;
} }
......
...@@ -88,6 +88,52 @@ void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) { ...@@ -88,6 +88,52 @@ void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) {
VisitVariable(src, &visitor); VisitVariable(src, &visitor);
} }
struct EnforceShapeAndDTypeEQVisitor {
const Variable* trg_;
void operator()(const LoDTensor& src) {
auto& tensor = trg_->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(
src.place().which(), tensor.place().which(),
"The Places of the two Variable must be all on CPU or all on GPU.");
PADDLE_ENFORCE_EQ(src.type(), tensor.type(),
"The dtype of the two Variable is not equal.");
PADDLE_ENFORCE_EQ(src.dims(), tensor.dims(),
"The dims of the two Variable is not equal.");
PADDLE_ENFORCE_EQ(src.lod(), tensor.lod(),
"The lod of the two Variable is not equal.");
PADDLE_ENFORCE_EQ(src.layout(), tensor.layout(),
"The layout of the two Variable's tensor is not equal.");
}
void operator()(const SelectedRows& src) {
auto& selected_rows = trg_->Get<SelectedRows>();
PADDLE_ENFORCE_EQ(
src.place().which(), selected_rows.place().which(),
"The Places of the two Variable must be all on CPU or all on GPU.");
PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(),
"The dtype of the two Variable is not equal.");
PADDLE_ENFORCE_EQ(src.value().layout(), selected_rows.value().layout(),
"The layout of the two Variable's tensor is not equal.");
PADDLE_ENFORCE_EQ(src.height(), selected_rows.height(),
"The height of the two Variable is not equal.");
PADDLE_ENFORCE_EQ(src.GetCompleteDims(), selected_rows.GetCompleteDims(),
"The dims of the two Variable is not equal.");
}
template <typename T>
void operator()(const T&) {
PADDLE_ENFORCE("EnforceShapeAndDTypeEQ is not supported by type %s",
typeid(T).name());
}
};
void VariableVisitor::EnforceShapeAndDTypeEQ(const Variable& var1,
const Variable& var2) {
EnforceShapeAndDTypeEQVisitor visitor{&var1};
VisitVariable(var2, &visitor);
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -26,6 +26,9 @@ class VariableVisitor { ...@@ -26,6 +26,9 @@ class VariableVisitor {
static Tensor &GetMutableTensor(Variable *var); static Tensor &GetMutableTensor(Variable *var);
static void ShareDimsAndLoD(const Variable &src, Variable *trg); static void ShareDimsAndLoD(const Variable &src, Variable *trg);
static void EnforceShapeAndDTypeEQ(const Variable &var1,
const Variable &var2);
}; };
} // namespace details } // namespace details
......
...@@ -348,8 +348,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -348,8 +348,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
} }
} }
} }
platform::DeviceContextPool::Instance().Get(place_)->Wait();
if (create_vars && create_local_scope) { if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
} else {
// Delete the local scopes created in operators.
scope->DropKids();
} }
if (FLAGS_benchmark) { if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------"; VLOG(2) << "-------------------------------------------------------";
......
...@@ -93,6 +93,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -93,6 +93,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
RunImpl(scope, place); RunImpl(scope, place);
} }
bool OperatorBase::HasInputs(const std::string& name) const {
if (inputs_.find(name) != inputs_.end()) {
return true;
} else {
return false;
}
}
std::string OperatorBase::Input(const std::string& name) const { std::string OperatorBase::Input(const std::string& name) const {
auto& ins = Inputs(name); auto& ins = Inputs(name);
PADDLE_ENFORCE_LE(ins.size(), 1UL, PADDLE_ENFORCE_LE(ins.size(), 1UL,
...@@ -109,6 +117,14 @@ const std::vector<std::string>& OperatorBase::Inputs( ...@@ -109,6 +117,14 @@ const std::vector<std::string>& OperatorBase::Inputs(
return it->second; return it->second;
} }
bool OperatorBase::HasOutputs(const std::string& name) const {
if (outputs_.find(name) != outputs_.end()) {
return true;
} else {
return false;
}
}
std::string OperatorBase::Output(const std::string& name) const { std::string OperatorBase::Output(const std::string& name) const {
auto& outs = Outputs(name); auto& outs = Outputs(name);
PADDLE_ENFORCE_LE(outs.size(), 1UL, PADDLE_ENFORCE_LE(outs.size(), 1UL,
...@@ -220,13 +236,18 @@ void OperatorBase::CheckAllInputOutputSet() const { ...@@ -220,13 +236,18 @@ void OperatorBase::CheckAllInputOutputSet() const {
if (op_info == nullptr || op_info->proto_ == nullptr) return; if (op_info == nullptr || op_info->proto_ == nullptr) return;
for (auto& in : op_info->Proto().inputs()) { for (auto& in : op_info->Proto().inputs()) {
PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(), if (!in.dispensable()) {
"Type %s's input %s is not set", Type(), in.name()); PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
"Operator %s's input, %s, is not set", Type(), in.name());
}
} }
for (auto& out : op_info->Proto().outputs()) { for (auto& out : op_info->Proto().outputs()) {
PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(), if (!out.dispensable()) {
"Type %s's output %s is not set", Type(), out.name()); PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
"Operator %s's output, %s, is not set", Type(),
out.name());
}
} }
} }
...@@ -332,6 +353,9 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -332,6 +353,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
: op_(op), scope_(scope) {} : op_(op), scope_(scope) {}
bool HasInput(const std::string& name) const override { bool HasInput(const std::string& name) const override {
if (!op_.HasInputs(name)) {
return false;
}
auto& ins = Inputs(name); auto& ins = Inputs(name);
size_t length = ins.size(); size_t length = ins.size();
if (length == 0) { if (length == 0) {
...@@ -345,6 +369,9 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -345,6 +369,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
} }
bool HasOutput(const std::string& name) const override { bool HasOutput(const std::string& name) const override {
if (!op_.HasOutputs(name)) {
return false;
}
auto& outs = Outputs(name); auto& outs = Outputs(name);
size_t length = outs.size(); size_t length = outs.size();
if (length == 0) { if (length == 0) {
...@@ -358,6 +385,9 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -358,6 +385,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
} }
bool HasInputs(const std::string& name) const override { bool HasInputs(const std::string& name) const override {
if (!op_.HasInputs(name)) {
return false;
}
auto inputs = op_.Inputs(name); auto inputs = op_.Inputs(name);
if (inputs.empty()) { if (inputs.empty()) {
return false; return false;
...@@ -371,6 +401,9 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -371,6 +401,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
} }
bool HasOutputs(const std::string& name) const override { bool HasOutputs(const std::string& name) const override {
if (!op_.HasOutputs(name)) {
return false;
}
auto outputs = op_.Outputs(name); auto outputs = op_.Outputs(name);
if (outputs.empty()) { if (outputs.empty()) {
return false; return false;
......
...@@ -105,6 +105,7 @@ class OperatorBase { ...@@ -105,6 +105,7 @@ class OperatorBase {
const VariableNameMap& Inputs() const { return inputs_; } const VariableNameMap& Inputs() const { return inputs_; }
const VariableNameMap& Outputs() const { return outputs_; } const VariableNameMap& Outputs() const { return outputs_; }
bool HasInputs(const std::string& name) const;
//! Get a input with argument's name described in `op_proto` //! Get a input with argument's name described in `op_proto`
std::string Input(const std::string& name) const; std::string Input(const std::string& name) const;
//! Get a input which has multiple variables. //! Get a input which has multiple variables.
...@@ -112,6 +113,7 @@ class OperatorBase { ...@@ -112,6 +113,7 @@ class OperatorBase {
//! Get all inputs variable names //! Get all inputs variable names
std::vector<std::string> InputVars() const; std::vector<std::string> InputVars() const;
bool HasOutputs(const std::string& name) const;
//! Get a output with argument's name described in `op_proto` //! Get a output with argument's name described in `op_proto`
std::string Output(const std::string& name) const; std::string Output(const std::string& name) const;
//! Get an output which has multiple variables. //! Get an output which has multiple variables.
......
...@@ -99,7 +99,7 @@ ParallelExecutor::ParallelExecutor( ...@@ -99,7 +99,7 @@ ParallelExecutor::ParallelExecutor(
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
details::MultiDevSSAGraphBuilder builder( details::MultiDevSSAGraphBuilder builder(
member_->places_, loss_var_name, params, member_->local_scopes_, member_->places_, loss_var_name, params, member_->local_scopes_,
use_default_grad_scale, member_->nccl_ctxs_.get()); member_->nccl_ctxs_.get(), use_default_grad_scale);
#else #else
details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
params, member_->local_scopes_, params, member_->local_scopes_,
......
if(WITH_TESTING) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt_engine SRCS test_engine.cc engine.cc DEPS dynload_cuda)
nv_test(test_tensorrt_engine SRCS test_engine.cc engine.cc DEPS dynload_cuda) set(ENGINE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/engine.cc)
endif() add_subdirectory(convert)
nv_test(test_tensorrt_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
nv_test(test_tensorrt_activation_op SRCS test_activation_op.cc ${ENGINE_FILE} activation_op.cc
DEPS ${FLUID_CORE_MODULES} activation_op)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
class ReluOpConverter : public OpConverter {
public:
ReluOpConverter() {}
void operator()(const framework::OpDesc& op) override {
LOG(INFO) << "convert a fluid relu op to tensorrt activation layer whose "
"type is Relu";
const nvinfer1::ITensor* input_tensor =
engine_->GetITensor(op.Input("X")[0]);
nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
nvinfer1::ActivationType::kRELU);
engine_->SetITensor(op.Output("Out")[0], layer->getOutput(0));
}
};
REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
} // namespace tensorrt
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
class Conv2dOpConverter : public OpConverter {
public:
Conv2dOpConverter() {}
void operator()(const framework::OpDesc& op) override {
LOG(INFO)
<< "convert a fluid conv2d op to tensorrt conv layer without bias";
}
};
REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
} // namespace tensorrt
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
class MulOpConverter : public OpConverter {
public:
MulOpConverter() {}
void operator()(const framework::OpDesc& op) override {
LOG(INFO) << "convert a fluid mul op to tensorrt fc layer without bias";
}
};
REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
} // namespace tensorrt
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* Convert Op from Fluid to TensorRT Engine.
*/
class OpConverter {
public:
OpConverter() {}
virtual void operator()(const framework::OpDesc& op) {}
void Execute(const framework::OpDesc& op, TensorRTEngine* engine) {
std::string type = op.Type();
auto it = converters_.find(type);
PADDLE_ENFORCE(it != converters_.end(), "no OpConverter for optype [%s]",
type);
it->second->SetEngine(engine);
(*it->second)(op);
}
static OpConverter& Global() {
static auto* x = new OpConverter;
return *x;
}
template <typename T>
void Register(const std::string& key) {
converters_[key] = new T;
}
// convert fluid op to tensorrt layer
void ConvertOp(const framework::OpDesc& op, TensorRTEngine* engine) {
OpConverter::Global().Execute(op, engine);
}
// convert fluid block to tensorrt network
void ConvertBlock(const framework::BlockDesc& block, TensorRTEngine* engine) {
for (auto op : block.AllOps()) {
OpConverter::Global().Execute(*op, engine);
}
}
void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
virtual ~OpConverter() {}
// TensorRT engine
TensorRTEngine* engine_{nullptr};
private:
// registered op converter map, whose key is the fluid op type, and value is
// the pointer position of corresponding OpConverter class.
std::unordered_map<std::string, OpConverter*> converters_;
// fluid inference scope
framework::Scope* scope_{nullptr};
};
#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__) \
struct trt_##op_type__##_converter { \
trt_##op_type__##_converter() { \
OpConverter::Global().Register<Converter__>(#op_type__); \
} \
}; \
trt_##op_type__##_converter trt_##op_type__##_converter__;
} // namespace tensorrt
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
USE_OP(relu);
namespace paddle {
namespace inference {
namespace tensorrt {
void compare(float input, float expect) {
framework::Scope scope;
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
// init fluid op and variable
auto x_var = scope.Var("X");
auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
x_tensor->Resize({1, 1});
std::vector<float> init;
init.push_back(input);
framework::TensorFromVector(init, ctx, x_tensor);
auto out_var = scope.Var("Out");
auto out_tensor = out_var->GetMutable<framework::LoDTensor>();
out_tensor->Resize({1, 1});
out_tensor->mutable_data<float>(place);
framework::OpDesc op_desc;
op_desc.SetType("relu");
op_desc.SetInput("X", {"X"});
op_desc.SetOutput("Out", {"Out"});
auto relu_op = framework::OpRegistry::CreateOp(op_desc);
// run fluid op
relu_op->Run(scope, place);
std::vector<float> out1;
framework::TensorToVector(*out_tensor, ctx, &out1);
// init tensorrt op
cudaStream_t stream;
ASSERT_EQ(0, cudaStreamCreate(&stream));
TensorRTEngine* engine = new TensorRTEngine(1, 1 << 10, &stream);
engine->InitNetwork();
engine->DeclareInput("X", nvinfer1::DataType::kFLOAT,
nvinfer1::DimsCHW{1, 1, 1});
OpConverter op_converter;
op_converter.ConvertOp(op_desc, engine);
engine->DeclareOutput("Out");
engine->FreezeNetwork();
engine->SetInputFromCPU("X", &input, 1 * sizeof(float));
// run tensorrt op
engine->Execute(1);
float out2;
engine->GetOutputInCPU("Out", &out2, 1 * sizeof(float));
ASSERT_EQ(out1[0], out2);
ASSERT_EQ(out1[0], expect);
delete engine;
cudaStreamDestroy(stream);
}
TEST(OpConverter, ConvertRelu) {
compare(1, 1); // relu(1) = 1
compare(-5, 0); // relu(-5) = 0
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(BlockConverter, ConvertBlock) {
framework::ProgramDesc prog;
auto* block = prog.MutableBlock(0);
auto* mul_op = block->AppendOp();
mul_op->SetType("mul");
auto* conv2d_op = block->AppendOp();
conv2d_op->SetType("conv2d");
OpConverter converter;
converter.ConvertBlock(*block, nullptr /*TensorRTEngine*/);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -80,8 +80,8 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name, ...@@ -80,8 +80,8 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first"); PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
auto* input = infer_network_->addInput(name.c_str(), dtype, dim); auto* input = infer_network_->addInput(name.c_str(), dtype, dim);
PADDLE_ENFORCE(input, "infer network add input %s failed", name); PADDLE_ENFORCE(input, "infer network add input %s failed", name);
buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] * AccumDims(dim); buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] * AccumDims(dim);
TensorRTEngine::SetITensor(name, input);
return input; return input;
} }
...@@ -99,6 +99,19 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset, ...@@ -99,6 +99,19 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
buffer_sizes_[name] = 0; buffer_sizes_[name] = 0;
} }
void TensorRTEngine::DeclareOutput(const std::string& name) {
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
name);
auto* output = TensorRTEngine::GetITensor(name);
PADDLE_ENFORCE(output != nullptr);
output->setName(name.c_str());
infer_network_->markOutput(*output);
// output buffers' size can only be decided latter, set zero here to mark this
// and will reset latter.
buffer_sizes_[name] = 0;
}
void* TensorRTEngine::GetOutputInGPU(const std::string& name) { void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
return buffer(name); return buffer(name);
} }
...@@ -110,7 +123,6 @@ void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst, ...@@ -110,7 +123,6 @@ void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
PADDLE_ENFORCE(it != buffer_sizes_.end()); PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0); PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_GE(max_size, it->second); PADDLE_ENFORCE_GE(max_size, it->second);
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buffer(name), it->second, PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buffer(name), it->second,
cudaMemcpyDeviceToHost, *stream_)); cudaMemcpyDeviceToHost, *stream_));
} }
...@@ -126,10 +138,24 @@ void*& TensorRTEngine::buffer(const std::string& name) { ...@@ -126,10 +138,24 @@ void*& TensorRTEngine::buffer(const std::string& name) {
void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data, void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data,
size_t size) { size_t size) {
void* buf = buffer(name); void* buf = buffer(name);
cudaMemcpyAsync(buf, data, size, cudaMemcpyHostToDevice, *stream_);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
0, cudaMemcpyAsync(buf, data, size, cudaMemcpyHostToDevice, *stream_)); 0, cudaMemcpyAsync(buf, data, size, cudaMemcpyHostToDevice, *stream_));
} }
void TensorRTEngine::SetITensor(const std::string& name,
nvinfer1::ITensor* tensor) {
PADDLE_ENFORCE(tensor != nullptr);
PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate itensor name %s",
name);
itensor_map_[name] = tensor;
}
nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
PADDLE_ENFORCE(itensor_map_.count(name), "no itensor %s", name);
return itensor_map_[name];
}
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -80,6 +80,8 @@ class TensorRTEngine : public EngineBase { ...@@ -80,6 +80,8 @@ class TensorRTEngine : public EngineBase {
// name. // name.
void DeclareOutput(const nvinfer1::ILayer* layer, int offset, void DeclareOutput(const nvinfer1::ILayer* layer, int offset,
const std::string& name); const std::string& name);
// Set the itensor_map_[name] as the network's output, and set its name.
void DeclareOutput(const std::string& name);
// GPU memory address for an ITensor with specific name. One can operate on // GPU memory address for an ITensor with specific name. One can operate on
// these memory directly for acceleration, for example, output the converted // these memory directly for acceleration, for example, output the converted
...@@ -98,6 +100,10 @@ class TensorRTEngine : public EngineBase { ...@@ -98,6 +100,10 @@ class TensorRTEngine : public EngineBase {
// LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU // LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
// to CPU. // to CPU.
void GetOutputInCPU(const std::string& name, void* dst, size_t max_size); void GetOutputInCPU(const std::string& name, void* dst, size_t max_size);
// Fill an ITensor into map itensor_map_.
void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
// Get an ITensor called name.
nvinfer1::ITensor* GetITensor(const std::string& name);
nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
...@@ -113,6 +119,8 @@ class TensorRTEngine : public EngineBase { ...@@ -113,6 +119,8 @@ class TensorRTEngine : public EngineBase {
std::vector<void*> buffers_; std::vector<void*> buffers_;
// max data size for the buffers. // max data size for the buffers.
std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_; std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
itensor_map_;
// TensorRT related internal members // TensorRT related internal members
template <typename T> template <typename T>
......
...@@ -70,7 +70,6 @@ TEST_F(TensorRTEngineTest, add_layer) { ...@@ -70,7 +70,6 @@ TEST_F(TensorRTEngineTest, add_layer) {
engine_->Execute(1); engine_->Execute(1);
LOG(INFO) << "to get output"; LOG(INFO) << "to get output";
// void* y_v =
float y_cpu; float y_cpu;
engine_->GetOutputInCPU("y", &y_cpu, sizeof(float)); engine_->GetOutputInCPU("y", &y_cpu, sizeof(float));
......
...@@ -16,9 +16,12 @@ limitations under the License. */ ...@@ -16,9 +16,12 @@ limitations under the License. */
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(data_set, "cifar10", "Data set to test");
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
DEFINE_string(fp16_dirname, "", "Directory of the float16 inference model.");
DEFINE_int32(batch_size, 1, "Batch size of input data"); DEFINE_int32(batch_size, 1, "Batch size of input data");
DEFINE_int32(repeat, 1, "Running the inference program repeat times"); DEFINE_int32(repeat, 1, "Running the inference program repeat times");
DEFINE_bool(skip_cpu, false, "Skip the cpu test");
TEST(inference, image_classification) { TEST(inference, image_classification) {
if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) { if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
...@@ -35,20 +38,31 @@ TEST(inference, image_classification) { ...@@ -35,20 +38,31 @@ TEST(inference, image_classification) {
paddle::framework::LoDTensor input; paddle::framework::LoDTensor input;
// Use normilized image pixels as input data, // Use normilized image pixels as input data,
// which should be in the range [0.0, 1.0]. // which should be in the range [0.0, 1.0].
SetupTensor<float>(&input, {FLAGS_batch_size, 3, 32, 32}, if (FLAGS_data_set == "cifar10") {
static_cast<float>(0), static_cast<float>(1)); SetupTensor<float>(&input, {FLAGS_batch_size, 3, 32, 32},
static_cast<float>(0), static_cast<float>(1));
} else if (FLAGS_data_set == "imagenet") {
SetupTensor<float>(&input, {FLAGS_batch_size, 3, 224, 224},
static_cast<float>(0), static_cast<float>(1));
} else {
LOG(FATAL) << "Only cifar10 or imagenet is supported.";
}
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input); cpu_feeds.push_back(&input);
paddle::framework::LoDTensor output1; paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1; if (!FLAGS_skip_cpu) {
cpu_fetchs1.push_back(&output1); std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
LOG(INFO) << "--- CPU Runs: ---"; // Run inference on CPU
TestInference<paddle::platform::CPUPlace, false, true>( LOG(INFO) << "--- CPU Runs: ---";
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat); LOG(INFO) << "Batch size is " << FLAGS_batch_size;
LOG(INFO) << output1.dims(); TestInference<paddle::platform::CPUPlace, false, true>(
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
LOG(INFO) << output1.dims();
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2; paddle::framework::LoDTensor output2;
...@@ -57,24 +71,27 @@ TEST(inference, image_classification) { ...@@ -57,24 +71,27 @@ TEST(inference, image_classification) {
// Run inference on CUDA GPU // Run inference on CUDA GPU
LOG(INFO) << "--- GPU Runs: ---"; LOG(INFO) << "--- GPU Runs: ---";
LOG(INFO) << "Batch size is " << FLAGS_batch_size;
TestInference<paddle::platform::CUDAPlace, false, true>( TestInference<paddle::platform::CUDAPlace, false, true>(
dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat); dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
LOG(INFO) << output2.dims(); LOG(INFO) << output2.dims();
CheckError<float>(output1, output2); if (!FLAGS_skip_cpu) {
CheckError<float>(output1, output2);
}
// float16 inference requires cuda GPUs with >= 5.3 compute capability // float16 inference requires cuda GPUs with >= 5.3 compute capability
if (paddle::platform::GetCUDAComputeCapability(0) >= 53) { if (!FLAGS_fp16_dirname.empty() &&
paddle::platform::GetCUDAComputeCapability(0) >= 53) {
paddle::framework::LoDTensor output3; paddle::framework::LoDTensor output3;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs3; std::vector<paddle::framework::LoDTensor*> cpu_fetchs3;
cpu_fetchs3.push_back(&output3); cpu_fetchs3.push_back(&output3);
LOG(INFO) << "--- GPU Runs in float16 mode: ---"; LOG(INFO) << "--- GPU Runs in float16 mode: ---";
std::string fp16_dirname = dirname; LOG(INFO) << "Batch size is " << FLAGS_batch_size;
fp16_dirname.replace(fp16_dirname.find("book/"),
std::string("book/").size(), "book/float16_");
TestInference<paddle::platform::CUDAPlace, false, true>( TestInference<paddle::platform::CUDAPlace, false, true>(
fp16_dirname, cpu_feeds, cpu_fetchs3, FLAGS_repeat); FLAGS_fp16_dirname, cpu_feeds, cpu_fetchs3, FLAGS_repeat);
CheckError<float>(output2, output3); CheckError<float>(output2, output3);
} }
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
class AdadeltaOp : public framework::OperatorWithKernel { class AdadeltaOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -55,6 +56,12 @@ class AdadeltaOp : public framework::OperatorWithKernel { ...@@ -55,6 +56,12 @@ class AdadeltaOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("AvgSquaredGradOut", param_dim); ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim); ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("Param")->type());
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
}; };
class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker { class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
......
...@@ -23,6 +23,7 @@ limitations under the License. */ ...@@ -23,6 +23,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
class AdagradOp : public framework::OperatorWithKernel { class AdagradOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -56,6 +57,12 @@ class AdagradOp : public framework::OperatorWithKernel { ...@@ -56,6 +57,12 @@ class AdagradOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("ParamOut", param_dims);
ctx->SetOutputDim("MomentOut", param_dims); ctx->SetOutputDim("MomentOut", param_dims);
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("Param")->type());
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
}; };
class AdagradOpMaker : public framework::OpProtoAndCheckerMaker { class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
class AdamOp : public framework::OperatorWithKernel { class AdamOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -69,6 +70,12 @@ class AdamOp : public framework::OperatorWithKernel { ...@@ -69,6 +70,12 @@ class AdamOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("Moment1Out", param_dims); ctx->SetOutputDim("Moment1Out", param_dims);
ctx->SetOutputDim("Moment2Out", param_dims); ctx->SetOutputDim("Moment2Out", param_dims);
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("Param")->type());
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
}; };
class AdamOpMaker : public framework::OpProtoAndCheckerMaker { class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
class AdamaxOp : public framework::OperatorWithKernel { class AdamaxOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -63,6 +64,12 @@ class AdamaxOp : public framework::OperatorWithKernel { ...@@ -63,6 +64,12 @@ class AdamaxOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("MomentOut", param_dims); ctx->SetOutputDim("MomentOut", param_dims);
ctx->SetOutputDim("InfNormOut", param_dims); ctx->SetOutputDim("InfNormOut", param_dims);
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("Param")->type());
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
}; };
class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker { class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using paddle::platform::MKLDNNDeviceContext;
using paddle::platform::MKLDNNMemDesc;
using mkldnn::memory;
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
namespace {
template <typename T>
struct bn_type_traits {
using op_type = T;
using op_desc = typename op_type::desc;
using op_prim = typename op_type::primitive_desc;
};
template <typename T, typename Container>
void copy_to_weights(T scale_begin, T scale_end, T shift_begin, T shift_end,
Container *c) {
auto it = std::begin(*c);
std::copy(scale_begin, scale_end, std::inserter(*c, it));
std::copy(
shift_begin, shift_end,
std::inserter(*c, std::next(it, std::distance(scale_begin, scale_end))));
}
template <typename Op, typename... Args>
void run_batch_norm_op(Args &&... args) {
Op batch_norm_op{args...};
std::vector<mkldnn::primitive> pipeline;
pipeline.push_back(batch_norm_op);
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
template <typename T>
inline void *cast_const_to_void(const T *t) {
return static_cast<void *>(const_cast<T *>(t));
}
} // namespace
template <typename T>
class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto data_layout_str = ctx.Attr<std::string>("data_layout");
auto data_layout = framework::StringToDataLayout(data_layout_str);
PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
"MKLDNN batch normalization handles only NCHW data layout");
const float epsilon = ctx.Attr<float>("epsilon");
const float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test");
const auto *x = ctx.Input<Tensor>("X");
const auto *mean = ctx.Input<Tensor>("Mean");
const auto *variance = ctx.Input<Tensor>("Variance");
auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
auto mkldnn_engine = dev_ctx.GetEngine();
auto *y = ctx.Output<Tensor>("Y");
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
auto *batch_mean = ctx.Output<Tensor>("SavedMean");
auto *batch_variance = ctx.Output<Tensor>("SavedVariance");
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *shift = ctx.Input<Tensor>("Bias");
y->mutable_data<T>(ctx.GetPlace());
mean_out->mutable_data<T>(ctx.GetPlace());
variance_out->mutable_data<T>(ctx.GetPlace());
if (!is_test) {
batch_mean->mutable_data<T>(ctx.GetPlace());
batch_variance->mutable_data<T>(ctx.GetPlace());
}
auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring
: mkldnn::prop_kind::forward_training;
auto dims = paddle::framework::vectorize2int(x->dims());
auto src_md =
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
auto dst_md =
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
auto src_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
auto dst_pd = mkldnn::memory::primitive_desc{dst_md, mkldnn_engine};
auto src = mkldnn::memory{src_pd, cast_const_to_void(x->data<T>())};
auto dst = mkldnn::memory{dst_pd, y->data<T>()};
unsigned flags = mkldnn::use_scale_shift;
if (is_test) flags |= mkldnn::use_global_stats;
using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
auto batch_norm_fwd_desc =
bn_fwd_types::op_desc{propagation, src_md, epsilon, flags};
auto batch_norm_fwd_pd =
bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine};
const unsigned int ic = dims[1];
// MKLDNN requires a single piece of memory for scale and shift/bias data
const size_t scaleshift_size = 2 * ic;
std::vector<T> scaleshift_data;
scaleshift_data.reserve(scaleshift_size);
copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
shift->data<T>() + ic, &scaleshift_data);
auto scaleshift_memory = mkldnn::memory{
batch_norm_fwd_pd.weights_primitive_desc(), scaleshift_data.data()};
if (is_test) {
auto mean_memory = mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(),
cast_const_to_void(mean->data<T>())};
auto variance_memory =
mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(),
cast_const_to_void(variance->data<T>())};
run_batch_norm_op<typename bn_fwd_types::op_type>(
batch_norm_fwd_pd, src, (const mkldnn::primitive::at &)mean_memory,
(const mkldnn::primitive::at &)variance_memory, scaleshift_memory,
dst);
} else {
auto mean_memory =
mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(),
cast_const_to_void(batch_mean->data<T>())};
auto variance_memory =
mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(),
cast_const_to_void(batch_variance->data<T>())};
run_batch_norm_op<bn_fwd_types::op_type>(batch_norm_fwd_pd, src,
scaleshift_memory, dst,
mean_memory, variance_memory);
}
if (!is_test) {
const unsigned int in = dims[0];
const unsigned int sample_size = x->numel() / in / ic;
// saved_xx is use just in this batch of data
EigenVectorArrayMap<T> saved_mean_e(
batch_mean->mutable_data<T>(ctx.GetPlace()), ic);
EigenVectorArrayMap<T> saved_variance_e(
batch_variance->mutable_data<T>(ctx.GetPlace()), ic);
saved_mean_e.setZero();
saved_variance_e.setZero();
const unsigned int x_arr_size = in * ic;
ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, x_arr_size);
for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
saved_mean_e(nc % ic) += x_arr.col(nc).sum();
}
saved_mean_e /= in * sample_size;
for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
saved_variance_e(nc % ic) +=
(x_arr.col(nc) - saved_mean_e(nc % ic)).matrix().squaredNorm();
}
saved_variance_e /= in * sample_size;
ConstEigenVectorArrayMap<T> mean_arr{mean->data<T>(), ic};
ConstEigenVectorArrayMap<T> variance_arr{variance->data<T>(), ic};
EigenVectorArrayMap<T> running_mean_arr(
mean_out->mutable_data<T>(ctx.GetPlace()), ic);
EigenVectorArrayMap<T> running_var_arr(
variance_out->mutable_data<T>(ctx.GetPlace()), ic);
auto one_minus_momentum = 1. - momentum;
running_mean_arr =
mean_arr * momentum + saved_mean_e * one_minus_momentum;
running_var_arr =
variance_arr * momentum + saved_variance_e * one_minus_momentum;
}
}
};
template <typename T>
class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext &ctx) const override {
auto data_layout_str = ctx.Attr<std::string>("data_layout");
auto data_layout = framework::StringToDataLayout(data_layout_str);
PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
"MKLDNN batch normalization handles only NCHW data layout");
auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
auto mkldnn_engine = dev_ctx.GetEngine();
const float epsilon = ctx.Attr<float>("epsilon");
const auto *x = ctx.Input<Tensor>("X");
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *shift = ctx.Input<Tensor>("Bias");
const auto *batch_mean = ctx.Input<Tensor>("SavedMean");
const auto *batch_variance = ctx.Input<Tensor>("SavedVariance");
const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *diff_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));
diff_x->mutable_data<T>(ctx.GetPlace());
diff_scale->mutable_data<T>(ctx.GetPlace());
diff_shift->mutable_data<T>(ctx.GetPlace());
auto dims = paddle::framework::vectorize2int(x->dims());
unsigned flags = mkldnn::use_scale_shift | !mkldnn::use_global_stats;
auto src_md =
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
auto dst_md =
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
auto diff_src_md =
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
auto diff_dst_md =
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
auto batch_norm_fwd_desc = bn_fwd_types::op_desc{
mkldnn::prop_kind::forward_training, src_md, epsilon, flags};
auto batch_norm_fwd_pd =
bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine};
auto batch_norm_bwd_desc = bn_bwd_types::op_desc{
mkldnn::prop_kind::backward, diff_dst_md, dst_md, epsilon, flags};
auto batch_norm_bwd_pd = bn_bwd_types::op_prim{
batch_norm_bwd_desc, mkldnn_engine, batch_norm_fwd_pd};
auto src = mkldnn::memory{{src_md, mkldnn_engine},
cast_const_to_void(x->data<T>())};
auto mean = mkldnn::memory{batch_norm_bwd_pd.mean_primitive_desc(),
cast_const_to_void(batch_mean->data<T>())};
auto variance =
mkldnn::memory{batch_norm_bwd_pd.variance_primitive_desc(),
cast_const_to_void(batch_variance->data<T>())};
auto diff_dst = mkldnn::memory{{diff_dst_md, mkldnn_engine},
cast_const_to_void(diff_y->data<T>())};
const unsigned int ic = dims[1];
const size_t scaleshift_size = 2 * ic;
std::vector<T> scaleshift_data;
scaleshift_data.reserve(scaleshift_size);
copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
shift->data<T>() + ic, &scaleshift_data);
auto scaleshift_memory = mkldnn::memory{
batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data()};
std::vector<T> diff_scaleshift_data;
diff_scaleshift_data.reserve(scaleshift_size);
copy_to_weights(diff_scale->data<T>(), diff_scale->data<T>() + ic,
diff_shift->data<T>(), diff_shift->data<T>() + ic,
&diff_scaleshift_data);
auto diff_scaleshift_memory =
mkldnn::memory{batch_norm_bwd_pd.diff_weights_primitive_desc(),
diff_scaleshift_data.data()};
auto diff_src = mkldnn::memory{{diff_src_md, mkldnn_engine},
static_cast<void *>(diff_x->data<T>())};
run_batch_norm_op<bn_bwd_types::op_type>(
batch_norm_bwd_pd, src, mean, variance, diff_dst, scaleshift_memory,
diff_src, diff_scaleshift_memory);
auto it = std::begin(diff_scaleshift_data);
std::copy(it, std::next(it, ic), diff_scale->data<T>());
std::copy(std::next(it, ic), std::end(diff_scaleshift_data),
diff_shift->data<T>());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(batch_norm, MKLDNN, paddle::platform::CPUPlace,
ops::BatchNormMKLDNNOpKernel<float>);
REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, paddle::platform::CPUPlace,
ops::BatchNormMKLDNNGradOpKernel<float>);
...@@ -15,6 +15,9 @@ limitations under the License. */ ...@@ -15,6 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/batch_norm_op.h"
#include <string> #include <string>
#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_layout.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -87,9 +90,13 @@ class BatchNormOp : public framework::OperatorWithKernel { ...@@ -87,9 +90,13 @@ class BatchNormOp : public framework::OperatorWithKernel {
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
auto input_data_type = auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("X")->type()); framework::ToDataType(ctx.Input<Tensor>("X")->type());
// For float or float16 input tensor, the type of the scale, bias, mean, // By default, the type of the scale, bias, mean,
// and var tensors should both be float. // and var tensors should both be float. (For float or float16 input tensor)
// or double (For double input tensor).
auto bn_param_type = framework::proto::VarType::FP32; auto bn_param_type = framework::proto::VarType::FP32;
if (input_data_type == framework::proto::VarType::FP64) {
bn_param_type = framework::proto::VarType::FP64;
}
PADDLE_ENFORCE_EQ(bn_param_type, PADDLE_ENFORCE_EQ(bn_param_type,
framework::ToDataType(ctx.Input<Tensor>("Scale")->type()), framework::ToDataType(ctx.Input<Tensor>("Scale")->type()),
"Scale input should be of float type"); "Scale input should be of float type");
...@@ -102,7 +109,18 @@ class BatchNormOp : public framework::OperatorWithKernel { ...@@ -102,7 +109,18 @@ class BatchNormOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType( PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType(
ctx.Input<Tensor>("Variance")->type()), ctx.Input<Tensor>("Variance")->type()),
"Variance input should be of float type"); "Variance input should be of float type");
return framework::OpKernelType(input_data_type, ctx.GetPlace());
framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
}
#endif
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
library_);
} }
}; };
...@@ -147,6 +165,9 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -147,6 +165,9 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
"Variance of the current mini batch, " "Variance of the current mini batch, "
"will apply to output when training") "will apply to output when training")
.AsIntermediate(); .AsIntermediate();
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Batch Normalization. Batch Normalization.
...@@ -345,8 +366,19 @@ class BatchNormGradOp : public framework::OperatorWithKernel { ...@@ -345,8 +366,19 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
if (t == nullptr) { if (t == nullptr) {
PADDLE_THROW("can't find Y@GRAD"); PADDLE_THROW("can't find Y@GRAD");
} }
return framework::OpKernelType(framework::ToDataType(t->type()),
ctx.GetPlace()); framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
}
#endif
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
layout, library_);
} }
}; };
...@@ -470,6 +502,7 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker { ...@@ -470,6 +502,7 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
op->SetInput("Scale", Input("Scale")); op->SetInput("Scale", Input("Scale"));
op->SetInput("Bias", Input("Bias"));
op->SetInput("SavedMean", Output("SavedMean")); op->SetInput("SavedMean", Output("SavedMean"));
op->SetInput("SavedVariance", Output("SavedVariance")); op->SetInput("SavedVariance", Output("SavedVariance"));
...@@ -492,8 +525,9 @@ REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, ...@@ -492,8 +525,9 @@ REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp); REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
batch_norm, batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>); ops::BatchNormKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
batch_norm_grad, batch_norm_grad,
ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>); ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -287,6 +287,8 @@ namespace ops = paddle::operators; ...@@ -287,6 +287,8 @@ namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>, batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
ops::BatchNormKernel<plat::CUDADeviceContext, double>,
ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>); ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>); batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
ops::BatchNormGradKernel<plat::CUDADeviceContext, double>);
...@@ -23,16 +23,54 @@ struct BeamSearchDecodeFunctor { ...@@ -23,16 +23,54 @@ struct BeamSearchDecodeFunctor {
BeamSearchDecodeFunctor(const LoDTensorArray& step_ids, BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
const LoDTensorArray& step_scores, const LoDTensorArray& step_scores,
LoDTensor* id_tensor, LoDTensor* score_tensor) LoDTensor* id_tensor, LoDTensor* score_tensor)
: step_ids_(step_ids), : step_ids_origin_(step_ids),
step_scores_(step_scores), step_scores_origin_(step_scores),
id_tensor_(id_tensor), id_tensor_(id_tensor),
score_tensor_(score_tensor) {} score_tensor_(score_tensor) {
tensor_on_gpu_ = false;
// First make a copy of GPU data on CPU
if (platform::is_gpu_place(step_ids_origin_[0].place())) {
tensor_on_gpu_ = true;
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(step_ids_origin_[0].place());
// Copy all tensors in the input tensor array
for (auto& step_id : step_ids_origin_) {
framework::LoDTensor out;
dev_ctx->Wait();
framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out);
dev_ctx->Wait();
out.set_lod(step_id.lod());
step_ids_.push_back(out);
}
}
if (platform::is_gpu_place(step_scores_origin_[0].place())) {
tensor_on_gpu_ = true;
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(step_scores_origin_[0].place());
// Copy all tensors in the input tensor array
for (auto& step_score : step_scores_origin_) {
framework::LoDTensor out;
dev_ctx->Wait();
framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx, &out);
dev_ctx->Wait();
out.set_lod(step_score.lod());
step_scores_.push_back(out);
}
}
}
template <typename T> template <typename T>
void operator()() const; void operator()() const;
const LoDTensorArray& step_ids_; bool tensor_on_gpu_;
const LoDTensorArray& step_scores_; const LoDTensorArray& step_ids_origin_;
const LoDTensorArray& step_scores_origin_;
LoDTensorArray step_ids_ = LoDTensorArray();
LoDTensorArray step_scores_ = LoDTensorArray();
LoDTensor* id_tensor_; LoDTensor* id_tensor_;
LoDTensor* score_tensor_; LoDTensor* score_tensor_;
}; };
...@@ -40,8 +78,14 @@ struct BeamSearchDecodeFunctor { ...@@ -40,8 +78,14 @@ struct BeamSearchDecodeFunctor {
template <typename T> template <typename T>
void BeamSearchDecodeFunctor::operator()() const { void BeamSearchDecodeFunctor::operator()() const {
BeamSearchDecoder<T> beam_search_decoder; BeamSearchDecoder<T> beam_search_decoder;
beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_, // Check if the tensor is on GPU. If so, use the CPU copy instead
score_tensor_); if (tensor_on_gpu_) {
beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_,
score_tensor_);
} else {
beam_search_decoder.PackAllSteps(step_ids_origin_, step_scores_origin_,
id_tensor_, score_tensor_);
}
} }
template <> template <>
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -61,9 +61,9 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> { ...@@ -61,9 +61,9 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
auto output_col_vec = output_mat.chip(i, 1); auto output_col_vec = output_mat.chip(i, 1);
Tensor weight_mat = Tensor weight_mat =
weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim})); weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
math::gemm<DeviceContext, T>(dev_ctx, CblasNoTrans, CblasNoTrans, math::GetBlas<DeviceContext, T>(dev_ctx).GEMM(
batch_size, y_dim, x_dim, 1, x->data<T>(), CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, x->data<T>(),
weight_mat.data<T>(), 0, left_mul.data<T>()); weight_mat.data<T>(), 0, left_mul.data<T>());
output_col_vec.device(place) = output_col_vec.device(place) =
(left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1)); (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
} }
...@@ -125,6 +125,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> { ...@@ -125,6 +125,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
set_zero(dev_ctx, d_y, static_cast<T>(0)); set_zero(dev_ctx, d_y, static_cast<T>(0));
} }
auto blas = math::GetBlas<DeviceContext, T>(ctx);
// Caculate the Output(X@Grad) and Output(Y@Grad). // Caculate the Output(X@Grad) and Output(Y@Grad).
if (d_x || d_y) { if (d_x || d_y) {
Eigen::DSizes<int, 2> bcast_for_x(1, y_dim); Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
...@@ -138,18 +140,16 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> { ...@@ -138,18 +140,16 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1)) output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
.broadcast(bcast_for_x) * .broadcast(bcast_for_x) *
y_mat; y_mat;
math::gemm<DeviceContext, T>( blas.GEMM(CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
dev_ctx, CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1, y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
} }
if (d_y) { if (d_y) {
x_scale_mat.device(place) = x_scale_mat.device(place) =
output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1)) output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
.broadcast(bcast_for_y) * .broadcast(bcast_for_y) *
x_mat; x_mat;
math::gemm<DeviceContext, T>( blas.GEMM(CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
dev_ctx, CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
} }
} }
} }
...@@ -166,9 +166,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> { ...@@ -166,9 +166,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1)) output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
.broadcast(bcast_for_weight) * .broadcast(bcast_for_weight) *
x_mat; x_mat;
math::gemm<DeviceContext, T>(dev_ctx, CblasTrans, CblasNoTrans, x_dim, blas.GEMM(CblasTrans, CblasNoTrans, x_dim, y_dim, batch_size, 1,
y_dim, batch_size, 1, x_scale.data<T>(), x_scale.data<T>(), y->data<T>(), 0, d_weight_i.data<T>());
y->data<T>(), 0, d_weight_i.data<T>());
} }
} }
......
...@@ -366,7 +366,8 @@ REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, ...@@ -366,7 +366,8 @@ REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>, paddle::operators::CUDNNConvOpKernel<float>,
paddle::operators::CUDNNConvOpKernel<double>); paddle::operators::CUDNNConvOpKernel<double>,
paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>, paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>); paddle::operators::CUDNNConvGradOpKernel<double>);
...@@ -17,9 +17,9 @@ limitations under the License. */ ...@@ -17,9 +17,9 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/operators/math/vol2col.h"
namespace paddle { namespace paddle {
...@@ -161,6 +161,7 @@ class GemmConvKernel : public framework::OpKernel<T> { ...@@ -161,6 +161,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col; math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
...@@ -186,8 +187,7 @@ class GemmConvKernel : public framework::OpKernel<T> { ...@@ -186,8 +187,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<DeviceContext, T>(dev_ctx, filter_slice, false, col_matrix, blas.MatMul(filter_slice, col_matrix, &out_slice);
false, T(1.0), &out_slice, T(0.0));
} }
} }
} }
...@@ -274,6 +274,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> { ...@@ -274,6 +274,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
math::SetConstant<DeviceContext, T> set_zero; math::SetConstant<DeviceContext, T> set_zero;
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
if (input_grad) { if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace()); input_grad->mutable_data<T>(context.GetPlace());
...@@ -303,9 +304,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> { ...@@ -303,9 +304,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
col_matrix.ShareDataWith(in_grad_slice); col_matrix.ShareDataWith(in_grad_slice);
col_matrix.Resize(col_matrix_shape); col_matrix.Resize(col_matrix_shape);
} }
math::matmul<DeviceContext, T>(dev_ctx, filter_slice, true, blas.MatMul(filter_slice, true, out_grad_slice, false, &col_matrix);
out_grad_slice, false, T(1.0),
&col_matrix, T(0.0));
if (is_expand && data_dim == 2U) { if (is_expand && data_dim == 2U) {
col2im(dev_ctx, col, dilations, strides, col2im(dev_ctx, col, dilations, strides,
...@@ -352,9 +351,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> { ...@@ -352,9 +351,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
// gemm // gemm
Tensor filter_grad_slice = Tensor filter_grad_slice =
filter_grad_.Slice(g * out_step, (g + 1) * out_step); filter_grad_.Slice(g * out_step, (g + 1) * out_step);
math::matmul<DeviceContext, T>(dev_ctx, out_grad_slice, false, blas.MatMul(out_grad_slice, false, col_matrix, true,
col_matrix, true, T(1.0), &filter_grad_slice);
&filter_grad_slice, T(1.0));
} }
} }
} }
......
...@@ -16,8 +16,8 @@ limitations under the License. */ ...@@ -16,8 +16,8 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/operators/math/vol2col.h"
namespace paddle { namespace paddle {
...@@ -118,6 +118,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> { ...@@ -118,6 +118,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
output->mutable_data<T>(context.GetPlace()); output->mutable_data<T>(context.GetPlace());
math::SetConstant<DeviceContext, T> set_zero; math::SetConstant<DeviceContext, T> set_zero;
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
set_zero(dev_ctx, output, static_cast<T>(0)); set_zero(dev_ctx, output, static_cast<T>(0));
math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im; math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
...@@ -134,9 +135,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> { ...@@ -134,9 +135,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
// col_matrix = filter * input_batch // col_matrix = filter * input_batch
// of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w) // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
math::matmul<DeviceContext, T>(dev_ctx, filter, true, input_batch, false, blas.MatMul(filter, true, input_batch, false, &col_matrix);
static_cast<T>(1.0), &col_matrix,
static_cast<T>(0.0));
if (data_dim == 2U) { if (data_dim == 2U) {
// col2im: col_matrix -> dy // col2im: col_matrix -> dy
...@@ -213,6 +212,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -213,6 +212,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
// im2col + gemm (similar to conv-forward) // im2col + gemm (similar to conv-forward)
// input need to compute gradient // input need to compute gradient
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
if (input_grad || filter_grad) { if (input_grad || filter_grad) {
Tensor col; Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace()); col.mutable_data<T>(col_shape, context.GetPlace());
...@@ -267,9 +267,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -267,9 +267,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
// or // or
// (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m, // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
// d, h, w) // d, h, w)
math::matmul<DeviceContext, T>( blas.MatMul(filter, false, col_matrix, false, &input_grad_batch);
dev_ctx, filter, false, col_matrix, false, static_cast<T>(1.0),
&input_grad_batch, static_cast<T>(0.0));
} }
if (filter_grad) { if (filter_grad) {
// input batch // input batch
...@@ -279,9 +277,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -279,9 +277,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
// or // or
// (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d * // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
// k_h * k_w) // k_h * k_w)
math::matmul<DeviceContext, T>(dev_ctx, in_batch, false, col_matrix, blas.MatMul(in_batch, false, col_matrix, true, &filter_grad_);
true, static_cast<T>(1.0),
&filter_grad_, static_cast<T>(1.0));
} }
} }
} }
......
...@@ -164,11 +164,13 @@ or not. But the output only shares the LoD information with input X. ...@@ -164,11 +164,13 @@ or not. But the output only shares the LoD information with input X.
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
using CPUCtx = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker, REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp); REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>, REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
ops::CrossEntropyOpKernel<double>); ops::CrossEntropyOpKernel<CPUCtx, double>);
REGISTER_OP_CPU_KERNEL(cross_entropy_grad, REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
ops::CrossEntropyGradientOpKernel<float>, ops::CrossEntropyGradientOpKernel<CPUCtx, float>,
ops::CrossEntropyGradientOpKernel<double>); ops::CrossEntropyGradientOpKernel<CPUCtx, double>);
...@@ -14,98 +14,11 @@ limitations under the License. */ ...@@ -14,98 +14,11 @@ limitations under the License. */
#include "paddle/fluid/operators/cross_entropy_op.h" #include "paddle/fluid/operators/cross_entropy_op.h"
namespace paddle {
namespace operators {
namespace {
template <typename T>
__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
const int64_t* label, const int N,
const int D) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
int idx = i * D + label[i];
dX[idx] = -dY[i] / X[idx];
}
}
template <typename T>
__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
const T* label, const int N,
const int D) {
int ids = blockIdx.x * blockDim.x + threadIdx.x;
if (ids < N * D) {
int row_ids = ids / D;
dX[ids] = -label[ids] * dY[row_ids] / X[ids];
}
}
} // namespace
template <typename T>
class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
const Tensor* x = ctx.Input<Tensor>("X");
const Tensor* label = ctx.Input<Tensor>("Label");
Tensor* y = ctx.Output<Tensor>("Y");
y->mutable_data<T>(ctx.GetPlace());
math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
ctx.template device_context<platform::CUDADeviceContext>(), y, x, label,
ctx.Attr<bool>("soft_label"));
}
};
template <typename T>
class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
const Tensor* x = ctx.Input<Tensor>("X");
const Tensor* label = ctx.Input<Tensor>("Label");
Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
const T* dy_data =
ctx.Input<Tensor>(framework::GradVarName("Y"))->data<T>();
T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
const T* x_data = x->data<T>();
int64_t batch_size = x->dims()[0];
int64_t class_num = x->dims()[1];
int block = 512;
int grid = (batch_size * class_num + block - 1) / block;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto stream = dev_ctx.stream();
if (ctx.Attr<bool>("soft_label")) {
auto* label_data = label->data<T>();
SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
dx_data, dy_data, x_data, label_data, batch_size, class_num);
} else {
math::SetConstant<platform::CUDADeviceContext, T> functor;
functor(dev_ctx, dx, 0);
auto* label_data = label->data<int64_t>();
grid = (batch_size + block - 1) / block;
CrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
dx_data, dy_data, x_data, label_data, batch_size, class_num);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>, using CUDACtx = paddle::platform::CUDADeviceContext;
ops::CrossEntropyOpCUDAKernel<double>); REGISTER_OP_CUDA_KERNEL(cross_entropy,
ops::CrossEntropyOpKernel<CUDACtx, float>,
ops::CrossEntropyOpKernel<CUDACtx, double>);
REGISTER_OP_CUDA_KERNEL(cross_entropy_grad, REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
ops::CrossEntropyGradientOpCUDAKernel<float>, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
ops::CrossEntropyGradientOpCUDAKernel<double>); ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
...@@ -17,69 +17,106 @@ limitations under the License. */ ...@@ -17,69 +17,106 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T> template <typename DeviceContext, typename T>
class CrossEntropyOpKernel : public framework::OpKernel<T> { class CrossEntropyOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), auto* x = ctx.Input<Tensor>("X");
"This kernel only runs on CPU."); auto* labels = ctx.Input<Tensor>("Label");
const Tensor* x = ctx.Input<Tensor>("X"); auto* y = ctx.Output<Tensor>("Y");
const Tensor* labels = ctx.Input<Tensor>("Label");
Tensor* y = ctx.Output<Tensor>("Y");
y->mutable_data<T>(ctx.GetPlace()); y->mutable_data<T>(ctx.GetPlace());
math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()( math::CrossEntropyFunctor<DeviceContext, T>()(
ctx.template device_context<platform::CPUDeviceContext>(), y, x, labels, ctx.template device_context<DeviceContext>(), y, x, labels,
ctx.Attr<bool>("soft_label")); ctx.Attr<bool>("soft_label"));
} }
}; };
template <typename T> template <typename T>
class XeSoftlabelGradFunctor {
public:
XeSoftlabelGradFunctor(T* dx,
const T* dy, // NOLINT
const T* x, // NOLINT
const T* label, // NOLINT
size_t num_classes)
: dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {}
HOSTDEVICE void operator()(size_t i) {
auto row_ids = i / num_classes_;
dx_[i] = -label_[i] * dy_[row_ids] / x_[i];
}
private:
T* dx_;
const T* dy_;
const T* x_;
const T* label_;
size_t num_classes_;
};
template <typename T>
class XeGradFunctor {
public:
XeGradFunctor(T* dx,
const T* dy, // NOLINT
const T* x, // NOLINT
const int64_t* label, // NOLINT
size_t num_classes)
: dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {}
HOSTDEVICE void operator()(size_t sample_id) {
auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id];
for (size_t x_offset = sample_id * num_classes_;
x_offset < (sample_id + 1) * num_classes_; ++x_offset) {
dx_[x_offset] = x_offset != x_is_true_offset
? static_cast<T>(0)
: -dy_[sample_id] / x_[x_offset];
}
}
private:
T* dx_;
const T* dy_;
const T* x_;
const int64_t* label_;
size_t num_classes_;
};
template <typename DeviceContext, typename T>
class CrossEntropyGradientOpKernel : public framework::OpKernel<T> { class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), auto* x = ctx.Input<Tensor>("X");
"This kernel only runs on CPU."); auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
const Tensor* x = ctx.Input<Tensor>("X"); auto* label = ctx.Input<Tensor>("Label");
const Tensor* dy = ctx.Input<Tensor>(framework::GradVarName("Y")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
const Tensor* label = ctx.Input<Tensor>("Label"); auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
int64_t class_num = x->dims()[1]; int64_t class_num = x->dims()[1];
if (ctx.Attr<bool>("soft_label")) { if (ctx.Attr<bool>("soft_label")) {
auto x_mat = EigenMatrix<T>::From(*x); XeSoftlabelGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
auto dy_mat = EigenMatrix<T>::From(*dy); label->data<T>(),
auto lbl_mat = EigenMatrix<T>::From(*label); static_cast<size_t>(class_num));
auto dx_mat = EigenMatrix<T>::From(*dx); platform::ForRange<DeviceContext> for_range(
ctx.template device_context<DeviceContext>(),
dx_mat.device(*ctx.template device_context<platform::CPUDeviceContext>() static_cast<size_t>(dx->numel()));
.eigen_device()) = for_range(functor);
-(lbl_mat *
dy_mat.broadcast(Eigen::DSizes<int64_t, 2>(1, class_num)) / x_mat);
} else { } else {
int64_t batch_size = x->dims()[0]; XeGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
const T* dy_data = dy->data<T>(); label->data<int64_t>(),
const T* x_data = x->data<T>(); static_cast<size_t>(class_num));
const int64_t* label_data = label->data<int64_t>(); platform::ForRange<DeviceContext> for_range(
ctx.template device_context<DeviceContext>(),
math::SetConstant<platform::CPUDeviceContext, T> functor; static_cast<size_t>(dy->numel()));
functor(ctx.template device_context<platform::CPUDeviceContext>(), dx, 0); for_range(functor);
for (int64_t i = 0; i < batch_size; ++i) {
PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
int64_t index = i * class_num + label_data[i];
dx_data[index] = math::TolerableValue<T>()(-dy_data[i] / x_data[index]);
}
} }
} }
}; };
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
class DecayedAdagradOp : public framework::OperatorWithKernel { class DecayedAdagradOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -51,6 +52,12 @@ class DecayedAdagradOp : public framework::OperatorWithKernel { ...@@ -51,6 +52,12 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("ParamOut", param_dims);
ctx->SetOutputDim("MomentOut", param_dims); ctx->SetOutputDim("MomentOut", param_dims);
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("Param")->type());
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
}; };
class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker { class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
......
...@@ -211,6 +211,11 @@ void AsyncGRPCServer::WaitClientGet(int count) { ...@@ -211,6 +211,11 @@ void AsyncGRPCServer::WaitClientGet(int count) {
} }
} }
void AsyncGRPCServer::WaitServerReady() {
std::unique_lock<std::mutex> lock(this->mutex_ready_);
condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
}
void AsyncGRPCServer::RunSyncUpdate() { void AsyncGRPCServer::RunSyncUpdate() {
::grpc::ServerBuilder builder; ::grpc::ServerBuilder builder;
builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(), builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(),
...@@ -244,6 +249,12 @@ void AsyncGRPCServer::RunSyncUpdate() { ...@@ -244,6 +249,12 @@ void AsyncGRPCServer::RunSyncUpdate() {
t_prefetch_.reset(new std::thread( t_prefetch_.reset(new std::thread(
std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(), std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
"cq_prefetch", prefetch_register))); "cq_prefetch", prefetch_register)));
{
std::lock_guard<std::mutex> lock(this->mutex_ready_);
ready_ = 1;
}
condition_ready_.notify_all();
// wait server // wait server
server_->Wait(); server_->Wait();
t_send_->join(); t_send_->join();
...@@ -261,8 +272,8 @@ void AsyncGRPCServer::ShutdownQueue() { ...@@ -261,8 +272,8 @@ void AsyncGRPCServer::ShutdownQueue() {
// This URL explains why shutdown is complicate: // This URL explains why shutdown is complicate:
void AsyncGRPCServer::ShutDown() { void AsyncGRPCServer::ShutDown() {
is_shut_down_ = true; is_shut_down_ = true;
server_->Shutdown();
ShutdownQueue(); ShutdownQueue();
server_->Shutdown();
} }
void AsyncGRPCServer::TryToRegisterNewSendOne() { void AsyncGRPCServer::TryToRegisterNewSendOne() {
......
...@@ -45,10 +45,10 @@ class RequestBase; ...@@ -45,10 +45,10 @@ class RequestBase;
class AsyncGRPCServer final { class AsyncGRPCServer final {
public: public:
explicit AsyncGRPCServer(const std::string &address, bool sync_mode) explicit AsyncGRPCServer(const std::string &address, bool sync_mode)
: address_(address), sync_mode_(sync_mode) {} : address_(address), sync_mode_(sync_mode), ready_(0) {}
~AsyncGRPCServer() {} ~AsyncGRPCServer() {}
void WaitServerReady();
void RunSyncUpdate(); void RunSyncUpdate();
// functions to sync server barrier status. // functions to sync server barrier status.
...@@ -120,6 +120,10 @@ class AsyncGRPCServer final { ...@@ -120,6 +120,10 @@ class AsyncGRPCServer final {
framework::ProgramDesc *program_; framework::ProgramDesc *program_;
framework::Executor *executor_; framework::Executor *executor_;
int selected_port_; int selected_port_;
std::mutex mutex_ready_;
std::condition_variable condition_ready_;
int ready_;
}; };
}; // namespace detail }; // namespace detail
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#ifdef __NVCC__ #ifdef __NVCC__
#include <cuda.h> #include <cuda.h>
#include <thrust/iterator/iterator_adaptor.h> #include <thrust/iterator/iterator_adaptor.h>
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
#endif #endif
...@@ -336,43 +337,6 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out, ...@@ -336,43 +337,6 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out,
} }
#ifdef __NVCC__ #ifdef __NVCC__
template <typename T>
__device__ T reduceSum(T val, int tid, int len) {
// NOTE(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
const int warpSize = 32;
__shared__ T shm[warpSize];
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, tid < len);
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += platform::__shfl_down_sync(mask, val, offset);
if (tid < warpSize) shm[tid] = 0;
__syncthreads();
if (tid % warpSize == 0) {
shm[tid / warpSize] = val;
}
__syncthreads();
CREATE_SHFL_MASK(mask, tid < warpSize);
if (tid < warpSize) {
val = shm[tid];
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += platform::__shfl_down_sync(mask, val, offset);
}
return val;
}
template <typename T, typename DX_OP, typename DY_OP> template <typename T, typename DX_OP, typename DY_OP>
static __global__ void ElemwiseGradBroadcast1CUDAKernel( static __global__ void ElemwiseGradBroadcast1CUDAKernel(
const T* x, const T* y, const T* out, const T* dout, int h, int w, const T* x, const T* y, const T* out, const T* dout, int h, int w,
...@@ -395,7 +359,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel( ...@@ -395,7 +359,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
if (dy) { if (dy) {
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
val = reduceSum(val, tid, h); val = paddle::platform::reduceSum(val, tid, h);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
dy[j] = val; dy[j] = val;
} }
...@@ -472,7 +436,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel( ...@@ -472,7 +436,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
if (dy) { if (dy) {
int h = pre * post; int h = pre * post;
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
val = reduceSum(val, tid, h); val = paddle::platform::reduceSum(val, tid, h);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
dy[j] = val; dy[j] = val;
} }
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
class FTRLOp : public framework::OperatorWithKernel { class FTRLOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -53,6 +54,12 @@ class FTRLOp : public framework::OperatorWithKernel { ...@@ -53,6 +54,12 @@ class FTRLOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("SquaredAccumOut", param_dim); ctx->SetOutputDim("SquaredAccumOut", param_dim);
ctx->SetOutputDim("LinearAccumOut", param_dim); ctx->SetOutputDim("LinearAccumOut", param_dim);
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("Param")->type());
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
}; };
class FTRLOpMaker : public framework::OpProtoAndCheckerMaker { class FTRLOpMaker : public framework::OpProtoAndCheckerMaker {
......
...@@ -71,7 +71,7 @@ class GenNCCLIdOp : public framework::OperatorBase { ...@@ -71,7 +71,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
void GetIdByServer(framework::Scope* scope, void GetIdByServer(framework::Scope* scope,
const platform::DeviceContext& dev_ctx) const { const platform::DeviceContext& dev_ctx) const {
std::string endpoint = Attr<std::string>("endpoint"); std::string endpoint = Attr<std::string>("endpoint");
rpc_service_.reset(new detail::AsyncGRPCServer(endpoint, true)); rpc_service_ = new detail::AsyncGRPCServer(endpoint, true);
framework::ProgramDesc empty_program; framework::ProgramDesc empty_program;
framework::Executor executor(dev_ctx.GetPlace()); framework::Executor executor(dev_ctx.GetPlace());
rpc_service_->SetScope(scope); rpc_service_->SetScope(scope);
...@@ -79,21 +79,21 @@ class GenNCCLIdOp : public framework::OperatorBase { ...@@ -79,21 +79,21 @@ class GenNCCLIdOp : public framework::OperatorBase {
rpc_service_->SetProgram(&empty_program); rpc_service_->SetProgram(&empty_program);
rpc_service_->SetExecutor(&executor); rpc_service_->SetExecutor(&executor);
server_thread_.reset(new std::thread(std::bind( server_thread_.reset(new std::thread(
&detail::AsyncGRPCServer::RunSyncUpdate, rpc_service_.get()))); std::bind(&detail::AsyncGRPCServer::RunSyncUpdate, rpc_service_)));
rpc_service_->SetCond(0); rpc_service_->SetCond(0);
VLOG(3) << "start getting nccl id from trainer 0..."; VLOG(3) << "start getting nccl id from trainer 0...";
auto recv = rpc_service_->Get(); auto recv = rpc_service_->Get();
VLOG(3) << "got nccl id and stop server..."; VLOG(3) << "got nccl id and stop server...";
// rpc_service_->SetCond(1); rpc_service_->ShutDown();
// rpc_service_->ShutDown();
rpc_service->Push(LISTEN_TERMINATE_MESSAGE);
VLOG(3) << "rpc server stopped"; VLOG(3) << "rpc server stopped";
// TODO(wuyi): reinit nccl communicators // TODO(wuyi): reinit nccl communicators
server_thread_->join();
delete rpc_service_;
} }
protected: protected:
mutable std::shared_ptr<detail::AsyncGRPCServer> rpc_service_; mutable detail::AsyncGRPCServer* rpc_service_ = nullptr;
mutable std::shared_ptr<std::thread> server_thread_; mutable std::shared_ptr<std::thread> server_thread_;
}; };
......
...@@ -14,11 +14,10 @@ limitations under the License. */ ...@@ -14,11 +14,10 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -87,10 +86,10 @@ class GRUUnitKernel : public framework::OpKernel<T> { ...@@ -87,10 +86,10 @@ class GRUUnitKernel : public framework::OpKernel<T> {
const T* weight_data = weight->data<T>(); const T* weight_data = weight->data<T>();
T* gate_data = gate->data<T>(); T* gate_data = gate->data<T>();
T* reset_hidden_prev_data = reset_hidden_prev->data<T>(); T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
math::gemm<DeviceContext, T>( auto blas = math::GetBlas<DeviceContext, T>(context);
context.template device_context<DeviceContext>(), false, false, blas.GEMM(false, false, batch_size, 2 * frame_size, frame_size, 1,
batch_size, 2 * frame_size, frame_size, 1, hidden_prev_data, frame_size, hidden_prev_data, frame_size, weight_data, frame_size * 2, 1,
weight_data, frame_size * 2, 1, gate_data, frame_size * 3); gate_data, frame_size * 3);
// calculate activited gate // calculate activited gate
Eigen::array<int, 2> extents({{batch_size, frame_size}}); Eigen::array<int, 2> extents({{batch_size, frame_size}});
...@@ -103,11 +102,10 @@ class GRUUnitKernel : public framework::OpKernel<T> { ...@@ -103,11 +102,10 @@ class GRUUnitKernel : public framework::OpKernel<T> {
g.slice(r_offsets, extents), g.slice(r_offsets, extents)); g.slice(r_offsets, extents), g.slice(r_offsets, extents));
auto r = g.slice(r_offsets, extents); // reset gate auto r = g.slice(r_offsets, extents); // reset gate
r_h_p.device(place) = r * h_p; // reset previous hidden state r_h_p.device(place) = r * h_p; // reset previous hidden state
math::gemm<DeviceContext, T>( blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
context.template device_context<DeviceContext>(), false, false, reset_hidden_prev_data, frame_size,
batch_size, frame_size, frame_size, 1, reset_hidden_prev_data, weight_data + frame_size * frame_size * 2, frame_size, 1,
frame_size, weight_data + frame_size * frame_size * 2, frame_size, 1, gate_data + frame_size * 2, frame_size * 3);
gate_data + frame_size * 2, frame_size * 3);
Eigen::array<int, 2> c_offsets({{0, frame_size * 2}}); Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
ActCompute(context.Attr<int>("activation"), place, ActCompute(context.Attr<int>("activation"), place,
...@@ -188,11 +186,11 @@ class GRUUnitGradKernel : public framework::OpKernel<T> { ...@@ -188,11 +186,11 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
ActGradCompute(context.Attr<int>("activation"), place, c, c, ActGradCompute(context.Attr<int>("activation"), place, c, c,
d_g.slice(c_offsets, extents), d_h * u); d_g.slice(c_offsets, extents), d_h * u);
// backward for reset_hidden_prev // backward for reset_hidden_prev
math::gemm<DeviceContext, T>( auto blas = math::GetBlas<DeviceContext, T>(context);
context.template device_context<DeviceContext>(), false, true, blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
batch_size, frame_size, frame_size, 1, gate_grad_data + frame_size * 2, gate_grad_data + frame_size * 2, frame_size * 3,
frame_size * 3, weight_data + frame_size * frame_size * 2, frame_size, weight_data + frame_size * frame_size * 2, frame_size, 0,
0, reset_hidden_prev_grad_data, frame_size); reset_hidden_prev_grad_data, frame_size);
// backward for unactivated reset gate // backward for unactivated reset gate
ActGradCompute(context.Attr<int>("gate_activation"), place, r, r, ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
d_g.slice(r_offsets, extents), d_r_h_p * h_p); d_g.slice(r_offsets, extents), d_r_h_p * h_p);
...@@ -200,18 +198,15 @@ class GRUUnitGradKernel : public framework::OpKernel<T> { ...@@ -200,18 +198,15 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
if (weight_grad) { if (weight_grad) {
T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace()); T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
// backward for state_weight // backward for state_weight
math::gemm<DeviceContext, T>( blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
context.template device_context<DeviceContext>(), true, false, reset_hidden_prev_data, frame_size,
frame_size, frame_size, batch_size, 1, reset_hidden_prev_data, gate_grad_data + frame_size * 2, frame_size * 3, 0,
frame_size, gate_grad_data + frame_size * 2, frame_size * 3, 0, weight_grad_data + frame_size * frame_size * 2, frame_size);
weight_grad_data + frame_size * frame_size * 2, frame_size);
// backward for update_gate_weight and reset_gate_weight // backward for update_gate_weight and reset_gate_weight
math::gemm<DeviceContext, T>( blas.GEMM(true, false, frame_size, frame_size * 2, batch_size, 1,
context.template device_context<DeviceContext>(), true, false, hidden_prev_data, frame_size, gate_grad_data, frame_size * 3, 0,
frame_size, frame_size * 2, batch_size, 1, hidden_prev_data, weight_grad_data, frame_size * 2);
frame_size, gate_grad_data, frame_size * 3, 0, weight_grad_data,
frame_size * 2);
} }
// backward for hidden_prev // backward for hidden_prev
if (hidden_prev_grad) { if (hidden_prev_grad) {
...@@ -219,11 +214,9 @@ class GRUUnitGradKernel : public framework::OpKernel<T> { ...@@ -219,11 +214,9 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
hidden_prev_grad->mutable_data<T>(context.GetPlace()); hidden_prev_grad->mutable_data<T>(context.GetPlace());
auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad); auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u); d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
math::gemm<DeviceContext, T>( blas.GEMM(false, true, batch_size, frame_size, frame_size * 2, 1,
context.template device_context<DeviceContext>(), false, true, gate_grad_data, frame_size * 3, weight_data, frame_size * 2, 1,
batch_size, frame_size, frame_size * 2, 1, gate_grad_data, hidden_prev_grad_data, frame_size);
frame_size * 3, weight_data, frame_size * 2, 1, hidden_prev_grad_data,
frame_size);
} }
// backward for input // backward for input
if (input_grad) { if (input_grad) {
......
...@@ -15,8 +15,8 @@ limitations under the License. */ ...@@ -15,8 +15,8 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
namespace paddle { namespace paddle {
...@@ -46,9 +46,9 @@ class RowwiseMean2D<platform::CUDADeviceContext, T> { ...@@ -46,9 +46,9 @@ class RowwiseMean2D<platform::CUDADeviceContext, T> {
} }
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) { const framework::Tensor& input, framework::Tensor* out) {
math::gemv<platform::CUDADeviceContext, T>( math::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
context, false, left_, right_, 1., input.data<T>(), divisor_.data<T>(), false, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
0., out->data<T>()); out->data<T>());
} }
private: private:
...@@ -93,9 +93,9 @@ class ColwiseSum2D<platform::CUDADeviceContext, T> { ...@@ -93,9 +93,9 @@ class ColwiseSum2D<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) { const framework::Tensor& input, framework::Tensor* out) {
math::gemv<platform::CUDADeviceContext, T>( math::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
context, true, left_, right_, 1., input.data<T>(), divisor_.data<T>(), true, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
0., out->data<T>()); out->data<T>());
} }
private: private:
......
...@@ -66,12 +66,7 @@ static void ParallelExecuteBlocks( ...@@ -66,12 +66,7 @@ static void ParallelExecuteBlocks(
for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
} }
static void SavePort(std::shared_ptr<detail::AsyncGRPCServer> rpc_service) { std::atomic_int ListenAndServOp::selected_port_{0};
std::ofstream port_file;
port_file.open("/tmp/paddle.selected_port");
port_file << rpc_service->GetSelectedPort();
port_file.close();
}
ListenAndServOp::ListenAndServOp(const std::string &type, ListenAndServOp::ListenAndServOp(const std::string &type,
const framework::VariableNameMap &inputs, const framework::VariableNameMap &inputs,
...@@ -79,15 +74,27 @@ ListenAndServOp::ListenAndServOp(const std::string &type, ...@@ -79,15 +74,27 @@ ListenAndServOp::ListenAndServOp(const std::string &type,
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
int ListenAndServOp::GetSelectedPort() const {
return rpc_service_->GetSelectedPort();
}
void ListenAndServOp::Stop() { void ListenAndServOp::Stop() {
rpc_service_->Push(LISTEN_TERMINATE_MESSAGE); rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
server_thread_->join(); server_thread_->join();
} }
void ListenAndServOp::SavePort(const std::string &file_path) const {
// NOTE: default write file to /tmp/paddle.selected_port
selected_port_ = rpc_service_->GetSelectedPort();
std::ofstream port_file;
port_file.open(file_path);
port_file << selected_port_.load();
port_file.close();
VLOG(4) << "selected port written to " << file_path;
}
void ListenAndServOp::WaitServerReady() {
while (selected_port_.load() == 0) {
}
}
void ListenAndServOp::RunSyncLoop(framework::Executor *executor, void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
framework::ProgramDesc *program, framework::ProgramDesc *program,
framework::Scope *recv_scope, framework::Scope *recv_scope,
...@@ -318,9 +325,12 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -318,9 +325,12 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
// start the server listening after all member initialized. // start the server listening after all member initialized.
server_thread_.reset(new std::thread(RunServer, rpc_service_)); server_thread_.reset(new std::thread(RunServer, rpc_service_));
VLOG(3) << "wait server thread to become ready..."; VLOG(3) << "wait server thread to become ready...";
sleep(5); rpc_service_->WaitServerReady();
// Write to a file of server selected port for python use. // Write to a file of server selected port for python use.
SavePort(rpc_service_); std::string file_path = string::Sprintf("/tmp/paddle.%d.selected_port",
static_cast<int>(::getpid()));
SavePort(file_path);
if (sync_mode) { if (sync_mode) {
RunSyncLoop(&executor, program, &recv_scope, prefetch_block); RunSyncLoop(&executor, program, &recv_scope, prefetch_block);
} else { } else {
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <stdint.h> #include <stdint.h>
#include <atomic>
#include <ostream> #include <ostream>
#include <string> #include <string>
...@@ -39,8 +40,6 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -39,8 +40,6 @@ class ListenAndServOp : public framework::OperatorBase {
const framework::VariableNameMap& outputs, const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs); const framework::AttributeMap& attrs);
int GetSelectedPort() const;
void RunSyncLoop(framework::Executor* executor, void RunSyncLoop(framework::Executor* executor,
framework::ProgramDesc* program, framework::ProgramDesc* program,
framework::Scope* recv_scope, framework::Scope* recv_scope,
...@@ -49,14 +48,25 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -49,14 +48,25 @@ class ListenAndServOp : public framework::OperatorBase {
void RunAsyncLoop(framework::Executor* executor, void RunAsyncLoop(framework::Executor* executor,
framework::ProgramDesc* program) const; framework::ProgramDesc* program) const;
void SavePort(
const std::string& file_path = "/tmp/paddle.selected_port") const;
void WaitServerReady();
int GetSelectedPort() { return selected_port_; }
void Stop() override; void Stop() override;
void RunImpl(const framework::Scope& scope, void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override; const platform::Place& dev_place) const override;
static void ResetPort() { selected_port_ = 0; }
protected: protected:
mutable std::shared_ptr<detail::AsyncGRPCServer> rpc_service_; mutable std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
mutable std::shared_ptr<std::thread> server_thread_; mutable std::shared_ptr<std::thread> server_thread_;
// FIXME(wuyi): it's static so that the operator can be cloned.
static std::atomic_int selected_port_;
}; };
} // namespace operators } // namespace operators
......
...@@ -46,8 +46,7 @@ class LoDResetKernel : public framework::OpKernel<T> { ...@@ -46,8 +46,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
auto* lod = lod_t->data<int>(); auto* lod = lod_t->data<int>();
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
framework::Tensor lod_cpu; framework::Tensor lod_cpu;
framework::TensorCopy(*lod_t, platform::CPUPlace(), framework::TensorCopySync(*lod_t, platform::CPUPlace(), &lod_cpu);
ctx.device_context(), &lod_cpu);
lod = lod_cpu.data<int>(); lod = lod_cpu.data<int>();
} }
level0 = std::vector<int>(lod, lod + lod_t->numel()); level0 = std::vector<int>(lod, lod + lod_t->numel());
......
...@@ -15,9 +15,9 @@ limitations under the License. */ ...@@ -15,9 +15,9 @@ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h" #include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/operators/math/sequence2batch.h"
namespace paddle { namespace paddle {
...@@ -114,6 +114,7 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -114,6 +114,7 @@ class LSTMKernel : public framework::OpKernel<T> {
auto cand_act = math::detail::GetActivationType( auto cand_act = math::detail::GetActivationType(
ctx.Attr<std::string>("candidate_activation")); ctx.Attr<std::string>("candidate_activation"));
auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
for (size_t n = 0; n < num_batch; n++) { for (size_t n = 0; n < num_batch; n++) {
int bstart = static_cast<int>(batch_starts[n]); int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]); int bend = static_cast<int>(batch_starts[n + 1]);
...@@ -129,9 +130,8 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -129,9 +130,8 @@ class LSTMKernel : public framework::OpKernel<T> {
int pre_h_start = static_cast<int>(batch_starts[n - 1]); int pre_h_start = static_cast<int>(batch_starts[n - 1]);
int pre_h_end = pre_h_start + cur_batch_size; int pre_h_end = pre_h_start + cur_batch_size;
auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end); auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
math::matmul<DeviceContext, T>(device_ctx, pre_hidden_t, false, *weight, blas.MatMul(pre_hidden_t, false, *weight, false, static_cast<T>(1.0),
false, static_cast<T>(1.0), &gate_t, &gate_t, static_cast<T>(1.0));
static_cast<T>(1.0));
} else if (hidden_t0) { } else if (hidden_t0) {
// If n == 0 and there is no initialized hidden state, that is to say // If n == 0 and there is no initialized hidden state, that is to say
// the H0 is zeros, the calculation W_h * H0 will be skiped. // the H0 is zeros, the calculation W_h * H0 will be skiped.
...@@ -143,9 +143,8 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -143,9 +143,8 @@ class LSTMKernel : public framework::OpKernel<T> {
Tensor ordered_h0; Tensor ordered_h0;
ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order, ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
&ordered_h0, true); &ordered_h0, true);
math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false, *weight, blas.MatMul(ordered_h0, false, *weight, false, static_cast<T>(1.0),
false, static_cast<T>(1.0), &gate_t, &gate_t, static_cast<T>(1.0));
static_cast<T>(1.0));
} }
lstm_value.gate_value = gate_t.data<T>(); lstm_value.gate_value = gate_t.data<T>();
...@@ -282,6 +281,7 @@ class LSTMGradKernel : public framework::OpKernel<T> { ...@@ -282,6 +281,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
auto batch_starts = batch_gate->lod()[0]; auto batch_starts = batch_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1; size_t num_batch = batch_starts.size() - 1;
auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) { for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
int bstart = static_cast<int>(batch_starts[n]); int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]); int bend = static_cast<int>(batch_starts[n + 1]);
...@@ -320,29 +320,25 @@ class LSTMGradKernel : public framework::OpKernel<T> { ...@@ -320,29 +320,25 @@ class LSTMGradKernel : public framework::OpKernel<T> {
int pre_h_start = static_cast<int>(batch_starts[n - 1]); int pre_h_start = static_cast<int>(batch_starts[n - 1]);
int pre_h_end = pre_h_start + cur_batch_size; int pre_h_end = pre_h_start + cur_batch_size;
auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end); auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end);
math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true, blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
static_cast<T>(1.0), &pre_hidden_g, &pre_hidden_g, static_cast<T>(1.0));
static_cast<T>(1.0));
if (weight_g) { if (weight_g) {
/* backward weight */ /* backward weight */
auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end); auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end);
math::matmul<DeviceContext, T>(device_ctx, pre_hidden, true, gate_g, blas.MatMul(pre_hidden, true, gate_g, false, static_cast<T>(1.0),
false, static_cast<T>(1.0), weight_g, weight_g, static_cast<T>(1.0));
static_cast<T>(1.0));
} }
} else { } else {
if (h0 && weight_g) { if (h0 && weight_g) {
ReorderInitState<DeviceContext, T>(device_ctx, *h0, order, ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
&ordered_h0, true); &ordered_h0, true);
math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true, gate_g, blas.MatMul(ordered_h0, true, gate_g, false, static_cast<T>(1.0),
false, static_cast<T>(1.0), weight_g, weight_g, static_cast<T>(1.0));
static_cast<T>(1.0));
} }
if (h0 && h0_g) { if (h0 && h0_g) {
ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace()); ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
true, static_cast<T>(1.0), &ordered_h0_g, static_cast<T>(0.0));
&ordered_h0_g, static_cast<T>(0.0));
} }
} }
} }
......
...@@ -14,15 +14,14 @@ limitations under the License. */ ...@@ -14,15 +14,14 @@ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h" #include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -143,7 +142,7 @@ class LSTMPKernel : public framework::OpKernel<T> { ...@@ -143,7 +142,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
auto proj_act = math::detail::GetActivationType( auto proj_act = math::detail::GetActivationType(
ctx.Attr<std::string>("proj_activation")); ctx.Attr<std::string>("proj_activation"));
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
for (size_t n = 0; n < num_batch; n++) { for (size_t n = 0; n < num_batch; n++) {
int bstart = static_cast<int>(batch_starts[n]); int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]); int bend = static_cast<int>(batch_starts[n + 1]);
...@@ -160,9 +159,8 @@ class LSTMPKernel : public framework::OpKernel<T> { ...@@ -160,9 +159,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
int pre_h_start = static_cast<int>(batch_starts[n - 1]); int pre_h_start = static_cast<int>(batch_starts[n - 1]);
int pre_h_end = pre_h_start + cur_batch_size; int pre_h_end = pre_h_start + cur_batch_size;
auto pre_proj_t = batch_proj.Slice(pre_h_start, pre_h_end); auto pre_proj_t = batch_proj.Slice(pre_h_start, pre_h_end);
math::matmul<DeviceContext, T>(device_ctx, pre_proj_t, false, *weight, blas.MatMul(pre_proj_t, false, *weight, false, static_cast<T>(1.0),
false, static_cast<T>(1.0), &gate_t, &gate_t, static_cast<T>(1.0));
static_cast<T>(1.0));
} else if (hidden_t0) { } else if (hidden_t0) {
// If n == 0 and there is no initialized hidden state, that is to say // If n == 0 and there is no initialized hidden state, that is to say
// the H0 is zeros, the calculation W_h * H0 will be skiped. // the H0 is zeros, the calculation W_h * H0 will be skiped.
...@@ -176,16 +174,14 @@ class LSTMPKernel : public framework::OpKernel<T> { ...@@ -176,16 +174,14 @@ class LSTMPKernel : public framework::OpKernel<T> {
ordered_proj0->mutable_data<T>(ctx.GetPlace()); ordered_proj0->mutable_data<T>(ctx.GetPlace());
ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order, ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
&ordered_h0, true); &ordered_h0, true);
math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false, blas.MatMul(ordered_h0, false, *proj_weight, false, static_cast<T>(1.0),
*proj_weight, false, static_cast<T>(1.0), ordered_proj0, static_cast<T>(0.0));
ordered_proj0, static_cast<T>(0.0));
if (proj_act != math::detail::ActivationType::kIdentity) { if (proj_act != math::detail::ActivationType::kIdentity) {
auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0); auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
ActCompute(cell_act, place, proj0_dev, proj0_dev); ActCompute(cell_act, place, proj0_dev, proj0_dev);
} }
math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, false, blas.MatMul(*ordered_proj0, false, *weight, false, static_cast<T>(1.0),
*weight, false, static_cast<T>(1.0), &gate_t, static_cast<T>(1.0));
&gate_t, static_cast<T>(1.0));
} }
lstmp_value.gate_value = gate_t.data<T>(); lstmp_value.gate_value = gate_t.data<T>();
...@@ -196,9 +192,8 @@ class LSTMPKernel : public framework::OpKernel<T> { ...@@ -196,9 +192,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act, device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act,
cell_act, cand_act); cell_act, cand_act);
lstmp_value.prev_state_value = lstmp_value.state_value; lstmp_value.prev_state_value = lstmp_value.state_value;
math::matmul<DeviceContext, T>(device_ctx, hidden_t, false, *proj_weight, blas.MatMul(hidden_t, false, *proj_weight, false, static_cast<T>(1.0),
false, static_cast<T>(1.0), &proj_t, &proj_t, static_cast<T>(0.0));
static_cast<T>(0.0));
if (proj_act != math::detail::ActivationType::kIdentity) { if (proj_act != math::detail::ActivationType::kIdentity) {
auto proj_t_dev = EigenMatrix<T>::From(proj_t); auto proj_t_dev = EigenMatrix<T>::From(proj_t);
ActCompute(cell_act, place, proj_t_dev, proj_t_dev); ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
...@@ -361,6 +356,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> { ...@@ -361,6 +356,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
auto batch_starts = batch_gate->lod()[0]; auto batch_starts = batch_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1; size_t num_batch = batch_starts.size() - 1;
auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) { for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
int bstart = static_cast<int>(batch_starts[n]); int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]); int bend = static_cast<int>(batch_starts[n + 1]);
...@@ -375,15 +371,13 @@ class LSTMPGradKernel : public framework::OpKernel<T> { ...@@ -375,15 +371,13 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
} }
/* hidden state backwarad */ /* hidden state backwarad */
Tensor out_g = batch_hidden_g.Slice(bstart, bend); Tensor out_g = batch_hidden_g.Slice(bstart, bend);
math::matmul<DeviceContext, T>(device_ctx, proj_g, false, *proj_weight, blas.MatMul(proj_g, false, *proj_weight, true, static_cast<T>(1.0),
true, static_cast<T>(1.0), &out_g, &out_g, static_cast<T>(0.0));
static_cast<T>(0.0));
/* projection weight backward*/ /* projection weight backward*/
if (proj_weight_g) { if (proj_weight_g) {
Tensor hidden_t = batch_hidden->Slice(bstart, bend); Tensor hidden_t = batch_hidden->Slice(bstart, bend);
math::matmul<DeviceContext, T>(device_ctx, hidden_t, true, proj_g, blas.MatMul(hidden_t, true, proj_g, false, static_cast<T>(1.0),
false, static_cast<T>(1.0), proj_weight_g, static_cast<T>(1.0));
proj_weight_g, static_cast<T>(1.0));
} }
Tensor gate = batch_gate->Slice(bstart, bend); Tensor gate = batch_gate->Slice(bstart, bend);
...@@ -419,24 +413,21 @@ class LSTMPGradKernel : public framework::OpKernel<T> { ...@@ -419,24 +413,21 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
int pre_h_start = static_cast<int>(batch_starts[n - 1]); int pre_h_start = static_cast<int>(batch_starts[n - 1]);
int pre_h_end = pre_h_start + cur_batch_size; int pre_h_end = pre_h_start + cur_batch_size;
auto pre_proj_g = batch_proj_g.Slice(pre_h_start, pre_h_end); auto pre_proj_g = batch_proj_g.Slice(pre_h_start, pre_h_end);
math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true, blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
static_cast<T>(1.0), &pre_proj_g, &pre_proj_g, static_cast<T>(1.0));
static_cast<T>(1.0));
if (weight_g) { if (weight_g) {
/* weight backward*/ /* weight backward*/
auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end); auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end);
math::matmul<DeviceContext, T>(device_ctx, pre_proj, true, gate_g, blas.MatMul(pre_proj, true, gate_g, false, static_cast<T>(1.0),
false, static_cast<T>(1.0), weight_g, weight_g, static_cast<T>(1.0));
static_cast<T>(1.0));
} }
} else { } else {
if (h0 && weight_g) { if (h0 && weight_g) {
ReorderInitState<DeviceContext, T>(device_ctx, *h0, order, ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
&ordered_h0, true); &ordered_h0, true);
if (weight_g) { if (weight_g) {
math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, true, blas.MatMul(*ordered_proj0, true, gate_g, false,
gate_g, false, static_cast<T>(1.0), static_cast<T>(1.0), weight_g, static_cast<T>(1.0));
weight_g, static_cast<T>(1.0));
} }
} }
if (h0 && (h0_g || proj_weight_g)) { if (h0 && (h0_g || proj_weight_g)) {
...@@ -444,9 +435,8 @@ class LSTMPGradKernel : public framework::OpKernel<T> { ...@@ -444,9 +435,8 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
Tensor proj0_g; Tensor proj0_g;
proj0_g.Resize({in_dims[0], proj_weight->dims()[1]}); proj0_g.Resize({in_dims[0], proj_weight->dims()[1]});
proj0_g.mutable_data<T>(ctx.GetPlace()); proj0_g.mutable_data<T>(ctx.GetPlace());
math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
true, static_cast<T>(1.0), &proj0_g, &proj0_g, static_cast<T>(0.0));
static_cast<T>(0.0));
if (proj_act != math::detail::ActivationType::kIdentity) { if (proj_act != math::detail::ActivationType::kIdentity) {
auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0); auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
auto proj0_g_dev = EigenMatrix<T>::From(proj0_g); auto proj0_g_dev = EigenMatrix<T>::From(proj0_g);
...@@ -454,14 +444,12 @@ class LSTMPGradKernel : public framework::OpKernel<T> { ...@@ -454,14 +444,12 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
proj0_g_dev); proj0_g_dev);
} }
if (h0_g) { if (h0_g) {
math::matmul<DeviceContext, T>( blas.MatMul(proj0_g, false, *proj_weight, true, static_cast<T>(1.0),
device_ctx, proj0_g, false, *proj_weight, true, &ordered_h0_g, static_cast<T>(0.0));
static_cast<T>(1.0), &ordered_h0_g, static_cast<T>(0.0));
} }
if (proj_weight_g) { if (proj_weight_g) {
math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true, blas.MatMul(ordered_h0, true, proj0_g, false, static_cast<T>(1.0),
proj0_g, false, static_cast<T>(1.0), proj_weight_g, static_cast<T>(1.0));
proj_weight_g, static_cast<T>(1.0));
} }
} }
} }
......
...@@ -41,7 +41,8 @@ math_library(depthwise_conv) ...@@ -41,7 +41,8 @@ math_library(depthwise_conv)
math_library(gru_compute DEPS activation_functions math_function) math_library(gru_compute DEPS activation_functions math_function)
math_library(im2col) math_library(im2col)
math_library(lstm_compute DEPS activation_functions) math_library(lstm_compute DEPS activation_functions)
math_library(math_function DEPS cblas) cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
math_library(math_function DEPS blas)
math_library(maxouting) math_library(maxouting)
math_library(pooling) math_library(pooling)
math_library(selected_rows_functor DEPS selected_rows math_function) math_library(selected_rows_functor DEPS selected_rows math_function)
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace operators {
namespace math {
// Do nothing. Blas is a header only library.
} // namespace math
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#ifdef PADDLE_WITH_MKLML
#include <mkl_cblas.h>
#include <mkl_lapacke.h>
#include <mkl_vml_functions.h>
#endif
#ifdef PADDLE_USE_OPENBLAS
#include <cblas.h>
#include <lapacke.h>
#endif
#ifndef LAPACK_FOUND
extern "C" {
#include <cblas.h> // NOLINT
int LAPACKE_sgetrf(int matrix_layout, int m, int n, float* a, int lda,
int* ipiv);
int LAPACKE_dgetrf(int matrix_layout, int m, int n, double* a, int lda,
int* ipiv);
int LAPACKE_sgetri(int matrix_layout, int n, float* a, int lda,
const int* ipiv);
int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
const int* ipiv);
}
#endif
namespace paddle {
namespace operators {
namespace math {
template <typename DeviceContext>
class Blas {
public:
explicit Blas(const DeviceContext& context) : context_(context) {}
template <typename T>
void GEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
T alpha, const T* A, const T* B, T beta, T* C) const;
template <typename T>
void GEMM(bool transA, bool transB, int M, int N, int K, T alpha, const T* A,
int lda, const T* B, int ldb, T beta, T* C, int ldc) const;
template <typename T>
void MatMul(const framework::Tensor& mat_a, bool trans_a,
const framework::Tensor& mat_b, bool trans_b, T alpha,
framework::Tensor* mat_out, T beta) const;
template <typename T>
void MatMul(const framework::Tensor& mat_a, bool trans_a,
const framework::Tensor& mat_b, bool trans_b,
framework::Tensor* mat_out) const {
MatMul(mat_a, trans_a, mat_b, trans_b, static_cast<T>(1.0), mat_out,
static_cast<T>(0.0));
}
template <typename T>
void MatMul(const framework::Tensor& mat_a, const framework::Tensor& mat_b,
framework::Tensor* mat_out) const {
this->template MatMul<T>(mat_a, false, mat_b, false, mat_out);
}
template <typename T>
void AXPY(int n, T alpha, const T* x, T* y) const;
template <typename T>
void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta,
T* C) const;
template <typename T>
void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
int K, T alpha, const T* A, const T* B, T beta, T* C,
int batchCount, int64_t strideA, int64_t strideB) const;
private:
const DeviceContext& context_;
};
template <typename DeviceContext, typename T>
class BlasT : private Blas<DeviceContext> {
public:
using Blas<DeviceContext>::Blas;
template <typename... ARGS>
void GEMM(ARGS... args) const {
Base()->template GEMM<T>(args...);
}
template <typename... ARGS>
void MatMul(ARGS... args) const {
Base()->template MatMul<T>(args...);
}
template <typename... ARGS>
void AXPY(ARGS... args) const {
Base()->template AXPY<T>(args...);
}
template <typename... ARGS>
void GEMV(ARGS... args) const {
Base()->template GEMV<T>(args...);
}
template <typename... ARGS>
void BatchedGEMM(ARGS... args) const {
Base()->template BatchedGEMM<T>(args...);
}
private:
const Blas<DeviceContext>* Base() const {
return static_cast<const Blas<DeviceContext>*>(this);
}
};
template <typename DeviceContext, typename T>
inline BlasT<DeviceContext, T> GetBlas(
const framework::ExecutionContext& exe_ctx) {
return BlasT<DeviceContext, T>(
exe_ctx.template device_context<DeviceContext>());
}
template <typename DeviceContext, typename T>
inline BlasT<DeviceContext, T> GetBlas(const DeviceContext& dev_ctx) {
return BlasT<DeviceContext, T>(dev_ctx);
}
} // namespace math
} // namespace operators
} // namespace paddle
#include "paddle/fluid/operators/math/blas_impl.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/operators/math/blas_impl.cu.h"
#endif
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/dynload/cublas.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
struct CUBlas;
template <>
struct CUBlas<float> {
template <typename... ARGS>
static void GEMM(ARGS... args) {
PADDLE_ENFORCE(platform::dynload::cublasSgemm(args...));
}
template <typename... ARGS>
static void AXPY(ARGS... args) {
PADDLE_ENFORCE(platform::dynload::cublasSaxpy(args...));
}
template <typename... ARGS>
static void GEMV(ARGS... args) {
PADDLE_ENFORCE(platform::dynload::cublasSgemv(args...));
}
template <typename... ARGS>
static void GEMM_BATCH(ARGS... args) {
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(args...));
#else
PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5");
#endif
}
};
template <>
struct CUBlas<double> {
template <typename... ARGS>
static void GEMM(ARGS... args) {
PADDLE_ENFORCE(platform::dynload::cublasDgemm(args...));
}
template <typename... ARGS>
static void AXPY(ARGS... args) {
PADDLE_ENFORCE(platform::dynload::cublasDaxpy(args...));
}
template <typename... ARGS>
static void GEMV(ARGS... args) {
PADDLE_ENFORCE(platform::dynload::cublasDgemv(args...));
}
template <typename... ARGS>
static void GEMM_BATCH(ARGS... args) {
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(args...));
#else
PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5");
#endif
}
};
template <>
struct CUBlas<platform::float16> {
using float16 = platform::float16;
static void GEMM(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const float16 *alpha, const float16 *A, int lda,
const float16 *B, int ldb, const float16 *beta, float16 *C,
int ldc) {
PADDLE_ENFORCE(
platform::dynload::cublasHgemm(handle, transa, transb, m, n, k,
reinterpret_cast<const __half *>(alpha),
reinterpret_cast<const __half *>(A), lda,
reinterpret_cast<const __half *>(B), ldb,
reinterpret_cast<const __half *>(beta),
reinterpret_cast<__half *>(C), ldc));
}
template <typename... ARGS>
static void GEMM_BATCH(ARGS... args) {
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(args...));
#else
PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5");
#endif
}
};
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M,
int N, int K, T alpha, const T *A,
const T *B, T beta, T *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
B, ldb, A, lda, &beta, C, N);
}
template <>
template <>
inline void Blas<platform::CUDADeviceContext>::GEMM(
CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
platform::float16 alpha, const platform::float16 *A,
const platform::float16 *B, platform::float16 beta,
platform::float16 *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53,
"cublas fp16 gemm requires GPU compute capability >= 53");
#if CUDA_VERSION >= 8000
float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
#if CUDA_VERSION >= 9000
if (context_.GetComputeCapability() >= 70) {
PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
context_.cublas_handle(), CUBLAS_TENSOR_OP_MATH));
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
} else {
PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
context_.cublas_handle(), CUBLAS_DEFAULT_MATH));
}
#endif // CUDA_VERSION >= 9000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
CUDA_R_32F, algo));
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
N, M, K, &h_alpha, h_B, ldb, h_A, lda,
&h_beta, h_C, N);
#endif // CUDA_VERSION >= 8000
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
int N, int K, T alpha, const T *A,
int lda, const T *B, int ldb,
T beta, T *C, int ldc) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
B, ldb, A, lda, &beta, C, ldc);
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
T *y) const {
CUBlas<T>::AXPY(context_.cublas_handle(), n, &alpha, x, 1, y, 1);
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
T alpha, const T *A, const T *B,
T beta, T *C) const {
cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
CUBlas<T>::GEMV(context_.cublas_handle(), cuTransA, N, M, &alpha, A, N, B, 1,
&beta, C, 1);
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
T alpha, const T *A, const T *B, T beta, T *C, int batchCount,
int64_t strideA, int64_t strideB) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int64_t strideC = M * N;
CUBlas<T>::GEMM_BATCH(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
&alpha, B, ldb, strideB, A, lda, strideA, &beta, C, ldc,
strideC, batchCount);
}
} // namespace math
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
struct CBlas;
template <>
struct CBlas<float> {
template <typename... ARGS>
static void GEMM(ARGS... args) {
cblas_sgemm(args...);
}
template <typename... ARGS>
static void AXPY(ARGS... args) {
cblas_saxpy(args...);
}
template <typename... ARGS>
static void GEMV(ARGS... args) {
cblas_sgemv(args...);
}
#ifdef PADDLE_WITH_MKLML
template <typename... ARGS>
static void GEMM_BATCH(ARGS... args) {
cblas_sgemm_batch(args...);
}
#endif
};
template <>
struct CBlas<double> {
template <typename... ARGS>
static void GEMM(ARGS... args) {
cblas_dgemm(args...);
}
template <typename... ARGS>
static void AXPY(ARGS... args) {
cblas_daxpy(args...);
}
template <typename... ARGS>
static void GEMV(ARGS... args) {
cblas_dgemv(args...);
}
#ifdef PADDLE_WITH_MKLML
template <typename... ARGS>
static void GEMM_BATCH(ARGS... args) {
cblas_dgemm_batch(args...);
}
#endif
};
template <>
struct CBlas<platform::float16> {
static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
#ifdef PADDLE_WITH_MKLML
static void GEMM_BATCH(...) {
PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
}
#endif
};
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M,
int N, int K, T alpha, const T *A,
const T *B, T beta, T *C) const {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::GEMM(bool transA, bool transB, int M,
int N, int K, T alpha, const T *A,
int lda, const T *B, int ldb,
T beta, T *C, int ldc) const {
CBlas<T>::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
lda, B, ldb, beta, C, ldc);
}
template <typename DeviceContext>
template <typename T>
void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, bool trans_a,
const framework::Tensor &mat_b, bool trans_b,
T alpha, framework::Tensor *mat_out,
T beta) const {
auto dim_a = mat_a.dims();
auto dim_b = mat_b.dims();
auto dim_out = mat_out->dims();
PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix");
PADDLE_ENFORCE(
mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(),
"The places of matrices must be same");
int M = dim_out[0];
int N = dim_out[1];
int K = !trans_a ? dim_a[1] : dim_a[0];
CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = !trans_b ? CblasNoTrans : CblasTrans;
this->GEMM(transA, transB, M, N, K, alpha, mat_a.data<T>(), mat_b.data<T>(),
beta, mat_out->data<T>());
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::AXPY(int n, T alpha, const T *x,
T *y) const {
CBlas<T>::AXPY(n, alpha, x, 1, y, 1);
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
const T *A, const T *B, T beta,
T *C) const {
CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
CBlas<T>::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
T alpha, const T *A, const T *B, T beta, T *C, int batchCount,
int64_t strideA, int64_t strideB) const {
#ifdef PADDLE_WITH_MKLML
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
auto a_array = std::vector<const T *>(batchCount);
auto b_array = std::vector<const T *>(batchCount);
auto c_array = std::vector<T *>(batchCount);
for (int k = 0; k < batchCount; ++k) {
a_array[k] = &A[k * strideA];
b_array[k] = &B[k * strideB];
c_array[k] = &C[k * M * N];
}
CBlas<T>::GEMM_BATCH(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
a_array.data(), &lda, b_array.data(), &ldb, &beta,
c_array.data(), &ldc, 1 /* group_count */, &batchCount);
#else
for (int k = 0; k < batchCount; ++k) {
const float *Ak = &A[k * strideA];
const float *Bk = &B[k * strideB];
float *Ck = &C[k * M * N];
this->template GEMM<T>(transA, transB, M, N, K, alpha, Ak, Bk, beta, Ck);
}
#endif
}
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -69,8 +69,8 @@ void testConcat() { ...@@ -69,8 +69,8 @@ void testConcat() {
} }
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a); paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b); paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
} }
std::vector<paddle::framework::Tensor> input; std::vector<paddle::framework::Tensor> input;
...@@ -86,8 +86,8 @@ void testConcat() { ...@@ -86,8 +86,8 @@ void testConcat() {
int* out_ptr; int* out_ptr;
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context, paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
&out_cpu); &out_cpu);
out_ptr = out_cpu.data<int>(); out_ptr = out_cpu.data<int>();
} else { } else {
out_ptr = out.data<int>(); out_ptr = out.data<int>();
...@@ -142,8 +142,8 @@ void testConcat() { ...@@ -142,8 +142,8 @@ void testConcat() {
} }
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a); paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b); paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
} }
input.clear(); input.clear();
...@@ -157,8 +157,8 @@ void testConcat() { ...@@ -157,8 +157,8 @@ void testConcat() {
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context, paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
&out_cpu); &out_cpu);
out_ptr = out_cpu.data<int>(); out_ptr = out_cpu.data<int>();
} else { } else {
out_ptr = out.data<int>(); out_ptr = out.data<int>();
...@@ -215,8 +215,8 @@ void testConcat() { ...@@ -215,8 +215,8 @@ void testConcat() {
} }
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a); paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b); paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
} }
input.clear(); input.clear();
...@@ -230,8 +230,8 @@ void testConcat() { ...@@ -230,8 +230,8 @@ void testConcat() {
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context, paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
&out_cpu); &out_cpu);
out_ptr = out_cpu.data<int>(); out_ptr = out_cpu.data<int>();
} else { } else {
out_ptr = out.data<int>(); out_ptr = out.data<int>();
...@@ -290,8 +290,8 @@ void testConcat() { ...@@ -290,8 +290,8 @@ void testConcat() {
} }
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a); paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b); paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
} }
input.clear(); input.clear();
...@@ -305,8 +305,8 @@ void testConcat() { ...@@ -305,8 +305,8 @@ void testConcat() {
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context, paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
&out_cpu); &out_cpu);
out_ptr = out_cpu.data<int>(); out_ptr = out_cpu.data<int>();
} else { } else {
out_ptr = out.data<int>(); out_ptr = out.data<int>();
......
...@@ -17,8 +17,8 @@ limitations under the License. */ ...@@ -17,8 +17,8 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -211,6 +211,7 @@ class ContextProjectGradFunctor { ...@@ -211,6 +211,7 @@ class ContextProjectGradFunctor {
int input_row_begin, input_row_end; int input_row_begin, input_row_end;
int sequence_height, sequence_width; int sequence_height, sequence_width;
sequence_width = in.dims()[1]; sequence_width = in.dims()[1];
auto blas = math::GetBlas<DeviceContext, T>(context);
if (input_grad) { if (input_grad) {
for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) { for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
...@@ -262,8 +263,8 @@ class ContextProjectGradFunctor { ...@@ -262,8 +263,8 @@ class ContextProjectGradFunctor {
Tensor out_t_sub = out_t.Slice(k * context_length, Tensor out_t_sub = out_t.Slice(k * context_length,
k * context_length + padding_size); k * context_length + padding_size);
Tensor w_sub = padding_data->Slice(k, k + padding_size); Tensor w_sub = padding_data->Slice(k, k + padding_size);
axpy<DeviceContext, T>(context, w_sub.numel(), static_cast<T>(1), blas.AXPY(w_sub.numel(), static_cast<T>(1), out_t_sub.data<T>(),
out_t_sub.data<T>(), w_sub.data<T>()); w_sub.data<T>());
} }
} }
if (down_pad > 0) { if (down_pad > 0) {
...@@ -294,8 +295,8 @@ class ContextProjectGradFunctor { ...@@ -294,8 +295,8 @@ class ContextProjectGradFunctor {
(down_pad_begin_row + t) * context_length); (down_pad_begin_row + t) * context_length);
Tensor w_sub = padding_data->Slice( Tensor w_sub = padding_data->Slice(
up_pad + padding_idx, up_pad + padding_idx + padding_size); up_pad + padding_idx, up_pad + padding_idx + padding_size);
axpy<DeviceContext, T>(context, w_sub.numel(), static_cast<T>(1), blas.AXPY(w_sub.numel(), static_cast<T>(1), out_t_sub.data<T>(),
out_t_sub.data<T>(), w_sub.data<T>()); w_sub.data<T>());
} }
} }
out_t.Resize({sequence_height, context_length * sequence_width}); out_t.Resize({sequence_height, context_length * sequence_width});
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
...@@ -30,66 +31,22 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, ...@@ -30,66 +31,22 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
} }
} }
template <typename T>
__device__ __forceinline__ T sum_single_warp(T val) {
val += platform::__shfl_down_sync(0, val, 16);
val += platform::__shfl_down_sync(0, val, 8);
val += platform::__shfl_down_sync(0, val, 4);
val += platform::__shfl_down_sync(0, val, 2);
val += platform::__shfl_down_sync(0, val, 1);
return val;
}
// CUDA do not support dynamic arrary in template
// https://stackoverflow.com/questions/20497209
template <typename T>
struct SharedMemory {
// Ensure that we won't compile any un-specialized types
__device__ T* GetPointer() { return NULL; }
};
template <>
struct SharedMemory<float> {
__device__ float* GetPointer() {
extern __shared__ float s_float[];
return s_float;
}
};
template <>
struct SharedMemory<double> {
__device__ double* GetPointer() {
extern __shared__ double s_double[];
return s_double;
}
};
template <typename T> template <typename T>
__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
const int class_num) { const int class_num) {
int tid = threadIdx.x; int tid = threadIdx.x;
SharedMemory<T> d_sum_shared; T val = 0;
T* d_sum = d_sum_shared.GetPointer();
d_sum[tid] = 0;
int cur_idx = tid; int idx = blockIdx.x * class_num + tid;
int next_idx = blockIdx.x * class_num + tid; int end = blockIdx.x * class_num + class_num;
while (cur_idx < class_num) { for (; idx < end; idx += blockDim.x) {
d_sum[tid] += val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx];
math::TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
next_idx += blockDim.x;
cur_idx += blockDim.x;
} }
__syncthreads();
for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) { val = paddle::platform::reduceSum(val, tid, blockDim.x);
if (tid < stride) d_sum[tid] += d_sum[tid + stride]; if (threadIdx.x == 0) {
__syncthreads(); Y[blockIdx.x] = -val;
} }
T val = d_sum[tid];
val = sum_single_warp<T>(val);
if (tid == 0) Y[blockIdx.x] = -val;
} }
} // namespace } // namespace
...@@ -113,9 +70,7 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> { ...@@ -113,9 +70,7 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
? 512 ? 512
: pow(2, static_cast<int>(std::log2(class_num))); : pow(2, static_cast<int>(std::log2(class_num)));
SoftCrossEntropyKernel<T><<< SoftCrossEntropyKernel<T><<<batch_size, block, 0, ctx.stream()>>>(
batch_size, block, block * sizeof(T),
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
loss_data, prob_data, label_data, class_num); loss_data, prob_data, label_data, class_num);
} else { } else {
const int64_t* label_data = labels->data<int64_t>(); const int64_t* label_data = labels->data<int64_t>();
......
...@@ -43,8 +43,8 @@ void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output, ...@@ -43,8 +43,8 @@ void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
r_prev_out = prev_output_value[i]; r_prev_out = prev_output_value[i];
} }
op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out, op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
r_value_reset_output, active_gate); &r_value_reset_output, active_gate);
update_gate[i] = r_value_update_gate; update_gate[i] = r_value_update_gate;
reset_gate[i] = r_value_reset_gate; reset_gate[i] = r_value_reset_gate;
...@@ -71,8 +71,8 @@ void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output, ...@@ -71,8 +71,8 @@ void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
r_prev_out = prev_output_value[i]; r_prev_out = prev_output_value[i];
} }
op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out, op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
r_output, active_node); &r_output, active_node);
frame_state[i] = r_value_frame_state; frame_state[i] = r_value_frame_state;
output_value[i] = r_output; output_value[i] = r_output;
...@@ -99,8 +99,8 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, ...@@ -99,8 +99,8 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i]; r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i];
} }
op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out, op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
r_value_reset_output, active_gate); &r_value_reset_output, active_gate);
update_gate[i] = r_value_update_gate; update_gate[i] = r_value_update_gate;
reset_gate[i] = r_value_reset_gate; reset_gate[i] = r_value_reset_gate;
...@@ -129,8 +129,8 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, ...@@ -129,8 +129,8 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i]; r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i];
} }
op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out, op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
r_output, active_node); &r_output, active_node);
frame_state[i] = r_value_frame_state; frame_state[i] = r_value_frame_state;
(reinterpret_cast<__m256 *>(output_value))[i] = r_output; (reinterpret_cast<__m256 *>(output_value))[i] = r_output;
...@@ -213,9 +213,9 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, ...@@ -213,9 +213,9 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
r_prev_out_grad = prev_out_grad[i]; r_prev_out_grad = prev_out_grad[i];
} }
op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value, op_state_grad(&r_update_gate_value, &r_update_gate_grad,
r_frame_state_grad, r_prev_out_value, r_prev_out_grad, &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value,
r_out_grad, active_node); &r_prev_out_grad, &r_out_grad, active_node);
update_gate_grad[i] = r_update_gate_grad; update_gate_grad[i] = r_update_gate_grad;
frame_state_grad[i] = r_frame_state_grad; frame_state_grad[i] = r_frame_state_grad;
...@@ -258,9 +258,9 @@ void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, ...@@ -258,9 +258,9 @@ void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
r_prev_out_grad = prev_out_grad[i]; r_prev_out_grad = prev_out_grad[i];
} }
op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value, op_reset_grad(&r_update_gate_value, &r_update_gate_grad,
r_reset_gate_grad, r_prev_out_value, r_prev_out_grad, &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value,
r_reset_output_grad, active_gate); &r_prev_out_grad, &r_reset_output_grad, active_gate);
update_gate_grad[i] = r_update_gate_grad; update_gate_grad[i] = r_update_gate_grad;
reset_gate_grad[i] = r_reset_gate_grad; reset_gate_grad[i] = r_reset_gate_grad;
...@@ -302,9 +302,9 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, ...@@ -302,9 +302,9 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i]; r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
} }
op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value, op_state_grad(&r_update_gate_value, &r_update_gate_grad,
r_frame_state_grad, r_prev_out_value, r_prev_out_grad, &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value,
r_out_grad, active_node); &r_prev_out_grad, &r_out_grad, active_node);
update_gate_grad[i] = r_update_gate_grad; update_gate_grad[i] = r_update_gate_grad;
frame_state_grad[i] = r_frame_state_grad; frame_state_grad[i] = r_frame_state_grad;
...@@ -350,9 +350,9 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, ...@@ -350,9 +350,9 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i]; r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
} }
op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value, op_reset_grad(&r_update_gate_value, &r_update_gate_grad,
r_reset_gate_grad, r_prev_out_value, r_prev_out_grad, &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value,
r_reset_output_grad, active_gate); &r_prev_out_grad, &r_reset_output_grad, active_gate);
update_gate_grad[i] = r_update_gate_grad; update_gate_grad[i] = r_update_gate_grad;
reset_gate_grad[i] = r_reset_gate_grad; reset_gate_grad[i] = r_reset_gate_grad;
......
...@@ -55,8 +55,8 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output, ...@@ -55,8 +55,8 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
r_prev_out = prev_output_value[frame_idx]; r_prev_out = prev_output_value[frame_idx];
} }
op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out, op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
r_value_reset_output, active_gate); &r_value_reset_output, active_gate);
gate_value[frame_idx + frame_size * 0] = r_value_update_gate; gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
gate_value[frame_idx + frame_size * 1] = r_value_reset_gate; gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
...@@ -93,8 +93,8 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output, ...@@ -93,8 +93,8 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
r_prev_out = prev_output_value[frame_idx]; r_prev_out = prev_output_value[frame_idx];
} }
op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out, op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
r_output, active_node); &r_output, active_node);
gate_value[frame_idx + frame_size * 2] = r_value_frame_state; gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
output_value[frame_idx] = r_output; output_value[frame_idx] = r_output;
...@@ -137,9 +137,9 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value, ...@@ -137,9 +137,9 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
r_prev_out_grad = prev_out_grad[frame_idx]; r_prev_out_grad = prev_out_grad[frame_idx];
} }
op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value, op_state_grad(&r_update_gate_value, &r_update_gate_grad, &r_frame_state_value,
r_frame_state_grad, r_prev_out_value, r_prev_out_grad, &r_frame_state_grad, &r_prev_out_value, &r_prev_out_grad,
r_out_grad, active_node); &r_out_grad, active_node);
gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad; gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad; gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
...@@ -185,9 +185,9 @@ __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value, ...@@ -185,9 +185,9 @@ __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
r_reset_output_grad = reset_output_grad[frame_idx]; r_reset_output_grad = reset_output_grad[frame_idx];
} }
op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value, op_reset_grad(&r_update_gate_value, &r_update_gate_grad, &r_reset_gate_value,
r_reset_gate_grad, r_prev_out_value, r_prev_out_grad, &r_reset_gate_grad, &r_prev_out_value, &r_prev_out_grad,
r_reset_output_grad, active_gate); &r_reset_output_grad, active_gate);
gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad; gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad; gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
......
...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once
#include <type_traits>
#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
#include <type_traits>
// TODO(guosheng): refine code style in gru_kernel // TODO(guosheng): refine code style in gru_kernel
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -28,25 +28,25 @@ namespace forward { ...@@ -28,25 +28,25 @@ namespace forward {
template <typename T> template <typename T>
class gru_resetOutput { class gru_resetOutput {
public: public:
HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate, HOSTDEVICE void operator()(T *value_update_gate, T *value_reset_gate,
T &prev_out, T &value_reset_output, T *prev_out, T *value_reset_output,
ActivationType act_gate) { ActivationType act_gate) {
value_update_gate = activation(value_update_gate, act_gate); *value_update_gate = activation(*value_update_gate, act_gate);
value_reset_gate = activation(value_reset_gate, act_gate); *value_reset_gate = activation(*value_reset_gate, act_gate);
value_reset_output = prev_out * value_reset_gate; *value_reset_output = (*prev_out) * (*value_reset_gate);
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ #ifndef __AVX__
static const bool avx = false; static const bool avx = false;
#else #else
static const bool avx = true; static const bool avx = true;
HOSTDEVICE void operator()(__m256 &value_update_gate, HOSTDEVICE void operator()(__m256 *value_update_gate,
__m256 &value_reset_gate, __m256 &prev_out, __m256 *value_reset_gate, __m256 *prev_out,
__m256 &value_reset_output, __m256 *value_reset_output,
ActivationType act_gate) { ActivationType act_gate) {
value_update_gate = activation(value_update_gate, act_gate); *value_update_gate = activation(*value_update_gate, act_gate);
value_reset_gate = activation(value_reset_gate, act_gate); *value_reset_gate = activation(*value_reset_gate, act_gate);
value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate); *value_reset_output = _mm256_mul_ps(*prev_out, *value_reset_gate);
} }
#endif #endif
#endif #endif
...@@ -55,25 +55,25 @@ class gru_resetOutput { ...@@ -55,25 +55,25 @@ class gru_resetOutput {
template <typename T> template <typename T>
class gru_finalOutput { class gru_finalOutput {
public: public:
HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state, HOSTDEVICE void operator()(T *value_update_gate, T *value_frame_state,
T &prev_out, T &value_output, T *prev_out, T *value_output,
ActivationType act_input) { ActivationType act_input) {
value_frame_state = activation(value_frame_state, act_input); *value_frame_state = activation(*value_frame_state, act_input);
value_output = prev_out - (value_update_gate * prev_out) + *value_output = *prev_out - ((*value_update_gate) * (*prev_out)) +
(value_update_gate * value_frame_state); ((*value_update_gate) * (*value_frame_state));
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ #ifndef __AVX__
static const bool avx = false; static const bool avx = false;
#else #else
static const bool avx = true; static const bool avx = true;
HOSTDEVICE void operator()(__m256 &value_update_gate, HOSTDEVICE void operator()(__m256 *value_update_gate,
__m256 &value_frame_state, __m256 &prev_out, __m256 *value_frame_state, __m256 *prev_out,
__m256 &value_output, ActivationType act_input) { __m256 *value_output, ActivationType act_input) {
value_frame_state = activation(value_frame_state, act_input); *value_frame_state = activation(*value_frame_state, act_input);
value_output = _mm256_add_ps( *value_output = _mm256_add_ps(
_mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)), _mm256_sub_ps(*prev_out, _mm256_mul_ps(*value_update_gate, *prev_out)),
_mm256_mul_ps(value_update_gate, value_frame_state)); _mm256_mul_ps(*value_update_gate, *value_frame_state));
} }
#endif #endif
#endif #endif
...@@ -85,37 +85,38 @@ namespace backward { ...@@ -85,37 +85,38 @@ namespace backward {
template <typename T> template <typename T>
class gru_stateGrad { class gru_stateGrad {
public: public:
HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate, HOSTDEVICE void operator()(T *value_update_gate, T *grad_update_gate,
T &value_frame_state, T &grad_frame_state, T *value_frame_state, T *grad_frame_state,
T &value_prev_out, T &grad_prev_out, T *value_prev_out, T *grad_prev_out,
T &grad_output, ActivationType act_input) { T *grad_output, ActivationType act_input) {
grad_update_gate = (grad_output * value_frame_state); *grad_update_gate = (*grad_output * (*value_frame_state));
grad_update_gate -= (grad_output * value_prev_out); *grad_update_gate -= (*grad_output * (*value_prev_out));
grad_prev_out -= (grad_output * value_update_gate); *grad_prev_out -= (*grad_output * (*value_update_gate));
grad_prev_out += grad_output; *grad_prev_out += *grad_output;
grad_frame_state = activation(grad_output * value_update_gate, *grad_frame_state = activation(*grad_output * (*value_update_gate),
value_frame_state, act_input); *value_frame_state, act_input);
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ #ifndef __AVX__
static const bool avx = false; static const bool avx = false;
#else #else
static const bool avx = true; static const bool avx = true;
HOSTDEVICE void operator()(__m256 &value_update_gate, HOSTDEVICE void operator()(__m256 *value_update_gate,
__m256 &grad_update_gate, __m256 *grad_update_gate,
__m256 &value_frame_state, __m256 *value_frame_state,
__m256 &grad_frame_state, __m256 &value_prev_out, __m256 *grad_frame_state, __m256 *value_prev_out,
__m256 &grad_prev_out, __m256 &grad_output, __m256 *grad_prev_out, __m256 *grad_output,
ActivationType act_input) { ActivationType act_input) {
grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state); *grad_update_gate = _mm256_mul_ps(*grad_output, *value_frame_state);
grad_update_gate = _mm256_sub_ps( *grad_update_gate = _mm256_sub_ps(
grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out)); *grad_update_gate, _mm256_mul_ps(*grad_output, *value_prev_out));
grad_prev_out = _mm256_add_ps( *grad_prev_out = _mm256_add_ps(
_mm256_sub_ps(grad_prev_out, _mm256_sub_ps(*grad_prev_out,
_mm256_mul_ps(grad_output, value_update_gate)), _mm256_mul_ps(*grad_output, *value_update_gate)),
grad_output); *grad_output);
grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate), *grad_frame_state =
value_frame_state, act_input); activation(_mm256_mul_ps(*grad_output, *value_update_gate),
*value_frame_state, act_input);
} }
#endif #endif
#endif #endif
...@@ -124,32 +125,34 @@ class gru_stateGrad { ...@@ -124,32 +125,34 @@ class gru_stateGrad {
template <typename T> template <typename T>
class gru_resetGrad { class gru_resetGrad {
public: public:
HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate, HOSTDEVICE void operator()(T *value_update_gate, T *grad_update_gate,
T &value_reset_gate, T &grad_reset_gate, T *value_reset_gate, T *grad_reset_gate,
T &value_prev_out, T &grad_prev_out, T *value_prev_out, T *grad_prev_out,
T &grad_reset_output, ActivationType act_gate) { T *grad_reset_output, ActivationType act_gate) {
grad_reset_gate = (grad_reset_output * value_prev_out); *grad_reset_gate = (*grad_reset_output * (*value_prev_out));
grad_prev_out += (grad_reset_output * value_reset_gate); *grad_prev_out += (*grad_reset_output * (*value_reset_gate));
grad_update_gate = *grad_update_gate =
activation(grad_update_gate, value_update_gate, act_gate); activation(*grad_update_gate, *value_update_gate, act_gate);
grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate); *grad_reset_gate =
activation(*grad_reset_gate, *value_reset_gate, act_gate);
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ #ifndef __AVX__
static const bool avx = false; static const bool avx = false;
#else #else
static const bool avx = true; static const bool avx = true;
HOSTDEVICE void operator()(__m256 &value_update_gate, HOSTDEVICE void operator()(__m256 *value_update_gate,
__m256 &grad_update_gate, __m256 &value_reset_gate, __m256 *grad_update_gate, __m256 *value_reset_gate,
__m256 &grad_reset_gate, __m256 &value_prev_out, __m256 *grad_reset_gate, __m256 *value_prev_out,
__m256 &grad_prev_out, __m256 &grad_reset_output, __m256 *grad_prev_out, __m256 *grad_reset_output,
ActivationType act_gate) { ActivationType act_gate) {
grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out); *grad_reset_gate = _mm256_mul_ps(*grad_reset_output, *value_prev_out);
grad_prev_out = _mm256_add_ps( *grad_prev_out = _mm256_add_ps(
grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate)); *grad_prev_out, _mm256_mul_ps(*grad_reset_output, *value_reset_gate));
grad_update_gate = *grad_update_gate =
activation(grad_update_gate, value_update_gate, act_gate); activation(*grad_update_gate, *value_update_gate, act_gate);
grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate); *grad_reset_gate =
activation(*grad_reset_gate, *value_reset_gate, act_gate);
} }
#endif #endif
#endif #endif
......
...@@ -59,9 +59,9 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -59,9 +59,9 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
r_prev_state = value.prev_state_value[i]; r_prev_state = value.prev_state_value[i];
} }
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
active_gate, active_state); active_node, active_gate, active_state);
value_in[i] = r_value_in; value_in[i] = r_value_in;
value_ig[i] = r_value_ig; value_ig[i] = r_value_ig;
...@@ -125,11 +125,11 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -125,11 +125,11 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
r_prev_state = value.prev_state_value[i]; r_prev_state = value.prev_state_value[i];
} }
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in,
r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state, &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO, &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate, &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
active_state); active_node, active_gate, active_state);
grad_in[i] = r_grad_in; grad_in[i] = r_grad_in;
grad_ig[i] = r_grad_ig; grad_ig[i] = r_grad_ig;
...@@ -186,9 +186,9 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -186,9 +186,9 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i]; r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i];
} }
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
active_gate, active_state); active_node, active_gate, active_state);
value_in[i] = r_value_in; value_in[i] = r_value_in;
value_ig[i] = r_value_ig; value_ig[i] = r_value_ig;
...@@ -258,11 +258,11 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -258,11 +258,11 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i]; r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i];
} }
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in,
r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state, &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO, &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate, &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
active_state); active_node, active_gate, active_state);
grad_in[i] = r_grad_in; grad_in[i] = r_grad_in;
grad_ig[i] = r_grad_ig; grad_ig[i] = r_grad_ig;
......
...@@ -70,9 +70,9 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size, ...@@ -70,9 +70,9 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
r_prev_state = value.prev_state_value[frame_idx]; r_prev_state = value.prev_state_value[frame_idx];
} }
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, active_gate, &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
active_state); active_node, active_gate, active_state);
value.gate_value[frame_idx] = r_value_in; value.gate_value[frame_idx] = r_value_in;
value.gate_value[frame_idx + frame_size] = r_value_ig; value.gate_value[frame_idx + frame_size] = r_value_ig;
...@@ -145,11 +145,11 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value, ...@@ -145,11 +145,11 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
r_prev_state = value.prev_state_value[frame_idx]; r_prev_state = value.prev_state_value[frame_idx];
} }
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, &r_grad_ig,
r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, &r_state,
r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, &r_checkF,
r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, active_node,
active_state); active_gate, active_state);
grad.gate_grad[frame_idx] = r_grad_in; grad.gate_grad[frame_idx] = r_grad_in;
grad.gate_grad[frame_idx + frame_size] = r_grad_ig; grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
......
...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once
#include <type_traits>
#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
#include <type_traits>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
...@@ -27,19 +27,19 @@ namespace forward { ...@@ -27,19 +27,19 @@ namespace forward {
template <class T> template <class T>
class lstm { class lstm {
public: public:
HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og, HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og,
T &prev_state, T &state, T &state_atv, T &output, T *prev_state, T *state, T *state_atv, T *output,
T &checkI, T &checkF, T &checkO, T *checkI, T *checkF, T *checkO,
ActivationType active_node, ActivationType active_node,
ActivationType active_gate, ActivationType active_gate,
ActivationType active_state) { ActivationType active_state) {
value_in = activation(value_in, active_node); *value_in = activation(*value_in, active_node);
value_ig = activation(value_ig + prev_state * checkI, active_gate); *value_ig = activation(*value_ig + (*prev_state) * (*checkI), active_gate);
value_fg = activation(value_fg + prev_state * checkF, active_gate); *value_fg = activation(*value_fg + (*prev_state) * (*checkF), active_gate);
state = value_in * value_ig + prev_state * value_fg; *state = (*value_in) * (*value_ig) + (*prev_state) * (*value_fg);
value_og = activation(value_og + state * checkO, active_gate); *value_og = activation(*value_og + (*state) * (*checkO), active_gate);
state_atv = activation(state, active_state); *state_atv = activation(*state, active_state);
output = value_og * state_atv; *output = (*value_og) * (*state_atv);
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default
...@@ -48,27 +48,27 @@ class lstm { ...@@ -48,27 +48,27 @@ class lstm {
// Only float support AVX optimization // Only float support AVX optimization
static const bool avx = std::is_same<T, float>::value; static const bool avx = std::is_same<T, float>::value;
HOSTDEVICE void operator()(__m256 &value_in, __m256 &value_ig, HOSTDEVICE void operator()(__m256 *value_in, __m256 *value_ig,
__m256 &value_fg, __m256 &value_og, __m256 *value_fg, __m256 *value_og,
__m256 &prev_state, __m256 &state, __m256 *prev_state, __m256 *state,
__m256 &state_atv, __m256 &output, __m256 &checkI, __m256 *state_atv, __m256 *output, __m256 *checkI,
__m256 &checkF, __m256 &checkO, __m256 *checkF, __m256 *checkO,
ActivationType active_node, ActivationType active_node,
ActivationType active_gate, ActivationType active_gate,
ActivationType active_state) { ActivationType active_state) {
value_in = activation(value_in, active_node); *value_in = activation(*value_in, active_node);
value_ig = *value_ig = activation(
activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)), _mm256_add_ps(*value_ig, _mm256_mul_ps(*prev_state, *checkI)),
active_gate); active_gate);
value_fg = *value_fg = activation(
activation(_mm256_add_ps(value_fg, _mm256_mul_ps(prev_state, checkF)), _mm256_add_ps(*value_fg, _mm256_mul_ps(*prev_state, *checkF)),
active_gate); active_gate);
state = _mm256_add_ps(_mm256_mul_ps(value_in, value_ig), *state = _mm256_add_ps(_mm256_mul_ps(*value_in, *value_ig),
_mm256_mul_ps(prev_state, value_fg)); _mm256_mul_ps(*prev_state, *value_fg));
value_og = activation(_mm256_add_ps(value_og, _mm256_mul_ps(state, checkO)), *value_og = activation(
active_gate); _mm256_add_ps(*value_og, _mm256_mul_ps(*state, *checkO)), active_gate);
state_atv = activation(state, active_state); *state_atv = activation(*state, active_state);
output = _mm256_mul_ps(value_og, state_atv); *output = _mm256_mul_ps(*value_og, *state_atv);
} }
#endif #endif
#endif #endif
...@@ -81,26 +81,29 @@ namespace backward { ...@@ -81,26 +81,29 @@ namespace backward {
template <class T> template <class T>
class lstm { class lstm {
public: public:
HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og, HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og,
T &grad_in, T &grad_ig, T &grad_fg, T &grad_og, T *grad_in, T *grad_ig, T *grad_fg, T *grad_og,
T &prev_state, T &prev_state_grad, T &state, T *prev_state, T *prev_state_grad, T *state,
T &state_grad, T &state_atv, T &output_grad, T *state_grad, T *state_atv, T *output_grad,
T &checkI, T &checkF, T &checkO, T &checkIGrad, T *checkI, T *checkF, T *checkO, T *checkIGrad,
T &checkFGrad, T &checkOGrad, T *checkFGrad, T *checkOGrad,
ActivationType active_node, ActivationType active_node,
ActivationType active_gate, ActivationType active_gate,
ActivationType active_state) { ActivationType active_state) {
grad_og = activation(output_grad * state_atv, value_og, active_gate); *grad_og =
state_grad += activation(output_grad * value_og, state_atv, active_state) + activation((*output_grad) * (*state_atv), *value_og, active_gate);
grad_og * checkO; *state_grad +=
grad_in = activation(state_grad * value_ig, value_in, active_node); activation((*output_grad) * (*value_og), *state_atv, active_state) +
grad_ig = activation(state_grad * value_in, value_ig, active_gate); (*grad_og) * (*checkO);
grad_fg = activation(state_grad * prev_state, value_fg, active_gate); *grad_in = activation((*state_grad) * (*value_ig), *value_in, active_node);
prev_state_grad = *grad_ig = activation((*state_grad) * (*value_in), *value_ig, active_gate);
grad_ig * checkI + grad_fg * checkF + state_grad * value_fg; *grad_fg =
checkIGrad = grad_ig * prev_state; activation((*state_grad) * (*prev_state), *value_fg, active_gate);
checkFGrad = grad_fg * prev_state; *prev_state_grad = (*grad_ig) * (*checkI) + (*grad_fg) * (*checkF) +
checkOGrad = grad_og * state; (*state_grad) * (*value_fg);
*checkIGrad = (*grad_ig) * (*prev_state);
*checkFGrad = (*grad_fg) * (*prev_state);
*checkOGrad = (*grad_og) * (*state);
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default
...@@ -109,32 +112,33 @@ class lstm { ...@@ -109,32 +112,33 @@ class lstm {
// Only float support AVX optimization // Only float support AVX optimization
static const bool avx = std::is_same<T, float>::value; static const bool avx = std::is_same<T, float>::value;
HOSTDEVICE void operator()( HOSTDEVICE void operator()(
__m256 &value_in, __m256 &value_ig, __m256 &value_fg, __m256 &value_og, __m256 *value_in, __m256 *value_ig, __m256 *value_fg, __m256 *value_og,
__m256 &grad_in, __m256 &grad_ig, __m256 &grad_fg, __m256 &grad_og, __m256 *grad_in, __m256 *grad_ig, __m256 *grad_fg, __m256 *grad_og,
__m256 &prev_state, __m256 &prev_state_grad, __m256 &state, __m256 *prev_state, __m256 *prev_state_grad, __m256 *state,
__m256 &state_grad, __m256 &state_atv, __m256 &output_grad, __m256 *state_grad, __m256 *state_atv, __m256 *output_grad,
__m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad, __m256 *checkI, __m256 *checkF, __m256 *checkO, __m256 *checkIGrad,
__m256 &checkFGrad, __m256 &checkOGrad, ActivationType active_node, __m256 *checkFGrad, __m256 *checkOGrad, ActivationType active_node,
ActivationType active_gate, ActivationType active_state) { ActivationType active_gate, ActivationType active_state) {
grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og, *grad_og = activation(_mm256_mul_ps(*output_grad, *state_atv), *value_og,
active_gate); active_gate);
state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og), *state_grad =
state_atv, active_state), _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og),
state_grad); *state_atv, active_state),
state_grad = _mm256_add_ps(_mm256_mul_ps(grad_og, checkO), state_grad); *state_grad);
grad_in = *state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
activation(_mm256_mul_ps(state_grad, value_ig), value_in, active_node); *grad_in = activation(_mm256_mul_ps(*state_grad, *value_ig), *value_in,
grad_ig = active_node);
activation(_mm256_mul_ps(state_grad, value_in), value_ig, active_gate); *grad_ig = activation(_mm256_mul_ps(*state_grad, *value_in), *value_ig,
grad_fg = activation(_mm256_mul_ps(state_grad, prev_state), value_fg, active_gate);
active_gate); *grad_fg = activation(_mm256_mul_ps(*state_grad, *prev_state), *value_fg,
prev_state_grad = _mm256_add_ps(_mm256_mul_ps(grad_ig, checkI), active_gate);
_mm256_mul_ps(grad_fg, checkF)); *prev_state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_ig, *checkI),
prev_state_grad = _mm256_mul_ps(*grad_fg, *checkF));
_mm256_add_ps(_mm256_mul_ps(state_grad, value_fg), prev_state_grad); *prev_state_grad =
checkIGrad = _mm256_mul_ps(grad_ig, prev_state); _mm256_add_ps(_mm256_mul_ps(*state_grad, *value_fg), *prev_state_grad);
checkFGrad = _mm256_mul_ps(grad_fg, prev_state); *checkIGrad = _mm256_mul_ps(*grad_ig, *prev_state);
checkOGrad = _mm256_mul_ps(grad_og, state); *checkFGrad = _mm256_mul_ps(*grad_fg, *prev_state);
*checkOGrad = _mm256_mul_ps(*grad_og, *state);
} }
#endif #endif
#endif #endif
......
...@@ -10,9 +10,9 @@ See the License for the specific language governing permissions and ...@@ -10,9 +10,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/gru_compute.h" #include "paddle/fluid/operators/math/gru_compute.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h" #include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
#include "paddle/fluid/operators/math/detail/gru_kernel.h" #include "paddle/fluid/operators/math/detail/gru_kernel.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -25,21 +25,21 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> { ...@@ -25,21 +25,21 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
const detail::ActivationType active_node, const detail::ActivationType active_node,
const detail::ActivationType active_gate) { const detail::ActivationType active_gate) {
#ifndef __NVCC__ #ifndef __NVCC__
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
if (value.prev_out_value) { if (value.prev_out_value) {
math::gemm<platform::CPUDeviceContext, T>( blas.GEMM(false, false, batch_size, frame_size * 2, frame_size, 1,
context, false, false, batch_size, frame_size * 2, frame_size, 1, value.prev_out_value, frame_size, value.gate_weight,
value.prev_out_value, frame_size, value.gate_weight, frame_size * 2, frame_size * 2, 1, value.gate_value, frame_size * 3);
1, value.gate_value, frame_size * 3);
} }
detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value, detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
frame_size, batch_size, active_gate); frame_size, batch_size, active_gate);
if (value.prev_out_value) { if (value.prev_out_value) {
math::gemm<platform::CPUDeviceContext, T>( blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
context, false, false, batch_size, frame_size, frame_size, 1, value.reset_output_value, frame_size, value.state_weight,
value.reset_output_value, frame_size, value.state_weight, frame_size, frame_size, 1, value.gate_value + frame_size * 2,
1, value.gate_value + frame_size * 2, frame_size * 3); frame_size * 3);
} }
detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value, detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
...@@ -58,36 +58,32 @@ struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> { ...@@ -58,36 +58,32 @@ struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
#ifndef __NVCC__ #ifndef __NVCC__
detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value, detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
grad, frame_size, batch_size, active_node); grad, frame_size, batch_size, active_node);
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
if (value.prev_out_value && grad.prev_out_grad) { if (value.prev_out_value && grad.prev_out_grad) {
math::gemm<platform::CPUDeviceContext, T>( blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
context, false, true, batch_size, frame_size, frame_size, 1, grad.gate_grad + frame_size * 2, frame_size * 3,
grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight, value.state_weight, frame_size, 0, grad.reset_output_grad,
frame_size, 0, grad.reset_output_grad, frame_size); frame_size);
if (grad.state_weight_grad) { if (grad.state_weight_grad) {
math::gemm<platform::CPUDeviceContext, T>( blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
context, true, false, frame_size, frame_size, batch_size, 1, value.reset_output_value, frame_size,
value.reset_output_value, frame_size, grad.gate_grad + frame_size * 2, frame_size * 3, 1,
grad.gate_grad + frame_size * 2, frame_size * 3, 1, grad.state_weight_grad, frame_size);
grad.state_weight_grad, frame_size);
} }
} }
detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value, detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
grad, frame_size, batch_size, active_gate); grad, frame_size, batch_size, active_gate);
if (grad.prev_out_grad && value.prev_out_value) { if (grad.prev_out_grad && value.prev_out_value) {
math::gemm<platform::CPUDeviceContext, T>( blas.GEMM(false, true, batch_size, frame_size, frame_size * 2, 1,
context, false, true, batch_size, frame_size, frame_size * 2, 1, grad.gate_grad, frame_size * 3, value.gate_weight,
grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1, frame_size * 2, 1, grad.prev_out_grad, frame_size);
grad.prev_out_grad, frame_size);
if (grad.gate_weight_grad) { if (grad.gate_weight_grad) {
math::gemm<platform::CPUDeviceContext, T>( blas.GEMM(true, false, frame_size, frame_size * 2, batch_size, 1,
context, true, false, frame_size, frame_size * 2, batch_size, 1, value.prev_out_value, frame_size, grad.gate_grad,
value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1, frame_size * 3, 1, grad.gate_weight_grad, frame_size * 2);
grad.gate_weight_grad, frame_size * 2);
} }
} }
#endif #endif
......
...@@ -9,10 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -9,10 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <paddle/fluid/platform/device_context.h>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h" #include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h"
#include "paddle/fluid/operators/math/detail/gru_kernel.h" #include "paddle/fluid/operators/math/detail/gru_kernel.h"
#include "paddle/fluid/operators/math/gru_compute.h" #include "paddle/fluid/operators/math/gru_compute.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -36,12 +37,11 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> { ...@@ -36,12 +37,11 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
threads = dim3(32, 32); threads = dim3(32, 32);
grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
} }
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
if (value.prev_out_value) { if (value.prev_out_value) {
math::gemm<platform::CUDADeviceContext, T>( blas.GEMM(false, false, batch_size, frame_size * 2, frame_size, 1,
context, false, false, batch_size, frame_size * 2, frame_size, 1, value.prev_out_value, frame_size, value.gate_weight,
value.prev_out_value, frame_size, value.gate_weight, frame_size * 2, frame_size * 2, 1, value.gate_value, frame_size * 3);
1, value.gate_value, frame_size * 3);
} }
if (batch_size == 1) { if (batch_size == 1) {
...@@ -61,10 +61,10 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> { ...@@ -61,10 +61,10 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
} }
if (value.prev_out_value) { if (value.prev_out_value) {
math::gemm<platform::CUDADeviceContext, T>( blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
context, false, false, batch_size, frame_size, frame_size, 1, value.reset_output_value, frame_size, value.state_weight,
value.reset_output_value, frame_size, value.state_weight, frame_size, frame_size, 1, value.gate_value + frame_size * 2,
1, value.gate_value + frame_size * 2, frame_size * 3); frame_size * 3);
} }
if (batch_size == 1) { if (batch_size == 1) {
...@@ -121,18 +121,19 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> { ...@@ -121,18 +121,19 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
grad.output_grad, frame_size, batch_size, active_node); grad.output_grad, frame_size, batch_size, active_node);
} }
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
if (value.prev_out_value && grad.prev_out_grad) { if (value.prev_out_value && grad.prev_out_grad) {
math::gemm<platform::CUDADeviceContext, T>( blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
context, false, true, batch_size, frame_size, frame_size, 1, grad.gate_grad + frame_size * 2, frame_size * 3,
grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight, value.state_weight, frame_size, 0, grad.reset_output_grad,
frame_size, 0, grad.reset_output_grad, frame_size); frame_size);
if (grad.state_weight_grad) { if (grad.state_weight_grad) {
math::gemm<platform::CUDADeviceContext, T>( blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
context, true, false, frame_size, frame_size, batch_size, 1, value.reset_output_value, frame_size,
value.reset_output_value, frame_size, grad.gate_grad + frame_size * 2, frame_size * 3, 1,
grad.gate_grad + frame_size * 2, frame_size * 3, 1, grad.state_weight_grad, frame_size);
grad.state_weight_grad, frame_size);
} }
} }
...@@ -153,16 +154,14 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> { ...@@ -153,16 +154,14 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
} }
if (grad.prev_out_grad && value.prev_out_value) { if (grad.prev_out_grad && value.prev_out_value) {
math::gemm<platform::CUDADeviceContext, T>( blas.GEMM(false, true, batch_size, frame_size, frame_size * 2, 1,
context, false, true, batch_size, frame_size, frame_size * 2, 1, grad.gate_grad, frame_size * 3, value.gate_weight,
grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1, frame_size * 2, 1, grad.prev_out_grad, frame_size);
grad.prev_out_grad, frame_size);
if (grad.gate_weight_grad) { if (grad.gate_weight_grad) {
math::gemm<platform::CUDADeviceContext, T>( blas.GEMM(true, false, frame_size, frame_size * 2, batch_size, 1,
context, true, false, frame_size, frame_size * 2, batch_size, 1, value.prev_out_value, frame_size, grad.gate_grad,
value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1, frame_size * 3, 1, grad.gate_weight_grad, frame_size * 2);
grad.gate_weight_grad, frame_size * 2);
} }
} }
} }
......
...@@ -24,266 +24,6 @@ namespace math { ...@@ -24,266 +24,6 @@ namespace math {
using float16 = paddle::platform::float16; using float16 = paddle::platform::float16;
template <>
void gemm<platform::CPUDeviceContext, float16>(
const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const float16* B, const float16 beta,
float16* C) {
PADDLE_THROW("float16 GEMM not supported on CPU");
}
template <>
void gemm<platform::CPUDeviceContext, float>(
const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float alpha, const float* A, const float* B, const float beta,
float* C) {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
void gemm<platform::CPUDeviceContext, double>(
const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const double alpha, const double* A, const double* B, const double beta,
double* C) {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
void gemm<platform::CPUDeviceContext, float16>(
const platform::CPUDeviceContext& context, const bool transA,
const bool transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const int lda, const float16* B,
const int ldb, const float16 beta, float16* C, const int ldc) {
PADDLE_THROW("float16 GEMM not supported on CPU");
}
template <>
void gemm<platform::CPUDeviceContext, float>(
const platform::CPUDeviceContext& context, const bool transA,
const bool transB, const int M, const int N, const int K, const float alpha,
const float* A, const int lda, const float* B, const int ldb,
const float beta, float* C, const int ldc) {
cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
lda, B, ldb, beta, C, ldc);
}
template <>
void gemm<platform::CPUDeviceContext, double>(
const platform::CPUDeviceContext& context, const bool transA,
const bool transB, const int M, const int N, const int K,
const double alpha, const double* A, const int lda, const double* B,
const int ldb, const double beta, double* C, const int ldc) {
cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
lda, B, ldb, beta, C, ldc);
}
template <>
void matmul<platform::CPUDeviceContext, float16>(
const platform::CPUDeviceContext& context,
const framework::Tensor& matrix_a, bool trans_a,
const framework::Tensor& matrix_b, bool trans_b, float16 alpha,
framework::Tensor* matrix_out, float16 beta) {
PADDLE_THROW("float16 matmul not supported on CPU");
}
template <>
void matmul<platform::CPUDeviceContext, float>(
const platform::CPUDeviceContext& context,
const framework::Tensor& matrix_a, bool trans_a,
const framework::Tensor& matrix_b, bool trans_b, float alpha,
framework::Tensor* matrix_out, float beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix");
PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
platform::is_cpu_place(matrix_b.place()) &&
platform::is_cpu_place(matrix_out->place()),
"Matrix must all be in CPUPlace");
int M = dim_out[0];
int N = dim_out[1];
int K = (trans_a == false) ? dim_a[1] : dim_a[0];
CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::CPUDeviceContext, float>(
context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
matrix_b.data<float>(), beta, matrix_out->data<float>());
}
template <>
void matmul<platform::CPUDeviceContext, double>(
const platform::CPUDeviceContext& context,
const framework::Tensor& matrix_a, bool trans_a,
const framework::Tensor& matrix_b, bool trans_b, double alpha,
framework::Tensor* matrix_out, double beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix");
PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
platform::is_cpu_place(matrix_b.place()) &&
platform::is_cpu_place(matrix_out->place()),
"Matrix must all be in CPUPlace");
int M = dim_out[0];
int N = dim_out[1];
int K = (trans_a == false) ? dim_a[1] : dim_a[0];
CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::CPUDeviceContext, double>(
context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
matrix_b.data<double>(), beta, matrix_out->data<double>());
}
template <>
void batched_gemm<platform::CPUDeviceContext, float16>(
const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const float16* B, const float16 beta,
float16* C, const int batchCount, const int64_t strideA,
const int64_t strideB) {
PADDLE_THROW("float16 batched_gemm not supported on CPU");
}
#ifdef PADDLE_WITH_MKLML
// Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize.
template <>
void batched_gemm<platform::CPUDeviceContext, float>(
const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float alpha, const float* A, const float* B, const float beta,
float* C, const int batchCount, const int64_t strideA,
const int64_t strideB) {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
auto a_array = std::vector<const float*>(batchCount);
auto b_array = std::vector<const float*>(batchCount);
auto c_array = std::vector<float*>(batchCount);
for (int k = 0; k < batchCount; ++k) {
a_array[k] = &A[k * strideA];
b_array[k] = &B[k * strideB];
c_array[k] = &C[k * M * N];
}
cblas_sgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
a_array.data(), &lda, b_array.data(), &ldb, &beta,
c_array.data(), &ldc, 1 /* group_count */, &batchCount);
}
template <>
void batched_gemm<platform::CPUDeviceContext, double>(
const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const double alpha, const double* A, const double* B, const double beta,
double* C, const int batchCount, const int64_t strideA,
const int64_t strideB) {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
auto a_array = std::vector<const double*>(batchCount);
auto b_array = std::vector<const double*>(batchCount);
auto c_array = std::vector<double*>(batchCount);
for (int k = 0; k < batchCount; ++k) {
a_array[k] = &A[k * strideA];
b_array[k] = &B[k * strideB];
c_array[k] = &C[k * M * N];
}
cblas_dgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
a_array.data(), &lda, b_array.data(), &ldb, &beta,
c_array.data(), &ldc, 1 /* group_count */, &batchCount);
}
#else
// The below is a naive but correct serial implementation that just loops
// over the batch dimension. This is a fallback for when the batched gemm
// functions of Intel MKL are not available. In the future, this computation
// should be parallelized.
template <>
void batched_gemm<platform::CPUDeviceContext, float>(
const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float alpha, const float* A, const float* B, const float beta,
float* C, const int batchCount, const int64_t strideA,
const int64_t strideB) {
for (int k = 0; k < batchCount; ++k) {
const float* Ak = &A[k * strideA];
const float* Bk = &B[k * strideB];
float* Ck = &C[k * M * N];
gemm<platform::CPUDeviceContext, float>(context, transA, transB, M, N, K,
alpha, Ak, Bk, beta, Ck);
}
}
template <>
void batched_gemm<platform::CPUDeviceContext, double>(
const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const double alpha, const double* A, const double* B, const double beta,
double* C, const int batchCount, const int64_t strideA,
const int64_t strideB) {
for (int k = 0; k < batchCount; ++k) {
const double* Ak = &A[k * strideA];
const double* Bk = &B[k * strideB];
double* Ck = &C[k * M * N];
gemm<platform::CPUDeviceContext, double>(context, transA, transB, M, N, K,
alpha, Ak, Bk, beta, Ck);
}
}
#endif
template <>
void gemv<platform::CPUDeviceContext, float>(
const platform::CPUDeviceContext& context, const bool trans_a, const int M,
const int N, const float alpha, const float* A, const float* B,
const float beta, float* C) {
CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
cblas_sgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
}
template <>
void gemv<platform::CPUDeviceContext, double>(
const platform::CPUDeviceContext& context, const bool trans_a, const int M,
const int N, const double alpha, const double* A, const double* B,
const double beta, double* C) {
CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
}
template <>
void axpy<platform::CPUDeviceContext, float>(
const platform::CPUDeviceContext& context, const int n, const float alpha,
const float* x, float* y) {
cblas_saxpy(n, alpha, x, 1, y, 1);
}
template <>
void axpy<platform::CPUDeviceContext, double>(
const platform::CPUDeviceContext& context, const int n, const double alpha,
const double* x, double* y) {
cblas_daxpy(n, alpha, x, 1, y, 1);
}
template struct SetConstant<platform::CPUDeviceContext, platform::float16>; template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
template struct SetConstant<platform::CPUDeviceContext, float>; template struct SetConstant<platform::CPUDeviceContext, float>;
template struct SetConstant<platform::CPUDeviceContext, double>; template struct SetConstant<platform::CPUDeviceContext, double>;
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/math_function_impl.h" #include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
...@@ -25,374 +26,6 @@ namespace math { ...@@ -25,374 +26,6 @@ namespace math {
using float16 = paddle::platform::float16; using float16 = paddle::platform::float16;
template <>
void gemm<platform::CUDADeviceContext, float16>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const float16* B, const float16 beta,
float16* C) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
"cublas fp16 gemm requires GPU compute capability >= 53");
#if CUDA_VERSION >= 8000
float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
#if CUDA_VERSION >= 9000
if (context.GetComputeCapability() >= 70) {
PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
CUBLAS_TENSOR_OP_MATH));
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
} else {
PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
CUBLAS_DEFAULT_MATH));
}
#endif // CUDA_VERSION >= 9000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
CUDA_R_32F, algo));
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
const half h_alpha = static_cast<const half>(alpha);
const half h_beta = static_cast<const half>(beta);
const half* h_A = reinterpret_cast<const half*>(A);
const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C);
PADDLE_ENFORCE(platform::dynload::cublasHgemm(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
h_A, lda, &h_beta, h_C, N));
#endif // CUDA_VERSION >= 8000
}
template <>
void gemm<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float alpha, const float* A, const float* B, const float beta,
float* C) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
PADDLE_ENFORCE(platform::dynload::cublasSgemm(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
lda, &beta, C, N));
}
template <>
void gemm<platform::CUDADeviceContext, double>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const double alpha, const double* A, const double* B, const double beta,
double* C) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
PADDLE_ENFORCE(platform::dynload::cublasDgemm(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
lda, &beta, C, N));
}
template <>
void gemm<platform::CUDADeviceContext, float16>(
const platform::CUDADeviceContext& context, const bool transA,
const bool transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const int lda, const float16* B,
const int ldb, const float16 beta, float16* C, const int ldc) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
const half h_alpha = static_cast<const half>(alpha);
const half h_beta = static_cast<const half>(beta);
const half* h_A = reinterpret_cast<const half*>(A);
const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C);
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
"cublas Hgemm requires GPU compute capability >= 53");
PADDLE_ENFORCE(platform::dynload::cublasHgemm(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
h_A, lda, &h_beta, h_C, ldc));
}
template <>
void gemm<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context, const bool transA,
const bool transB, const int M, const int N, const int K, const float alpha,
const float* A, const int lda, const float* B, const int ldb,
const float beta, float* C, const int ldc) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
PADDLE_ENFORCE(platform::dynload::cublasSgemm(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
lda, &beta, C, ldc));
}
template <>
void gemm<platform::CUDADeviceContext, double>(
const platform::CUDADeviceContext& context, const bool transA,
const bool transB, const int M, const int N, const int K,
const double alpha, const double* A, const int lda, const double* B,
const int ldb, const double beta, double* C, const int ldc) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
PADDLE_ENFORCE(platform::dynload::cublasDgemm(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
lda, &beta, C, ldc));
}
template <>
void matmul<platform::CUDADeviceContext, float16>(
const platform::CUDADeviceContext& context,
const framework::Tensor& matrix_a, bool trans_a,
const framework::Tensor& matrix_b, bool trans_b, float16 alpha,
framework::Tensor* matrix_out, float16 beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix");
PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
platform::is_gpu_place(matrix_b.place()) &&
platform::is_gpu_place(matrix_out->place()),
"Matrix must all be in CUDAPlace");
int M = dim_out[0];
int N = dim_out[1];
int K = (trans_a == false) ? dim_a[1] : dim_a[0];
CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::CUDADeviceContext, float16>(
context, transA, transB, M, N, K, alpha, matrix_a.data<float16>(),
matrix_b.data<float16>(), beta, matrix_out->data<float16>());
}
template <>
void matmul<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context,
const framework::Tensor& matrix_a, bool trans_a,
const framework::Tensor& matrix_b, bool trans_b, float alpha,
framework::Tensor* matrix_out, float beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix");
PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
platform::is_gpu_place(matrix_b.place()) &&
platform::is_gpu_place(matrix_out->place()),
"Matrix must all be in CUDAPlace");
int M = dim_out[0];
int N = dim_out[1];
int K = (trans_a == false) ? dim_a[1] : dim_a[0];
CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::CUDADeviceContext, float>(
context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
matrix_b.data<float>(), beta, matrix_out->data<float>());
}
template <>
void matmul<platform::CUDADeviceContext, double>(
const platform::CUDADeviceContext& context,
const framework::Tensor& matrix_a, bool trans_a,
const framework::Tensor& matrix_b, bool trans_b, double alpha,
framework::Tensor* matrix_out, double beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix");
PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
platform::is_gpu_place(matrix_b.place()) &&
platform::is_gpu_place(matrix_out->place()),
"Matrix must all be in CUDAPlace");
int M = dim_out[0];
int N = dim_out[1];
int K = (trans_a == false) ? dim_a[1] : dim_a[0];
CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::CUDADeviceContext, double>(
context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
matrix_b.data<double>(), beta, matrix_out->data<double>());
}
template <>
void batched_gemm<platform::CUDADeviceContext, float16>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const float16* B, const float16 beta,
float16* C, const int batchCount, const int64_t strideA,
const int64_t strideB) {
#if CUDA_VERSION >= 8000
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int64_t strideC = M * N;
const half h_alpha = static_cast<const half>(alpha);
const half h_beta = static_cast<const half>(beta);
const half* h_A = reinterpret_cast<const half*>(A);
const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C);
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
"cublas Hgemm requires GPU compute capability >= 53");
PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "HgemmStridedBatched is not supported on cuda <= 7.5");
#endif
}
template <>
void batched_gemm<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float alpha, const float* A, const float* B, const float beta,
float* C, const int batchCount, const int64_t strideA,
const int64_t strideB) {
#if CUDA_VERSION >= 8000
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int64_t strideC = M * N;
PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "SgemmStridedBatched is not supported on cuda <= 7.5");
#endif
}
template <>
void batched_gemm<platform::CUDADeviceContext, double>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const double alpha, const double* A, const double* B, const double beta,
double* C, const int batchCount, const int64_t strideA,
const int64_t strideB) {
#if CUDA_VERSION >= 8000
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int64_t strideC = M * N;
PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "DgemmStridedBatched is not supported on cuda <= 7.5");
#endif
}
template <>
void gemv<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context, const bool trans_a, const int M,
const int N, const float alpha, const float* A, const float* B,
const float beta, float* C) {
cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
PADDLE_ENFORCE(platform::dynload::cublasSgemv(context.cublas_handle(),
cuTransA, N, M, &alpha, A, N, B,
1, &beta, C, 1));
}
template <>
void gemv<platform::CUDADeviceContext, double>(
const platform::CUDADeviceContext& context, const bool trans_a, const int M,
const int N, const double alpha, const double* A, const double* B,
const double beta, double* C) {
cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
PADDLE_ENFORCE(platform::dynload::cublasDgemv(context.cublas_handle(),
cuTransA, N, M, &alpha, A, N, B,
1, &beta, C, 1));
}
template <>
void axpy<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context, const int n, const float alpha,
const float* x, float* y) {
PADDLE_ENFORCE(platform::dynload::cublasSaxpy(context.cublas_handle(), n,
&alpha, x, 1, y, 1));
}
template <>
void axpy<platform::CUDADeviceContext, double>(
const platform::CUDADeviceContext& context, const int n, const double alpha,
const double* x, double* y) {
PADDLE_ENFORCE(platform::dynload::cublasDaxpy(context.cublas_handle(), n,
&alpha, x, 1, y, 1));
}
template struct SetConstant<platform::CUDADeviceContext, platform::float16>; template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
template struct SetConstant<platform::CUDADeviceContext, float>; template struct SetConstant<platform::CUDADeviceContext, float>;
template struct SetConstant<platform::CUDADeviceContext, double>; template struct SetConstant<platform::CUDADeviceContext, double>;
...@@ -484,10 +117,9 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()( ...@@ -484,10 +117,9 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
one.mutable_data<double>({in_dims[0]}, context.GetPlace()); one.mutable_data<double>({in_dims[0]}, context.GetPlace());
SetConstant<platform::CUDADeviceContext, double> set; SetConstant<platform::CUDADeviceContext, double> set;
set(context, &one, static_cast<double>(1.0)); set(context, &one, static_cast<double>(1.0));
gemv<platform::CUDADeviceContext, double>( GetBlas<platform::CUDADeviceContext, double>(context).GEMV(
context, true, static_cast<int>(in_dims[0]), static_cast<int>(in_dims[1]), true, static_cast<int>(in_dims[0]), static_cast<int>(in_dims[1]), 1.0,
1.0, input.data<double>(), one.data<double>(), 0.0, input.data<double>(), one.data<double>(), 0.0, vector->data<double>());
vector->data<double>());
} }
template struct RowwiseSum<platform::CUDADeviceContext, float>; template struct RowwiseSum<platform::CUDADeviceContext, float>;
...@@ -506,10 +138,9 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()( ...@@ -506,10 +138,9 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
one.mutable_data<double>({size}, context.GetPlace()); one.mutable_data<double>({size}, context.GetPlace());
SetConstant<platform::CUDADeviceContext, double> set; SetConstant<platform::CUDADeviceContext, double> set;
set(context, &one, static_cast<double>(1.0)); set(context, &one, static_cast<double>(1.0));
gemv<platform::CUDADeviceContext, double>( GetBlas<platform::CUDADeviceContext, double>(context).GEMV(
context, true, static_cast<int>(in_dims[1]), static_cast<int>(in_dims[0]), true, static_cast<int>(in_dims[1]), static_cast<int>(in_dims[0]), 1.0,
1.0, one.data<double>(), input.data<double>(), 0.0, one.data<double>(), input.data<double>(), 0.0, vector->data<double>());
vector->data<double>());
} }
template struct RowwiseMean<platform::CUDADeviceContext, float>; template struct RowwiseMean<platform::CUDADeviceContext, float>;
......
...@@ -42,6 +42,7 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda, ...@@ -42,6 +42,7 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -50,47 +51,6 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda, ...@@ -50,47 +51,6 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
// Support continuous memory now
// If transA = N, and transB = N
// Then matrixA: M * K, matrixB: K * N, matrixC : M * N
// For more detailed info, please refer to
// http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
template <typename DeviceContext, typename T>
void gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const T alpha, const T* A, const T* B, const T beta, T* C);
// gemm wrapper with stride args for matrix uncontinuous in memory
template <typename DeviceContext, typename T>
void gemm(const DeviceContext& context, const bool transA, const bool transB,
const int M, const int N, const int K, const T alpha, const T* A,
const int lda, const T* B, const int ldb, const T beta, T* C,
const int ldc);
// matrix multiply with continuous memory
template <typename DeviceContext, typename T>
void matmul(const DeviceContext& context, const framework::Tensor& matrix_a,
bool trans_a, const framework::Tensor& matrix_b, bool trans_b,
T alpha, framework::Tensor* matrix_out, T beta);
// Batched gemm
template <typename DeviceContext, typename T>
void batched_gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N,
const int K, const T alpha, const T* A, const T* B,
const T beta, T* C, const int batchCount,
const int64_t strideA, const int64_t strideB);
template <typename DeviceContext, typename T>
void gemv(const DeviceContext& context, const bool trans_a, const int M,
const int N, const T alpha, const T* A, const T* B, const T beta,
T* C);
template <typename DeviceContext, typename T>
void axpy(const DeviceContext& context, const int n, const T alpha, const T* x,
T* y);
template <typename DeviceContext, typename T, int Rank> template <typename DeviceContext, typename T, int Rank>
struct Transpose { struct Transpose {
void operator()(const DeviceContext& context, const framework::Tensor& in, void operator()(const DeviceContext& context, const framework::Tensor& in,
......
...@@ -13,6 +13,14 @@ ...@@ -13,6 +13,14 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/operators/math/blas.h"
template <typename T>
inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
GetBlas(const paddle::platform::CPUDeviceContext& context) {
return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
T>(context);
}
TEST(math_function, gemm_notrans_cblas) { TEST(math_function, gemm_notrans_cblas) {
paddle::framework::Tensor input1; paddle::framework::Tensor input1;
...@@ -34,9 +42,8 @@ TEST(math_function, gemm_notrans_cblas) { ...@@ -34,9 +42,8 @@ TEST(math_function, gemm_notrans_cblas) {
memcpy(input3_ptr, arr3, 8 * sizeof(float)); memcpy(input3_ptr, arr3, 8 * sizeof(float));
paddle::platform::CPUDeviceContext context(*cpu_place); paddle::platform::CPUDeviceContext context(*cpu_place);
paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>( GetBlas<float>(context).GEMM(false, false, m, n, k, 1, input1_ptr, 3,
context, false, false, m, n, k, 1, input1_ptr, 3, input2_ptr + 1, 4, 1, input2_ptr + 1, 4, 1, input3_ptr + 1, 4);
input3_ptr + 1, 4);
EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[0], 0);
EXPECT_EQ(input3_ptr[1], 24); EXPECT_EQ(input3_ptr[1], 24);
...@@ -68,9 +75,8 @@ TEST(math_function, gemm_trans_clbas) { ...@@ -68,9 +75,8 @@ TEST(math_function, gemm_trans_clbas) {
memcpy(input3_ptr, arr3, 8 * sizeof(float)); memcpy(input3_ptr, arr3, 8 * sizeof(float));
paddle::platform::CPUDeviceContext context(*cpu_place); paddle::platform::CPUDeviceContext context(*cpu_place);
paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>( GetBlas<float>(context).GEMM(false, true, m, n, k, 1, input1_ptr, 3,
context, false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1, input2_ptr + 3, 3, 1, input3_ptr + 1, 4);
input3_ptr + 1, 4);
EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[0], 0);
EXPECT_EQ(input3_ptr[1], 24); EXPECT_EQ(input3_ptr[1], 24);
...@@ -124,9 +130,8 @@ void GemvTest(int m, int n, bool trans) { ...@@ -124,9 +130,8 @@ void GemvTest(int m, int n, bool trans) {
} }
paddle::platform::CPUDeviceContext context(*cpu_place); paddle::platform::CPUDeviceContext context(*cpu_place);
paddle::operators::math::gemv<paddle::platform::CPUDeviceContext, T>( GetBlas<T>(context).GEMV(trans, static_cast<int>(m), static_cast<int>(n), 1.,
context, trans, static_cast<int>(m), static_cast<int>(n), 1., data_a, data_a, data_b, 0., data_c);
data_b, 0., data_c);
if (!trans) { if (!trans) {
for (int i = 0; i < m; ++i) { for (int i = 0; i < m; ++i) {
......
...@@ -12,7 +12,9 @@ ...@@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size, void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
const std::vector<float>& data) { const std::vector<float>& data) {
...@@ -22,6 +24,13 @@ void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size, ...@@ -22,6 +24,13 @@ void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
} }
} }
template <typename T>
inline paddle::operators::math::BlasT<paddle::platform::CUDADeviceContext, T>
GetBlas(const paddle::platform::CUDADeviceContext& context) {
return paddle::operators::math::GetBlas<paddle::platform::CUDADeviceContext,
T>(context);
}
TEST(math_function, notrans_mul_trans_fp32) { TEST(math_function, notrans_mul_trans_fp32) {
paddle::framework::Tensor input1; paddle::framework::Tensor input1;
paddle::framework::Tensor input1_gpu; paddle::framework::Tensor input1_gpu;
...@@ -41,9 +50,8 @@ TEST(math_function, notrans_mul_trans_fp32) { ...@@ -41,9 +50,8 @@ TEST(math_function, notrans_mul_trans_fp32) {
paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float>({2, 2}, gpu_place); out_gpu.mutable_data<float>({2, 2}, gpu_place);
GetBlas<float>(context).MatMul(input1_gpu, false, input2_gpu, true, 1,
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>( &out_gpu, 0);
context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
...@@ -80,10 +88,9 @@ TEST(math_function, notrans_mul_trans_fp16) { ...@@ -80,10 +88,9 @@ TEST(math_function, notrans_mul_trans_fp16) {
out_gpu.mutable_data<paddle::platform::float16>({2, 2}, gpu_place); out_gpu.mutable_data<paddle::platform::float16>({2, 2}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, GetBlas<paddle::platform::float16>(context).MatMul(
paddle::platform::float16>( input1_gpu, false, input2_gpu, true, paddle::platform::float16(1),
context, input1_gpu, false, input2_gpu, true, &out_gpu, paddle::platform::float16(0));
paddle::platform::float16(1), &out_gpu, paddle::platform::float16(0));
paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
...@@ -115,8 +122,8 @@ TEST(math_function, trans_mul_notrans_fp32) { ...@@ -115,8 +122,8 @@ TEST(math_function, trans_mul_notrans_fp32) {
out_gpu.mutable_data<float>({3, 3}, gpu_place); out_gpu.mutable_data<float>({3, 3}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>( GetBlas<float>(context).MatMul(input1_gpu, true, input2_gpu, false, 1,
context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); &out_gpu, 0);
paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
...@@ -158,10 +165,9 @@ TEST(math_function, trans_mul_notrans_fp16) { ...@@ -158,10 +165,9 @@ TEST(math_function, trans_mul_notrans_fp16) {
out_gpu.mutable_data<paddle::platform::float16>({3, 3}, gpu_place); out_gpu.mutable_data<paddle::platform::float16>({3, 3}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, GetBlas<paddle::platform::float16>(context).MatMul(
paddle::platform::float16>( input1_gpu, true, input2_gpu, false, paddle::platform::float16(1),
context, input1_gpu, true, input2_gpu, false, &out_gpu, paddle::platform::float16(0));
paddle::platform::float16(1), &out_gpu, paddle::platform::float16(0));
paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
...@@ -210,8 +216,8 @@ TEST(math_function, gemm_notrans_cublas_fp32) { ...@@ -210,8 +216,8 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
float* b = input2_gpu.data<float>(); float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(gpu_place); float* c = input3_gpu.mutable_data<float>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>( GetBlas<float>(context).GEMM(false, false, m, n, k, 1, a, 3, b + 1, 4, 1,
context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); c + 1, 4);
paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
...@@ -271,10 +277,9 @@ TEST(math_function, gemm_notrans_cublas_fp16) { ...@@ -271,10 +277,9 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
paddle::platform::float16* c = paddle::platform::float16* c =
input3_gpu.mutable_data<paddle::platform::float16>(gpu_place); input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, GetBlas<paddle::platform::float16>(context).GEMM(
paddle::platform::float16>( false, false, m, n, k, static_cast<paddle::platform::float16>(1), a, 3,
context, false, false, m, n, k, paddle::platform::float16(1), a, 3, b + 1, b + 1, 4, static_cast<paddle::platform::float16>(1), c + 1, 4);
4, paddle::platform::float16(1), c + 1, 4);
paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
...@@ -327,8 +332,8 @@ TEST(math_function, gemm_trans_cublas_fp32) { ...@@ -327,8 +332,8 @@ TEST(math_function, gemm_trans_cublas_fp32) {
float* b = input2_gpu.data<float>(); float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(gpu_place); float* c = input3_gpu.mutable_data<float>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>( GetBlas<float>(context).GEMM(false, true, m, n, k, 1, a, 3, b + 3, 3, 1,
context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); c + 1, 4);
paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
...@@ -382,10 +387,9 @@ TEST(math_function, gemm_trans_cublas_fp16) { ...@@ -382,10 +387,9 @@ TEST(math_function, gemm_trans_cublas_fp16) {
paddle::platform::float16* c = paddle::platform::float16* c =
input3_gpu.mutable_data<paddle::platform::float16>(gpu_place); input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, GetBlas<paddle::platform::float16>(context).GEMM(
paddle::platform::float16>( false, true, m, n, k, static_cast<paddle::platform::float16>(1), a, 3,
context, false, true, m, n, k, paddle::platform::float16(1), a, 3, b + 3, b + 3, 3, static_cast<paddle::platform::float16>(1), c + 1, 4);
3, paddle::platform::float16(1), c + 1, 4);
paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
...@@ -431,9 +435,8 @@ void GemvTest(int m, int n, bool trans) { ...@@ -431,9 +435,8 @@ void GemvTest(int m, int n, bool trans) {
paddle::framework::TensorCopySync(mat_a, gpu_place, &g_mat_a); paddle::framework::TensorCopySync(mat_a, gpu_place, &g_mat_a);
paddle::framework::TensorCopySync(vec_b, gpu_place, &g_vec_b); paddle::framework::TensorCopySync(vec_b, gpu_place, &g_vec_b);
paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>( GetBlas<T>(context).GEMV(trans, static_cast<int>(m), static_cast<int>(n), 1.,
context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a, g_data_a, g_data_b, 0., g_data_c);
g_data_b, 0., g_data_c);
paddle::framework::TensorCopySync(g_vec_c, cpu_place, &vec_c); paddle::framework::TensorCopySync(g_vec_c, cpu_place, &vec_c);
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -129,15 +129,17 @@ class MatMulFunctor { ...@@ -129,15 +129,17 @@ class MatMulFunctor {
CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
auto blas = GetBlas<DeviceContext, T>(context);
if (!batchCount) { if (!batchCount) {
// regular matrix multiplication // regular matrix multiplication
gemm<DeviceContext, T>(context, transA, transB, M, N, kA, alpha, blas.GEMM(transA, transB, M, N, kA, alpha, a.data<T>(), b.data<T>(), beta,
a.data<T>(), b.data<T>(), beta, out->data<T>()); out->data<T>());
} else { } else {
// batched matrix multiplication // batched matrix multiplication
batched_gemm<DeviceContext, T>( blas.BatchedGEMM(transA, transB, M, N, kA, alpha, a.data<T>(),
context, transA, transB, M, N, kA, alpha, a.data<T>(), b.data<T>(), b.data<T>(), beta, out->data<T>(), batchCount, strideA,
beta, out->data<T>(), batchCount, strideA, strideB); strideB);
} }
} }
}; };
......
...@@ -41,7 +41,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod, ...@@ -41,7 +41,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
seq = cpu_seq; seq = cpu_seq;
} else { } else {
TensorCopy(cpu_seq, *place, *context, &seq); TensorCopySync(cpu_seq, *place, &seq);
seq.set_lod(lod); seq.set_lod(lod);
} }
...@@ -64,7 +64,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod, ...@@ -64,7 +64,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
cpu_seq_back = seq_back; cpu_seq_back = seq_back;
} else { } else {
TensorCopy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back); TensorCopySync(seq_back, paddle::platform::CPUPlace(), &cpu_seq_back);
cpu_seq_back.set_lod(lod); cpu_seq_back.set_lod(lod);
} }
......
...@@ -17,6 +17,8 @@ limitations under the License. */ ...@@ -17,6 +17,8 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
class MomentumOp : public framework::OperatorWithKernel { class MomentumOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -50,6 +52,12 @@ class MomentumOp : public framework::OperatorWithKernel { ...@@ -50,6 +52,12 @@ class MomentumOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("ParamOut", param_dim);
ctx->SetOutputDim("VelocityOut", param_dim); ctx->SetOutputDim("VelocityOut", param_dim);
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("Param")->type());
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
}; };
class MomentumOpMaker : public framework::OpProtoAndCheckerMaker { class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
......
...@@ -204,6 +204,8 @@ REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ...@@ -204,6 +204,8 @@ REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(mul_grad, ops::MulGradOp); REGISTER_OPERATOR(mul_grad, ops::MulGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>); mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>,
ops::MulKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>); mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::MulGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -18,6 +18,8 @@ limitations under the License. */ ...@@ -18,6 +18,8 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>, REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
ops::MulKernel<plat::CUDADeviceContext, double>,
ops::MulKernel<plat::CUDADeviceContext, plat::float16>); ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(mul_grad, REGISTER_OP_CUDA_KERNEL(mul_grad,
ops::MulGradKernel<plat::CUDADeviceContext, float>); ops::MulGradKernel<plat::CUDADeviceContext, float>,
ops::MulGradKernel<plat::CUDADeviceContext, double>);
...@@ -14,9 +14,9 @@ limitations under the License. */ ...@@ -14,9 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -46,9 +46,10 @@ class MulKernel : public framework::OpKernel<T> { ...@@ -46,9 +46,10 @@ class MulKernel : public framework::OpKernel<T> {
if (z_dim.size() != 2) { if (z_dim.size() != 2) {
z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
} }
math::matmul<DeviceContext, T>(
context.template device_context<DeviceContext>(), x_matrix, false, auto blas = math::GetBlas<DeviceContext, T>(context);
y_matrix, false, static_cast<T>(1), z, static_cast<T>(0));
blas.MatMul(x_matrix, y_matrix, z);
if (z_dim.size() != 2) { if (z_dim.size() != 2) {
z->Resize(z_dim); z->Resize(z_dim);
} }
...@@ -79,6 +80,7 @@ class MulGradKernel : public framework::OpKernel<T> { ...@@ -79,6 +80,7 @@ class MulGradKernel : public framework::OpKernel<T> {
Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X")); Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
if (dx) { if (dx) {
dx->mutable_data<T>(ctx.GetPlace()); dx->mutable_data<T>(ctx.GetPlace());
Tensor dx_matrix = dx->dims().size() > 2 Tensor dx_matrix = dx->dims().size() > 2
...@@ -86,8 +88,7 @@ class MulGradKernel : public framework::OpKernel<T> { ...@@ -86,8 +88,7 @@ class MulGradKernel : public framework::OpKernel<T> {
: *dx; : *dx;
// dx = dout * y'. dx: M x K, dout : M x N, y : K x N // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
math::matmul<DeviceContext, T>(dev_ctx, dout_mat, false, y_matrix, true, blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
1, &dx_matrix, 0);
} }
if (dy) { if (dy) {
dy->mutable_data<T>(ctx.GetPlace()); dy->mutable_data<T>(ctx.GetPlace());
...@@ -95,8 +96,7 @@ class MulGradKernel : public framework::OpKernel<T> { ...@@ -95,8 +96,7 @@ class MulGradKernel : public framework::OpKernel<T> {
? framework::ReshapeToMatrix(*dy, y_num_col_dims) ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
: *dy; : *dy;
// dy = x' * dout. dy K x N, dout : M x N, x : M x K // dy = x' * dout. dy K x N, dout : M x N, x : M x K
math::matmul<DeviceContext, T>(dev_ctx, x_matrix, true, dout_mat, false, blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
1, &dy_matrix, 0);
} }
} }
}; };
......
...@@ -33,7 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> { ...@@ -33,7 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
auto cols = ins[0]->numel() / rows; auto cols = ins[0]->numel() / rows;
// copy index to cpu // copy index to cpu
Tensor index_t_cpu; Tensor index_t_cpu;
TensorCopy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
auto* index = index_t_cpu.data<int32_t>(); auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace()); platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
...@@ -69,7 +69,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> { ...@@ -69,7 +69,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
auto cols = ins[0]->numel() / rows; auto cols = ins[0]->numel() / rows;
// copy index to cpu // copy index to cpu
Tensor index_t_cpu; Tensor index_t_cpu;
TensorCopy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
auto* index = index_t_cpu.data<int32_t>(); auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
......
...@@ -174,7 +174,8 @@ REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace, ...@@ -174,7 +174,8 @@ REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNOpKernel<float>, ops::PoolCUDNNOpKernel<float>,
ops::PoolCUDNNOpKernel<double>); ops::PoolCUDNNOpKernel<double>,
ops::PoolCUDNNOpKernel<plat::float16>);
REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNGradOpKernel<float>, ops::PoolCUDNNGradOpKernel<float>,
ops::PoolCUDNNGradOpKernel<double>); ops::PoolCUDNNGradOpKernel<double>);
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
class ProximalAdagradOp : public framework::OperatorWithKernel { class ProximalAdagradOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -55,6 +56,12 @@ class ProximalAdagradOp : public framework::OperatorWithKernel { ...@@ -55,6 +56,12 @@ class ProximalAdagradOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("ParamOut", param_dim);
ctx->SetOutputDim("MomentOut", param_dim); ctx->SetOutputDim("MomentOut", param_dim);
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("Param")->type());
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
}; };
class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker { class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
class ProximalGDOp : public framework::OperatorWithKernel { class ProximalGDOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -43,6 +44,12 @@ class ProximalGDOp : public framework::OperatorWithKernel { ...@@ -43,6 +44,12 @@ class ProximalGDOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("ParamOut", param_dim);
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("Param")->type());
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
}; };
class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker { class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker {
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/row_conv_op.h" #include "paddle/fluid/operators/row_conv_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_device_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -189,6 +189,10 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout, ...@@ -189,6 +189,10 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout,
} }
__syncthreads(); __syncthreads();
// NOTE(zcd): temporary solution
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, true);
for (int i = 0; i < num_sequence; i++) { for (int i = 0; i < num_sequence; i++) {
int start = static_cast<int>(batch_indices[i]); int start = static_cast<int>(batch_indices[i]);
int end = static_cast<int>(batch_indices[i + 1]); int end = static_cast<int>(batch_indices[i + 1]);
...@@ -220,7 +224,7 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout, ...@@ -220,7 +224,7 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout,
for (int offset = 16; offset > 0; for (int offset = 16; offset > 0;
offset = offset / 2) { // blockDim.x is 32. offset = offset / 2) { // blockDim.x is 32.
val += platform::__shfl_down_sync(0, val, offset); val += platform::CudaShuffleDownSync(mask, val, offset);
} }
__syncthreads(); __syncthreads();
...@@ -251,6 +255,10 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence, ...@@ -251,6 +255,10 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence,
T *sh_in = mem; T *sh_in = mem;
T *sh_dout = &mem[block_x * block_y]; T *sh_dout = &mem[block_x * block_y];
// NOTE(zcd): temporary solution
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, true);
for (int i = 0; i < num_sequence; i++) { for (int i = 0; i < num_sequence; i++) {
int start = static_cast<int>(batch_indices[i]); int start = static_cast<int>(batch_indices[i]);
int end = static_cast<int>(batch_indices[i + 1]); int end = static_cast<int>(batch_indices[i + 1]);
...@@ -276,7 +284,7 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence, ...@@ -276,7 +284,7 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence,
for (int offset = 16; offset > 0; for (int offset = 16; offset > 0;
offset = offset / 2) { // blockDim.x is 32. offset = offset / 2) { // blockDim.x is 32.
val += platform::__shfl_down_sync(0, val, offset); val += platform::CudaShuffleDownSync(mask, val, offset);
} }
__syncthreads(); __syncthreads();
......
...@@ -106,6 +106,8 @@ class SaveOp : public framework::OperatorBase { ...@@ -106,6 +106,8 @@ class SaveOp : public framework::OperatorBase {
auto out_kernel_type = framework::OpKernelType(out_dtype, place); auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor out; framework::LoDTensor out;
framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out); framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
// copy LoD info to the new tensor
out.set_lod(tensor.lod());
framework::SerializeToStream(fout, out, dev_ctx); framework::SerializeToStream(fout, out, dev_ctx);
} else { } else {
framework::SerializeToStream(fout, tensor, dev_ctx); framework::SerializeToStream(fout, tensor, dev_ctx);
......
...@@ -35,7 +35,6 @@ class ScaleOp : public framework::OperatorWithKernel { ...@@ -35,7 +35,6 @@ class ScaleOp : public framework::OperatorWithKernel {
} }
}; };
template <typename AttrType>
class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker) ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
...@@ -47,9 +46,9 @@ Scale operator ...@@ -47,9 +46,9 @@ Scale operator
$$Out = scale*X$$ $$Out = scale*X$$
)DOC"); )DOC");
AddAttr<AttrType>("scale", AddAttr<float>("scale",
"(float, default 1.0)" "(float, default 1.0)"
"The scaling factor of the scale operator.") "The scaling factor of the scale operator.")
.SetDefault(1.0); .SetDefault(1.0);
} }
}; };
...@@ -73,8 +72,7 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker { ...@@ -73,8 +72,7 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>, REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker);
ops::ScaleGradMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>, scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>, ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
......
...@@ -116,6 +116,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs, ...@@ -116,6 +116,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) { void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
f::Scope scope; f::Scope scope;
p::CPUPlace place; p::CPUPlace place;
VLOG(4) << "before init tensor";
if (is_sparse) { if (is_sparse) {
InitSelectedRowsInScope(place, &scope); InitSelectedRowsInScope(place, &scope);
} else { } else {
...@@ -137,6 +138,7 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) { ...@@ -137,6 +138,7 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
attrs.insert({"PrefetchBlock", prefetch_block}); attrs.insert({"PrefetchBlock", prefetch_block});
attrs.insert({"grad_to_block_id", std::vector<std::string>({""})}); attrs.insert({"grad_to_block_id", std::vector<std::string>({""})});
attrs.insert({"sync_mode", true}); attrs.insert({"sync_mode", true});
VLOG(4) << "before init op";
listen_and_serv_op = listen_and_serv_op =
f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs); f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
*initialized = true; *initialized = true;
...@@ -149,7 +151,9 @@ TEST(SendRecvOp, CPUDense) { ...@@ -149,7 +151,9 @@ TEST(SendRecvOp, CPUDense) {
std::thread server_thread(StartServerNet, false, &initialized); std::thread server_thread(StartServerNet, false, &initialized);
while (!initialized) { while (!initialized) {
} }
sleep(5); // wait server to start static_cast<paddle::operators::ListenAndServOp *>(listen_and_serv_op.get())
->WaitServerReady();
// local net // local net
f::Scope scope; f::Scope scope;
p::CPUPlace place; p::CPUPlace place;
...@@ -185,6 +189,7 @@ TEST(SendRecvOp, CPUDense) { ...@@ -185,6 +189,7 @@ TEST(SendRecvOp, CPUDense) {
listen_and_serv_op->Stop(); listen_and_serv_op->Stop();
server_thread.join(); server_thread.join();
listen_and_serv_op.reset(nullptr); listen_and_serv_op.reset(nullptr);
paddle::operators::ListenAndServOp::ResetPort();
} }
TEST(SendRecvOp, CPUSparse) { TEST(SendRecvOp, CPUSparse) {
...@@ -193,7 +198,12 @@ TEST(SendRecvOp, CPUSparse) { ...@@ -193,7 +198,12 @@ TEST(SendRecvOp, CPUSparse) {
std::thread server_thread(StartServerNet, true, &initialized); std::thread server_thread(StartServerNet, true, &initialized);
while (!initialized) { while (!initialized) {
} }
sleep(5); // wait server to start auto *listen_and_serv_op_ptr =
static_cast<paddle::operators::ListenAndServOp *>(
listen_and_serv_op.get());
ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
listen_and_serv_op_ptr->WaitServerReady();
// local net // local net
f::Scope scope; f::Scope scope;
p::CPUPlace place; p::CPUPlace place;
...@@ -201,10 +211,6 @@ TEST(SendRecvOp, CPUSparse) { ...@@ -201,10 +211,6 @@ TEST(SendRecvOp, CPUSparse) {
InitSelectedRowsInScope(place, &scope); InitSelectedRowsInScope(place, &scope);
scope.Var("RPC_CLIENT_VAR"); scope.Var("RPC_CLIENT_VAR");
f::AttributeMap attrs; f::AttributeMap attrs;
auto *listen_and_serv_op_ptr =
static_cast<paddle::operators::ListenAndServOp *>(
listen_and_serv_op.get());
ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
selected_port = listen_and_serv_op_ptr->GetSelectedPort(); selected_port = listen_and_serv_op_ptr->GetSelectedPort();
std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port); std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
attrs.insert({"endpoints", std::vector<std::string>({endpoint})}); attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
...@@ -236,4 +242,5 @@ TEST(SendRecvOp, CPUSparse) { ...@@ -236,4 +242,5 @@ TEST(SendRecvOp, CPUSparse) {
listen_and_serv_op->Stop(); listen_and_serv_op->Stop();
server_thread.join(); server_thread.join();
listen_and_serv_op.reset(); listen_and_serv_op.reset();
paddle::operators::ListenAndServOp::ResetPort();
} }
...@@ -58,17 +58,15 @@ class SequenceConvKernel : public framework::OpKernel<T> { ...@@ -58,17 +58,15 @@ class SequenceConvKernel : public framework::OpKernel<T> {
// Because if padding_trainable is false, padding data should be zeros. // Because if padding_trainable is false, padding data should be zeros.
math::SetConstant<DeviceContext, T> set_zero; math::SetConstant<DeviceContext, T> set_zero;
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
set_zero(dev_ctx, &col, static_cast<T>(0)); set_zero(dev_ctx, &col, static_cast<T>(0));
math::ContextProjectFunctor<DeviceContext, T> seq_project_functor; math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable, seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
context_start, context_length, context_stride, up_pad, context_start, context_length, context_stride, up_pad,
down_pad, &col); down_pad, &col);
math::matmul<DeviceContext, T>(dev_ctx, col, false, filter, false, blas.MatMul(col, filter, out);
static_cast<T>(1.0), out,
static_cast<T>(0.0));
} }
}; };
...@@ -99,6 +97,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> { ...@@ -99,6 +97,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
math::SetConstant<DeviceContext, T> set_zero; math::SetConstant<DeviceContext, T> set_zero;
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
// use col_shape in the im2col calculation // use col_shape in the im2col calculation
framework::DDim col_shape = {in->dims()[0], framework::DDim col_shape = {in->dims()[0],
sequence_width * context_length}; sequence_width * context_length};
...@@ -108,8 +107,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> { ...@@ -108,8 +107,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
col.mutable_data<T>(col_shape, context.GetPlace()); col.mutable_data<T>(col_shape, context.GetPlace());
// Because if padding_trainable is false, padding data should be zeros. // Because if padding_trainable is false, padding data should be zeros.
set_zero(dev_ctx, &col, static_cast<T>(0)); set_zero(dev_ctx, &col, static_cast<T>(0));
math::matmul<DeviceContext, T>(dev_ctx, *out_g, false, *filter, true, blas.MatMul(*out_g, false, *filter, true, &col);
T(1.0), &col, T(1.0));
} }
math::ContextProjectFunctor<DeviceContext, T> seq_project_functor; math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
math::ContextProjectGradFunctor<DeviceContext, T> seq_project_grad_functor; math::ContextProjectGradFunctor<DeviceContext, T> seq_project_grad_functor;
...@@ -150,8 +148,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> { ...@@ -150,8 +148,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
context_start, context_length, context_stride, up_pad, context_start, context_length, context_stride, up_pad,
down_pad, &col); down_pad, &col);
math::matmul<DeviceContext, T>(dev_ctx, col, true, out_grad, false, blas.MatMul(col, true, out_grad, false, &filter_grad);
T(1.0), &filter_grad, T(1.0));
} }
} }
}; };
......
...@@ -66,13 +66,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> { ...@@ -66,13 +66,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace()); offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
framework::TensorCopy(*offset, platform::CPUPlace(), ctx.device_context(), framework::TensorCopySync(*offset, platform::CPUPlace(), &offset_cpu);
&offset_cpu);
offset_data = offset_cpu.data<int64_t>(); offset_data = offset_cpu.data<int64_t>();
length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace()); length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
framework::TensorCopy(*length, platform::CPUPlace(), ctx.device_context(), framework::TensorCopySync(*length, platform::CPUPlace(), &length_cpu);
&length_cpu);
length_data = length_cpu.data<int64_t>(); length_data = length_cpu.data<int64_t>();
} }
...@@ -127,13 +125,11 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> { ...@@ -127,13 +125,11 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace()); offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
framework::TensorCopy(*offset, platform::CPUPlace(), ctx.device_context(), framework::TensorCopySync(*offset, platform::CPUPlace(), &offset_cpu);
&offset_cpu);
offset_data = offset_cpu.data<int64_t>(); offset_data = offset_cpu.data<int64_t>();
length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace()); length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
framework::TensorCopy(*length, platform::CPUPlace(), ctx.device_context(), framework::TensorCopySync(*length, platform::CPUPlace(), &length_cpu);
&length_cpu);
length_data = length_cpu.data<int64_t>(); length_data = length_cpu.data<int64_t>();
} }
......
...@@ -164,7 +164,9 @@ REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, ...@@ -164,7 +164,9 @@ REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad); REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>); softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
softmax_grad, softmax_grad,
ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>); ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -19,6 +19,8 @@ namespace ops = paddle::operators; ...@@ -19,6 +19,8 @@ namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
softmax, ops::SoftmaxKernel<plat::CUDADeviceContext, float>, softmax, ops::SoftmaxKernel<plat::CUDADeviceContext, float>,
ops::SoftmaxKernel<plat::CUDADeviceContext, double>,
ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>); ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(softmax_grad, REGISTER_OP_CUDA_KERNEL(
ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>); softmax_grad, ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>,
ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>);
...@@ -75,4 +75,5 @@ namespace ops = paddle::operators; ...@@ -75,4 +75,5 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(top_k, ops::TopkOp, ops::TopkOpMaker, REGISTER_OPERATOR(top_k, ops::TopkOp, ops::TopkOpMaker,
paddle::framework::EmptyGradOpMaker); paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(top_k, REGISTER_OP_CPU_KERNEL(top_k,
ops::TopkKernel<paddle::platform::CPUPlace, float>); ops::TopkKernel<paddle::platform::CPUPlace, float>,
ops::TopkKernel<paddle::platform::CPUPlace, double>);
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/top_k_op.h" #include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cuda_device_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -235,8 +236,14 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid, ...@@ -235,8 +236,14 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
sh_topk[tid] = topk[*beam]; sh_topk[tid] = topk[*beam];
} }
} }
// NOTE(zcd): temporary solution
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, true);
if (maxid[0] / 32 == warp) { if (maxid[0] / 32 == warp) {
if (__shfl(*beam, (maxid[0]) % 32, 32) == MaxLength) break; if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) ==
MaxLength)
break;
} }
} }
} }
...@@ -318,4 +325,5 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> { ...@@ -318,4 +325,5 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>); REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>,
paddle::operators::TopkOpCUDAKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cuda.h>
namespace paddle {
namespace platform {
#if CUDA_VERSION < 9000
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
template <typename T>
__forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
int delta, int width = 32) {
#if CUDA_VERSION < 9000
return __shfl_down(val, delta, width);
#else
return __shfl_down_sync(mask, val, delta, width);
#endif
}
template <typename T>
__forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
int width = 32) {
#if CUDA_VERSION < 9000
return __shfl(val, src_line, width);
#else
return __shfl_sync(mask, val, src_line, width);
#endif
}
template <typename T>
__device__ T reduceSum(T val, int tid, int len) {
// NOTE(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
const int warpSize = 32;
__shared__ T shm[warpSize];
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, tid < len);
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += platform::CudaShuffleDownSync(mask, val, offset);
if (tid < warpSize) shm[tid] = 0;
if (tid % warpSize == 0) {
shm[tid / warpSize] = val;
}
__syncthreads();
CREATE_SHFL_MASK(mask, tid < warpSize);
if (tid < warpSize) {
val = shm[tid];
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += platform::CudaShuffleDownSync(mask, val, offset);
}
return val;
}
} // namespace platform
} // namespace paddle
...@@ -65,19 +65,5 @@ CUDA_ATOMIC_WRAPPER(Add, double) { ...@@ -65,19 +65,5 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
return __longlong_as_double(old); return __longlong_as_double(old);
} }
#endif #endif
// __shfl_down has been deprecated as of CUDA 9.0.
#if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
return __shfl_down(val, delta);
}
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -18,7 +18,6 @@ limitations under the License. */ ...@@ -18,7 +18,6 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/profiler.pb.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "RowConvOp.h" #include "paddle/cuda/include/hl_base.h"
#include "hl_base.h" #include "paddle/function/RowConvOp.h"
namespace paddle { namespace paddle {
...@@ -94,7 +94,7 @@ __global__ void KeRowConv2(real* y, ...@@ -94,7 +94,7 @@ __global__ void KeRowConv2(real* y,
} }
template <> template <>
void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out, void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out, // NOLINT
const GpuMatrix& in, const GpuMatrix& in,
const GpuMatrix& filter, const GpuMatrix& filter,
const GpuIVector& seq) { const GpuIVector& seq) {
...@@ -144,6 +144,10 @@ __global__ void KeRowConvBwWeight(real* dw, ...@@ -144,6 +144,10 @@ __global__ void KeRowConvBwWeight(real* dw,
} }
__syncthreads(); __syncthreads();
// NOTE(zcd): temporary solution
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, true);
for (int i = 0; i < numSeq; ++i) { for (int i = 0; i < numSeq; ++i) {
const int start = starts[i]; const int start = starts[i];
const int end = starts[i + 1]; const int end = starts[i + 1];
...@@ -170,11 +174,10 @@ __global__ void KeRowConvBwWeight(real* dw, ...@@ -170,11 +174,10 @@ __global__ void KeRowConvBwWeight(real* dw,
real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t]; real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t];
__syncthreads(); __syncthreads();
// warp size and blockDim.x is 32. // warp size and blockDim.x is 32.
val += __shfl_down(val, 16);
val += __shfl_down(val, 8); for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down(val, 4); val += __shfl_down_sync(mask, val, offset);
val += __shfl_down(val, 2);
val += __shfl_down(val, 1);
__syncthreads(); __syncthreads();
if (tidx == 0) { if (tidx == 0) {
sh_dw[t][tidy] += val; sh_dw[t][tidy] += val;
...@@ -205,6 +208,10 @@ __global__ void KeRowConvBwWeight2(real* dw, ...@@ -205,6 +208,10 @@ __global__ void KeRowConvBwWeight2(real* dw,
__shared__ real sh_x[BLOCK_H][BLOCK_W]; __shared__ real sh_x[BLOCK_H][BLOCK_W];
__shared__ real sh_dy[BLOCK_H][BLOCK_W]; __shared__ real sh_dy[BLOCK_H][BLOCK_W];
// NOTE(zcd): temporary solution
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, true);
for (int i = 0; i < numSeq; ++i) { for (int i = 0; i < numSeq; ++i) {
const int start = starts[i]; const int start = starts[i];
const int end = starts[i + 1]; const int end = starts[i + 1];
...@@ -230,11 +237,9 @@ __global__ void KeRowConvBwWeight2(real* dw, ...@@ -230,11 +237,9 @@ __global__ void KeRowConvBwWeight2(real* dw,
real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx]; real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
__syncthreads(); __syncthreads();
// warp size and blockDim.x is 32. // warp size and blockDim.x is 32.
val += __shfl_down(val, 16); for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down(val, 8); val += __shfl_down_sync(mask, val, offset);
val += __shfl_down(val, 4);
val += __shfl_down(val, 2);
val += __shfl_down(val, 1);
__syncthreads(); __syncthreads();
if (tidx == 0 && (gidx + tidy) < width) { if (tidx == 0 && (gidx + tidy) < width) {
...@@ -323,8 +328,8 @@ template <> ...@@ -323,8 +328,8 @@ template <>
void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG, void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
const GpuMatrix& in, const GpuMatrix& in,
const GpuMatrix& filter, const GpuMatrix& filter,
GpuMatrix& inG, GpuMatrix& inG, // NOLINT
GpuMatrix& filterG, GpuMatrix& filterG, // NOLINT
const GpuIVector& seq) { const GpuIVector& seq) {
const size_t numSeq = seq.getSize() - 1; const size_t numSeq = seq.getSize() - 1;
const size_t contextLength = filter.getHeight(); const size_t contextLength = filter.getHeight();
......
...@@ -2157,26 +2157,20 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, ...@@ -2157,26 +2157,20 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
int wend = wstart + sizeX; int wend = wstart + sizeX;
wstart = wstart < 0 ? 0 : wstart; wstart = wstart < 0 ? 0 : wstart;
wend = wend < (int)imgSizeW ? wend : (int)imgSizeW; wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
if (maskData == NULL) {
real tmp = -(real)FLT_MAX; real maxval = -(real)FLT_MAX;
for (int h = hstart; h < hend; ++h) { int max_index = -1;
for (int w = wstart; w < wend; ++w) { for (int h = hstart; h < hend; ++h) {
tmp = tmp < inputData[h * imgSizeW + w] for (int w = wstart; w < wend; ++w) {
? inputData[h * imgSizeW + w] if (maxval < inputData[h * imgSizeW + w]) {
: tmp; maxval = inputData[h * imgSizeW + w];
} max_index = h * imgSizeW + w;
}
outData[ph * outputW + pw] = tmp;
} else {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
if (outData[ph * outputW + pw] < inputData[h * imgSizeW + w]) {
outData[ph * outputW + pw] = inputData[h * imgSizeW + w];
maskData[ph * outputW + pw] = h * imgSizeW + w;
}
} }
} }
} }
outData[ph * outputW + pw] = maxval;
if (maskData != NULL) maskData[ph * outputW + pw] = max_index;
} }
} }
// compute offset // compute offset
......
...@@ -40,6 +40,7 @@ function print_usage() { ...@@ -40,6 +40,7 @@ function print_usage() {
${BLUE}capi${NONE}: generate paddle CAPI package ${BLUE}capi${NONE}: generate paddle CAPI package
${BLUE}fluid_inference_lib${NONE}: deploy fluid inference library ${BLUE}fluid_inference_lib${NONE}: deploy fluid inference library
${BLUE}check_style${NONE}: run code style check ${BLUE}check_style${NONE}: run code style check
${BLUE}cicheck${NONE}: run CI tasks
" "
} }
...@@ -453,6 +454,8 @@ function gen_capi_package() { ...@@ -453,6 +454,8 @@ function gen_capi_package() {
} }
function gen_fluid_inference_lib() { function gen_fluid_inference_lib() {
mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build
if [ ${WITH_C_API:-OFF} == "OFF" ] ; then if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
cat <<EOF cat <<EOF
======================================== ========================================
...@@ -503,6 +506,13 @@ function main() { ...@@ -503,6 +506,13 @@ function main() {
check_style) check_style)
check_style check_style
;; ;;
cicheck)
cmake_gen ${PYTHON_ABI:-""}
build
run_test
gen_capi_package
gen_fluid_inference_lib
;;
*) *)
print_usage print_usage
exit 0 exit 0
......
...@@ -56,11 +56,15 @@ EOL ...@@ -56,11 +56,15 @@ EOL
if ! [ -x "$(command -v ${DOCKER_CMD})" ]; then if ! [ -x "$(command -v ${DOCKER_CMD})" ]; then
DOCKER_CMD="docker" DOCKER_CMD="docker"
fi fi
if [ ! -d "${HOME}/.ccache" ]; then
mkdir ${HOME}/.ccache
fi
set -x set -x
${DOCKER_CMD} run -it \ ${DOCKER_CMD} run -it \
--name $CONTAINER_ID \ --name $CONTAINER_ID \
${DOCKER_ENV} \ ${DOCKER_ENV} \
-v $PADDLE_ROOT:/paddle \ -v $PADDLE_ROOT:/paddle \
-v ${HOME}/.ccache:/root/.ccache \
-w /paddle \ -w /paddle \
$IMG \ $IMG \
paddle/scripts/paddle_build.sh $@ paddle/scripts/paddle_build.sh $@
......
...@@ -22,14 +22,14 @@ from executor import * ...@@ -22,14 +22,14 @@ from executor import *
import trainer import trainer
from trainer import Trainer from trainer import Trainer
from trainer import Event from trainer import BeginEpochEvent
from trainer import EndEpochEvent
from trainer import BeginStepEvent
from trainer import EndStepEvent
import inferencer import inferencer
from inferencer import Inferencer from inferencer import Inferencer
import params
from params import Params
import io import io
import evaluator import evaluator
import initializer import initializer
...@@ -40,16 +40,14 @@ import backward ...@@ -40,16 +40,14 @@ import backward
import regularizer import regularizer
import average import average
import metrics import metrics
import transpiler
from param_attr import ParamAttr, WeightNormParamAttr from param_attr import ParamAttr, WeightNormParamAttr
from data_feeder import DataFeeder from data_feeder import DataFeeder
from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
from distribute_transpiler import DistributeTranspiler from transpiler import DistributeTranspiler, SimpleDistributeTranspiler, InferenceTranspiler, memory_optimize, release_memory
from distribute_transpiler_simple import SimpleDistributeTranspiler
from concurrency import (Go, make_channel, channel_send, channel_recv, from concurrency import (Go, make_channel, channel_send, channel_recv,
channel_close, Select) channel_close, Select)
from inference_transpiler import InferenceTranspiler
import clip import clip
from memory_optimization_transpiler import memory_optimize, release_memory
import profiler import profiler
import unique_name import unique_name
import recordio_writer import recordio_writer
...@@ -58,7 +56,7 @@ from parallel_executor import ParallelExecutor ...@@ -58,7 +56,7 @@ from parallel_executor import ParallelExecutor
Tensor = LoDTensor Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\
trainer.__all__ + inferencer.__all__ + params.__all__ + [ trainer.__all__ + inferencer.__all__ + transpiler.__all__ + [
'io', 'io',
'initializer', 'initializer',
'layers', 'layers',
...@@ -76,11 +74,6 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\ ...@@ -76,11 +74,6 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\
'WeightNormParamAttr', 'WeightNormParamAttr',
'DataFeeder', 'DataFeeder',
'clip', 'clip',
'SimpleDistributeTranspiler',
'DistributeTranspiler',
'InferenceTranspiler',
'memory_optimize',
'release_memory',
'profiler', 'profiler',
'unique_name', 'unique_name',
'recordio_writer', 'recordio_writer',
......
...@@ -848,17 +848,6 @@ class Block(object): ...@@ -848,17 +848,6 @@ class Block(object):
self.desc.remove_op(index, index + 1) self.desc.remove_op(index, index + 1)
del self.ops[index] del self.ops[index]
def delete_ops(self, ops):
# remove from cpp
# FIXME(typhoonzero): remove only the first occurrence.
try:
start = list(self.ops).index(ops[0])
end = list(self.ops).index(ops[-1])
except Exception, e:
raise e
self.desc.remove_op(start, end + 1)
def slice_ops(self, start, end): def slice_ops(self, start, end):
return self.ops[start:end] return self.ops[start:end]
......
...@@ -12,19 +12,34 @@ ...@@ -12,19 +12,34 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import core
import framework
import executor
import io
__all__ = ['Inferencer', ] __all__ = ['Inferencer', ]
class Inferencer(object): class Inferencer(object):
def __init__(self, network_func, params, place=None): def __init__(self, network_func, param_path=None, place=None):
# 1. we need to generate a framework.Program by calling # 1. we need to generate a framework.Program by calling
# network_func. Reference: fluid.program_guard in test_word2vec.py # network_func. Reference: fluid.program_guard in test_word2vec.py
# 2. move the default_main_program to self.program. # 2. move the default_main_program to self.program.
# 3. run the default_startup program. # 3. run the default_startup program.
self.params = params
# 4. load params from param_path into scope
self.scope = core.Scope()
self.place = place self.place = place
self.startup_program = framework.Program()
# TODO: generate the startup_program with network_func
exe = executor.Executor(place)
exe.run(self.startup_program, scope=self.scope)
if param_path:
# load params from param_path into scope
io.load_persistables(exe, dirname=param_path)
def infer(self, inputs): def infer(self, inputs):
# run self.program # run self.program
......
...@@ -400,11 +400,11 @@ class LayerHelper(object): ...@@ -400,11 +400,11 @@ class LayerHelper(object):
if isinstance(act, basestring): if isinstance(act, basestring):
act = {'type': act} act = {'type': act}
if 'use_cudnn' in self.kwargs and self.kwargs.get('use_cudnn'):
act['use_cudnn'] = self.kwargs.get('use_cudnn')
if 'use_mkldnn' in self.kwargs: if 'use_mkldnn' in self.kwargs:
act['use_mkldnn'] = self.kwargs.get('use_mkldnn') act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
act_type = act.pop('type') act_type = act.pop('type')
if 'use_mkldnn' in self.kwargs:
act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
tmp = input_var tmp = input_var
# NOTE(dzhwinter): some activation support inplace compution. # NOTE(dzhwinter): some activation support inplace compution.
if not core.IsInplace(act_type): if not core.IsInplace(act_type):
......
...@@ -50,8 +50,6 @@ def data(name, ...@@ -50,8 +50,6 @@ def data(name,
dtype(int|float): The type of data : float32, float_16, int etc dtype(int|float): The type of data : float32, float_16, int etc
type(VarType): The output type. By default it is LOD_TENSOR. type(VarType): The output type. By default it is LOD_TENSOR.
lod_level(int): The LoD Level. 0 means the input data is not a sequence. lod_level(int): The LoD Level. 0 means the input data is not a sequence.
main_program(Program): Name of the main program that calls this
startup_program(Program): Name of the startup program
stop_gradient(bool): A boolean that mentions whether gradient should flow. stop_gradient(bool): A boolean that mentions whether gradient should flow.
Returns: Returns:
...@@ -74,13 +72,15 @@ def data(name, ...@@ -74,13 +72,15 @@ def data(name,
if append_batch_size: if append_batch_size:
shape = [-1] + shape # append batch size as -1 shape = [-1] + shape # append batch size as -1
return helper.create_global_variable( data_var = helper.create_global_variable(
name=name, name=name,
shape=shape, shape=shape,
dtype=dtype, dtype=dtype,
type=type, type=type,
stop_gradient=stop_gradient, stop_gradient=stop_gradient,
lod_level=lod_level) lod_level=lod_level)
data_var.is_data = True
return data_var
class BlockGuardServ(BlockGuard): class BlockGuardServ(BlockGuard):
......
...@@ -169,7 +169,9 @@ def monkey_patch_variable(): ...@@ -169,7 +169,9 @@ def monkey_patch_variable():
# a*b == b*a. Do not need to reverse explicitly # a*b == b*a. Do not need to reverse explicitly
("__rmul__", "elementwise_mul", False), ("__rmul__", "elementwise_mul", False),
("__div__", "elementwise_div", False), ("__div__", "elementwise_div", False),
("__truediv__", "elementwise_div", False),
("__rdiv__", "elementwise_div", True), ("__rdiv__", "elementwise_div", True),
("__rtruediv__", "elementwise_div", True),
("__pow__", "elementwise_pow", False), ("__pow__", "elementwise_pow", False),
("__rpow__", "elementwise_pow", True), ("__rpow__", "elementwise_pow", True),
# for logical compare # for logical compare
......
...@@ -88,6 +88,7 @@ def fc(input, ...@@ -88,6 +88,7 @@ def fc(input,
num_flatten_dims=1, num_flatten_dims=1,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
use_cudnn=False,
use_mkldnn=False, use_mkldnn=False,
act=None, act=None,
is_test=False, is_test=False,
...@@ -1496,6 +1497,7 @@ def batch_norm(input, ...@@ -1496,6 +1497,7 @@ def batch_norm(input,
bias_attr=None, bias_attr=None,
data_layout='NCHW', data_layout='NCHW',
in_place=False, in_place=False,
use_mkldnn=False,
name=None, name=None,
moving_mean_name=None, moving_mean_name=None,
moving_variance_name=None, moving_variance_name=None,
...@@ -1574,9 +1576,12 @@ def batch_norm(input, ...@@ -1574,9 +1576,12 @@ def batch_norm(input,
"SavedMean": saved_mean, "SavedMean": saved_mean,
"SavedVariance": saved_variance "SavedVariance": saved_variance
}, },
attrs={"momentum": momentum, attrs={
"epsilon": epsilon, "momentum": momentum,
"is_test": is_test}) "epsilon": epsilon,
"is_test": is_test,
"use_mkldnn": use_mkldnn
})
return helper.append_activation(batch_norm_out) return helper.append_activation(batch_norm_out)
......
...@@ -28,7 +28,8 @@ from contextlib import contextmanager ...@@ -28,7 +28,8 @@ from contextlib import contextmanager
__all__ = [ __all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad',
'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'Adadelta', 'ModelAverage' 'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'Adadelta', 'ModelAverage',
'Optimizer'
] ]
......
...@@ -45,7 +45,7 @@ class ParallelExecutor(object): ...@@ -45,7 +45,7 @@ class ParallelExecutor(object):
training. training.
allow_op_delay(bool, default False): Whether to delay and buffer allow_op_delay(bool, default False): Whether to delay and buffer
some operators together for scheduling or not, which may some operators together for scheduling or not, which may
improve performance in some cases, defalut False. improve performance in some cases, default False.
share_vars_from(ParallelExecutor, default None): If provied, share_vars_from(ParallelExecutor, default None): If provied,
it will share variables from the specified ParallelExecutor. it will share variables from the specified ParallelExecutor.
use_default_grad_scale(bool, default True): If set True, a default use_default_grad_scale(bool, default True): If set True, a default
...@@ -95,7 +95,7 @@ class ParallelExecutor(object): ...@@ -95,7 +95,7 @@ class ParallelExecutor(object):
if use_cuda: if use_cuda:
# Experiments on se-resnext shows that too many threads hurt # Experiments on se-resnext shows that too many threads hurt
# performance. Worth tunning for other models in the future. # performance. Worth tunning for other models in the future.
num_threads = len(self._places) num_threads = len(self._places) * 2
else: else:
num_threads = min( num_threads = min(
len(self._places) * 2, multiprocessing.cpu_count()) len(self._places) * 2, multiprocessing.cpu_count())
......
...@@ -64,15 +64,14 @@ def resnet_cifar10(input, depth=32): ...@@ -64,15 +64,14 @@ def resnet_cifar10(input, depth=32):
res3 = layer_warp(basicblock, res2, 32, 64, n, 2) res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
pool = fluid.layers.pool2d( pool = fluid.layers.pool2d(
input=res3, pool_size=8, pool_type='avg', pool_stride=1) input=res3, pool_size=8, pool_type='avg', pool_stride=1)
return pool predict = fluid.layers.fc(input=pool, size=10, act='softmax')
return predict
def inference_network(): def inference_network():
classdim = 10
data_shape = [3, 32, 32] data_shape = [3, 32, 32]
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
net = resnet_cifar10(images, 32) predict = resnet_cifar10(images, 32)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
return predict return predict
......
...@@ -43,15 +43,14 @@ def vgg16_bn_drop(input): ...@@ -43,15 +43,14 @@ def vgg16_bn_drop(input):
bn = fluid.layers.batch_norm(input=fc1, act='relu') bn = fluid.layers.batch_norm(input=fc1, act='relu')
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=4096, act=None) fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
return fc2 predict = fluid.layers.fc(input=fc2, size=10, act='softmax')
return predict
def inference_network(): def inference_network():
classdim = 10
data_shape = [3, 32, 32] data_shape = [3, 32, 32]
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
net = vgg16_bn_drop(images) predict = vgg16_bn_drop(images)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
return predict return predict
......
...@@ -80,12 +80,7 @@ def train(use_cuda, save_dirname, is_local): ...@@ -80,12 +80,7 @@ def train(use_cuda, save_dirname, is_local):
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
optimize_ops,
params_grads,
trainer_id,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_startup = t.get_startup_program(current_endpoint,
......
...@@ -189,12 +189,7 @@ def train(net_type, use_cuda, save_dirname, is_local): ...@@ -189,12 +189,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
optimize_ops,
params_grads,
trainer_id,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_startup = t.get_startup_program(current_endpoint,
...@@ -252,26 +247,6 @@ def infer(use_cuda, save_dirname=None): ...@@ -252,26 +247,6 @@ def infer(use_cuda, save_dirname=None):
fetch_targets, exe, fetch_targets, exe,
inference_transpiler_program) inference_transpiler_program)
if use_cuda and fluid.core.is_float16_supported(place):
# Use float16_transpiler to speedup
fp16_transpiler_program = inference_transpiler_program.clone()
t.float16_transpile(fp16_transpiler_program, place)
fp16_results = exe.run(fp16_transpiler_program,
feed={feed_target_names[0]: tensor_img},
fetch_list=fetch_targets)
assert len(results[0]) == len(fp16_results[0])
for i in range(len(results[0])):
np.testing.assert_almost_equal(
results[0][i], fp16_results[0][i], decimal=2)
print("float16 infer results: ", fp16_results[0])
fluid.io.save_inference_model("float16_" + save_dirname,
feed_target_names, fetch_targets, exe,
fp16_transpiler_program)
def main(net_type, use_cuda, is_local=True): def main(net_type, use_cuda, is_local=True):
if use_cuda and not fluid.core.is_compiled_with_cuda(): if use_cuda and not fluid.core.is_compiled_with_cuda():
......
...@@ -259,12 +259,7 @@ def train(use_cuda, save_dirname=None, is_local=True): ...@@ -259,12 +259,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
optimize_ops,
params_grads,
trainer_id,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_startup = t.get_startup_program(current_endpoint,
......
...@@ -231,12 +231,7 @@ def train_main(use_cuda, is_sparse, is_local=True): ...@@ -231,12 +231,7 @@ def train_main(use_cuda, is_sparse, is_local=True):
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
optimize_ops,
params_grads,
trainer_id,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_startup = t.get_startup_program(current_endpoint,
......
...@@ -162,12 +162,7 @@ def train(nn_type, ...@@ -162,12 +162,7 @@ def train(nn_type,
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
optimize_ops,
params_grads,
trainer_id,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_startup = t.get_startup_program(current_endpoint,
......
...@@ -261,12 +261,7 @@ def train(use_cuda, save_dirname, is_local=True): ...@@ -261,12 +261,7 @@ def train(use_cuda, save_dirname, is_local=True):
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
optimize_ops,
params_grads,
trainer_id,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_startup = t.get_startup_program(current_endpoint,
......
...@@ -213,12 +213,7 @@ def train(word_dict, ...@@ -213,12 +213,7 @@ def train(word_dict,
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
optimize_ops,
params_grads,
trainer_id,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_startup = t.get_startup_program(current_endpoint,
......
...@@ -145,12 +145,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): ...@@ -145,12 +145,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
optimize_ops,
params_grads,
trainer_id,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_startup = t.get_startup_program(current_endpoint,
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
from functools import partial
CLASS_DIM = 2
EMB_DIM = 128
HID_DIM = 512
STACKED_NUM = 3
def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):
assert stacked_num % 2 == 1
emb = fluid.layers.embedding(
input=data, size=[input_dim, emb_dim], is_sparse=True)
fc1 = fluid.layers.fc(input=emb, size=hid_dim)
lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
inputs = [fc1, lstm1]
for i in range(2, stacked_num + 1):
fc = fluid.layers.fc(input=inputs, size=hid_dim)
lstm, cell = fluid.layers.dynamic_lstm(
input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
inputs = [fc, lstm]
fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
prediction = fluid.layers.fc(input=[fc_last, lstm_last],
size=class_dim,
act='softmax')
return prediction
def inference_network(word_dict):
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
dict_dim = len(word_dict)
net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM,
STACKED_NUM)
return net
def train_network(word_dict):
prediction = inference_network(word_dict)
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(cost)
accuracy = fluid.layers.accuracy(input=prediction, label=label)
return avg_cost, accuracy
def train(use_cuda, save_path):
BATCH_SIZE = 128
EPOCH_NUM = 5
word_dict = paddle.dataset.imdb.word_dict()
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=1000),
batch_size=BATCH_SIZE)
test_data = paddle.batch(
paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
def event_handler(event):
if isinstance(event, fluid.EndIteration):
if (event.batch_id % 10) == 0:
avg_cost, accuracy = trainer.test(reader=test_data)
print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format(
event.batch_id + 1, avg_cost, accuracy))
if accuracy > 0.01: # Low threshold for speeding up CI
trainer.params.save(save_path)
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer(
partial(train_network, word_dict),
optimizer=fluid.optimizer.Adagrad(learning_rate=0.002),
place=place,
event_handler=event_handler)
trainer.train(train_data, EPOCH_NUM, event_handler=event_handler)
def infer(use_cuda, save_path):
params = fluid.Params(save_path)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
word_dict = paddle.dataset.imdb.word_dict()
inferencer = fluid.Inferencer(
partial(inference_network, word_dict), params, place=place)
def create_random_lodtensor(lod, place, low, high):
data = np.random.random_integers(low, high,
[lod[-1], 1]).astype("int64")
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod([lod])
return res
lod = [0, 4, 10]
tensor_words = create_random_lodtensor(
lod, place, low=0, high=len(word_dict) - 1)
results = inferencer.infer({'words': tensor_words})
print("infer results: ", results)
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "understand_sentiment_stacked_lstm.inference.model"
train(use_cuda, save_path)
infer(use_cuda, save_path)
if __name__ == '__main__':
for use_cuda in (False, True):
main(use_cuda=use_cuda)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.fluid as fluid
import numpy as np
import math
import sys
from functools import partial
PASS_NUM = 100
EMBED_SIZE = 32
HIDDEN_SIZE = 256
N = 5
BATCH_SIZE = 32
def create_random_lodtensor(lod, place, low, high):
# The range of data elements is [low, high]
data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod([lod])
return res
word_dict = paddle.dataset.imikolov.build_dict()
dict_size = len(word_dict)
def inference_program(is_sparse):
first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
embed_first = fluid.layers.embedding(
input=first_word,
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=is_sparse,
param_attr='shared_w')
embed_second = fluid.layers.embedding(
input=second_word,
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=is_sparse,
param_attr='shared_w')
embed_third = fluid.layers.embedding(
input=third_word,
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=is_sparse,
param_attr='shared_w')
embed_forth = fluid.layers.embedding(
input=forth_word,
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=is_sparse,
param_attr='shared_w')
concat_embed = fluid.layers.concat(
input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
hidden1 = fluid.layers.fc(input=concat_embed,
size=HIDDEN_SIZE,
act='sigmoid')
predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
return predict_word
def train_program(is_sparse):
next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
predict_word = inference_program(is_sparse)
cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
avg_cost = fluid.layers.mean(cost)
return avg_cost
def train(use_cuda, is_sparse, save_path):
train_reader = paddle.batch(
paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
def event_handler(event):
print type(event)
if isinstance(event, fluid.EndEpochEvent):
avg_cost = trainer.test(reader=paddle.dataset.imikolov.test(
word_dict, N))
if avg_cost < 5.0:
trainer.save_params(save_path)
return
if math.isnan(avg_cost):
sys.exit("got NaN loss, training failed.")
trainer = fluid.Trainer(
partial(train_program, is_sparse),
fluid.optimizer.SGD(learning_rate=0.001),
place=place)
trainer.train(
reader=train_reader, num_epochs=100, event_handler=event_handler)
def infer(use_cuda, is_sparse, save_path):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
partial(inference_program, is_sparse),
param_path=save_path,
place=place)
lod = [0, 1]
first_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
second_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
third_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
fourth_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
result = inferencer.infer({
'firstw': first_word,
'secondw': second_word,
'thirdw': third_word,
'forthw': fourth_word
})
print(result)
def main(use_cuda, is_sparse):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "word2vec.inference.model"
train(use_cuda, is_sparse, save_path)
infer(use_cuda, is_sparse, save_path)
if __name__ == '__main__':
for use_cuda in (False, True):
for is_sparse in (False, True):
main(use_cuda=use_cuda, is_sparse=is_sparse)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle.fluid.core as core
from paddle.fluid.op import Operator
import paddle.fluid as fluid
from op_test import OpTest
from paddle.fluid.framework import grad_var_name
from test_batch_norm_op import TestBatchNormOpInference, TestBatchNormOpTraining, _reference_training, _reference_grad
class TestMKLDNNBatchNormOpTraining(TestBatchNormOpTraining):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_formats = ["NCHW"]
def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
epsilon, momentum, shape, data_layout):
# run forward
y, saved_mean, saved_variance = _reference_training(
x, scale, bias, epsilon, data_layout)
mean_out = saved_mean * (1. - momentum) + momentum * mean
variance_out = saved_variance * (1. - momentum) + momentum * variance
# run backward
x_grad, scale_grad, bias_grad = _reference_grad(
x, y_grad, scale, saved_mean, saved_variance, epsilon, data_layout)
return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
class TestMKLDNNBatchNormOpInference(TestBatchNormOpInference):
def init_kernel_type(self):
self.use_mkldnn = True
def test_check_output(self):
place = core.CPUPlace()
data_format = "NCHW"
self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
if __name__ == '__main__':
unittest.main()
...@@ -158,6 +158,8 @@ def set_output_grad(scope, outputs, place, feed_dict=None): ...@@ -158,6 +158,8 @@ def set_output_grad(scope, outputs, place, feed_dict=None):
class TestBatchNormOpInference(unittest.TestCase): class TestBatchNormOpInference(unittest.TestCase):
def setUp(self): def setUp(self):
self.dtype = np.float32 self.dtype = np.float32
self.use_mkldnn = False
self.init_kernel_type()
def __assert_close(self, tensor, np_array, msg, atol=1e-4): def __assert_close(self, tensor, np_array, msg, atol=1e-4):
self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
...@@ -230,6 +232,7 @@ class TestBatchNormOpInference(unittest.TestCase): ...@@ -230,6 +232,7 @@ class TestBatchNormOpInference(unittest.TestCase):
# attrs # attrs
is_test=True, is_test=True,
data_layout=data_layout, data_layout=data_layout,
use_mkldnn=self.use_mkldnn,
epsilon=epsilon) epsilon=epsilon)
batch_norm_op.run(scope, place) batch_norm_op.run(scope, place)
...@@ -254,10 +257,15 @@ class TestBatchNormOpInference(unittest.TestCase): ...@@ -254,10 +257,15 @@ class TestBatchNormOpInference(unittest.TestCase):
[2, 3, 4, 5]) [2, 3, 4, 5])
self.check_with_place(place, data_format, self.dtype, [2, 3]) self.check_with_place(place, data_format, self.dtype, [2, 3])
def init_kernel_type(self):
pass
class TestFP16BatchNormOpInference(TestBatchNormOpInference): class TestFP16BatchNormOpInference(TestBatchNormOpInference):
def setUp(self): def setUp(self):
self.dtype = np.float16 self.dtype = np.float16
self.use_mkldnn = False
self.init_kernel_type()
def test_check_output(self): def test_check_output(self):
places = [] places = []
...@@ -274,9 +282,28 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference): ...@@ -274,9 +282,28 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
class TestBatchNormOpTraining(unittest.TestCase): class TestBatchNormOpTraining(unittest.TestCase):
def setUp(self):
self.use_mkldnn = False
self.data_formats = ["NCHW", "NHWC"]
self.init_kernel_type()
def __assert_close(self, tensor, np_array, msg, atol=1e-4): def __assert_close(self, tensor, np_array, msg, atol=1e-4):
np.allclose(np.array(tensor), np_array, atol=atol) np.allclose(np.array(tensor), np_array, atol=atol)
def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
epsilon, momentum, shape, data_layout):
# run forward
y, saved_mean, var_ref = _reference_training(x, scale, bias, epsilon,
data_layout)
mean_out = saved_mean * (1. - momentum) + momentum * mean
variance_out = var_ref * (1. - momentum) + momentum * variance
saved_variance = 1. / np.sqrt(var_ref + epsilon)
# run backward
x_grad, scale_grad, bias_grad = _reference_grad(
x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout)
return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
def test_forward_backward(self): def test_forward_backward(self):
def test_with_place(place, data_layout, shape): def test_with_place(place, data_layout, shape):
# attr # attr
...@@ -295,16 +322,11 @@ class TestBatchNormOpTraining(unittest.TestCase): ...@@ -295,16 +322,11 @@ class TestBatchNormOpTraining(unittest.TestCase):
mean = np.zeros(scale_shape).astype(np.float32) mean = np.zeros(scale_shape).astype(np.float32)
variance = np.ones(scale_shape).astype(np.float32) variance = np.ones(scale_shape).astype(np.float32)
# run forward
y, saved_mean, var_ref = _reference_training(x, scale, bias,
epsilon, data_layout)
mean_out = saved_mean * (1. - momentum) + momentum * mean
variance_out = var_ref * (1. - momentum) + momentum * variance
saved_variance = 1. / np.sqrt(var_ref + epsilon)
# run backward
y_grad = np.random.random_sample(shape).astype(np.float32) y_grad = np.random.random_sample(shape).astype(np.float32)
x_grad, scale_grad, bias_grad = _reference_grad(
x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout) y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward(
x, y_grad, scale, bias, mean, variance, epsilon, momentum,
shape, data_layout)
var_dict = locals() var_dict = locals()
var_dict['y@GRAD'] = y_grad var_dict['y@GRAD'] = y_grad
...@@ -344,7 +366,8 @@ class TestBatchNormOpTraining(unittest.TestCase): ...@@ -344,7 +366,8 @@ class TestBatchNormOpTraining(unittest.TestCase):
"momentum": momentum, "momentum": momentum,
"epsilon": epsilon, "epsilon": epsilon,
"is_test": False, "is_test": False,
"data_layout": data_layout "data_layout": data_layout,
"use_mkldnn": self.use_mkldnn
}) })
block.create_var(name='y@GRAD', dtype='float32', shape=y.shape) block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
...@@ -387,13 +410,17 @@ class TestBatchNormOpTraining(unittest.TestCase): ...@@ -387,13 +410,17 @@ class TestBatchNormOpTraining(unittest.TestCase):
print "op test forward passed: ", str(place), data_layout print "op test forward passed: ", str(place), data_layout
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
for data_format in ["NCHW", "NHWC"]: for data_format in self.data_formats:
test_with_place(place, data_format, [2, 3, 4, 5]) test_with_place(place, data_format, [2, 3, 4, 5])
def init_kernel_type(self):
pass
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -22,12 +22,12 @@ from paddle.fluid.op import Operator ...@@ -22,12 +22,12 @@ from paddle.fluid.op import Operator
class TestBeamSearchDecodeOp(unittest.TestCase): class TestBeamSearchDecodeOp(unittest.TestCase):
def setUp(self): def setUp(self):
self.scope = core.Scope() self.scope = core.Scope()
self.cpu_place = core.CPUPlace() self.place = core.CPUPlace()
def append_lod_tensor(self, tensor_array, lod, data): def append_lod_tensor(self, tensor_array, lod, data):
lod_tensor = core.LoDTensor() lod_tensor = core.LoDTensor()
lod_tensor.set_lod(lod) lod_tensor.set_lod(lod)
lod_tensor.set(data, self.cpu_place) lod_tensor.set(data, self.place)
tensor_array.append(lod_tensor) tensor_array.append(lod_tensor)
def test_get_set(self): def test_get_set(self):
...@@ -71,7 +71,7 @@ class TestBeamSearchDecodeOp(unittest.TestCase): ...@@ -71,7 +71,7 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
SentenceIds="sentence_ids", SentenceIds="sentence_ids",
SentenceScores="sentence_scores") SentenceScores="sentence_scores")
beam_search_decode_op.run(self.scope, self.cpu_place) beam_search_decode_op.run(self.scope, self.place)
expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]] expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
self.assertEqual(sentence_ids.lod(), expected_lod) self.assertEqual(sentence_ids.lod(), expected_lod)
...@@ -84,5 +84,11 @@ class TestBeamSearchDecodeOp(unittest.TestCase): ...@@ -84,5 +84,11 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
np.array_equal(np.array(sentence_scores), expected_data)) np.array_equal(np.array(sentence_scores), expected_data))
class TestBeamSearchDecodeOpGPU(TestBeamSearchDecodeOp):
def setUp(self):
self.scope = core.Scope()
self.place = core.CUDAPlace(0)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -70,9 +70,11 @@ def conv3d_forward_naive(input, filter, group, conv_param): ...@@ -70,9 +70,11 @@ def conv3d_forward_naive(input, filter, group, conv_param):
class TestConv3dOp(OpTest): class TestConv3dOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "conv3d"
self.use_cudnn = False self.use_cudnn = False
self.dtype = np.float32
self.init_kernel_type()
self.init_group() self.init_group()
self.init_op_type()
self.init_dilation() self.init_dilation()
self.init_test_case() self.init_test_case()
...@@ -80,20 +82,24 @@ class TestConv3dOp(OpTest): ...@@ -80,20 +82,24 @@ class TestConv3dOp(OpTest):
'stride': self.stride, 'stride': self.stride,
'pad': self.pad, 'pad': self.pad,
'dilations': self.dilations, 'dilations': self.dilations,
'use_cudnn': self.use_cudnn,
'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter
} }
input = np.random.random(self.input_size).astype("float32")
filter = np.random.random(self.filter_size).astype("float32") input = np.random.random(self.input_size).astype(self.dtype)
filter = np.random.random(self.filter_size).astype(self.dtype)
output = conv3d_forward_naive(input, filter, self.groups, output = conv3d_forward_naive(input, filter, self.groups,
conv3d_param).astype("float32") conv3d_param).astype(self.dtype)
self.inputs = {'Input': input, 'Filter': filter} self.inputs = {
'Input': OpTest.np_dtype_to_fluid_dtype(input),
'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
}
self.attrs = { self.attrs = {
'strides': self.stride, 'strides': self.stride,
'paddings': self.pad, 'paddings': self.pad,
'groups': self.groups, 'groups': self.groups,
'dilations': self.dilations 'dilations': self.dilations,
'use_cudnn': self.use_cudnn
} }
self.outputs = {'Output': output} self.outputs = {'Output': output}
...@@ -108,6 +114,8 @@ class TestConv3dOp(OpTest): ...@@ -108,6 +114,8 @@ class TestConv3dOp(OpTest):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
if self.testcudnn(): if self.testcudnn():
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
self.check_grad_with_place( self.check_grad_with_place(
...@@ -120,6 +128,8 @@ class TestConv3dOp(OpTest): ...@@ -120,6 +128,8 @@ class TestConv3dOp(OpTest):
set(['Input', 'Filter']), 'Output', max_relative_error=0.03) set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
def test_check_grad_no_filter(self): def test_check_grad_no_filter(self):
if self.dtype == np.float16:
return
if self.testcudnn(): if self.testcudnn():
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
self.check_grad_with_place( self.check_grad_with_place(
...@@ -135,6 +145,8 @@ class TestConv3dOp(OpTest): ...@@ -135,6 +145,8 @@ class TestConv3dOp(OpTest):
no_grad_set=set(['Filter'])) no_grad_set=set(['Filter']))
def test_check_grad_no_input(self): def test_check_grad_no_input(self):
if self.dtype == np.float16:
return
if self.testcudnn(): if self.testcudnn():
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
self.check_grad_with_place( self.check_grad_with_place(
...@@ -163,8 +175,8 @@ class TestConv3dOp(OpTest): ...@@ -163,8 +175,8 @@ class TestConv3dOp(OpTest):
def init_group(self): def init_group(self):
self.groups = 1 self.groups = 1
def init_op_type(self): def init_kernel_type(self):
self.op_type = "conv3d" pass
class TestCase1(TestConv3dOp): class TestCase1(TestConv3dOp):
...@@ -235,34 +247,90 @@ class TestWithDilation(TestConv3dOp): ...@@ -235,34 +247,90 @@ class TestWithDilation(TestConv3dOp):
self.groups = 3 self.groups = 3
#----------------Conv3dCUDNN----------------
class TestCUDNN(TestConv3dOp): class TestCUDNN(TestConv3dOp):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "conv3d"
class TestFP16CUDNN(TestConv3dOp):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
class TestWithGroup1CUDNN(TestWithGroup1): class TestWithGroup1CUDNN(TestWithGroup1):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "conv3d"
class TestFP16WithGroup1CUDNN(TestWithGroup1):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
class TestWithGroup2CUDNN(TestWithGroup2): class TestWithGroup2CUDNN(TestWithGroup2):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "conv3d"
class TestFP16WithGroup2CUDNN(TestWithGroup2):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
class TestWith1x1CUDNN(TestWith1x1): class TestWith1x1CUDNN(TestWith1x1):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "conv3d"
class TestFP16With1x1CUDNN(TestWith1x1):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
class TestWithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1): class TestWithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "conv3d"
class TestFP16WithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
# FIXME(typhoonzero): find a way to determine if # FIXME(typhoonzero): find a way to determine if
......
...@@ -34,7 +34,7 @@ class TestSendOp(unittest.TestCase): ...@@ -34,7 +34,7 @@ class TestSendOp(unittest.TestCase):
p.start() p.start()
time.sleep(10) time.sleep(10)
with open("/tmp/paddle.selected_port", "r") as fn: with open("/tmp/paddle.%d.selected_port" % p.pid, "r") as fn:
selected_port = int(fn.readlines()[0]) selected_port = int(fn.readlines()[0])
self.init_client(place, selected_port) self.init_client(place, selected_port)
......
...@@ -280,7 +280,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -280,7 +280,7 @@ class TestMNIST(TestParallelExecutorBase):
fluid.recordio_writer.convert_reader_to_recordio_file( fluid.recordio_writer.convert_reader_to_recordio_file(
'./mnist.recordio', reader, feeder) './mnist.recordio', reader, feeder)
def test_simple_fc(self): def check_simple_fc_convergence(self):
self.check_network_convergence(simple_fc_net) self.check_network_convergence(simple_fc_net)
self.check_network_convergence(simple_fc_net, allow_op_delay=True) self.check_network_convergence(simple_fc_net, allow_op_delay=True)
...@@ -290,7 +290,10 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -290,7 +290,10 @@ class TestMNIST(TestParallelExecutorBase):
simple_fc_net, feed_dict={"image": img, simple_fc_net, feed_dict={"image": img,
"label": label}) "label": label})
def test_simple_fc_parallel_accuracy(self): def test_simple_fc(self):
self.check_simple_fc_convergence()
def check_simple_fc_parallel_accuracy(self):
img = numpy.zeros(shape=[32, 784], dtype='float32') img = numpy.zeros(shape=[32, 784], dtype='float32')
label = numpy.ones(shape=[32, 1], dtype='int64') label = numpy.ones(shape=[32, 1], dtype='int64')
single_first_loss, single_last_loss = self.check_network_convergence( single_first_loss, single_last_loss = self.check_network_convergence(
...@@ -311,7 +314,10 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -311,7 +314,10 @@ class TestMNIST(TestParallelExecutorBase):
for p_l in parallel_last_loss: for p_l in parallel_last_loss:
self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
def test_batchnorm_fc(self): def test_simple_fc_parallel_accuracy(self):
self.check_simple_fc_parallel_accuracy()
def check_batchnorm_fc_convergence(self):
self.check_network_convergence(fc_with_batchnorm) self.check_network_convergence(fc_with_batchnorm)
img = numpy.zeros(shape=[32, 784], dtype='float32') img = numpy.zeros(shape=[32, 784], dtype='float32')
label = numpy.ones(shape=[32, 1], dtype='int64') label = numpy.ones(shape=[32, 1], dtype='int64')
...@@ -319,6 +325,9 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -319,6 +325,9 @@ class TestMNIST(TestParallelExecutorBase):
fc_with_batchnorm, feed_dict={"image": img, fc_with_batchnorm, feed_dict={"image": img,
"label": label}) "label": label})
def test_batchnorm_fc(self):
self.check_batchnorm_fc_convergence()
class TestResnet(TestParallelExecutorBase): class TestResnet(TestParallelExecutorBase):
# @classmethod # @classmethod
...@@ -339,7 +348,7 @@ class TestResnet(TestParallelExecutorBase): ...@@ -339,7 +348,7 @@ class TestResnet(TestParallelExecutorBase):
# fluid.recordio_writer.convert_reader_to_recordio_file( # fluid.recordio_writer.convert_reader_to_recordio_file(
# "./flowers.recordio", reader, feeder, compressor=fluid.core.RecordIOWriter.Compressor.NoCompress) # "./flowers.recordio", reader, feeder, compressor=fluid.core.RecordIOWriter.Compressor.NoCompress)
def test_resnet(self): def check_resnet_convergence(self):
import functools import functools
batch_size = 2 batch_size = 2
self.check_network_convergence( self.check_network_convergence(
...@@ -348,6 +357,9 @@ class TestResnet(TestParallelExecutorBase): ...@@ -348,6 +357,9 @@ class TestResnet(TestParallelExecutorBase):
iter=20, iter=20,
batch_size=batch_size) batch_size=batch_size)
def test_resnet(self):
self.check_resnet_convergence()
class ModelHyperParams(object): class ModelHyperParams(object):
# Dictionary size for source and target language. This model directly uses # Dictionary size for source and target language. This model directly uses
...@@ -510,7 +522,7 @@ class TestTransformer(TestParallelExecutorBase): ...@@ -510,7 +522,7 @@ class TestTransformer(TestParallelExecutorBase):
class ParallelExecutorTestingDuringTraining(unittest.TestCase): class ParallelExecutorTestingDuringTraining(unittest.TestCase):
def test_parallel_testing(self): def check_network_convergence(self):
main = fluid.Program() main = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
with fluid.program_guard(main, startup): with fluid.program_guard(main, startup):
...@@ -550,6 +562,9 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): ...@@ -550,6 +562,9 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
"Train loss: " + str(train_loss) + "\n Test loss:" + "Train loss: " + str(train_loss) + "\n Test loss:" +
str(test_loss)) str(test_loss))
def test_parallel(self):
self.check_network_convergence()
import paddle.dataset.conll05 as conll05 import paddle.dataset.conll05 as conll05
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -568,21 +583,26 @@ embedding_name = 'emb' ...@@ -568,21 +583,26 @@ embedding_name = 'emb'
def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
**ignored): is_sparse, **ignored):
# 8 features # 8 features
predicate_embedding = fluid.layers.embedding( predicate_embedding = fluid.layers.embedding(
input=predicate, input=predicate,
is_sparse=is_sparse,
size=[pred_dict_len, word_dim], size=[pred_dict_len, word_dim],
dtype='float32', dtype='float32',
param_attr='vemb') param_attr='vemb')
mark_embedding = fluid.layers.embedding( mark_embedding = fluid.layers.embedding(
input=mark, size=[mark_dict_len, mark_dim], dtype='float32') input=mark,
is_sparse=is_sparse,
size=[mark_dict_len, mark_dim],
dtype='float32')
word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
emb_layers = [ emb_layers = [
fluid.layers.embedding( fluid.layers.embedding(
size=[word_dict_len, word_dim], size=[word_dict_len, word_dim],
is_sparse=is_sparse,
input=x, input=x,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name=embedding_name, trainable=False)) for x in word_input name=embedding_name, trainable=False)) for x in word_input
...@@ -632,7 +652,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, ...@@ -632,7 +652,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
class TestCRFModel(unittest.TestCase): class TestCRFModel(unittest.TestCase):
def test_all(self): def check_network_convergence(self, is_sparse):
main = fluid.Program() main = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
with fluid.program_guard(main, startup): with fluid.program_guard(main, startup):
...@@ -652,6 +672,7 @@ class TestCRFModel(unittest.TestCase): ...@@ -652,6 +672,7 @@ class TestCRFModel(unittest.TestCase):
name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
mark = fluid.layers.data( mark = fluid.layers.data(
name='mark_data', shape=[1], dtype='int64', lod_level=1) name='mark_data', shape=[1], dtype='int64', lod_level=1)
feature_out = db_lstm(**locals()) feature_out = db_lstm(**locals())
target = fluid.layers.data( target = fluid.layers.data(
name='target', shape=[1], dtype='int64', lod_level=1) name='target', shape=[1], dtype='int64', lod_level=1)
...@@ -694,3 +715,9 @@ class TestCRFModel(unittest.TestCase): ...@@ -694,3 +715,9 @@ class TestCRFModel(unittest.TestCase):
print map(numpy.array, print map(numpy.array,
pe.run(feed=feeder.feed(cur_batch), pe.run(feed=feeder.feed(cur_batch),
fetch_list=[avg_cost.name]))[0] fetch_list=[avg_cost.name]))[0]
def test_update_sparse_parameter(self):
self.check_network_convergence(is_sparse=True)
def test_update_dense_parameter(self):
self.check_network_convergence(is_sparse=False)
...@@ -90,20 +90,22 @@ def avg_pool3D_forward_naive(x, ...@@ -90,20 +90,22 @@ def avg_pool3D_forward_naive(x,
class TestPool3d_Op(OpTest): class TestPool3d_Op(OpTest):
def setUp(self): def setUp(self):
self.op_type = "pool3d"
self.use_cudnn = False self.use_cudnn = False
self.dtype = np.float32
self.init_test_case() self.init_test_case()
self.init_global_pool() self.init_global_pool()
self.init_op_type() self.init_kernel_type()
self.init_pool_type() self.init_pool_type()
self.init_ceil_mode() self.init_ceil_mode()
if self.global_pool: if self.global_pool:
self.paddings = [0 for _ in range(len(self.paddings))] self.paddings = [0 for _ in range(len(self.paddings))]
input = np.random.random(self.shape).astype("float32") input = np.random.random(self.shape).astype(self.dtype)
output = self.pool3D_forward_naive(input, self.ksize, self.strides, output = self.pool3D_forward_naive(input, self.ksize, self.strides,
self.paddings, self.global_pool, self.paddings, self.global_pool,
self.ceil_mode).astype("float32") self.ceil_mode).astype(self.dtype)
self.inputs = {'X': input} self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
self.attrs = { self.attrs = {
'strides': self.strides, 'strides': self.strides,
...@@ -116,7 +118,7 @@ class TestPool3d_Op(OpTest): ...@@ -116,7 +118,7 @@ class TestPool3d_Op(OpTest):
'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter
} }
self.outputs = {'Out': output.astype('float32')} self.outputs = {'Out': output}
def testcudnn(self): def testcudnn(self):
return core.is_compiled_with_cuda() and self.use_cudnn return core.is_compiled_with_cuda() and self.use_cudnn
...@@ -129,6 +131,8 @@ class TestPool3d_Op(OpTest): ...@@ -129,6 +131,8 @@ class TestPool3d_Op(OpTest):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
if self.testcudnn() and self.pool_type != "max": if self.testcudnn() and self.pool_type != "max":
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
self.check_grad_with_place( self.check_grad_with_place(
...@@ -142,8 +146,8 @@ class TestPool3d_Op(OpTest): ...@@ -142,8 +146,8 @@ class TestPool3d_Op(OpTest):
self.strides = [1, 1, 1] self.strides = [1, 1, 1]
self.paddings = [0, 0, 0] self.paddings = [0, 0, 0]
def init_op_type(self): def init_kernel_type(self):
self.op_type = "pool3d" pass
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "avg" self.pool_type = "avg"
...@@ -158,15 +162,11 @@ class TestPool3d_Op(OpTest): ...@@ -158,15 +162,11 @@ class TestPool3d_Op(OpTest):
class TestCase1(TestPool3d_Op): class TestCase1(TestPool3d_Op):
def init_test_case(self): def init_test_case(self):
self.op_type = "pool3d"
self.shape = [2, 3, 7, 7, 7] self.shape = [2, 3, 7, 7, 7]
self.ksize = [3, 3, 3] self.ksize = [3, 3, 3]
self.strides = [1, 1, 1] self.strides = [1, 1, 1]
self.paddings = [0, 0, 0] self.paddings = [0, 0, 0]
def init_op_type(self):
self.op_type = "pool3d"
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "avg" self.pool_type = "avg"
self.pool3D_forward_naive = avg_pool3D_forward_naive self.pool3D_forward_naive = avg_pool3D_forward_naive
...@@ -182,9 +182,6 @@ class TestCase2(TestPool3d_Op): ...@@ -182,9 +182,6 @@ class TestCase2(TestPool3d_Op):
self.strides = [1, 1, 1] self.strides = [1, 1, 1]
self.paddings = [1, 1, 1] self.paddings = [1, 1, 1]
def init_op_type(self):
self.op_type = "pool3d"
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "avg" self.pool_type = "avg"
self.pool3D_forward_naive = avg_pool3D_forward_naive self.pool3D_forward_naive = avg_pool3D_forward_naive
...@@ -194,27 +191,18 @@ class TestCase2(TestPool3d_Op): ...@@ -194,27 +191,18 @@ class TestCase2(TestPool3d_Op):
class TestCase3(TestPool3d_Op): class TestCase3(TestPool3d_Op):
def init_op_type(self):
self.op_type = "pool3d"
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "max" self.pool_type = "max"
self.pool3D_forward_naive = max_pool3D_forward_naive self.pool3D_forward_naive = max_pool3D_forward_naive
class TestCase4(TestCase1): class TestCase4(TestCase1):
def init_op_type(self):
self.op_type = "pool3d"
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "max" self.pool_type = "max"
self.pool3D_forward_naive = max_pool3D_forward_naive self.pool3D_forward_naive = max_pool3D_forward_naive
class TestCase5(TestCase2): class TestCase5(TestCase2):
def init_op_type(self):
self.op_type = "pool3d"
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "max" self.pool_type = "max"
self.pool3D_forward_naive = max_pool3D_forward_naive self.pool3D_forward_naive = max_pool3D_forward_naive
...@@ -222,39 +210,105 @@ class TestCase5(TestCase2): ...@@ -222,39 +210,105 @@ class TestCase5(TestCase2):
#--------------------test pool3d-------------------- #--------------------test pool3d--------------------
class TestCUDNNCase1(TestPool3d_Op): class TestCUDNNCase1(TestPool3d_Op):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool3d"
class TestFP16CUDNNCase1(TestPool3d_Op):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCUDNNCase2(TestCase1): class TestCUDNNCase2(TestCase1):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool3d"
class TestFP16CUDNNCase2(TestCase1):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCUDNNCase3(TestCase2): class TestCUDNNCase3(TestCase2):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool3d"
class TestFP16CUDNNCase3(TestCase2):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCUDNNCase4(TestCase3): class TestCUDNNCase4(TestCase3):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool3d"
class TestFP16CUDNNCase4(TestCase3):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCUDNNCase5(TestCase4): class TestCUDNNCase5(TestCase4):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool3d"
class TestFP16CUDNNCase5(TestCase4):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCUDNNCase6(TestCase5): class TestCUDNNCase6(TestCase5):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool3d"
class TestFP16CUDNNCase6(TestCase5):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCeilModeCase1(TestCUDNNCase1): class TestCeilModeCase1(TestCUDNNCase1):
......
...@@ -12,44 +12,238 @@ ...@@ -12,44 +12,238 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
import core
import framework
import executor
import data_feeder
import contextlib
import io
# optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
import optimizer as opt_module
import distribute_transpiler
__all__ = [ __all__ = [
'Event',
'Trainer', 'Trainer',
'BeginEpochEvent',
'EndEpochEvent',
'BeginStepEvent',
'EndStepEvent',
] ]
class Event(object): class BeginEpochEvent(object):
BEGIN_EPOCH = 0 def __init__(self, epoch_id):
END_EPOCH = 1 self.epoch = epoch_id
BEGIN_STEP = 2
END_STEP = 3
class EndEpochEvent(object):
def __init__(self, epoch_id):
self.epoch = epoch_id
def __init__(self):
self.step = 0 class BeginStepEvent(object):
self.epoch = 0 def __init__(self, epoch_id, step_id):
self.type = Event.BEGIN_EPOCH self.epoch = epoch_id
self.step = step_id
class EndStepEvent(object):
def __init__(self, epoch_id, step_id):
self.epoch = epoch_id
self.step = step_id
class Trainer(object): class Trainer(object):
def __init__(self, network_func, optimizer, params=None, place=None): """
Args:
program_func(callable): A function which will return loss. The loss must be a scaler.
optimizer(optimizer.Optimizer): The optimizer should be an instance of Optimizer
place: The device place of this trainer.
"""
def __init__(self, program_func, optimizer, param_path=None, place=None):
# 1. we need to generate a framework.Program by calling # 1. we need to generate a framework.Program by calling
# network_func. Reference: fluid.program_guard in # program_func. Reference: fluid.program_guard in
# test_word2vec.py # test_word2vec.py
self.scope = core.Scope()
self.startup_program = framework.Program()
self.train_program = framework.Program()
with framework.program_guard(self.train_program, self.startup_program):
loss = program_func()
if not isinstance(optimizer, opt_module.Optimizer):
raise TypeError(
"The optimizer should be an instance of Optimizer")
optimize_ops, params_grads = optimizer.minimize(loss)
self.place = Trainer._check_and_get_place(place)
self.dist_transpile_if_necessary(optimize_ops, params_grads)
# 2. move the default_main_program to self.program and run the # 2. move the default_main_program to self.program and run the
# default_startup program on an empty core.Scope() # default_startup program on an empty core.Scope()
# Run startup program
with self._prog_and_scope_guard():
exe = executor.Executor(place)
exe.run(self.startup_program)
# 3. call self.params.add_vars with the initialized scope, it if param_path:
# will add the new vars of the initialized scope into # load params from param_path into scope
# self.params. io.load_persistables(exe, dirname=param_path)
self.network_func = network_func
self.optimizer = optimizer
self.params = params
self.place = place
# TODO(helin): support distributed training
def train(self, reader, num_epochs, event_handler): def dist_transpile_if_necessary(self, optimize_ops, params_grads):
pass if "PADDLE_TRAINING_ROLE" not in os.environ:
return
# the port of all pservers, needed by both trainer and pserver
port = os.getenv("PADDLE_PSERVER_PORT", "6174")
# comma separated ips of all pservers, needed by trainer and
# pserver
pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
eplist = []
for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist)
# total number of workers/trainers in the job, needed by
# trainer and pserver
trainers = int(os.getenv("PADDLE_TRAINERS"))
# the IP of the local machine, needed by pserver only
current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
# the unique trainer id, starting from 0, needed by trainer
# only
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
# the role, should be either PSERVER or TRAINER
training_role = os.getenv("PADDLE_TRAINING_ROLE")
with self._prog_and_scope_guard():
t = distribute_transpiler.DistributeTranspiler()
t.transpile(
trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER":
self.train_program = t.get_pserver_program(current_endpoint)
self.startup_program = t.get_startup_program(current_endpoint,
self.train_program)
elif training_role == "TRAINER":
self.train_program = t.get_trainer_program()
else:
raise ValueError(
'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
)
def train(self,
num_epochs,
event_handler,
reader=None,
parallel=False,
feed_order=None):
"""
Train the model.
Args:
num_epochs: The number of epoch. An epoch will process all data in reader
event_handler: The event handler. A function with type (ev:Event)->void
reader:
parallel: True if use multi-CPUs or multi-GPUs
feed_order: Feeding order of reader. None will following the defining
order in program
Returns:
"""
if parallel:
raise NotImplementedError(
"Parallel Executor version of trainer is not implemented")
training_role = os.getenv("PADDLE_TRAINING_ROLE", "")
if training_role == "PSERVER":
with self._prog_and_scope_guard():
exe = executor.Executor(self.place)
exe.run()
return
self._train_by_executor(num_epochs, event_handler, reader, feed_order)
def test(self, reader): def test(self, reader):
pass pass
def save_params(self, param_path):
# reference: save_persistables in io.py
exe = executor.Executor(self.place)
io.save_persistables(
exe, dirname=param_path, main_program=self.startup_program)
@staticmethod
def _check_and_get_place(place):
"""
Check the type of place or get the default place
Args:
place(None|core.CUDAPlace|core.CPUPlace): the place that trainer will be executed on.
Raises:
TypeError if the type mismatched.
Returns:
the original place if it is not None.
if fluid is compiled with CUDA, returns CUDAPlace(0) by default.
Otherwise returns CPUPlace by default.
"""
if place is None:
if core.is_compiled_with_cuda():
return core.CUDAPlace(0)
else:
return core.CPUPlace()
else:
if not isinstance(place, core.CUDAPlace) and not isinstance(
place, core.CPUPlace):
raise TypeError("Place should be either CUDAPlace or CPUPlace")
return place
@contextlib.contextmanager
def _prog_and_scope_guard(self):
with framework.program_guard(
main_program=self.train_program,
startup_program=self.startup_program):
with executor.scope_guard(self.scope):
yield
def _train_by_executor(self, num_epochs, event_handler, reader, feed_order):
"""
Train by Executor and single device.
Args:
num_epochs:
event_handler:
reader:
feed_order:
Returns:
"""
with self._prog_and_scope_guard():
exe = executor.Executor(self.place)
if feed_order is None:
feed_var_list = [
var
for var in self.train_program.global_block(
).vars.itervalues()
if hasattr(var, 'is_data') and var.is_data
]
else:
feed_var_list = [
self.train_program.global_block().var(var_name)
for var_name in feed_order
]
feeder = data_feeder.DataFeeder(
feed_list=feed_var_list, place=self.place)
for epoch_id in range(num_epochs):
event_handler(BeginEpochEvent(epoch_id))
for step_id, data in enumerate(reader()):
event_handler(BeginStepEvent(epoch_id, step_id))
exe.run(feed=feeder.feed(data), fetch_list=[])
event_handler(EndStepEvent(epoch_id, step_id))
event_handler(EndEpochEvent(epoch_id))
...@@ -11,29 +11,12 @@ ...@@ -11,29 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from distribute_transpiler import DistributeTranspiler
from . import core from inference_transpiler import InferenceTranspiler
from memory_optimization_transpiler import memory_optimize, release_memory
__all__ = ['Params', ] from distribute_transpiler_simple import SimpleDistributeTranspiler
__all__ = [
class Params(object): "DistributeTranspiler", "InferenceTranspiler", "SimpleDistributeTranspiler",
def __init__(self, path=None): "memory_optimize", "release_memory"
self.scope = core.Scope() ]
if path:
self._load(path)
def _load(self, path):
# reference: load_persistables in io.py
pass
def save(self, path):
# reference: save_persistables in io.py
pass
def add_params(self, scope):
# take the keys from the scope,
# if not already exists in self.scope,
# add the key and value into self.scope.
pass
...@@ -17,9 +17,8 @@ from __future__ import print_function ...@@ -17,9 +17,8 @@ from __future__ import print_function
import math import math
import distributed_splitter as splitter import distributed_splitter as splitter
import framework from .. import core
from framework import Program, default_main_program, Variable, Parameter from ..framework import Program, default_main_program, Variable, Parameter
from . import core
LOOKUP_TABLE_TYPE = "lookup_table" LOOKUP_TABLE_TYPE = "lookup_table"
LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad" LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
...@@ -135,10 +134,18 @@ def split_dense_variable(var_list, ...@@ -135,10 +134,18 @@ def split_dense_variable(var_list,
return blocks return blocks
def delete_ops(block, ops):
try:
start = list(block.ops).index(ops[0])
end = list(block.ops).index(ops[-1])
[block.remove_op(start) for _ in xrange(end - start + 1)]
except Exception, e:
raise e
block.program.sync_with_cpp()
class DistributeTranspiler: class DistributeTranspiler:
def transpile(self, def transpile(self,
optimize_ops,
params_grads,
trainer_id, trainer_id,
program=None, program=None,
pservers="127.0.0.1:6174", pservers="127.0.0.1:6174",
...@@ -169,11 +176,6 @@ class DistributeTranspiler: ...@@ -169,11 +176,6 @@ class DistributeTranspiler:
4. append ops that should run on current server instance. 4. append ops that should run on current server instance.
5. add listen_and_serv op 5. add listen_and_serv op
:param optimize_ops: op list of optimization, should be the
return value of Optimizer.minimize
:type optimize_ops: list
:param params_grads: list of tuple(weight, gradient)
:type params_grads: list
:param trainer_id: one unique id for each trainer in a job. :param trainer_id: one unique id for each trainer in a job.
:type trainer_id: int :type trainer_id: int
:param program: program to transpile, default is default_main_program :param program: program to transpile, default is default_main_program
...@@ -194,7 +196,6 @@ class DistributeTranspiler: ...@@ -194,7 +196,6 @@ class DistributeTranspiler:
program = default_main_program() program = default_main_program()
self.origin_program = program self.origin_program = program
self.trainer_num = trainers self.trainer_num = trainers
self.optimize_ops = optimize_ops
self.sync_mode = sync_mode self.sync_mode = sync_mode
# TODO(typhoonzero): currently trainer_id is fetched from cluster system # TODO(typhoonzero): currently trainer_id is fetched from cluster system
# like Kubernetes, we should port this to use etcd later when developing # like Kubernetes, we should port this to use etcd later when developing
...@@ -202,6 +203,7 @@ class DistributeTranspiler: ...@@ -202,6 +203,7 @@ class DistributeTranspiler:
self.trainer_id = trainer_id self.trainer_id = trainer_id
pserver_endpoints = pservers.split(",") pserver_endpoints = pservers.split(",")
self.pserver_endpoints = pserver_endpoints self.pserver_endpoints = pserver_endpoints
self.optimize_ops, params_grads = self._get_optimize_pass()
# process lookup_table_op # process lookup_table_op
# 1. check all lookup_table_op is distributed # 1. check all lookup_table_op is distributed
...@@ -324,8 +326,7 @@ class DistributeTranspiler: ...@@ -324,8 +326,7 @@ class DistributeTranspiler:
def get_trainer_program(self): def get_trainer_program(self):
# remove optimize ops and add a send op to main_program # remove optimize ops and add a send op to main_program
self.origin_program.global_block().delete_ops(self.optimize_ops) delete_ops(self.origin_program.global_block(), self.optimize_ops)
self.origin_program.sync_with_cpp()
# FIXME(typhoonzero): serialize once will fix error occurs when clone. # FIXME(typhoonzero): serialize once will fix error occurs when clone.
self.origin_program.__str__() self.origin_program.__str__()
return self.origin_program return self.origin_program
...@@ -408,11 +409,8 @@ class DistributeTranspiler: ...@@ -408,11 +409,8 @@ class DistributeTranspiler:
# HACK: optimization global ops only used to scale beta1 and beta2 # HACK: optimization global ops only used to scale beta1 and beta2
# replace it with dependency engine. # replace it with dependency engine.
for op in self.optimize_ops: for op in self.optimize_ops:
if op.type == "scale": if self._is_adam_connected_op(op):
for in_name in op.input_arg_names: global_ops.append(op)
if in_name.startswith("beta1_pow_acc") or \
in_name.startswith("beta2_pow_acc"):
global_ops.append(op)
def __append_optimize_op__(op, block, grad_to_block_id): def __append_optimize_op__(op, block, grad_to_block_id):
if self._is_opt_op(op): if self._is_opt_op(op):
...@@ -612,8 +610,7 @@ class DistributeTranspiler: ...@@ -612,8 +610,7 @@ class DistributeTranspiler:
attrs={"axis": 0}) attrs={"axis": 0})
# delete lookup_table_op # delete lookup_table_op
program.global_block().delete_ops([op]) delete_ops(program.global_block(), [op])
program.sync_with_cpp()
# break for loop # break for loop
break break
...@@ -1147,3 +1144,32 @@ class DistributeTranspiler: ...@@ -1147,3 +1144,32 @@ class DistributeTranspiler:
# we only need to append op for once # we only need to append op for once
break break
return lr_ops return lr_ops
def _get_optimize_pass(self):
block = self.origin_program.global_block()
opt_ops = []
params_grads = []
for op in block.ops:
if self._is_opt_op(op):
opt_ops.append(op)
params_grads.append((self.origin_program.global_block().var(
op.input("Param")[0]),
self.origin_program.global_block().var(
op.input("Grad")[0])))
elif self._is_adam_connected_op(op):
opt_ops.append(op)
else:
pass
return opt_ops, params_grads
def _is_adam_connected_op(self, op):
"""
A hack function to determinate whether the input operator
is connected to optimize operator.
"""
if op.type == "scale":
for in_name in op.input_arg_names:
if in_name.startswith("beta1_pow_acc") or \
in_name.startswith("beta2_pow_acc"):
return True
return False
...@@ -12,10 +12,8 @@ ...@@ -12,10 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import framework from ..framework import Program, default_main_program, Parameter, Variable
from framework import Program, default_main_program, Parameter, Variable from ..layer_helper import LayerHelper
import optimizer
from layer_helper import LayerHelper
def hash_name_to_server(params_grads, pserver_endpoints): def hash_name_to_server(params_grads, pserver_endpoints):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from .. import core
from ..framework import Program
from ..executor import global_scope
class InferenceTranspiler:
def transpile(self, program, place, scope=None):
'''
Transpile the program. Support only fuse batch normalization now.
:param program: program to transpile
:type program: Program
:param place: inference place
:type place: Place
:param scope: inference scope
:type scope: Scope or None
'''
if not isinstance(program, Program):
raise TypeError("program should be as Program type")
if not isinstance(place, core.CPUPlace) and not isinstance(
place, core.CUDAPlace):
raise TypeError("place should be as CPUPlace/CUDAPlace type")
if scope is None:
scope = global_scope()
if not isinstance(scope, core.Scope):
raise TypeError("scope should be as Scope type or None")
self.fuse_batch_norm(program, place, scope)
def fuse_batch_norm(self, program, place, scope):
'''
Transpile the program by fused batch normalization.
The batch normalization followed the convolution or fully connected layer
can be integrated with them. Doing so will give us a forward acceleration,
especially in environments like mobile or embedded.
For input X:
- Conv process: X = input * W + bias
- Batch norm process: X' = (X - mean) / std
- Scale Process: Y = a * X' + b
After fuse into one operation:
Y = (input * W + bias - mean) / std * a + b
= input * a * W / std + ((bias - mean) / std * a + b)
The operator transformation is:
- before:
- conv->batch_norm->any_other_op (bias == 0)
- conv->elementwise_add->batch_norm->any_other_op (bias != 0)
- after:
- conv->elementwise_add->any_other_op
The transpile stages are:
1. insert elementwise_add op when bias == 0.
2. fuse the batch_norm's parameters to conv and elementwise_add operators.
3. remove batch_norm ops which are not used in any other ops.
4. adjust the input of any_other_op to be the output of elementwise_add operator.
5. remove unused variables.
:param program: program to transpile
:type program: Program
:param place: inference place
:type place: Place
:param scope: inference scope
:type scope: Scope
'''
self.scope = scope
self.place = place
self.block = program.block(0)
self.input_map = {} # store the input names should be adjusted
i = 0
while i < len(self.block.ops):
current_op = self.block.ops[i]
# TODO(luotao1): consider only conv2d now. fc would be delt later.
if current_op.type in ['conv2d']:
# TODO(luotao1): consider single chain network now.
# For branch network, we counldn't use block.ops[i + 1] as
# the judgment condition.
next_op = self.block.ops[i + 1]
# conv2d without bias
if (next_op.type == 'batch_norm'):
# insert bias op
bias_op = self._insert_bias_op(i + 1, current_op, next_op)
# fuse batch_norm
self._fuse_param(current_op, next_op, bias_op, 0)
# remove batch_norm_op
self.block.remove_op(i + 2)
i = i + 1
# conv2d with bias, the next_op.type is elementwise_add
elif (next_op.type == 'elementwise_add'):
next_next_op = self.block.ops[i + 2]
if (next_next_op.type == 'batch_norm'):
# fuse batch_norm
self._fuse_param(current_op, next_next_op, next_op, 1)
# remove batch_norm_op
self.block.remove_op(i + 2)
i = i + 1
i = i + 1
self._adjust_input()
self._remove_unused_var()
# TODO(luotao): use clone() method to flush the program.desc in force,
# since some large program.desc will not be flushed immediately.
# And a better solution will be considered later.
program = program.clone()
# ====================== private transpiler functions =====================
def _insert_bias_op(self, index, current_op, bn_op):
'''
Construct elementwise_add operator for adding bias
and insert it into program.
:param index: insert location of bias_op
:type index: Int
:param current_op: current operator (conv or fc)
:type current_op: Operator
:param bn_op: batch norm operator
:type bn_op: Operator
:return: bias_op
:rtype: Operator
'''
# The input of bias_op is current_op's output and Bias of bn_op
# The output of bias_op is bn_op's output
x_var = self.block.var(current_op.output("Output")[0])
y_var = self.block.var(bn_op.input("Bias")[0])
out_var = self.block.var(bn_op.output("Y")[0])
bias_op = self.block.insert_op(
index,
type="elementwise_add",
inputs={"X": x_var,
"Y": y_var},
outputs={"Out": out_var},
attrs={"axis": 1}) # dim_start=1
return bias_op
def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
'''
fuse the batch_norm_op' parameters to current_op (conv or fc)
:param current_op: current operator (conv or fc)
:type current_op: Operator
:param bn_op: batch norm operator
:type bn_op: Operator
:param bias_op: elementwise_add operator for adding bias
:type bias_op: Operator
:param with_bias: If current operator has bias, with_bias = 1; otherwise 0.
:type with_bias: Int
'''
def _update_param(op, old_param_name, new_param):
# For the sake of remaining the original variables the same as before,
# create new variables in scope to store the new parameters.
old_param_name = old_param_name[0]
old_var = self.block.vars[old_param_name]
new_param_name = old_param_name + '_fuse_bn'
new_var = self.block.create_parameter(
name=new_param_name.encode('ascii'),
type=old_var.type,
dtype=old_var.dtype,
shape=old_var.shape)
op.rename_input(old_param_name, new_param_name)
self.scope.var(new_param_name)
tensor = self.scope.find_var(new_param_name).get_tensor()
tensor.set(np.array(new_param), self.place)
def _load_param(param_name):
return np.array(self.scope.find_var(param_name[0]).get_tensor())
bias_bn = _load_param(bn_op.input("Bias")) #Bias
scale_bn = _load_param(bn_op.input("Scale")) #Scale
mean_bn = _load_param(bn_op.input("Mean")) #Mean
var_bn = _load_param(bn_op.input("Variance")) #Variance
# TODO(luotao1): consider only conv2d now. fc would be delt later.
current_param = _load_param(current_op.input("Filter"))
std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5)))
tmp = np.float32(np.divide(scale_bn, std_bn))
# add bias of batch_norm_op to conv2d
if with_bias:
bias = _load_param(bias_op.input("Y"))
else:
bias = np.zeros(bias_bn.shape)
bias = np.float32(
np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn))
# re-compute weight of conv2d
tmp = tmp.reshape(tmp.shape[0], -1)
dst_param = current_param.reshape((tmp.shape[0], -1))
dst_param = np.float32(np.multiply(dst_param, tmp))
dst_param = dst_param.reshape(current_param.shape)
# update parameters
_update_param(current_op, current_op.input("Filter"), dst_param)
_update_param(bias_op, bias_op.input("Y"), bias)
# collect the renamed input
self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
def _adjust_input(self):
for i in range(len(self.block.ops)):
current_op = self.block.ops[i]
for input_arg in current_op.input_arg_names:
if input_arg in self.input_map:
current_op.rename_input(input_arg,
self.input_map[input_arg])
def _remove_unused_var(self):
'''
remove unused varibles in program
'''
args = []
for i in range(len(self.block.ops)):
current_op = self.block.ops[i]
args += current_op.input_arg_names
args += current_op.output_arg_names
args = list(set(args)) # unique the input and output arguments
for var in self.block.vars.keys():
if var not in args:
self.block.remove_var(var)
...@@ -13,11 +13,9 @@ ...@@ -13,11 +13,9 @@
# limitations under the License. # limitations under the License.
from collections import defaultdict from collections import defaultdict
import framework from .. import core
from framework import Program, default_main_program, Parameter, Variable from ..framework import Program, default_main_program, Parameter, Variable
import backward from ..backward import _rename_arg_
from backward import _rename_arg_
from . import core
dtype_to_size = { dtype_to_size = {
core.VarDesc.VarType.FP16: 2, core.VarDesc.VarType.FP16: 2,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册