diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9385943da92bc8c44ca75b267a768ba8ea22bd8b..90c25e435083d78ad4c123999a588aaf9092f719 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,18 +7,14 @@
hooks:
- id: yapf
- repo: https://github.com/pre-commit/pre-commit-hooks
- sha: 4ef03c4223ad322c7adaa6c6c0efb26b57df3b71
+ sha: 7539d8bd1a00a3c1bfd34cdb606d3a6372e83469
hooks:
- id: check-added-large-files
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
- id: end-of-file-fixer
-# TODO(yuyang): trailing whitespace has some bugs on markdown
-# files now, please not add it to pre-commit hook now
-# - id: trailing-whitespace
-#
-# TODO(yuyang): debug-statements not fit for Paddle, because
-# not all of our python code is runnable. Some are used for
-# documenation
-# - id: debug-statements
+- repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
+ sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
+ hooks:
+ - id: clang-formater
diff --git a/.travis.yml b/.travis.yml
index ffe3bc193b49eb3b3318cbbc7f1c3d86dc205c14..effcf90769647960d55b971af0939496dc850e7a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -42,7 +42,7 @@ addons:
before_install:
- |
if [ ${JOB} == "BUILD_AND_TEST" ]; then
- if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)'
+ if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)'
then
echo "Only markdown docs were updated, stopping build process."
exit
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d7e7e49e9a038acc6ca272433cd39b08c812eccc..af193c27ae7d802a8724fdc1e23b4b5b583e9f7c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,8 +2,8 @@ cmake_minimum_required(VERSION 2.8)
project(paddle CXX C)
set(PADDLE_MAJOR_VERSION 0)
-set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b3)
+set(PADDLE_MINOR_VERSION 9)
+set(PADDLE_PATCH_VERSION 0a0)
set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
@@ -36,6 +36,7 @@ option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
option(WITH_GLOG "Compile PaddlePaddle use glog, otherwise use a log implement internally" ${LIBGLOG_FOUND})
option(WITH_GFLAGS "Compile PaddlePaddle use gflags, otherwise use a flag implement internally" ${GFLAGS_FOUND})
option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
+option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF)
option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
@@ -115,7 +116,6 @@ else()
endif(WITH_AVX)
if(WITH_DSO)
- set(CUDA_LIBRARIES "")
add_definitions(-DPADDLE_USE_DSO)
endif(WITH_DSO)
@@ -135,6 +135,10 @@ if(NOT WITH_TIMER)
add_definitions(-DPADDLE_DISABLE_TIMER)
endif(NOT WITH_TIMER)
+if(NOT WITH_PROFILER)
+ add_definitions(-DPADDLE_DISABLE_PROFILER)
+endif(NOT WITH_PROFILER)
+
if(WITH_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
diff --git a/README.md b/README.md
index 81ff8c7122ab8f1e39ef14a056532bb85cc57c77..8a8e15841586ae6a01bb93e94f6074189f556f5a 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,13 @@
# PaddlePaddle
-[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)
-[![Coverage Status](https://coveralls.io/repos/github/baidu/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/baidu/Paddle?branch=develop)
-[![Join the chat at https://gitter.im/PaddlePaddle/Deep_Learning](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/PaddlePaddle/Deep_Learning?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
-[![License](https://img.shields.io/badge/license-Apache%202.0-green.svg)](LICENSE)
+[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/cn/index.html)
+[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
+[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
+
Welcome to the PaddlePaddle GitHub.
@@ -14,7 +17,7 @@ developed by Baidu scientists and engineers for the purpose of applying deep
learning to many products at Baidu.
Our vision is to enable deep learning for everyone via PaddlePaddle.
-Please refer to our [release announcement](https://github.com/baidu/Paddle/releases) to track the latest feature of PaddlePaddle.
+Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
## Features
@@ -26,15 +29,15 @@ Please refer to our [release announcement](https://github.com/baidu/Paddle/relea
connection.
- **Efficiency**
-
+
In order to unleash the power of heterogeneous computing resource,
optimization occurs at different levels of PaddlePaddle, including
computing, memory, architecture and communication. The following are some
examples:
- Optimized math operations through SSE/AVX intrinsics, BLAS libraries
- (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
- - Highly optimized recurrent networks which can handle **variable-length**
+ (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
+ - Highly optimized recurrent networks which can handle **variable-length**
sequence without padding.
- Optimized local and distributed training for models with high dimensional
sparse data.
@@ -57,41 +60,39 @@ Please refer to our [release announcement](https://github.com/baidu/Paddle/relea
## Installation
Check out the [Install Guide](http://paddlepaddle.org/doc/build/) to install from
-pre-built packages (**docker image**, **deb package**) or
+pre-built packages (**docker image**, **deb package**) or
directly build on **Linux** and **Mac OS X** from the source code.
-
+
## Documentation
Both [English Docs](http://paddlepaddle.org/doc/) and [Chinese Docs](http://paddlepaddle.org/doc_cn/) are provided for our users and developers.
- [Quick Start](http://paddlepaddle.org/doc/demo/quick_start/index_en)
You can follow the quick start tutorial to learn how use PaddlePaddle
step-by-step.
-
+
- [Example and Demo](http://paddlepaddle.org/doc/demo/)
We provide five demos, including: image classification, sentiment analysis,
- sequence to sequence model, recommendation, semantic role labeling.
-
+ sequence to sequence model, recommendation, semantic role labeling.
+
- [Distributed Training](http://paddlepaddle.org/doc/cluster)
This system supports training deep learning models on multiple machines
with data parallelism.
-
+
- [Python API](http://paddlepaddle.org/doc/ui/)
PaddlePaddle supports using either Python interface or C++ to build your
system. We also use SWIG to wrap C++ source code to create a user friendly
interface for Python. You can also use SWIG to create interface for your
favorite programming language.
-
+
- [How to Contribute](http://paddlepaddle.org/doc/build/contribute_to_paddle.html)
We sincerely appreciate your interest and contributions. If you would like to
- contribute, please read the contribution guide.
+ contribute, please read the contribution guide.
- [Source Code Documents](http://paddlepaddle.org/doc/source/)
## Ask Questions
-Please join the [**gitter chat**](https://gitter.im/PaddlePaddle/Deep_Learning) or send email to
-**paddle-dev@baidu.com** to ask questions and talk about methods and models.
-Framework development discussions and
-bug reports are collected on [Issues](https://github.com/baidu/paddle/issues).
+
+You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).
## Copyright and License
PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
diff --git a/demo/image_classification/train.sh b/demo/image_classification/train.sh
index ed9b5220fff6a434cd332f0972d39c4149b3ebfe..db0a057bf35b4ad04a08a1e3f1fad3bd6a486350 100755
--- a/demo/image_classification/train.sh
+++ b/demo/image_classification/train.sh
@@ -24,7 +24,7 @@ paddle train \
--test_all_data_in_one_period=1 \
--use_gpu=1 \
--trainer_count=1 \
---num_passes=200 \
+--num_passes=300 \
--save_dir=$output \
2>&1 | tee $log
diff --git a/demo/model_zoo/embedding/pre_DictAndModel.sh b/demo/model_zoo/embedding/pre_DictAndModel.sh
index 7821850fb25cc5b87aa305c2113efbf50b093ed1..6d647f5dd9368eaf81c19386511c7d231e4799e3 100755
--- a/demo/model_zoo/embedding/pre_DictAndModel.sh
+++ b/demo/model_zoo/embedding/pre_DictAndModel.sh
@@ -18,7 +18,5 @@ set -x
# download the dictionary and pretrained model
for file in baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb
do
- # following is the google drive address
- # you can also directly download from https://pan.baidu.com/s/1o8q577s
- wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/$file --no-check-certificate
+ wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/$file
done
diff --git a/demo/model_zoo/resnet/get_model.sh b/demo/model_zoo/resnet/get_model.sh
index 89312d43edf8e4e7d639be73d5b3983ea916b902..133d08fca431540f2ed5cd6e63b51d9ce3a1b344 100755
--- a/demo/model_zoo/resnet/get_model.sh
+++ b/demo/model_zoo/resnet/get_model.sh
@@ -24,9 +24,7 @@ echo "Downloading ResNet models..."
for file in resnet_50.tar.gz resnet_101.tar.gz resnet_152.tar.gz mean_meta_224.tar.gz
do
- # following is the google drive address
- # you can also directly download from https://pan.baidu.com/s/1o8q577s
- wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/imagenet/$file --no-check-certificate
+ wget http://paddlepaddle.bj.bcebos.com/model_zoo/imagenet/$file
tar -xvf $file
rm $file
done
diff --git a/demo/quick_start/preprocess.sh b/demo/quick_start/preprocess.sh
index 58a72147c5e41351634395e770e9a214ed3cb01d..c9190e2dd2ef754bf3c7287006322b52493dc3a0 100755
--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/preprocess.sh
@@ -23,7 +23,7 @@ set -e
export LC_ALL=C
UNAME_STR=`uname`
-if [[ ${UNAME_STR} == 'Linux' ]]; then
+if [ ${UNAME_STR} == 'Linux' ]; then
SHUF_PROG='shuf'
else
SHUF_PROG='gshuf'
diff --git a/demo/semantic_role_labeling/data/extract_dict_feature.py b/demo/semantic_role_labeling/data/extract_dict_feature.py
index 2982e54c665b41400aab0a893ff3c76335404988..daca5f01cf2b3bd231bf530f17ec760272ce93e0 100644
--- a/demo/semantic_role_labeling/data/extract_dict_feature.py
+++ b/demo/semantic_role_labeling/data/extract_dict_feature.py
@@ -17,24 +17,15 @@ import os
from optparse import OptionParser
-def extract_dict_features(pair_file, feature_file, src_dict_file,
- tgt_dict_file):
- src_dict = set()
- tgt_dict = set()
-
- with open(pair_file) as fin, open(feature_file, 'w') as feature_out, open(
- src_dict_file, 'w') as src_dict_out, open(tgt_dict_file,
- 'w') as tgt_dict_out:
+def extract_dict_features(pair_file, feature_file):
+
+ with open(pair_file) as fin, open(feature_file, 'w') as feature_out:
for line in fin:
- sentence, labels = line.strip().split('\t')
+ sentence, predicate, labels = line.strip().split('\t')
sentence_list = sentence.split()
labels_list = labels.split()
- src_dict.update(sentence_list)
- tgt_dict.update(labels_list)
-
verb_index = labels_list.index('B-V')
- verb_feature = sentence_list[verb_index]
mark = [0] * len(labels_list)
if verb_index > 0:
@@ -42,47 +33,50 @@ def extract_dict_features(pair_file, feature_file, src_dict_file,
ctx_n1 = sentence_list[verb_index - 1]
else:
ctx_n1 = 'bos'
- ctx_n1_feature = ctx_n1
+
+ if verb_index > 1:
+ mark[verb_index - 2] = 1
+ ctx_n2 = sentence_list[verb_index - 2]
+ else:
+ ctx_n2 = 'bos'
mark[verb_index] = 1
- ctx_0_feature = sentence_list[verb_index]
+ ctx_0 = sentence_list[verb_index]
if verb_index < len(labels_list) - 2:
mark[verb_index + 1] = 1
ctx_p1 = sentence_list[verb_index + 1]
else:
ctx_p1 = 'eos'
- ctx_p1_feature = ctx_p1
+
+ if verb_index < len(labels_list) - 3:
+ mark[verb_index + 2] = 1
+ ctx_p2 = sentence_list[verb_index + 2]
+ else:
+ ctx_p2 = 'eos'
+
feature_str = sentence + '\t' \
- + verb_feature + '\t' \
- + ctx_n1_feature + '\t' \
- + ctx_0_feature + '\t' \
- + ctx_p1_feature + '\t' \
+ + predicate + '\t' \
+ + ctx_n2 + '\t' \
+ + ctx_n1 + '\t' \
+ + ctx_0 + '\t' \
+ + ctx_p1 + '\t' \
+ + ctx_p2 + '\t' \
+ ' '.join([str(i) for i in mark]) + '\t' \
+ labels
feature_out.write(feature_str + '\n')
- src_dict_out.write('\n')
- src_dict_out.write('\n'.join(list(src_dict)))
-
- tgt_dict_out.write('\n'.join(list(tgt_dict)))
if __name__ == '__main__':
- usage = '-p pair_file -f feature_file -s source dictionary -t target dictionary '
+ usage = '-p pair_file -f feature_file'
parser = OptionParser(usage)
parser.add_option('-p', dest='pair_file', help='the pair file')
- parser.add_option(
- '-f', dest='feature_file', help='the file to store feature')
- parser.add_option(
- '-s', dest='src_dict', help='the file to store source dictionary')
- parser.add_option(
- '-t', dest='tgt_dict', help='the file to store target dictionary')
+ parser.add_option('-f', dest='feature_file', help='the feature file')
(options, args) = parser.parse_args()
- extract_dict_features(options.pair_file, options.feature_file,
- options.src_dict, options.tgt_dict)
+ extract_dict_features(options.pair_file, options.feature_file)
diff --git a/demo/semantic_role_labeling/data/extract_pairs.py b/demo/semantic_role_labeling/data/extract_pairs.py
index 4d1bef8f958a62be9941d474a0b67542dcc5cfab..86ab00ce41723169de035a841d9e129a1b9e82a3 100644
--- a/demo/semantic_role_labeling/data/extract_pairs.py
+++ b/demo/semantic_role_labeling/data/extract_pairs.py
@@ -51,7 +51,7 @@ def read_sentences(words_file):
for line in fin:
line = line.strip()
if line == '':
- sentences.append(s.lower())
+ sentences.append(s)
s = ''
else:
s += line + ' '
@@ -64,6 +64,11 @@ def transform_labels(sentences, labels):
if len(labels[i]) == 1:
continue
else:
+ verb_list = []
+ for x in labels[i][0]:
+ if x !='-':
+ verb_list.append(x)
+
for j in xrange(1, len(labels[i])):
label_list = labels[i][j]
current_tag = 'O'
@@ -88,8 +93,7 @@ def transform_labels(sentences, labels):
is_in_bracket = True
else:
print 'error:', ll
-
- sen_lab_pair.append((sentences[i], label_seq))
+ sen_lab_pair.append((sentences[i], verb_list[j-1], label_seq))
return sen_lab_pair
@@ -97,9 +101,9 @@ def write_file(sen_lab_pair, output_file):
with open(output_file, 'w') as fout:
for x in sen_lab_pair:
sentence = x[0]
- label_seq = ' '.join(x[1])
- assert len(sentence.split()) == len(x[1])
- fout.write(sentence + '\t' + label_seq + '\n')
+ label_seq = ' '.join(x[2])
+ assert len(sentence.split()) == len(x[2])
+ fout.write(sentence + '\t' + x[1]+'\t' +label_seq + '\n')
if __name__ == '__main__':
diff --git a/demo/semantic_role_labeling/data/get_data.sh b/demo/semantic_role_labeling/data/get_data.sh
index 268c0995e27006ec62f38bdda9b0a0994dab096c..55e33f4685627ed483aa6642c518a33558091531 100644
--- a/demo/semantic_role_labeling/data/get_data.sh
+++ b/demo/semantic_role_labeling/data/get_data.sh
@@ -14,6 +14,10 @@
# limitations under the License.
set -e
wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/verbDict.txt --no-check-certificate
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/targetDict.txt --no-check-certificate
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/wordDict.txt --no-check-certificate
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/emb --no-check-certificate
tar -xzvf conll05st-tests.tar.gz
rm conll05st-tests.tar.gz
cp ./conll05st-release/test.wsj/words/test.wsj.words.gz .
@@ -22,4 +26,4 @@ gunzip test.wsj.words.gz
gunzip test.wsj.props.gz
python extract_pairs.py -w test.wsj.words -p test.wsj.props -o test.wsj.seq_pair
-python extract_dict_feature.py -p test.wsj.seq_pair -f feature -s src.dict -t tgt.dict
+python extract_dict_feature.py -p test.wsj.seq_pair -f feature
diff --git a/demo/semantic_role_labeling/dataprovider.py b/demo/semantic_role_labeling/dataprovider.py
index 5c003584a52d459f13b7942ebe3a7147ac58a42f..2c8e13462730a2e980fa1c3fe342ef0e062ab5d7 100644
--- a/demo/semantic_role_labeling/dataprovider.py
+++ b/demo/semantic_role_labeling/dataprovider.py
@@ -17,41 +17,52 @@ from paddle.trainer.PyDataProvider2 import *
UNK_IDX = 0
-def hook(settings, word_dict, label_dict, **kwargs):
+def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
settings.word_dict = word_dict
settings.label_dict = label_dict
+ settings.predicate_dict = predicate_dict
+
#all inputs are integral and sequential type
settings.slots = [
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)),
- integer_value_sequence(len(word_dict)), integer_value_sequence(2),
+ integer_value_sequence(len(word_dict)),
+ integer_value_sequence(len(word_dict)),
+ integer_value_sequence(len(predicate_dict)),
+ integer_value_sequence(2),
integer_value_sequence(len(label_dict))
]
-@provider(init_hook=hook)
-def process(obj, file_name):
+def get_batch_size(yeild_data):
+ return len(yeild_data[0])
+
+
+@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
+ can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
with open(file_name, 'r') as fdata:
for line in fdata:
- sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = \
+ sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = \
line.strip().split('\t')
-
+
words = sentence.split()
sen_len = len(words)
- word_slot = [obj.word_dict.get(w, UNK_IDX) for w in words]
+ word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
- predicate_slot = [obj.word_dict.get(predicate, UNK_IDX)] * sen_len
- ctx_n1_slot = [obj.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
- ctx_0_slot = [obj.word_dict.get(ctx_0, UNK_IDX)] * sen_len
- ctx_p1_slot = [obj.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+ predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
+ ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+ ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+ ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
+ ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+ ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
marks = mark.split()
mark_slot = [int(w) for w in marks]
label_list = label.split()
- label_slot = [obj.label_dict.get(w) for w in label_list]
-
- yield word_slot, predicate_slot, ctx_n1_slot, \
- ctx_0_slot, ctx_p1_slot, mark_slot, label_slot
+ label_slot = [settings.label_dict.get(w) for w in label_list]
+ yield word_slot, ctx_n2_slot, ctx_n1_slot, \
+ ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot, label_slot
diff --git a/demo/semantic_role_labeling/db_lstm.py b/demo/semantic_role_labeling/db_lstm.py
index e3f6edad6972112ed04e173a9b714e3fec13d402..54ceff0e724220cc9ea96b9e0ec6844947a8343e 100644
--- a/demo/semantic_role_labeling/db_lstm.py
+++ b/demo/semantic_role_labeling/db_lstm.py
@@ -18,8 +18,9 @@ import sys
from paddle.trainer_config_helpers import *
#file paths
-word_dict_file = './data/src.dict'
-label_dict_file = './data/tgt.dict'
+word_dict_file = './data/wordDict.txt'
+label_dict_file = './data/targetDict.txt'
+predicate_file= './data/verbDict.txt'
train_list_file = './data/train.list'
test_list_file = './data/test.list'
@@ -30,8 +31,10 @@ if not is_predict:
#load dictionaries
word_dict = dict()
label_dict = dict()
+ predicate_dict = dict()
with open(word_dict_file, 'r') as f_word, \
- open(label_dict_file, 'r') as f_label:
+ open(label_dict_file, 'r') as f_label, \
+ open(predicate_file, 'r') as f_pre:
for i, line in enumerate(f_word):
w = line.strip()
word_dict[w] = i
@@ -40,6 +43,11 @@ if not is_predict:
w = line.strip()
label_dict[w] = i
+ for i, line in enumerate(f_pre):
+ w = line.strip()
+ predicate_dict[w] = i
+
+
if is_test:
train_list_file = None
@@ -50,91 +58,157 @@ if not is_predict:
module='dataprovider',
obj='process',
args={'word_dict': word_dict,
- 'label_dict': label_dict})
+ 'label_dict': label_dict,
+ 'predicate_dict': predicate_dict })
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
+ pred_len = len(predicate_dict)
else:
word_dict_len = get_config_arg('dict_len', int)
label_dict_len = get_config_arg('label_len', int)
+ pred_len = get_config_arg('pred_len', int)
+############################## Hyper-parameters ##################################
mark_dict_len = 2
word_dim = 32
mark_dim = 5
-hidden_dim = 128
+hidden_dim = 512
depth = 8
-emb_lr = 1e-2
-fc_lr = 1e-2
-lstm_lr = 2e-2
+
+
+
+########################### Optimizer #######################################
+
settings(
batch_size=150,
- learning_method=AdamOptimizer(),
- learning_rate=1e-3,
+ learning_method=MomentumOptimizer(momentum=0),
+ learning_rate=2e-2,
regularization=L2Regularization(8e-4),
- gradient_clipping_threshold=25)
+ is_async=False,
+ model_average=ModelAverage(average_window=0.5,
+ max_average_window=10000),
+
+)
-#6 features
+
+
+
+####################################### network ##############################
+#8 features and 1 target
word = data_layer(name='word_data', size=word_dict_len)
-predicate = data_layer(name='verb_data', size=word_dict_len)
+predicate = data_layer(name='verb_data', size=pred_len)
+
+ctx_n2 = data_layer(name='ctx_n2_data', size=word_dict_len)
ctx_n1 = data_layer(name='ctx_n1_data', size=word_dict_len)
ctx_0 = data_layer(name='ctx_0_data', size=word_dict_len)
ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len)
+ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len)
mark = data_layer(name='mark_data', size=mark_dict_len)
+
if not is_predict:
target = data_layer(name='target', size=label_dict_len)
-ptt = ParameterAttribute(name='src_emb', learning_rate=emb_lr)
-layer_attr = ExtraLayerAttribute(drop_rate=0.5)
-fc_para_attr = ParameterAttribute(learning_rate=fc_lr)
-lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=lstm_lr)
-para_attr = [fc_para_attr, lstm_para_attr]
-word_embedding = embedding_layer(size=word_dim, input=word, param_attr=ptt)
-predicate_embedding = embedding_layer(
- size=word_dim, input=predicate, param_attr=ptt)
-ctx_n1_embedding = embedding_layer(size=word_dim, input=ctx_n1, param_attr=ptt)
-ctx_0_embedding = embedding_layer(size=word_dim, input=ctx_0, param_attr=ptt)
-ctx_p1_embedding = embedding_layer(size=word_dim, input=ctx_p1, param_attr=ptt)
-mark_embedding = embedding_layer(size=mark_dim, input=mark)
+default_std=1/math.sqrt(hidden_dim)/3.0
+
+emb_para = ParameterAttribute(name='emb', initial_std=0., learning_rate=0.)
+std_0 = ParameterAttribute(initial_std=0.)
+std_default = ParameterAttribute(initial_std=default_std)
+
+predicate_embedding = embedding_layer(size=word_dim, input=predicate, param_attr=ParameterAttribute(name='vemb',initial_std=default_std))
+mark_embedding = embedding_layer(name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)
+
+word_input=[word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+emb_layers = [embedding_layer(size=word_dim, input=x, param_attr=emb_para) for x in word_input]
+emb_layers.append(predicate_embedding)
+emb_layers.append(mark_embedding)
hidden_0 = mixed_layer(
+ name='hidden0',
size=hidden_dim,
- input=[
- full_matrix_projection(input=word_embedding),
- full_matrix_projection(input=predicate_embedding),
- full_matrix_projection(input=ctx_n1_embedding),
- full_matrix_projection(input=ctx_0_embedding),
- full_matrix_projection(input=ctx_p1_embedding),
- full_matrix_projection(input=mark_embedding),
- ])
+ bias_attr=std_default,
+ input=[ full_matrix_projection(input=emb, param_attr=std_default ) for emb in emb_layers ])
+
-lstm_0 = lstmemory(input=hidden_0, layer_attr=layer_attr)
+mix_hidden_lr = 1e-3
+lstm_para_attr = ParameterAttribute(initial_std=0.0, learning_rate=1.0)
+hidden_para_attr = ParameterAttribute(initial_std=default_std, learning_rate=mix_hidden_lr)
+
+lstm_0 = lstmemory(name='lstm0',
+ input=hidden_0,
+ act=ReluActivation(),
+ gate_act=SigmoidActivation(),
+ state_act=SigmoidActivation(),
+ bias_attr=std_0,
+ param_attr=lstm_para_attr)
#stack L-LSTM and R-LSTM with direct edges
input_tmp = [hidden_0, lstm_0]
+
for i in range(1, depth):
- fc = fc_layer(input=input_tmp, size=hidden_dim, param_attr=para_attr)
+ mix_hidden = mixed_layer(name='hidden'+str(i),
+ size=hidden_dim,
+ bias_attr=std_default,
+ input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
+ full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
+ ]
+ )
+
+ lstm = lstmemory(name='lstm'+str(i),
+ input=mix_hidden,
+ act=ReluActivation(),
+ gate_act=SigmoidActivation(),
+ state_act=SigmoidActivation(),
+ reverse=((i % 2)==1),
+ bias_attr=std_0,
+ param_attr=lstm_para_attr)
+
+ input_tmp = [mix_hidden, lstm]
+
+feature_out = mixed_layer(name='output',
+ size=label_dict_len,
+ bias_attr=std_default,
+ input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
+ full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
+ ],
+ )
- lstm = lstmemory(
- input=fc,
- act=ReluActivation(),
- reverse=(i % 2) == 1,
- layer_attr=layer_attr)
- input_tmp = [fc, lstm]
-prob = fc_layer(
- input=input_tmp,
- size=label_dict_len,
- act=SoftmaxActivation(),
- param_attr=para_attr)
if not is_predict:
- cls = classification_cost(input=prob, label=target)
- outputs(cls)
+ crf_l = crf_layer( name = 'crf',
+ size = label_dict_len,
+ input = feature_out,
+ label = target,
+ param_attr=ParameterAttribute(name='crfw',initial_std=default_std, learning_rate=mix_hidden_lr)
+
+ )
+
+
+ crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
+ size = label_dict_len,
+ input = feature_out,
+ label = target,
+ param_attr=ParameterAttribute(name='crfw')
+ )
+
+
+ eval = sum_evaluator(input=crf_dec_l)
+
+ outputs(crf_l)
+
else:
- outputs(prob)
+ crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
+ size = label_dict_len,
+ input = feature_out,
+ param_attr=ParameterAttribute(name='crfw')
+ )
+
+ outputs(crf_dec_l)
+
diff --git a/demo/semantic_role_labeling/predict.py b/demo/semantic_role_labeling/predict.py
index f051d4175cf6fff43bd7f84b457ab9dd12405a15..a7f1e8f81f59f6fe95fd29593ef1a826e652e570 100644
--- a/demo/semantic_role_labeling/predict.py
+++ b/demo/semantic_role_labeling/predict.py
@@ -26,7 +26,7 @@ UNK_IDX = 0
class Prediction():
- def __init__(self, train_conf, dict_file, model_dir, label_file):
+ def __init__(self, train_conf, dict_file, model_dir, label_file, predicate_dict_file):
"""
train_conf: trainer configure.
dict_file: word dictionary file name.
@@ -35,26 +35,37 @@ class Prediction():
self.dict = {}
self.labels = {}
+ self.predicate_dict={}
self.labels_reverse = {}
- self.load_dict_label(dict_file, label_file)
+ self.load_dict_label(dict_file, label_file, predicate_dict_file)
len_dict = len(self.dict)
len_label = len(self.labels)
-
- conf = parse_config(train_conf, 'dict_len=' + str(len_dict) +
- ',label_len=' + str(len_label) + ',is_predict=True')
+ len_pred = len(self.predicate_dict)
+
+ conf = parse_config(
+ train_conf,
+ 'dict_len=' + str(len_dict) +
+ ',label_len=' + str(len_label) +
+ ',pred_len=' + str(len_pred) +
+ ',is_predict=True')
self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
self.network.loadParameters(model_dir)
slots = [
- integer_value_sequence(len_dict), integer_value_sequence(len_dict),
- integer_value_sequence(len_dict), integer_value_sequence(len_dict),
- integer_value_sequence(len_dict), integer_value_sequence(2)
- ]
+ integer_value_sequence(len_dict),
+ integer_value_sequence(len_dict),
+ integer_value_sequence(len_dict),
+ integer_value_sequence(len_dict),
+ integer_value_sequence(len_dict),
+ integer_value_sequence(len_dict),
+ integer_value_sequence(len_pred),
+ integer_value_sequence(2)
+ ]
self.converter = DataProviderConverter(slots)
- def load_dict_label(self, dict_file, label_file):
+ def load_dict_label(self, dict_file, label_file, predicate_dict_file):
"""
Load dictionary from self.dict_file.
"""
@@ -65,39 +76,42 @@ class Prediction():
self.labels[line.strip()] = line_count
self.labels_reverse[line_count] = line.strip()
+ for line_count, line in enumerate(open(predicate_dict_file, 'r')):
+ self.predicate_dict[line.strip()] = line_count
def get_data(self, data_file):
"""
Get input data of paddle format.
"""
with open(data_file, 'r') as fdata:
for line in fdata:
- sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = line.strip(
+ sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = line.strip(
).split('\t')
words = sentence.split()
sen_len = len(words)
-
+
word_slot = [self.dict.get(w, UNK_IDX) for w in words]
- predicate_slot = [self.dict.get(predicate, UNK_IDX)] * sen_len
+ predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX)] * sen_len
+ ctx_n2_slot = [self.dict.get(ctx_n2, UNK_IDX)] * sen_len
ctx_n1_slot = [self.dict.get(ctx_n1, UNK_IDX)] * sen_len
ctx_0_slot = [self.dict.get(ctx_0, UNK_IDX)] * sen_len
ctx_p1_slot = [self.dict.get(ctx_p1, UNK_IDX)] * sen_len
+ ctx_p2_slot = [self.dict.get(ctx_p2, UNK_IDX)] * sen_len
marks = mark.split()
mark_slot = [int(w) for w in marks]
+
+ yield word_slot, ctx_n2_slot, ctx_n1_slot, \
+ ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot
- yield word_slot, predicate_slot, ctx_n1_slot, \
- ctx_0_slot, ctx_p1_slot, mark_slot
-
- def predict(self, data_file):
+ def predict(self, data_file, output_file):
"""
data_file: file name of input data.
"""
input = self.converter(self.get_data(data_file))
output = self.network.forwardTest(input)
- prob = output[0]["value"]
- lab = list(np.argsort(-prob)[:, 0])
+ lab = output[0]["id"].tolist()
- with open(data_file, 'r') as fin, open('predict.res', 'w') as fout:
+ with open(data_file, 'r') as fin, open(output_file, 'w') as fout:
index = 0
for line in fin:
sen = line.split('\t')[0]
@@ -109,8 +123,8 @@ class Prediction():
def option_parser():
- usage = ("python predict.py -c config -w model_dir "
- "-d word dictionary -l label_file -i input_file")
+ usage = ("python predict.py -c config -w model_dir "
+ "-d word dictionary -l label_file -i input_file -p pred_dict_file")
parser = OptionParser(usage="usage: %s [options]" % usage)
parser.add_option(
"-c",
@@ -131,6 +145,13 @@ def option_parser():
dest="label_file",
default=None,
help="label file")
+ parser.add_option(
+ "-p",
+ "--predict_dict_file",
+ action="store",
+ dest="predict_dict_file",
+ default=None,
+ help="predict_dict_file")
parser.add_option(
"-i",
"--data",
@@ -144,6 +165,14 @@ def option_parser():
dest="model_path",
default=None,
help="model path")
+
+ parser.add_option(
+ "-o",
+ "--output_file",
+ action="store",
+ dest="output_file",
+ default=None,
+ help="output file")
return parser.parse_args()
@@ -154,10 +183,12 @@ def main():
dict_file = options.dict_file
model_path = options.model_path
label_file = options.label_file
+ predict_dict_file = options.predict_dict_file
+ output_file = options.output_file
swig_paddle.initPaddle("--use_gpu=0")
- predict = Prediction(train_conf, dict_file, model_path, label_file)
- predict.predict(data_file)
+ predict = Prediction(train_conf, dict_file, model_path, label_file, predict_dict_file)
+ predict.predict(data_file,output_file)
if __name__ == '__main__':
diff --git a/demo/semantic_role_labeling/predict.sh b/demo/semantic_role_labeling/predict.sh
index a545b9a5d591b41bdbd54905cbbffc410abc8fb0..88ab5898f7d41056f4fe549b3145760783b27bf9 100644
--- a/demo/semantic_role_labeling/predict.sh
+++ b/demo/semantic_role_labeling/predict.sh
@@ -18,7 +18,7 @@ set -e
function get_best_pass() {
cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \
sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
- sort | head -n 1
+ sort -n | head -n 1
}
log=train.log
@@ -26,15 +26,18 @@ LOG=`get_best_pass $log`
LOG=(${LOG})
best_model_path="output/pass-${LOG[1]}"
-
config_file=db_lstm.py
-dict_file=./data/src.dict
-label_file=./data/tgt.dict
+dict_file=./data/wordDict.txt
+label_file=./data/targetDict.txt
+predicate_dict_file=./data/verbDict.txt
input_file=./data/feature
+output_file=predict.res
python predict.py \
-c $config_file \
-w $best_model_path \
-l $label_file \
+ -p $predicate_dict_file \
-d $dict_file \
- -i $input_file
+ -i $input_file \
+ -o $output_file
diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh
index 844649e8c0f6867dc0766e4ec6f250c5a4a004d9..f9e1bdcd4c752474329d36c4de3378f7d58e7b4b 100644
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
@@ -18,7 +18,7 @@ set -e
function get_best_pass() {
cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \
sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
- sort | head -n 1
+ sort -n | head -n 1
}
log=train.log
@@ -36,4 +36,5 @@ paddle train \
--job=test \
--use_gpu=false \
--config_args=is_test=1 \
+ --test_all_data_in_one_period=1 \
2>&1 | tee 'test.log'
diff --git a/demo/semantic_role_labeling/train.sh b/demo/semantic_role_labeling/train.sh
index c3a22b644be0ca08a2af73a57c09657014e49bfc..420768bb2b4ebed7b135a49c5eee5e5538426ae1 100644
--- a/demo/semantic_role_labeling/train.sh
+++ b/demo/semantic_role_labeling/train.sh
@@ -16,11 +16,14 @@
set -e
paddle train \
--config=./db_lstm.py \
+ --use_gpu=0 \
+ --log_period=5000 \
+ --trainer_count=1 \
+ --show_parameter_stats_period=5000 \
--save_dir=./output \
- --trainer_count=4 \
- --log_period=10 \
- --num_passes=500 \
- --use_gpu=false \
- --show_parameter_stats_period=10 \
+ --num_passes=10000 \
+ --average_test_period=10000000 \
+ --init_model_path=./data \
+ --load_missing_parameter_strategy=rand \
--test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
+ 2>&1 | tee 'train.log'
diff --git a/demo/sentiment/test.sh b/demo/sentiment/test.sh
index 098fbb91389b89c8b69ccf2f5d308e4e715ac950..c8b12a0e89dbddea56b4ee069ebf66f8d8630615 100755
--- a/demo/sentiment/test.sh
+++ b/demo/sentiment/test.sh
@@ -17,7 +17,7 @@ set -e
function get_best_pass() {
cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \
sed -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
- sort | head -n 1
+ sort -n | head -n 1
}
log=train.log
diff --git a/demo/sentiment/trainer_config.py b/demo/sentiment/trainer_config.py
index 894070e7c97dcb29e8c0df31437a374be5f5d691..114a9138ebfef054c7d3ba99b4a510a452f8f2cd 100644
--- a/demo/sentiment/trainer_config.py
+++ b/demo/sentiment/trainer_config.py
@@ -29,6 +29,7 @@ settings(
batch_size=128,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
+ average_window=0.5,
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25)
diff --git a/demo/seqToseq/data/paraphrase_data.sh b/demo/seqToseq/data/paraphrase_data.sh
index ea1f8dbcfad35699189f6cd4efc81d97e8c89148..1b3f1d45e11fbd5e600e58f583e503a603e484ff 100755
--- a/demo/seqToseq/data/paraphrase_data.sh
+++ b/demo/seqToseq/data/paraphrase_data.sh
@@ -16,9 +16,7 @@ set -e
set -x
# download the in-house paraphrase dataset
-# following is the google drive address
-# you can also directly download from https://pan.baidu.com/s/1o8q577s
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/paraphrase.tar.gz --no-check-certificate
+wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/paraphrase.tar.gz
# untar the dataset
tar -zxvf paraphrase.tar.gz
diff --git a/demo/seqToseq/data/wmt14_model.sh b/demo/seqToseq/data/wmt14_model.sh
index 2cec30688d27a57902cdf64d7be5712d12c69bdd..d6e7a732644dc188a165215ddf3f69e1514425eb 100755
--- a/demo/seqToseq/data/wmt14_model.sh
+++ b/demo/seqToseq/data/wmt14_model.sh
@@ -16,9 +16,7 @@ set -e
set -x
# download the pretrained model
-# following is the google drive address
-# you can also directly download from https://pan.baidu.com/s/1o8q577s
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/wmt14_model.tar.gz --no-check-certificate
+wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz
# untar the model
tar -zxvf wmt14_model.tar.gz
diff --git a/doc/algorithm/rnn/rnn.rst b/doc/algorithm/rnn/rnn.rst
index 399c5da5fffc20dda78b9eefb2629308cabd748e..01d2caefb5cdf4e949511fd0f5bbafe0e604e881 100644
--- a/doc/algorithm/rnn/rnn.rst
+++ b/doc/algorithm/rnn/rnn.rst
@@ -17,7 +17,7 @@ PaddlePaddle does not need any preprocessing to sequence data, such as padding.
.. code-block:: python
- settings.slots = [
+ settings.input_types = [
integer_value_sequence(len(settings.src_dict)),
integer_value_sequence(len(settings.trg_dict)),
integer_value_sequence(len(settings.trg_dict))]
diff --git a/doc/build/build_from_source.md b/doc/build/build_from_source.md
index b8f26f431eb7a04147fe791a8c805427c827fe09..b932fbc0fa4443d2fd8abfc9d8a78e68c44f667c 100644
--- a/doc/build/build_from_source.md
+++ b/doc/build/build_from_source.md
@@ -6,10 +6,10 @@ Installing from Sources
* [3. Build on Ubuntu](#ubuntu)
## Download and Setup
-You can download PaddlePaddle from the [github source](https://github.com/gangliao/Paddle).
+You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
```bash
-git clone https://github.com/baidu/Paddle paddle
+git clone https://github.com/PaddlePaddle/Paddle paddle
cd paddle
```
@@ -95,7 +95,7 @@ As a simple example, consider the following:
```bash
# necessary
sudo apt-get update
- sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
+ sudo apt-get install -y g++ make cmake swig build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
# optional
sudo apt-get install libgoogle-glog-dev
sudo apt-get install libgflags-dev
@@ -149,15 +149,15 @@ If still not found, you can manually set it based on CMake error information fro
As a simple example, consider the following:
-- **Only CPU**
+- **Only CPU with swig**
```bash
- cmake .. -DWITH_GPU=OFF
+ cmake .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON
```
-- **GPU**
+- **GPU with swig**
```bash
- cmake .. -DWITH_GPU=ON
+ cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON
```
- **GPU with doc and swig**
@@ -170,15 +170,13 @@ Finally, you can build PaddlePaddle:
```bash
# you can add build option here, such as:
-cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=
+cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX= -DWITH_SWIG_PY=ON
# please use sudo make install, if you want to install PaddlePaddle into the system
make -j `nproc` && make install
# set PaddlePaddle installation path in ~/.bashrc
export PATH=/bin:$PATH
```
-**Note:**
-
If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
Otherwise, PaddlePaddle will automatically install python dependencies
at first time when user run paddle commands, such as `paddle version`, `paddle train`.
diff --git a/doc/demo/quick_start/index_en.md b/doc/demo/quick_start/index_en.md
index 80d816a768a71156ce72cda6ea92b749fbcdbe1f..659485d9be1b6a3e9759a2fd040cb09d1f2a3005 100644
--- a/doc/demo/quick_start/index_en.md
+++ b/doc/demo/quick_start/index_en.md
@@ -477,7 +477,7 @@ The scripts of data downloading, network configurations, and training scrips are
Word embedding |
15MB |
8.484% |
-trainer_config.bow.py |
+trainer_config.emb.py |
diff --git a/doc/demo/semantic_role_labeling/curve.jpg b/doc/demo/semantic_role_labeling/curve.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..baa35ae7f0a0b6c246f3a0d331735477ab8bcd70
Binary files /dev/null and b/doc/demo/semantic_role_labeling/curve.jpg differ
diff --git a/doc/demo/semantic_role_labeling/semantic_role_labeling.md b/doc/demo/semantic_role_labeling/semantic_role_labeling.md
index 890f7314582c65e9add50664006b57aa4e0709eb..e2793b2b3494160a7a80f07ec2127bd1f1a4f2e4 100644
--- a/doc/demo/semantic_role_labeling/semantic_role_labeling.md
+++ b/doc/demo/semantic_role_labeling/semantic_role_labeling.md
@@ -1,183 +1,200 @@
-# Semantic Role labeling Tutorial #
-
-Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering. An instance is as following [1]:
-
- [ A0 He ] [ AM-MOD would ][ AM-NEG n’t ] [ V accept] [ A1 anything of value ] from [A2 those he was writing about ].
-
-- V: verb
-- A0: acceptor
-- A1: thing accepted
-- A2: accepted-from
-- A3: Attribute
-- AM-MOD: modal
-- AM-NEG: negation
-
-Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank.
-
-To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem.
-
-## Data Description
-The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license, the demo adopts the test data set of CoNLL-2005, which can be reached on website.
-
-To download and process the original data, user just need to execute the following command:
-
-```bash
-cd data
-./get_data.sh
-```
-Several new files appear in the `data `directory as follows.
-```bash
-conll05st-release:the test data set of CoNll-2005 shared task
-test.wsj.words:the Wall Street Journal data sentences
-test.wsj.props: the propositional arguments
-src.dict:the dictionary of words in sentences
-tgt.dict:the labels dictionary
-feature: the extracted features from data set
-```
-
-## Training
-### DB-LSTM
-Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit.
-
-Unlike Bidirectional-LSTM that used in Sentiment Analysis demo, the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model.
-
-The following figure shows a temporal expanded 2-layer DB-LSTM network.
-
-![pic](./network_arch.png)
-
-
-### Features
-Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark mr = 1 to denote the argument position if it locates in the predicate context region, or mr = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
-
-![pic](./feature.jpg)
-
-
-In this sample, the coresponding labelled sentence is:
-
-[ A1 A record date ] has [ AM-NEG n't ] been [ V set ] .
-
-In the demo, we adopt the feature template as above, consists of : `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`.
-
-### Data Provider
-
-`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The Six features and label are all IndexSlots.
-```
-def hook(settings, word_dict, label_dict, **kwargs):
- settings.word_dict = word_dict
- settings.label_dict = label_dict
- #all inputs are integral and sequential type
- settings.slots = [
- integer_value_sequence(len(word_dict)),
- integer_value_sequence(len(word_dict)),
- integer_value_sequence(len(word_dict)),
- integer_value_sequence(len(word_dict)),
- integer_value_sequence(len(word_dict)),
- integer_value_sequence(2),
- integer_value_sequence(len(label_dict))]
-```
-The corresponding data iterator is as following:
-```
-@provider(use_seq=True, init_hook=hook)
-def process(obj, file_name):
- with open(file_name, 'r') as fdata:
- for line in fdata:
- sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = line.strip().split('\t')
- words = sentence.split()
- sen_len = len(words)
- word_slot = [obj.word_dict.get(w, UNK_IDX) for w in words]
-
- predicate_slot = [obj.word_dict.get(predicate, UNK_IDX)] * sen_len
- ctx_n1_slot = [obj.word_dict.get(ctx_n1, UNK_IDX) ] * sen_len
- ctx_0_slot = [obj.word_dict.get(ctx_0, UNK_IDX) ] * sen_len
- ctx_p1_slot = [obj.word_dict.get(ctx_p1, UNK_IDX) ] * sen_len
-
- marks = mark.split()
- mark_slot = [int(w) for w in marks]
-
- label_list = label.split()
- label_slot = [obj.label_dict.get(w) for w in label_list]
-
- yield word_slot, predicate_slot, ctx_n1_slot, ctx_0_slot, ctx_p1_slot, mark_slot, label_slot
-```
-The `process`function yield 7 lists which are six features and labels.
-
-### Neural Network Config
-`db_lstm.py` is the neural network config file to load the dictionaries and define the data provider module and network architecture during the training procedure.
-
-Seven `data_layer` load instances from data provider. Six features are transformed into embedddings respectively, and mixed by `mixed_layer` . Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
-
-### Run Training
-The script for training is `train.sh`, user just need to execute:
-```bash
- ./train.sh
-```
-The content in `train.sh`:
-```
-paddle train \
- --config=./db_lstm.py \
- --save_dir=./output \
- --trainer_count=4 \
- --log_period=10 \
- --num_passes=500 \
- --use_gpu=false \
- --show_parameter_stats_period=10 \
- --test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
-```
-
-- \--config=./db_lstm.py : network config file.
-- \--save_di=./output: output path to save models.
-- \--trainer_count=4 : set thread number (or GPU count).
-- \--log_period=10 : print log every 20 batches.
-- \--num_passes=500: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
-- \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train.
-- \--show_parameter_stats_period=10: show parameter statistic every 100 batches.
-- \--test_all_data_in_one_period=1: test all data in every testing.
-
-
-After training, the models will be saved in directory `output`.
-
-### Run testing
-The script for testing is `test.sh`, user just need to execute:
-```bash
- ./test.sh
-```
-The main part in `tesh.sh`
-```
-paddle train \
- --config=./db_lstm.py \
- --model_list=$model_list \
- --job=test \
- --config_args=is_test=1 \
-```
-
- - \--config=./db_lstm.py: network config file
- - \--model_list=$model_list.list: model list file
- - \--job=test: indicate the test job
- - \--config_args=is_test=1: flag to indicate test
-
-
-### Run prediction
-The script for prediction is `predict.sh`, user just need to execute:
-```bash
- ./predict.sh
-
-```
-In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file
-```
-python predict.py
- -c $config_file
- -w $model_path
- -l $label_file
- -d $dict_file
- -i $input_file
-```
-
-`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix.
-
-After prediction, the result is saved in `predict.res`.
-
-## Reference
-[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005.
-
-[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
+# Semantic Role labeling Tutorial #
+
+Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering. An instance is as following [1]:
+
+ [ A0 He ] [ AM-MOD would ][ AM-NEG n’t ] [ V accept] [ A1 anything of value ] from [A2 those he was writing about ].
+
+- V: verb
+- A0: acceptor
+- A1: thing accepted
+- A2: accepted-from
+- A3: Attribute
+- AM-MOD: modal
+- AM-NEG: negation
+
+Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank.
+
+To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem.
+
+## Data Description
+The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license, the demo adopts the test data set of CoNLL-2005, which can be reached on website.
+
+To download and process the original data, user just need to execute the following command:
+
+```bash
+cd data
+./get_data.sh
+```
+Several new files appear in the `data `directory as follows.
+```bash
+conll05st-release:the test data set of CoNll-2005 shared task
+test.wsj.words:the Wall Street Journal data sentences
+test.wsj.props: the propositional arguments
+feature: the extracted features from data set
+```
+
+## Training
+### DB-LSTM
+Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit.
+
+Unlike Bidirectional-LSTM that used in Sentiment Analysis demo, the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model.
+
+The following figure shows a temporal expanded 2-layer DB-LSTM network.
+
+![pic](./network_arch.png)
+
+
+### Features
+Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark mr = 1 to denote the argument position if it locates in the predicate context region, or mr = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
+
+![pic](./feature.jpg)
+
+
+In this sample, the coresponding labelled sentence is:
+
+[ A1 A record date ] has [ AM-NEG n't ] been [ V set ] .
+
+In the demo, we adopt the feature template as above, consists of : `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`.
+
+### Data Provider
+
+`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The Six features and label are all IndexSlots.
+```
+def hook(settings, word_dict, label_dict, **kwargs):
+ settings.word_dict = word_dict
+ settings.label_dict = label_dict
+ #all inputs are integral and sequential type
+ settings.slots = [
+ integer_value_sequence(len(word_dict)),
+ integer_value_sequence(len(predicate_dict)),
+ integer_value_sequence(len(word_dict)),
+ integer_value_sequence(len(word_dict)),
+ integer_value_sequence(len(word_dict)),
+ integer_value_sequence(len(word_dict)),
+ integer_value_sequence(len(word_dict)),
+ integer_value_sequence(2),
+ integer_value_sequence(len(label_dict))]
+```
+The corresponding data iterator is as following:
+```
+@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
+ can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
+ with open(file_name, 'r') as fdata:
+ for line in fdata:
+ sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = \
+ line.strip().split('\t')
+
+ words = sentence.split()
+ sen_len = len(words)
+ word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
+
+ predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
+ ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+ ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+ ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
+ ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+ ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+ marks = mark.split()
+ mark_slot = [int(w) for w in marks]
+
+ label_list = label.split()
+ label_slot = [settings.label_dict.get(w) for w in label_list]
+ yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
+ ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
+```
+The `process`function yield 9 lists which are 8 features and label.
+
+### Neural Network Config
+`db_lstm.py` is the neural network config file to load the dictionaries and define the data provider module and network architecture during the training procedure.
+
+Nine `data_layer` load instances from data provider. Eight features are transformed into embedddings respectively, and mixed by `mixed_layer` . Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
+
+### Run Training
+The script for training is `train.sh`, user just need to execute:
+```bash
+ ./train.sh
+```
+The content in `train.sh`:
+```
+paddle train \
+ --config=./db_lstm.py \
+ --use_gpu=0 \
+ --log_period=5000 \
+ --trainer_count=1 \
+ --show_parameter_stats_period=5000 \
+ --save_dir=./output \
+ --num_passes=10000 \
+ --average_test_period=10000000 \
+ --init_model_path=./data \
+ --load_missing_parameter_strategy=rand \
+ --test_all_data_in_one_period=1 \
+2>&1 | tee 'train.log'
+```
+
+- \--config=./db_lstm.py : network config file.
+- \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train, until now crf_layer do not support GPU
+- \--log_period=500: print log every 20 batches.
+- \--trainer_count=1: set thread number (or GPU count).
+- \--show_parameter_stats_period=5000: show parameter statistic every 100 batches.
+- \--save_dir=./output: output path to save models.
+- \--num_passes=10000: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
+- \--average_test_period=10000000: do test on average parameter every average_test_period batches
+- \--init_model_path=./data: parameter initialization path
+- \--load_missing_parameter_strategy=rand: random initialization unexisted parameters
+- \--test_all_data_in_one_period=1: test all data in one period
+
+
+After training, the models will be saved in directory `output`. Our training curve is as following:
+
+![pic](./curve.jpg)
+
+
+### Run testing
+The script for testing is `test.sh`, user just need to execute:
+```bash
+ ./test.sh
+```
+The main part in `tesh.sh`
+```
+paddle train \
+ --config=./db_lstm.py \
+ --model_list=$model_list \
+ --job=test \
+ --config_args=is_test=1 \
+```
+
+ - \--config=./db_lstm.py: network config file
+ - \--model_list=$model_list.list: model list file
+ - \--job=test: indicate the test job
+ - \--config_args=is_test=1: flag to indicate test
+ - \--test_all_data_in_one_period=1: test all data in 1 period
+
+
+### Run prediction
+The script for prediction is `predict.sh`, user just need to execute:
+```bash
+ ./predict.sh
+
+```
+In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file
+```
+python predict.py
+ -c $config_file \
+ -w $best_model_path \
+ -l $label_file \
+ -p $predicate_dict_file \
+ -d $dict_file \
+ -i $input_file \
+ -o $output_file
+```
+
+`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix.
+
+After prediction, the result is saved in `predict.res`.
+
+## Reference
+[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005.
+
+[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
diff --git a/doc/demo/sentiment_analysis/sentiment_analysis.md b/doc/demo/sentiment_analysis/sentiment_analysis.md
index 385f49891dcd840c525f7d1c3aaf7f08a7e4903f..c53952c544de9fa88a6318432e34b0d05b149445 100644
--- a/doc/demo/sentiment_analysis/sentiment_analysis.md
+++ b/doc/demo/sentiment_analysis/sentiment_analysis.md
@@ -6,7 +6,7 @@ Sentiment analysis is also used to monitor social media based on large amount of
On the other hand, grabbing the user comments of products and analyzing their sentiment are useful to understand user preferences for companies, products, even competing products.
-This tutorial will guide you through the process of training a Long Short Term Memory (LSTM) Network to classify the sentiment of sentences from [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), sometimes known as the [Internet Movie Database (IMDB)](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf). This dataset contains movie reviews along with their associated binary sentiment polarity labels, namely positive and negative. So randomly guessing yields 50% accuracy.
+This tutorial will guide you through the process of training a Long Short Term Memory (LSTM) Network to classify the sentiment of sentences from [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), sometimes known as the Internet Movie Database (IMDB). This dataset contains movie reviews along with their associated binary sentiment polarity labels, namely positive and negative. So randomly guessing yields 50% accuracy.
## Data Preparation
@@ -39,7 +39,7 @@ imdbEr.txt imdb.vocab README test train
* imdbEr.txt: expected rating for each token in imdb.vocab.
* README: data documentation.
-Both train and test set directory contains:
+The file in train set directory is as follows. The test set also contains them except `unsup` and `urls_unsup.txt`.
```
labeledBow.feat neg pos unsup unsupBow.feat urls_neg.txt urls_pos.txt urls_unsup.txt
@@ -151,6 +151,7 @@ settings(
batch_size=128,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
+ average_window=0.5,
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)
@@ -163,17 +164,18 @@ stacked_lstm_net(dict_dim, class_dim=class_dim,
* **Data Definition**:
* get\_config\_arg(): get arguments setted by `--config_args=xx` in commandline argument.
- * Define TrainData and TestData provider, here using Python interface (PyDataProviderWrapper) of PaddlePaddle to load data. For details, you can refer to the document of PyDataProvider.
+ * Define data provider, here using Python interface to load data. For details, you can refer to the document of PyDataProvider2.
* **Algorithm Configuration**:
- * use sgd algorithm.
- * use adam optimization.
* set batch size of 128.
- * set average sgd window.
* set global learning rate.
+ * use adam optimization.
+ * set average sgd window.
+ * set L2 regularization.
+ * set gradient clipping threshold.
* **Network Configuration**:
- * dict_dim: get dictionary dimension.
- * class_dim: set category number, IMDB has two label, namely positive and negative label.
+ * dict_dim: dictionary dimension.
+ * class_dim: category number, IMDB has two label, namely positive and negative label.
* `stacked_lstm_net`: predefined network as shown in Figure 3, use this network by default.
* `bidirectional_lstm_net`: predefined network as shown in Figure 2.
diff --git a/doc/dev/new_layer/new_layer.rst b/doc/dev/new_layer/new_layer.rst
index 2fa00730486dbe1f2c9585872068a77efa09f004..af8b76a3075194ead9be40d2c943238b2cfadecc 100644
--- a/doc/dev/new_layer/new_layer.rst
+++ b/doc/dev/new_layer/new_layer.rst
@@ -60,7 +60,7 @@ Implement C++ Class
The C++ class of the layer implements the initialization, forward, and backward part of the layer. The fully connected layer is at :code:`paddle/gserver/layers/FullyConnectedLayer.h` and :code:`paddle/gserver/layers/FullyConnectedLayer.cpp`. We list simplified version of the code below.
-It needs to derive the base class :code:`paddle::BaseLayer`, and it needs to override the following functions:
+It needs to derive the base class :code:`paddle::Layer`, and it needs to override the following functions:
- constructor and destructor.
- :code:`init` function. It is used to initialize the parameters and settings.
diff --git a/doc/index.rst b/doc/index.rst
index 668ad75a902bdd14c6198c41380ae93e29cec0d3..76fb7a3ace8057d9cd34e03134c63ef0cd298cae 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -8,3 +8,4 @@ PaddlePaddle Documentation
user_guide.rst
dev/index.rst
algorithm/index.rst
+ optimization/index.rst
diff --git a/doc/optimization/gpu_profiling.rst b/doc/optimization/gpu_profiling.rst
new file mode 100644
index 0000000000000000000000000000000000000000..667bf1364e7cd4c9098caba72a127228d78ca38b
--- /dev/null
+++ b/doc/optimization/gpu_profiling.rst
@@ -0,0 +1,237 @@
+Profiling on PaddlePaddle
+=========================
+
+This tutorial will guide you step-by-step through how to conduct profiling and performance tuning using built-in timer, **nvprof** and **nvvp**.
+
+- What is profiling?
+- Why we need profiling?
+- How to do profiling?
+- Profile tools
+- Hands-on Tutorial
+- Profiling tips
+
+What's profiling?
+=================
+In software engineering, profiling is a form of dynamic program analysis that measures the space (memory) or time
+complexity of a program, the usage of particular instructions, or the frequency and duration of function calls.
+Most commonly, profiling information serves to aid program optimization.
+
+Briefly, profiler is used to measure application performance. Program analysis tools are extremely important for
+understanding program behavior. Simple profiling can tell you that how long does an operation take? For advanced
+profiling, it can interpret why does an operation take a long time?
+
+Why we need profiling?
+======================
+Since training deep neural network typically take a very long time to get over, performance is gradually becoming
+the most important thing in deep learning field. The first step to improve performance is to understand what parts
+are slow. There is no point in improving performance of a region which doesn’t take much time!
+
+
+How to do profiling?
+====================
+To achieve maximum performance, there are five steps you can take to reach your goals.
+
+- Profile the code
+- Find the slow parts
+- Work out why they’re slow
+- Make them fast
+- Profile the code again
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU, it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+Profiler Tools
+==============
+For general GPU profiling, a bunch of tools are provided from both NVIDIA and third party.
+
+**nvprof** is Nvidia profiler and **nvvp** is (GUI based) Nvidia visual profiler.
+In this tutorial, we will focus on nvprof and nvvp.
+
+:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
+above profilers.
+
+.. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
+ :language: c++
+ :lines: 111-124
+ :linenos:
+
+The above code snippet includes two methods, you can use any of them to profile the regions of interest.
+
+1. :code:`REGISTER_TIMER_INFO` is a built-in timer wrapper which can calculate the time overhead of both cpu functions and cuda kernels.
+
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+
+You can find more details about how to use both of them in the next session.
+
+Hands-on Approach
+=================
+
+Built-in Timer
+--------------
+
+To enable built-in timer in PaddlePaddle, first you have to add :code:`REGISTER_TIMER_INFO` into the regions of you interest.
+Then, all information could be stamped in the console via :code:`printStatus` or :code:`printAllStatus` function.
+As a simple example, consider the following:
+
+1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines).
+
+ .. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
+ :language: c++
+ :lines: 111-124
+ :emphasize-lines: 8-10,13
+ :linenos:
+
+2. Configure cmake with **WITH_TIMER** and recompile PaddlePaddle.
+
+ .. code-block:: bash
+
+ cmake .. -DWITH_TIMER=ON
+ make
+
+3. Execute your code and observe the results (see the emphasize-lines).
+
+ .. code-block:: bash
+ :emphasize-lines: 1,12-15
+
+ > ./paddle/math/tests/test_GpuProfiler
+ I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+ I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+ I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+ [==========] Running 1 test from 1 test case.
+ [----------] Global test environment set-up.
+ [----------] 1 test from Profiler
+ [ RUN ] Profiler.BilinearFwdBwd
+ I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+ gSizeX = 64, imgSizeY = 64"
+ I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+ I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+ I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd total=136.141 avg=136.141 max=136.141 min=136.141 count=1
+ I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+ I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+ [ OK ] Profiler.BilinearFwdBwd (136 ms)
+ [----------] 1 test from Profiler (136 ms total)
+
+ [----------] Global test environment tear-down
+ [==========] 1 test from 1 test case ran. (136 ms total)
+ [ PASSED ] 1 test.
+
+nvprof profiler
+---------------
+
+To use this command line profiler **nvprof**, you can simply issue the following command:
+
+1. Add :code:`REGISTER_GPU_PROFILER` function (see the emphasize-lines).
+
+ .. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
+ :language: c++
+ :lines: 111-124
+ :emphasize-lines: 6-7
+ :linenos:
+
+2. Configure cmake with **WITH_PROFILER** and recompile PaddlePaddle.
+
+ .. code-block:: bash
+
+ cmake .. -DWITH_PROFILER=ON
+ make
+
+3. Use Nvidia profiler **nvprof** to profile the binary.
+
+ .. code-block:: bash
+
+ nvprof ./paddle/math/tests/test_GpuProfiler
+
+Then, you can get the following profiling result:
+
+.. code-block:: bash
+
+ ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+ ==78544== Profiling result:
+ Time(%) Time Calls Avg Min Max Name
+ 27.60% 9.6305ms 5 1.9261ms 3.4560us 6.4035ms [CUDA memcpy HtoD]
+ 26.07% 9.0957ms 1 9.0957ms 9.0957ms 9.0957ms KeBilinearInterpBw
+ 23.78% 8.2977ms 1 8.2977ms 8.2977ms 8.2977ms KeBilinearInterpFw
+ 22.55% 7.8661ms 2 3.9330ms 1.5798ms 6.2863ms [CUDA memcpy DtoH]
+
+ ==78544== API calls:
+ Time(%) Time Calls Avg Min Max Name
+ 46.85% 682.28ms 8 85.285ms 12.639us 682.03ms cudaStreamCreateWithFlags
+ 39.83% 580.00ms 4 145.00ms 302ns 550.27ms cudaFree
+ 9.82% 143.03ms 9 15.892ms 8.7090us 142.78ms cudaStreamCreate
+ 1.23% 17.983ms 7 2.5690ms 23.210us 6.4563ms cudaMemcpy
+ 1.23% 17.849ms 2 8.9247ms 8.4726ms 9.3768ms cudaStreamSynchronize
+ 0.66% 9.5969ms 7 1.3710ms 288.43us 2.4279ms cudaHostAlloc
+ 0.13% 1.9530ms 11 177.54us 7.6810us 591.06us cudaMalloc
+ 0.07% 1.0424ms 8 130.30us 1.6970us 453.72us cudaGetDevice
+ 0.04% 527.90us 40 13.197us 525ns 253.99us cudaEventCreateWithFlags
+ 0.03% 435.73us 348 1.2520us 124ns 42.704us cuDeviceGetAttribute
+ 0.03% 419.36us 1 419.36us 419.36us 419.36us cudaGetDeviceCount
+ 0.02% 260.75us 2 130.38us 129.32us 131.43us cudaGetDeviceProperties
+ 0.02% 222.32us 2 111.16us 106.94us 115.39us cudaLaunch
+ 0.01% 214.06us 4 53.514us 28.586us 77.655us cuDeviceGetName
+ 0.01% 115.45us 4 28.861us 9.8250us 44.526us cuDeviceTotalMem
+ 0.01% 83.988us 4 20.997us 578ns 77.760us cudaSetDevice
+ 0.00% 38.918us 1 38.918us 38.918us 38.918us cudaEventCreate
+ 0.00% 34.573us 31 1.1150us 279ns 12.784us cudaDeviceGetAttribute
+ 0.00% 17.767us 1 17.767us 17.767us 17.767us cudaProfilerStart
+ 0.00% 15.228us 2 7.6140us 3.5460us 11.682us cudaConfigureCall
+ 0.00% 14.536us 2 7.2680us 1.1490us 13.387us cudaGetLastError
+ 0.00% 8.6080us 26 331ns 173ns 783ns cudaSetupArgument
+ 0.00% 5.5470us 6 924ns 215ns 2.6780us cuDeviceGet
+ 0.00% 5.4090us 6 901ns 328ns 3.3320us cuDeviceGetCount
+ 0.00% 4.1770us 3 1.3920us 1.0630us 1.8300us cuDriverGetVersion
+ 0.00% 3.4650us 3 1.1550us 1.0810us 1.2680us cuInit
+ 0.00% 830ns 1 830ns 830ns 830ns cudaRuntimeGetVersion
+
+
+nvvp profiler
+-------------
+
+For visual profiler **nvvp**, you can either import the output of :code:`nvprof –o ...` or
+run application through GUI.
+
+**Note: nvvp also support CPU profiling** (Click the box in nvvp to enable profile execution on CPU).
+
+.. image:: nvvp1.png
+ :align: center
+ :scale: 33%
+
+From the perspective of kernel functions, **nvvp** can even illustrate why does an operation take a long time?
+As shown in the following figure, kernel's block usage, register usage and shared memory usage from :code:`nvvp`
+allow us to fully utilize all warps on the GPU.
+
+.. image:: nvvp2.png
+ :align: center
+ :scale: 33%
+
+From the perspective of application, **nvvp** can give you some suggestions to address performance bottleneck.
+For instance, some advice in data movement and compute utilization from the below figure can guide you to tune performance.
+
+.. image:: nvvp3.png
+ :align: center
+ :scale: 33%
+
+.. image:: nvvp4.png
+ :align: center
+ :scale: 33%
+
+Profiling tips
+==============
+
+- The **nvprof** and **nvvp** output is a very good place to start.
+- The timeline is a good place to go next.
+- Only dig deep into a kernel if it’s taking a significant amount of your time.
+- Where possible, try to match profiler output with theory.
+ 1) For example, if I know I’m moving 1GB, and my kernel takes 10ms, I expect the profiler to report 100GB/s.
+ 2) Discrepancies are likely to mean your application isn’t doing what you thought it was.
+- Know your hardware: If your GPU can do 6 TFLOPs, and you’re already doing 5.5 TFLOPs, you won’t go much faster!
+
+
+Profiling is a key step in optimization. Sometimes quite simple changes can lead to big improvements in performance.
+Your mileage may vary!
+
+Reference
+=========
+Jeremy Appleyard, `GPU Profiling for Deep Learning `_, 2015
diff --git a/doc/optimization/index.rst b/doc/optimization/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c9e87e0778dfe44fa3d1bb84d0ad340aa6f25d08
--- /dev/null
+++ b/doc/optimization/index.rst
@@ -0,0 +1,7 @@
+Performance Tuning
+==================
+
+.. toctree::
+ :maxdepth: 3
+
+ gpu_profiling.rst
diff --git a/doc/optimization/nvvp1.png b/doc/optimization/nvvp1.png
new file mode 100644
index 0000000000000000000000000000000000000000..1af23ac3c52929b2b0645d2f9fa4d4c6db1f6e77
Binary files /dev/null and b/doc/optimization/nvvp1.png differ
diff --git a/doc/optimization/nvvp2.png b/doc/optimization/nvvp2.png
new file mode 100644
index 0000000000000000000000000000000000000000..177c9db708da6863d1075f3e615f5962dbe18b29
Binary files /dev/null and b/doc/optimization/nvvp2.png differ
diff --git a/doc/optimization/nvvp3.png b/doc/optimization/nvvp3.png
new file mode 100644
index 0000000000000000000000000000000000000000..d8f393667d6569b6f1e61ffccac43fae5888b6db
Binary files /dev/null and b/doc/optimization/nvvp3.png differ
diff --git a/doc/optimization/nvvp4.png b/doc/optimization/nvvp4.png
new file mode 100644
index 0000000000000000000000000000000000000000..51f2f3e183295de6cf8ddaf2b3b8a0862aa35f01
Binary files /dev/null and b/doc/optimization/nvvp4.png differ
diff --git a/doc/source/api/api.rst b/doc/source/api.rst
similarity index 90%
rename from doc/source/api/api.rst
rename to doc/source/api.rst
index 6fc450202df73f5ca99c2c52f257243aa37c90d4..30396c26b61827847cc5acc29cee1c3c8e7b226e 100644
--- a/doc/source/api/api.rst
+++ b/doc/source/api.rst
@@ -1,5 +1,5 @@
API
-========
+===
.. doxygenfile:: paddle/api/PaddleAPI.h
.. doxygenfile:: paddle/api/Internal.h
diff --git a/doc/source/cuda/cuda/cuda.rst b/doc/source/cuda/cuda/cuda.rst
deleted file mode 100644
index 52f17c2b2e48aec8e6fc8d5a7e4f443ad72d96a6..0000000000000000000000000000000000000000
--- a/doc/source/cuda/cuda/cuda.rst
+++ /dev/null
@@ -1,39 +0,0 @@
-Cuda
-=============
-
-Dynamic Link Libs
---------------------------
-
-hl_dso_loader.h
-``````````````````
-.. doxygenfile:: paddle/cuda/include/hl_dso_loader.h
-
-GPU Resources
-----------------
-
-hl_cuda.ph
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda.ph
-
-hl_cuda.h
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda.h
-
-CUDA Wrapper
---------------
-
-hl_cuda_cublas.h
-``````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cublas.h
-
-hl_cuda_cudnn.h
-``````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.h
-
-hl_cuda_cudnn.h
-``````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.ph
-
-
-
-
diff --git a/doc/source/cuda/cuda/index.rst b/doc/source/cuda/cuda/index.rst
deleted file mode 100644
index 5fa38ff0fc8cea2b97262ea5493dea27b322dc1c..0000000000000000000000000000000000000000
--- a/doc/source/cuda/cuda/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-CUDA
-====================
-
-.. toctree::
- :maxdepth: 3
-
- cuda.rst
diff --git a/doc/source/cuda/index.rst b/doc/source/cuda/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b0fed2e7f72c9a9671e56e114edfc88d72504dbe
--- /dev/null
+++ b/doc/source/cuda/index.rst
@@ -0,0 +1,9 @@
+CUDA
+====
+
+.. toctree::
+ :maxdepth: 2
+
+ matrix.rst
+ nn.rst
+ utils.rst
diff --git a/doc/source/cuda/matrix/matrix.rst b/doc/source/cuda/matrix.rst
similarity index 76%
rename from doc/source/cuda/matrix/matrix.rst
rename to doc/source/cuda/matrix.rst
index dd4f06599c5af29a0278617ffd1bd9f6ae6b222e..b7699c83eda15d9003506f5fc57b51d52e7af823 100644
--- a/doc/source/cuda/matrix/matrix.rst
+++ b/doc/source/cuda/matrix.rst
@@ -1,61 +1,59 @@
Matrix
-=======
+======
-Base Matrix
--------------
+Base
+----
hl_matrix.h
-``````````````````
+```````````
.. doxygenfile:: paddle/cuda/include/hl_matrix.h
hl_matrix_base.h
-``````````````````
+````````````````
.. doxygenfile:: paddle/cuda/include/hl_matrix_base.cuh
hl_matrix_apply.cuh
-``````````````````````
+```````````````````
.. doxygenfile:: paddle/cuda/include/hl_matrix_apply.cuh
hl_matrix_ops.cuh
-``````````````````````
+`````````````````
.. doxygenfile:: paddle/cuda/include/hl_matrix_ops.cuh
hl_matrix_type.cuh
-``````````````````````
+``````````````````
.. doxygenfile:: paddle/cuda/include/hl_matrix_type.cuh
hl_sse_matrix_kernel.cuh
-``````````````````````````
+````````````````````````
.. doxygenfile:: paddle/cuda/include/hl_sse_matrix_kernel.cuh
+Matrix Function
+---------------
+
hl_batch_transpose.h
-``````````````````````````
+````````````````````
.. doxygenfile:: paddle/cuda/include/hl_batch_transpose.h
-Sparse Matrix
---------------
-
-hl_sparse.h
-``````````````````
-.. doxygenfile:: paddle/cuda/include/hl_sparse.h
-
-hl_sparse.ph
-``````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_sparse.ph
-
-Others
----------------
-
hl_aggregate.h
-``````````````````
+``````````````
.. doxygenfile:: paddle/cuda/include/hl_aggregate.h
+hl_top_k.h
+``````````
+.. doxygenfile:: paddle/cuda/include/hl_top_k.h
+
hl_table_apply.h
-``````````````````
+````````````````
.. doxygenfile:: paddle/cuda/include/hl_table_apply.h
-hl_top_k.h
-``````````````````
-.. doxygenfile:: paddle/cuda/include/hl_top_k.h
+Sparse Matrix
+-------------
+hl_sparse.h
+```````````
+.. doxygenfile:: paddle/cuda/include/hl_sparse.h
+hl_sparse.ph
+````````````
+.. doxygenfile:: paddle/cuda/include/hl_sparse.ph
diff --git a/doc/source/cuda/matrix/index.rst b/doc/source/cuda/matrix/index.rst
deleted file mode 100644
index 63f95eb46618fd43a1140e4d857ae7e2fc89a6ae..0000000000000000000000000000000000000000
--- a/doc/source/cuda/matrix/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Matrix
-====================
-
-.. toctree::
- :maxdepth: 3
-
- matrix.rst
diff --git a/doc/source/cuda/rnn/rnn.rst b/doc/source/cuda/nn.rst
similarity index 79%
rename from doc/source/cuda/rnn/rnn.rst
rename to doc/source/cuda/nn.rst
index ce8ed96692bcb79eec0e5e6ae52a8bf5f6573418..5577d01e72a5b22847bda40528c46a28cacc1490 100644
--- a/doc/source/cuda/rnn/rnn.rst
+++ b/doc/source/cuda/nn.rst
@@ -1,36 +1,39 @@
-Neural Networks
-==================
+Neural Network
+==============
Base
--------
+----
+
.. doxygenfile:: paddle/cuda/include/hl_gpu.h
-.. doxygenfile:: paddle/cuda/include/hl_cnn.h
.. doxygenfile:: paddle/cuda/include/hl_functions.h
.. doxygenfile:: paddle/cuda/include/hl_avx_functions.h
-.. doxygenfile:: paddle/cuda/include/hl_device_functions.cuh
.. doxygenfile:: paddle/cuda/include/hl_gpu_functions.cuh
-
-Activation Functions
------------------------
.. doxygenfile:: paddle/cuda/include/hl_activation_functions.h
+
+CNN Related APIs
+----------------
+.. doxygenfile:: paddle/cuda/include/hl_cnn.h
+.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.h
+.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.ph
+
RNN Related APIs
------------------
+----------------
.. doxygenfile:: paddle/cuda/include/hl_recurrent_apply.cuh
.. doxygenfile:: paddle/cuda/include/hl_sequence.h
LSTM Model
-``````````````
+``````````
+
.. doxygenfile:: paddle/cuda/include/hl_lstm.h
.. dpxygenfile:: paddle/cuda/include/hl_cpu_lstm.cuh
.. doxygenfile:: paddle/cuda/include/hl_gpu_lstm.cuh
.. doxygenfile:: paddle/cuda/include/hl_lstm_ops.cuh
GRU Model
-````````````````
+`````````
+
.. doxygenfile:: paddle/cuda/include/hl_gru_ops.cuh
.. doxygenfile:: paddle/cuda/include/hl_cpu_gru.cuh
.. doxygenfile:: paddle/cuda/include/hl_gpu_gru.cuh
-
-
diff --git a/doc/source/cuda/rnn/index.rst b/doc/source/cuda/rnn/index.rst
deleted file mode 100644
index 4913e47ba1cbc1c2b93fe3e128626a8e66aedc62..0000000000000000000000000000000000000000
--- a/doc/source/cuda/rnn/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-RNN
-====================
-
-.. toctree::
- :maxdepth: 3
-
- rnn.rst
diff --git a/doc/source/cuda/utils.rst b/doc/source/cuda/utils.rst
new file mode 100644
index 0000000000000000000000000000000000000000..850e8bd1c6670947e2a5f1b6f9b0d5b252117cbf
--- /dev/null
+++ b/doc/source/cuda/utils.rst
@@ -0,0 +1,37 @@
+Utils
+=====
+
+Dynamic Link Libs
+-----------------
+.. doxygenfile:: paddle/cuda/include/hl_dso_loader.h
+
+GPU Resources
+-------------
+
+hl_cuda.ph
+``````````
+.. doxygenfile:: paddle/cuda/include/hl_cuda.ph
+
+hl_cuda.h
+`````````
+.. doxygenfile:: paddle/cuda/include/hl_cuda.h
+
+HPPL Base
+---------
+.. doxygenfile:: paddle/cuda/include/hl_base.h
+
+CUBLAS Wrapper
+--------------
+.. doxygenfile:: paddle/cuda/include/hl_cuda_cublas.h
+
+Timer
+-----
+.. doxygenfile:: paddle/cuda/include/hl_time.h
+
+Thread Resource
+---------------
+.. doxygenfile:: paddle/cuda/include/hl_thread.ph
+
+Device Function
+---------------
+.. doxygenfile:: paddle/cuda/include/hl_device_functions.cuh
diff --git a/doc/source/cuda/utils/index.rst b/doc/source/cuda/utils/index.rst
deleted file mode 100644
index 7a84cbe27dd21e326add1a0a1774cbaa089e195f..0000000000000000000000000000000000000000
--- a/doc/source/cuda/utils/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Utils
-====================
-
-.. toctree::
- :maxdepth: 3
-
- utils.rst
diff --git a/doc/source/cuda/utils/utils.rst b/doc/source/cuda/utils/utils.rst
deleted file mode 100644
index 1ea3e5404aa5fc792075aa09c7fd7a1986332c79..0000000000000000000000000000000000000000
--- a/doc/source/cuda/utils/utils.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-Utilities
-===========
-
-HPPL Base
-------------
-
-hl_base.h
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_base.h
-
-Timer
------------
-
-hl_time.h
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_time.h
-
-Thread Resource
------------
-
-hl_thread.ph
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_thread.ph
diff --git a/doc/source/gserver/activations/index.rst b/doc/source/gserver/activations.rst
similarity index 83%
rename from doc/source/gserver/activations/index.rst
rename to doc/source/gserver/activations.rst
index ccdae41128cd6b4edddda0ac44a825082d7495c9..55b9d3be383c07842d7066280cc0e174788db1fb 100644
--- a/doc/source/gserver/activations/index.rst
+++ b/doc/source/gserver/activations.rst
@@ -1,5 +1,5 @@
Activations
-=============
+===========
.. doxygenclass:: paddle::ActivationFunction
:members:
diff --git a/doc/source/gserver/dataprovider/index.rst b/doc/source/gserver/dataprovider/index.rst
deleted file mode 100644
index 4f6077f1224f90f693515d3414da4d96dc652345..0000000000000000000000000000000000000000
--- a/doc/source/gserver/dataprovider/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Data Providers Documents
-==========================
-
-.. toctree::
- :maxdepth: 3
-
- dataproviders.rst
diff --git a/doc/source/gserver/dataprovider/dataproviders.rst b/doc/source/gserver/dataproviders.rst
similarity index 87%
rename from doc/source/gserver/dataprovider/dataproviders.rst
rename to doc/source/gserver/dataproviders.rst
index e8aa4bc35634a0c6ede192a15b276564f7a2c13e..c30d9d6a36a6fbb664ae001274b6a7b0e721070f 100644
--- a/doc/source/gserver/dataprovider/dataproviders.rst
+++ b/doc/source/gserver/dataproviders.rst
@@ -1,23 +1,27 @@
+==============
Data Providers
-================
+==============
-Base DataProvider
-------------------
+DataProviders
+=============
+
+Base
+----
.. doxygenclass:: paddle::DataProvider
:members:
DataProviderGroup
--------------------
+-----------------
.. doxygenclass:: paddle::DataProviderGroup
:members:
MultiDataProvider
--------------------
+-----------------
.. doxygenclass:: paddle::MultiDataProvider
:members:
PyDataProvider
-===================
+==============
IFieldScanner
-------------
@@ -45,7 +49,7 @@ SparseValueScanner
:members:
SequenceScanner
-------------------
+---------------
.. doxygenclass:: paddle::SparseValueScanner
:members:
@@ -69,8 +73,8 @@ IPyDataProvider
.. doxygenclass:: paddle::PyDataProvider2
:members:
-Proto Data Provider
-===================
+ProtoDataProvider
+=================
ProtoDataProvider
----------------
@@ -78,6 +82,6 @@ ProtoDataProvider
:members:
ProtoSequenceDataProvider
-----------------
+-------------------------
.. doxygenclass:: paddle::ProtoSequenceDataProvider
:members:
diff --git a/doc/source/gserver/evaluators/evaluators.rst b/doc/source/gserver/evaluators.rst
similarity index 96%
rename from doc/source/gserver/evaluators/evaluators.rst
rename to doc/source/gserver/evaluators.rst
index 0c5cc85e7dff31693bdc9d2ee44ef470a0fc5f90..f5361f76cd2b1c9c004221c03ea05b2c1f3a652e 100644
--- a/doc/source/gserver/evaluators/evaluators.rst
+++ b/doc/source/gserver/evaluators.rst
@@ -1,14 +1,15 @@
-Base Evaluator
-==============
+==========
+Evaluators
+==========
+
+Base
+====
-Evaluator
----------
.. doxygenclass:: paddle::Evaluator
:members:
-
-Utils
-=====
+Sum
+===
SumEvaluator
------------
diff --git a/doc/source/gserver/evaluators/index.rst b/doc/source/gserver/evaluators/index.rst
deleted file mode 100644
index 298de3e1a32d36b9102f5ad64cc1b968f418041b..0000000000000000000000000000000000000000
--- a/doc/source/gserver/evaluators/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Evaluators
-==========
-
-.. toctree::
- :maxdepth: 3
-
- evaluators.rst
diff --git a/doc/source/gserver/gradientmachines/gradientmachines.rst b/doc/source/gserver/gradientmachines.rst
similarity index 54%
rename from doc/source/gserver/gradientmachines/gradientmachines.rst
rename to doc/source/gserver/gradientmachines.rst
index 3607664c850cdf4df4e10151b05f15e275adceaf..04c8e91d0316a45ad10b0ed0513d3e8916b7c3d9 100644
--- a/doc/source/gserver/gradientmachines/gradientmachines.rst
+++ b/doc/source/gserver/gradientmachines.rst
@@ -1,18 +1,18 @@
Gradient Machines
-================
+=================
GradientMachine
----------------------
+---------------
.. doxygenclass:: paddle::GradientMachine
:members:
-GradientMachineModel
---------------------
+GradientMachineMode
+-------------------
.. doxygenclass:: paddle::IGradientMachineMode
:members:
MultiGradientMachine
----------------------
+--------------------
.. doxygenclass:: paddle::MultiGradientMachine
:members:
@@ -21,20 +21,7 @@ TrainerThread
.. doxygenclass:: paddle::TrainerThread
:members:
-Recurrent Gradient Machines
----------------------------
+RecurrentGradientMachine
+------------------------
.. doxygenclass:: paddle::RecurrentGradientMachine
:members:
-
-Networks
-========
-
-NeuralNetwork
--------------
-.. doxygenclass:: paddle::NeuralNetwork
- :members:
-
-ParallelNeuralNetwork
----------------------
-.. doxygenclass:: paddle::ParallelNeuralNetwork
- :members:
diff --git a/doc/source/gserver/gradientmachines/index.rst b/doc/source/gserver/gradientmachines/index.rst
deleted file mode 100644
index 997c29a102f53c165c70ff11cd9650b83bcecf44..0000000000000000000000000000000000000000
--- a/doc/source/gserver/gradientmachines/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Gradient Machines Documents
-=============================
-
-.. toctree::
- :maxdepth: 3
-
- gradientmachines.rst
diff --git a/doc/source/gserver/index.rst b/doc/source/gserver/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..223b00b9a9dbf1db40ce702cf0e154e5e53a8644
--- /dev/null
+++ b/doc/source/gserver/index.rst
@@ -0,0 +1,12 @@
+GServer
+=======
+
+.. toctree::
+ :maxdepth: 2
+
+ activations.rst
+ dataproviders.rst
+ evaluators.rst
+ gradientmachines.rst
+ layers.rst
+ neworks.rst
diff --git a/doc/source/gserver/layers/layer.rst b/doc/source/gserver/layers.rst
similarity index 95%
rename from doc/source/gserver/layers/layer.rst
rename to doc/source/gserver/layers.rst
index 4b8e149505f0695ad2fa4be967a50d1a0ac48b43..191b2bdff26ed17437370a12036f9dbb174dae15 100644
--- a/doc/source/gserver/layers/layer.rst
+++ b/doc/source/gserver/layers.rst
@@ -1,6 +1,10 @@
-Base
+======
+Layers
======
+Base
+====
+
Layer
-----
.. doxygenclass:: paddle::Layer
@@ -17,7 +21,7 @@ Operator
:members:
Data Layer
-===========
+==========
.. doxygenclass:: paddle::DataLayer
:members:
@@ -58,6 +62,11 @@ CudnnConvLayer
.. doxygenclass:: paddle::CudnnConvLayer
:members:
+ExpandConvBaseLayer
+-------------------
+.. doxygenclass:: paddle::ExpandConvBaseLayer
+ :members:
+
ExpandConvLayer
---------------
.. doxygenclass:: paddle::ExpandConvLayer
@@ -86,6 +95,16 @@ CudnnPoolLayer
.. doxygenclass:: paddle::CudnnPoolLayer
:members:
+SpatialPyramidPoolLayer
+-----------------------
+.. doxygenclass:: paddle::SpatialPyramidPoolLayer
+ :members:
+
+MaxOutLayer
+-----------
+.. doxygenclass:: paddle::MaxOutLayer
+ :members:
+
Norm Layers
===========
@@ -402,6 +421,11 @@ TransLayer
Sampling Layers
===============
+BilinearInterpLayer
+-------------------
+.. doxygenclass:: paddle::BilinearInterpLayer
+ :members:
+
MultinomialSampler
------------------
.. doxygenclass:: paddle::MultinomialSampler
diff --git a/doc/source/gserver/layers/index.rst b/doc/source/gserver/layers/index.rst
deleted file mode 100644
index 559c5436b10a5977ac347611639b32d43f1ed123..0000000000000000000000000000000000000000
--- a/doc/source/gserver/layers/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Layers Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- layer.rst
diff --git a/doc/source/gserver/neworks.rst b/doc/source/gserver/neworks.rst
new file mode 100644
index 0000000000000000000000000000000000000000..73fb60d549cc88f61d2e2d18c9ec31c37cf4fa9a
--- /dev/null
+++ b/doc/source/gserver/neworks.rst
@@ -0,0 +1,12 @@
+Networks
+========
+
+NeuralNetwork
+-------------
+.. doxygenclass:: paddle::NeuralNetwork
+ :members:
+
+ParallelNeuralNetwork
+---------------------
+.. doxygenclass:: paddle::ParallelNeuralNetwork
+ :members:
diff --git a/doc/source/index.md b/doc/source/index.md
deleted file mode 100644
index 55fcdeb3dfcedd8589bf7986682708a957c05746..0000000000000000000000000000000000000000
--- a/doc/source/index.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# Source Code Documents
-
-## cuda
-
-- [CUDA](cuda/cuda/index.rst)
-- [Matrix](cuda/matrix/index.rst)
-- [RNN](cuda/rnn/index.rst)
-- [Utils](cuda/utils/index.rst)
-
-## gserver
-
-- [Activations](gserver/activations/index.rst)
-- [Data Providers](gserver/dataprovider/index.rst)
-- [Evaluators](gserver/evaluators/index.rst)
-- [Gradient Machines](gserver/gradientmachines/index.rst)
-- [Layers](gserver/layers/index.rst)
-
-## math
-
-- [Matrix](math/matrix/index.rst)
-- [Utils](math/utils/index.rst)
-
-## parameter
-
-- [Parameter](parameter/parameter/index.rst)
-- [Update](parameter/update/index.rst)
-- [Optimizer](parameter/optimizer/index.rst)
-
-## pserver
-
-- [Client](pserver/client/index.rst)
-- [Network](pserver/network/index.rst)
-- [Server](pserver/server/index.rst)
-
-## trainer
-
-- [Trainer](trainer/trainer.rst)
-
-## api
-
-- [API](api/api.rst)
-
-## utils
-
-- [CustomStackTrace](utils/customStackTrace.rst)
-- [Enumeration wrapper](utils/enum.rst)
-- [Lock](utils/lock.rst)
-- [Queue](utils/queue.rst)
-- [Thread](utils/thread.rst)
diff --git a/doc/source/index.rst b/doc/source/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..36323c888ee65147f59f28160dc26ca29235ba63
--- /dev/null
+++ b/doc/source/index.rst
@@ -0,0 +1,14 @@
+Source Code Documents
+=====================
+
+.. toctree::
+ :maxdepth: 1
+
+ gserver/index.rst
+ trainer.rst
+ parameter/index.rst
+ pserver/index.rst
+ api.rst
+ cuda/index.rst
+ math/index.rst
+ utils/index.rst
diff --git a/doc/source/math/functions.rst b/doc/source/math/functions.rst
new file mode 100644
index 0000000000000000000000000000000000000000..aef12e0f005226c6d40d74d0e858a11585339758
--- /dev/null
+++ b/doc/source/math/functions.rst
@@ -0,0 +1,10 @@
+Functions
+=========
+
+MathFunctions
+-------------
+.. doxygenfile:: paddle/math/MathFunctions.h
+
+SIMDFunctions
+-------------
+.. doxygenfile:: paddle/math/SIMDFunctions.h
diff --git a/doc/source/math/index.rst b/doc/source/math/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2ec16f2b4450c870f9590aea4ad4ca7dc415b75d
--- /dev/null
+++ b/doc/source/math/index.rst
@@ -0,0 +1,10 @@
+Math
+====
+
+.. toctree::
+ :maxdepth: 2
+
+ vector.rst
+ matrix.rst
+ functions.rst
+ utils.rst
diff --git a/doc/source/math/matrix.rst b/doc/source/math/matrix.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9bb20f618d229e1baea15e26378bf40d7c6e1783
--- /dev/null
+++ b/doc/source/math/matrix.rst
@@ -0,0 +1,76 @@
+Matrix
+======
+
+Base
+----
+
+BaseMatrix Template
+```````````````````
+.. doxygenclass:: paddle::BaseMatrixT
+ :members:
+
+Matrix
+``````
+.. doxygenclass:: paddle::Matrix
+ :members:
+
+MatrixOffset
+````````````
+.. doxygenclass:: paddle::MatrixOffset
+ :members:
+
+CpuMatrix
+---------
+
+CpuMatrix
+`````````
+.. doxygenclass:: paddle::CpuMatrix
+ :members:
+
+SharedCpuMatrix
+```````````````
+.. doxygenclass:: paddle::SharedCpuMatrix
+ :members:
+
+GpuMatrix
+---------
+.. doxygenclass:: paddle::GpuMatrix
+ :members:
+
+CpuSparseMatrix
+---------------
+
+CpuSparseMatrix
+```````````````
+.. doxygenclass:: paddle::CpuSparseMatrix
+ :members:
+
+SparseRowCpuMatrix
+``````````````````
+.. doxygenclass:: paddle::SparseRowCpuMatrix
+ :members:
+
+SparseAutoGrowRowCpuMatrix
+``````````````````````````
+.. doxygenclass:: paddle::SparseAutoGrowRowCpuMatrix
+ :members:
+
+SparsePrefetchRowCpuMatrix
+``````````````````````````
+.. doxygenclass:: paddle::SparsePrefetchRowCpuMatrix
+ :members:
+
+SparseRowIdsCpuMatrix
+`````````````````````
+.. doxygenclass:: paddle::SparseRowIdsCpuMatrix
+ :members:
+
+CacheRowCpuMatrix
+`````````````````
+.. doxygenclass:: paddle::CacheRowCpuMatrix
+ :members:
+
+GpuSparseMatrix
+---------------
+.. doxygenclass:: paddle::GpuSparseMatrix
+ :members:
diff --git a/doc/source/math/matrix/index.rst b/doc/source/math/matrix/index.rst
deleted file mode 100644
index 68410f2a27b68c87087f2c17de351495ac6a6cd0..0000000000000000000000000000000000000000
--- a/doc/source/math/matrix/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Matrix Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- matrix.rst
diff --git a/doc/source/math/matrix/matrix.rst b/doc/source/math/matrix/matrix.rst
deleted file mode 100644
index b12e3934f4705d4a2b7d3d790873701ddfe27d9f..0000000000000000000000000000000000000000
--- a/doc/source/math/matrix/matrix.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-Matrix
-=======
-
-Base
---------
-.. doxygenfile:: paddle/math/BaseMatrix.h
-
-Sparse Matrix
-----------------
-.. doxygenfile:: paddle/math/Matrix.h
-.. doxygenfile:: paddle/math/Vector.h
-.. doxygenfile:: paddle/math/MathUtils.h
-.. doxygenfile:: paddle/math/SparseMatrix.h
-.. doxygenfile:: paddle/math/SparseRowMatrix.h
-.. doxygenfile:: paddle/math/CpuSparseMatrix.h
-
-Others
-----------
-.. doxygenfile:: paddle/math/MathFunctions.h
-.. doxygenfile:: paddle/math/SIMDFunctions.h
diff --git a/doc/source/math/utils/utils.rst b/doc/source/math/utils.rst
similarity index 62%
rename from doc/source/math/utils/utils.rst
rename to doc/source/math/utils.rst
index 3df721a47b93bce950185f2d6ffe22d4a801af30..55d9961a390c205563a9ae4fbd87ac4ae90fc314 100644
--- a/doc/source/math/utils/utils.rst
+++ b/doc/source/math/utils.rst
@@ -1,9 +1,18 @@
-Utils
-=======
+Memory Manager
+==============
Memory Handle
---------------
+-------------
.. doxygenfile:: paddle/math/MemoryHandle.h
+
+Allocator
+---------
.. doxygenfile:: paddle/math/Allocator.h
+
+PoolAllocator
+`````````````
.. doxygenfile:: paddle/math/PoolAllocator.h
+
+Storage
+-------
.. doxygenfile:: paddle/math/Storage.h
diff --git a/doc/source/math/utils/index.rst b/doc/source/math/utils/index.rst
deleted file mode 100644
index e5fe335da29b957706ed52662682d11c425e5908..0000000000000000000000000000000000000000
--- a/doc/source/math/utils/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Utils Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- utils.rst
diff --git a/doc/source/math/vector.rst b/doc/source/math/vector.rst
new file mode 100644
index 0000000000000000000000000000000000000000..07f7062abaf4f30b8967b594f4e16ab881f5414f
--- /dev/null
+++ b/doc/source/math/vector.rst
@@ -0,0 +1,37 @@
+Vector
+======
+
+BaseVector
+``````````
+.. doxygenclass:: paddle::BaseVector
+ :members:
+
+Vector Template
+```````````````
+.. doxygenclass:: paddle::VectorT
+ :members:
+
+CpuVector Template
+``````````````````
+.. doxygenclass:: paddle::CpuVectorT
+ :members:
+
+GpuVector Template
+``````````````````
+.. doxygenclass:: paddle::GpuVectorT
+ :members:
+
+ParallelCpuVector Template
+``````````````````````````
+.. doxygenclass:: paddle::ParallelCpuVectorT
+ :members:
+
+ParallelGpuVector Template
+``````````````````````````
+.. doxygenclass:: paddle::ParallelGpuVectorT
+ :members:
+
+CpuGpuVector Template
+`````````````````````
+.. doxygenclass:: paddle::CpuGpuVectorT
+ :members:
diff --git a/doc/source/parameter/index.rst b/doc/source/parameter/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3bf6948dc3478574d8d125d8461235f8827e4e42
--- /dev/null
+++ b/doc/source/parameter/index.rst
@@ -0,0 +1,9 @@
+Parameter
+=========
+
+.. toctree::
+ :maxdepth: 2
+
+ parameter.rst
+ optimizer.rst
+ updater.rst
diff --git a/doc/source/parameter/optimizer.rst b/doc/source/parameter/optimizer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b5b8b850b349d547c9e5508d3ebec3d7e00ea310
--- /dev/null
+++ b/doc/source/parameter/optimizer.rst
@@ -0,0 +1,22 @@
+Optimizer
+=========
+
+ParameterOptimizer
+------------------
+.. doxygenfile:: paddle/parameter/ParameterOptimizer.h
+
+Regularizer
+-----------
+.. doxygenfile:: paddle/parameter/Regularizer.h
+
+FirstOrderOptimizer
+-------------------
+.. doxygenfile:: paddle/parameter/FirstOrderOptimizer.h
+
+AverageOptimizer
+----------------
+.. doxygenfile:: paddle/parameter/AverageOptimizer.h
+
+OptimizerWithRegularizer
+------------------------
+.. doxygenfile:: paddle/parameter/OptimizerWithRegularizer.h
diff --git a/doc/source/parameter/optimizer/index.rst b/doc/source/parameter/optimizer/index.rst
deleted file mode 100644
index 3338af5608a03ee853e3a5f16d2483b810215514..0000000000000000000000000000000000000000
--- a/doc/source/parameter/optimizer/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Parameter Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- optimizer.rst
diff --git a/doc/source/parameter/optimizer/optimizer.rst b/doc/source/parameter/optimizer/optimizer.rst
deleted file mode 100644
index 3d9e49217eb17541c14d8d64715278e62c99d2b4..0000000000000000000000000000000000000000
--- a/doc/source/parameter/optimizer/optimizer.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Optimizer
-============
-
-.. doxygenfile:: paddle/parameter/FirstOrderOptimizer.h
-.. doxygenfile:: paddle/parameter/AverageOptimizer.h
-.. doxygenfile:: paddle/parameter/ParameterOptimizer.h
-.. doxygenfile:: paddle/parameter/OptimizerWithRegularizer.h
diff --git a/doc/source/parameter/parameter/parameter.rst b/doc/source/parameter/parameter.rst
similarity index 66%
rename from doc/source/parameter/parameter/parameter.rst
rename to doc/source/parameter/parameter.rst
index 2b7afdb4093753598d73c686b1dc81b970d199d5..2daa62d4e63b952cd93bba35ee32ce35ce768a0d 100644
--- a/doc/source/parameter/parameter/parameter.rst
+++ b/doc/source/parameter/parameter.rst
@@ -1,16 +1,12 @@
Parameter
-=============
-
-Weight
---------
-.. doxygenfile:: paddle/parameter/Weight.h
-
-Regularizer
-------------
-.. doxygenfile:: paddle/parameter/Regularizer.h
+=========
Parameter
--------------
+---------
.. doxygenfile:: paddle/parameter/Argument.h
.. doxygenfile:: paddle/parameter/Parameter.h
.. doxygenfile:: paddle/parameter/ParallelParameter.h
+
+Weight
+------
+.. doxygenfile:: paddle/parameter/Weight.h
diff --git a/doc/source/parameter/parameter/index.rst b/doc/source/parameter/parameter/index.rst
deleted file mode 100644
index e7ed70ec4c87b3613cd8450f1e7fca1fb974afca..0000000000000000000000000000000000000000
--- a/doc/source/parameter/parameter/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Parameter Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- parameter.rst
diff --git a/doc/source/parameter/update/index.rst b/doc/source/parameter/update/index.rst
deleted file mode 100644
index 1bbd73319396e7b8ea32c78e0fe3569919bacf2d..0000000000000000000000000000000000000000
--- a/doc/source/parameter/update/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Parameter Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- update.rst
diff --git a/doc/source/parameter/update/update.rst b/doc/source/parameter/updater.rst
similarity index 75%
rename from doc/source/parameter/update/update.rst
rename to doc/source/parameter/updater.rst
index c417602f0338dbd84ae2bd2ca4eb09330202a0e8..dfa22e8e7d1d6f0713974835de93194d2cc58e6f 100644
--- a/doc/source/parameter/update/update.rst
+++ b/doc/source/parameter/updater.rst
@@ -1,7 +1,14 @@
-Update
-==========
+Updater
+=======
+Base
+----
.. doxygenfile:: paddle/parameter/ParameterUpdaterBase.h
+
+Hook
+----
.. doxygenfile:: paddle/parameter/ParameterUpdaterHook.h
-.. doxygenfile:: paddle/parameter/ParameterUpdateFunctions.h
+Functions
+---------
+.. doxygenfile:: paddle/parameter/ParameterUpdateFunctions.h
diff --git a/doc/source/pserver/client.rst b/doc/source/pserver/client.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e5bba0706a1d919104b85e23861ba490a2c828db
--- /dev/null
+++ b/doc/source/pserver/client.rst
@@ -0,0 +1,12 @@
+Client
+======
+
+BaseClient
+----------
+.. doxygenclass:: paddle::BaseClient
+ :members:
+
+ParameterClient2
+----------------
+.. doxygenclass:: paddle::ParameterClient2
+ :members:
diff --git a/doc/source/pserver/client/client.rst b/doc/source/pserver/client/client.rst
deleted file mode 100644
index fc7ed90d3dc8beb0baa30d63ccc956fbba2a4e4c..0000000000000000000000000000000000000000
--- a/doc/source/pserver/client/client.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Client
-=========
-
-.. doxygenclass:: paddle::BaseClient
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
-
-.. doxygenclass:: paddle::ParameterClient2
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
diff --git a/doc/source/pserver/client/index.rst b/doc/source/pserver/client/index.rst
deleted file mode 100644
index dc924c9ca8e7b9965638fd299dc2f5e78591c91b..0000000000000000000000000000000000000000
--- a/doc/source/pserver/client/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Client Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- client.rst
diff --git a/doc/source/pserver/index.rst b/doc/source/pserver/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0031e9476bd063511cc2f0a8c209f35627cf44ba
--- /dev/null
+++ b/doc/source/pserver/index.rst
@@ -0,0 +1,10 @@
+PServer
+=======
+
+.. toctree::
+ :maxdepth: 2
+
+ client.rst
+ network.rst
+ server.rst
+ utils.rst
diff --git a/doc/source/pserver/network.rst b/doc/source/pserver/network.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7004c9d91fa9f2af11e15791ef682c108761027e
--- /dev/null
+++ b/doc/source/pserver/network.rst
@@ -0,0 +1,27 @@
+Network
+=======
+
+SocketServer
+------------
+.. doxygenclass:: paddle::SocketServer
+ :members:
+
+SocketWorker
+------------
+.. doxygenclass:: paddle::SocketWorker
+ :members:
+
+SocketClient
+------------
+.. doxygenclass:: paddle::SocketClient
+ :members:
+
+SocketChannel
+-------------
+.. doxygenclass:: paddle::SocketChannel
+ :members:
+
+MessageReader
+-------------
+.. doxygenclass:: paddle::MsgReader
+ :members:
diff --git a/doc/source/pserver/network/index.rst b/doc/source/pserver/network/index.rst
deleted file mode 100644
index 2fdf95e17d339d69de8e027d92cbb385e2bd51ec..0000000000000000000000000000000000000000
--- a/doc/source/pserver/network/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Network Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- network.rst
diff --git a/doc/source/pserver/network/network.rst b/doc/source/pserver/network/network.rst
deleted file mode 100644
index e000ff8dbbdc37e9d638d18d20a8ba53e21dd245..0000000000000000000000000000000000000000
--- a/doc/source/pserver/network/network.rst
+++ /dev/null
@@ -1,42 +0,0 @@
-Network
-==========
-
-Socket Server
-----------------
-.. doxygenclass:: paddle::SocketServer
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
-
-Socket Worker
-----------------
-.. doxygenclass:: paddle::SocketWorker
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
-
-Socket Client
-----------------
-.. doxygenclass:: paddle::SocketClient
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
-
-Socket Channel
----------------
-.. doxygenclass:: paddle::SocketChannel
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
-
-Message Reader
----------------
-.. doxygenclass:: paddle::MsgReader
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
diff --git a/doc/source/pserver/server.rst b/doc/source/pserver/server.rst
new file mode 100644
index 0000000000000000000000000000000000000000..35301acf8ffe3d97e6124c37cf8fe1b43071e14e
--- /dev/null
+++ b/doc/source/pserver/server.rst
@@ -0,0 +1,12 @@
+Server
+======
+
+ProtoServer
+-----------
+.. doxygenclass:: paddle::ProtoServer
+ :members:
+
+ParameterServer2
+----------------
+.. doxygenclass:: paddle::ParameterServer2
+ :members:
diff --git a/doc/source/pserver/server/index.rst b/doc/source/pserver/server/index.rst
deleted file mode 100644
index 09e3530bfeaf56ebbadb1694a69a036813e8970f..0000000000000000000000000000000000000000
--- a/doc/source/pserver/server/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Server Documents
-====================
-
-.. toctree::
- :maxdepth: 3
-
- server.rst
diff --git a/doc/source/pserver/server/server.rst b/doc/source/pserver/server/server.rst
deleted file mode 100644
index f3110fdd731d246ce4211d05e32ddd98584bdbb7..0000000000000000000000000000000000000000
--- a/doc/source/pserver/server/server.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Server
-==========
-
-.. doxygenclass:: paddle::ProtoServer
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
-
-.. doxygenclass:: paddle::ParameterServer2
- :members:
- :protected-members:
- :private-members:
- :undoc-members:
diff --git a/doc/source/trainer/trainer.rst b/doc/source/trainer.rst
similarity index 94%
rename from doc/source/trainer/trainer.rst
rename to doc/source/trainer.rst
index 12c24597e7f99cd489204602ae25a89d7b960630..85f1feb4fc941f94e65a6b1d037445d2367f65ec 100644
--- a/doc/source/trainer/trainer.rst
+++ b/doc/source/trainer.rst
@@ -14,7 +14,7 @@ RemoteParameterUpdater
:members:
ConcurrentRemoteParameterUpdater
----------------------------------
+--------------------------------
.. doxygenclass:: paddle::ConcurrentRemoteParameterUpdater
:members:
diff --git a/doc/source/utils/customStackTrace.rst b/doc/source/utils/customStackTrace.rst
index a4e6f05a406f33256548fc0ef32bbbf3daff1536..cdc8930739eb4b4d6308ff1fbce170d2977d42e8 100644
--- a/doc/source/utils/customStackTrace.rst
+++ b/doc/source/utils/customStackTrace.rst
@@ -1,9 +1,4 @@
CustomStackTrace
================
-
-
-class CustomStackTrace
-----------------------
-
.. doxygenclass:: paddle::CustomStackTrace
:members:
diff --git a/doc/source/utils/enum.rst b/doc/source/utils/enum.rst
index 17166d35f7cfa63e51058cc5f86165b1e22bbe1e..e0da75afe164f9dab59b862faa7230fc57423e50 100644
--- a/doc/source/utils/enum.rst
+++ b/doc/source/utils/enum.rst
@@ -1,9 +1,3 @@
-enumeration_wrapper
+Enumeration wrapper
===================
-
-
-namespace paddle::enumeration_wrapper
--------------------------------------
-
.. doxygennamespace:: paddle::enumeration_wrapper
-
diff --git a/doc/source/utils/index.rst b/doc/source/utils/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7ddc47d1726f7627852be922d2b769d0752aa799
--- /dev/null
+++ b/doc/source/utils/index.rst
@@ -0,0 +1,11 @@
+Utils
+=====
+
+.. toctree::
+ :maxdepth: 2
+
+ lock.rst
+ queue.rst
+ thread.rst
+ customStackTrace.rst
+ enum.rst
diff --git a/doc/source/utils/lock.rst b/doc/source/utils/lock.rst
index 0b027e403f49fc1720904cf4b502d81e4148e1e3..f011acb9431f0f3dc3b2ba27fcfe71fe6eb07ae9 100644
--- a/doc/source/utils/lock.rst
+++ b/doc/source/utils/lock.rst
@@ -1,37 +1,32 @@
-Thread
-======
+Lock
+====
-
-class Thread
-------------
-
-.. doxygenclass:: paddle::Thread
+RWLock
+------
+.. doxygenclass:: paddle::RWLock
:members:
-
-class ThreadWorker
-------------------
-
-.. doxygenclass:: paddle::ThreadWorker
+ReadLockGuard
+-------------
+.. doxygenclass:: paddle::ReadLockGuard
:members:
-
-class SyncThreadPool
---------------------
-
-.. doxygenclass:: paddle::SyncThreadPool
+SpinLock
+--------
+.. doxygenclass:: paddle::SpinLock
:members:
-
-
-class MultiThreadWorker
------------------------
-.. doxygenclass:: paddle::MultiThreadWorker
+Semaphore
+---------
+.. doxygenclass:: paddle::Semaphore
:members:
-
-class AsyncThreadPool
----------------------
+ThreadBarrier
+-------------
+.. doxygenclass:: paddle::ThreadBarrier
+ :members:
-.. doxygenclass:: paddle::AsyncThreadPool
+LockedCondition
+---------------
+.. doxygenclass:: paddle::LockedCondition
:members:
diff --git a/doc/source/utils/queue.rst b/doc/source/utils/queue.rst
index 72a464ca67288d0d0e24980d59c3bbc85f111081..98192648e2d61e622c2337d10ba024dd676ee685 100644
--- a/doc/source/utils/queue.rst
+++ b/doc/source/utils/queue.rst
@@ -1,16 +1,12 @@
Queue
=====
-
-class Queue
-------------
-
+Queue
+-----
.. doxygenclass:: paddle::Queue
:members:
-
-class BlockingQueue
--------------------
-
+BlockingQueue
+-------------
.. doxygenclass:: paddle::BlockingQueue
:members:
diff --git a/doc/source/utils/thread.rst b/doc/source/utils/thread.rst
index 2eb67dde6a945cc8e250989f7fc8cefed942950e..23d379a9894e5fc22bc6795a480a53d768e608e6 100644
--- a/doc/source/utils/thread.rst
+++ b/doc/source/utils/thread.rst
@@ -1,40 +1,27 @@
-Lock
-====
+Thread
+======
-
-class RWLock
-------------
-
-.. doxygenclass:: paddle::RWLock
+Thread
+------
+.. doxygenclass:: paddle::Thread
:members:
-class ReadLockGuard
--------------------
-
-.. doxygenclass:: paddle::ReadLockGuard
+ThreadWorker
+------------
+.. doxygenclass:: paddle::ThreadWorker
:members:
-class SpinLock
+SyncThreadPool
--------------
-
-.. doxygenclass:: paddle::SpinLock
+.. doxygenclass:: paddle::SyncThreadPool
:members:
-
-class Semaphore
----------------
-
-.. doxygenclass:: paddle::Semaphore
- :members:
-
-class ThreadBarrier
--------------------
-
-.. doxygenclass:: paddle::ThreadBarrier
+
+MultiThreadWorker
+-----------------
+.. doxygenclass:: paddle::MultiThreadWorker
:members:
-class LockedCondition
----------------------
-
-.. doxygenclass:: paddle::LockedCondition
+AsyncThreadPool
+---------------
+.. doxygenclass:: paddle::AsyncThreadPool
:members:
-
diff --git a/doc_cn/algorithm/rnn/hierarchical-layer.md b/doc_cn/algorithm/rnn/hierarchical-layer.rst
similarity index 50%
rename from doc_cn/algorithm/rnn/hierarchical-layer.md
rename to doc_cn/algorithm/rnn/hierarchical-layer.rst
index 519653df081d6e7919ada3cbff6aaf4d2a2f6115..a9906b8b9c2036ae349f30d7edee770884f73f99 100644
--- a/doc_cn/algorithm/rnn/hierarchical-layer.md
+++ b/doc_cn/algorithm/rnn/hierarchical-layer.rst
@@ -1,6 +1,11 @@
-# 支持双层序列作为输入的Layer
+###########################
+支持双层序列作为输入的Layer
+###########################
-## 概述
+.. contents::
+
+概述
+====
在自然语言处理任务中,序列是一种常见的数据类型。一个独立的词语,可以看作是一个非序列输入,或者,我们称之为一个0层的序列;由词语构成的句子,是一个单层序列;若干个句子构成一个段落,是一个双层的序列。
@@ -12,55 +17,79 @@
+ 单层序列:排成一列的多个元素,每个元素是一个0层序列,元素之间的顺序是重要的输入信息
+ 双层序列:排成一列的多个元素,每个元素是一个单层序列,称之为双层序列的一个子序列(subseq),subseq的每个元素是一个0层序列
-
在 PaddlePaddle中,下面这些Layer能够接受双层序列作为输入,完成相应的计算。
-## pooling_layer
-
-pooling_layer的使用示例如下,详细见配置API。
-```python
-seq_pool = pooling_layer(input=layer,
- pooling_type=AvgPooling(),
- agg_level=AggregateLevel.EACH_SEQUENCE)
-```
+
+pooling_layer
+==============
+
+pooling_layer 的使用示例如下,详细见 `pooling_layer`_ 配置API。
+
+.. code-block:: bash
+
+ seq_pool = pooling_layer(input=layer,
+ pooling_type=AvgPooling(),
+ agg_level=AggregateLevel.EACH_SEQUENCE)
+
- `pooling_type` 目前支持两种,分别是:MaxPooling()和AvgPooling()。
-- `agg_level=AggregateLevel.TIMESTEP`时(默认值):
+
+- `agg_level=AggregateLevel.TIMESTEP` 时(默认值):
+
- 作用:双层序列经过运算变成一个0层序列,或单层序列经过运算变成一个0层序列
- 输入:一个双层序列,或一个单层序列
- 输出:一个0层序列,即整个输入序列(单层或双层)的平均值(或最大值)
-- `agg_level=AggregateLevel.EACH_SEQUENCE`时:
+
+- `agg_level=AggregateLevel.EACH_SEQUENCE` 时:
+
- 作用:一个双层序列经过运算变成一个单层序列
- 输入:必须是一个双层序列
- 输出:一个单层序列,序列的每个元素是原来双层序列每个subseq元素的平均值(或最大值)
-## last_seq 和 first_seq
+last_seq 和 first_seq
+=====================
+
+last_seq 的使用示例如下( `first_seq`_ 类似),详细见 `last_seq`_ 配置API。
+
+.. code-block:: bash
+
+ last = last_seq(input=layer,
+ agg_level=AggregateLevel.EACH_SEQUENCE)
+
+- `agg_level=AggregateLevel.TIMESTEP` 时(默认值):
-last_seq的使用示例如下(first_seq类似),详细见配置API。
-```python
-last = last_seq(input=layer,
- agg_level=AggregateLevel.EACH_SEQUENCE)
-```
-- `agg_level=AggregateLevel.TIMESTEP`时(默认值):
- 作用:一个双层序列经过运算变成一个0层序列,或一个单层序列经过运算变成一个0层序列
- 输入:一个双层序列或一个单层序列
- 输出:一个0层序列,即整个输入序列(双层或者单层)最后一个,或第一个元素。
-- `agg_level=AggregateLevel.EACH_SEQUENCE`时:
+
+- `agg_level=AggregateLevel.EACH_SEQUENCE` 时:
- 作用:一个双层序列经过运算变成一个单层序列
- 输入:必须是一个双层序列
- 输出:一个单层序列,其中每个元素是双层序列中每个subseq最后一个(或第一个)元素。
-## expand_layer
+expand_layer
+============
+
+expand_layer 的使用示例如下,详细见 `expand_layer`_ 配置API。
+
+.. code-block:: bash
+
+ expand = expand_layer(input=layer1,
+ expand_as=layer2,
+ expand_level=ExpandLevel.FROM_TIMESTEP)
+
+- `expand_level=ExpandLevel.FROM_TIMESTEP` 时(默认值):
-expand_layer的使用示例如下,详细见配置API。
-```python
-expand = expand_layer(input=layer1,
- expand_as=layer2,
- expand_level=ExpandLevel.FROM_TIMESTEP)
-```
-- `expand_level=ExpandLevel.FROM_TIMESTEP`时(默认值):
- 作用:一个0层序列经过运算扩展成一个单层序列,或者一个双层序列
- - 输入:layer1必须是一个0层序列,是待扩展的数据;layer2可以是一个单层序列,或者是一个双层序列,提供扩展的长度信息
- - 输出:一个单层序列,或一个双层序列,输出序列的类型(双层序列,或单层序列)和序列中含有元素的数目同 layer2一致。若输出是单层序列,单层序列的每个元素(0层序列),都是对layer1元素的拷贝;若输出是双层序列,双层序列每个subseq中每个元素(0层序列),都是对layer1元素的拷贝
-- `expand_level=ExpandLevel.FROM_SEQUENCE`时:
+ - 输入:layer1必须是一个0层序列,是待扩展的数据;layer2 可以是一个单层序列,或者是一个双层序列,提供扩展的长度信息
+ - 输出:一个单层序列或一个双层序列,输出序列的类型(双层序列或单层序列)和序列中含有元素的数目同 layer2 一致。若输出是单层序列,单层序列的每个元素(0层序列),都是对layer1元素的拷贝;若输出是双层序列,双层序列每个subseq中每个元素(0层序列),都是对layer1元素的拷贝
+
+- `expand_level=ExpandLevel.FROM_SEQUENCE` 时:
+
- 作用:一个单层序列经过运算扩展成一个双层序列
- - 输入:layer1必须是一个单层序列,是待扩展的数据;layer2必须是一个双层序列,提供扩展的长度信息
- - 输出:一个双层序列,序列中含有元素的数目同layer2一致。要求单层序列含有元素的数目(0层序列),和双层序列含有subseq 的数目一致。单层序列第i个元素(0层序列),被扩展为一个单层序列,构成了输出双层序列的第i个subseq。
+ - 输入:layer1必须是一个单层序列,是待扩展的数据;layer2 必须是一个双层序列,提供扩展的长度信息
+ - 输出:一个双层序列,序列中含有元素的数目同 layer2 一致。要求单层序列含有元素的数目(0层序列)和双层序列含有subseq 的数目一致。单层序列第i个元素(0层序列),被扩展为一个单层序列,构成了输出双层序列的第i个 subseq 。
+
+
+.. _pooling_layer: ../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer
+.. _last_seq: ../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq
+.. _first_seq: ../../../doc/ui/api/trainer_config_helpers/layers.html#first-seq
+.. _expand_layer: ../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer
diff --git a/doc_cn/build_and_install/cmake/cblas_settings.csv b/doc_cn/build_and_install/cmake/cblas_settings.csv
index d804c0a662cb652dbefb0d09fb18538308c20aec..a6356baf16a0d3d2499e39d2055d8ee878dcaef2 100644
--- a/doc_cn/build_and_install/cmake/cblas_settings.csv
+++ b/doc_cn/build_and_install/cmake/cblas_settings.csv
@@ -1,4 +1,5 @@
-MKL_ROOT,mkl的路径,在${MKL_ROOT}/include下需要包含mkl.h,在${MKL_ROOT}/lib目录下需要包含 mkl_core,mkl_sequential和mkl_intel_lp64三个库
-ATLAS_ROOT,ATLAS库的路径,在${ATLAS_ROOT}/include下需要包含cblas.h,而在${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库
-OPENBLAS_ROOT,在${OPENBLAS_ROOT}/include下需要包含cblas.h,而在${OPENBLAS_ROOT}/lib下需要包含openblas库
-REFERENCE_CBLAS_ROOT,在${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h,在${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库
\ No newline at end of file
+编译选项,描述,注意
+MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h,${MKL_ROOT}/lib目录下需要包含mkl_core,mkl_sequential和mkl_intel_lp64三个库。
+ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h,${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。
+OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h,${OPENBLAS_ROOT}/lib下需要包含openblas库。
+REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h,${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/compile_options.csv b/doc_cn/build_and_install/cmake/compile_options.csv
index 0b8015aaee4d7b9068cb4a8de5d9967569e37f0c..12b45eebb2822d77447fa1bc754360605971dcab 100644
--- a/doc_cn/build_and_install/cmake/compile_options.csv
+++ b/doc_cn/build_and_install/cmake/compile_options.csv
@@ -1,15 +1,14 @@
-选项,说明,默认值
-WITH_GPU,是否编译GPU支持。,是否寻找到cuda工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否使用运行时动态加载cuda动态库,而非静态加载cuda动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制,是
-WITH_PYTHON,是否内嵌python解释器。可以方便嵌入式工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA支持,否
-WITH_GLOG,是否使用GLOG,如果不使用则会使用一个简化版的日志实现。可以方便嵌入式工作。,取决于是否寻找到GLOG
-WITH_GFLAGS,是否使用GFLAGS,如果不使用则会使用一个简化版的命令行参数解析。可以方便嵌入式工作。,取决于是否寻找到GFLAGS
-WITH_TIMER,是否开启计时功能开启计时功能会导致运行略慢,打印的日志变多。但是方便调试和benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到gtest
-WITH_DOC,是否编译英文文档,否
-WITH_DOC_CN,是否编译中文文档,否
-WITH_SWIG_PY,是否编译python的swig接口,python的swig接口可以方便进行预测和定制化训练,取决于是否找到swig
+选项,说明,默认值
+WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
+WITH_DOUBLE,是否使用双精度浮点数。,否
+WITH_DSO,是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。,是
+WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
+WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
+WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
+WITH_RDMA,是否开启RDMA,否
+WITH_GLOG,是否开启GLOG。如果不开启,则会使用一个简化版的日志,同时方便今后的嵌入式移植工作。,取决于是否寻找到GLOG
+WITH_GFLAGS,是否使用GFLAGS。如果不开启,则会使用一个简化版的命令行参数解析器,同时方便今后的嵌入式移植工作。,取决于是否寻找到GFLAGS
+WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢,打印的日志变多,但是方便调试和测Benchmark,否
+WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
+WITH_DOC,是否编译中英文文档,否
+WITH_SWIG_PY,是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/compile_options.rst b/doc_cn/build_and_install/cmake/compile_options.rst
index bb5b18a073803662774cb6b7bcbdbafe3ad51112..f345ead2bf851bdad7be2fb8185d16fd2a318a66 100644
--- a/doc_cn/build_and_install/cmake/compile_options.rst
+++ b/doc_cn/build_and_install/cmake/compile_options.rst
@@ -1,62 +1,43 @@
-设置PaddlePaddle的编译选项
-==========================
-
-PaddlePaddle的编译选项可以在调用cmake的时候设置。cmake是一个跨平台的编译脚本,调用
-cmake可以将cmake项目文件,生成各个平台的makefile。详细的cmake使用方法可以参考
-`cmake的官方文档 `_ 。
-
-PaddlePaddle的编译选项是可以控制PaddlePaddle生成CPU/GPU版本二进制,链接何种blas等等。所有的
-编译选项列表如下
-
-PaddlePaddle的编译选项
-----------------------
-
-bool型的编译选项
-++++++++++++++++
-设置下列编译选项时,可以在cmake的命令行设置。使用 -D命令即可。例如
-:code:`cmake -D WITH_GPU=OFF`
-
-.. csv-table:: PaddlePaddle的bool型编译选项
- :widths: 1, 7, 2
- :file: compile_options.csv
-
-blas相关的编译选项
-++++++++++++++++++
-
-PaddlePaddle可以使用 `MKL `_ ,
-`Atlas `_ ,
-`OpenBlas `_ 和
-`refference Blas `_ ,任意一种cblas实现。
-通过编译时指定路径来实现引用各种blas。
-
-cmake编译时会首先在系统路径(/usr/lib\:/usr/local/lib)中寻找这些blas的实现。同时
-也会读取相关路径变量来进行搜索。路径变量为\:
-
-
-.. csv-table:: PaddlePaddle的cblas编译选项
- :widths: 1, 9
- :header: "编译选项", "描述"
- :file: cblas_settings.csv
-
-这些变量均可以使用 -D命令指定。例如 :code:`cmake -D MKL_ROOT=/opt/mkl/`。这些变
-量也可以通过调用cmake命令前通过环境变量指定。例如
-
-.. code-block:: bash
-
- export MKL_ROOT=/opt/mkl
- cmake
-
-需要注意的是,这些变量只在第一次cmake的时候有效。如果在第一次cmake之后想要重新设
-置这些变量,推荐清理( :code:`rm -rf` )掉编译目录后,再指定。
-
-cuda/cudnn相关的编译选项
-++++++++++++++++++++++++
-
-PaddlePaddle可以使用 cudnn v2之后的任何一个cudnn版本来编译运行。但需要注意的是编译和
-运行使用的cudnn尽量是同一个版本。推荐使用最新版本的cudnn v5.1。
-
-在cmake配置时可以使用 :code:`CUDNN_ROOT` 来配置CUDNN的安装路径。使用的命令也是
--D,例如 :code:`cmake -D CUDNN_ROOT=/opt/cudnnv5` 。
-
-需要注意的是,这些变量只在第一次cmake的时候有效。如果在第一次cmake之后想要重新设
-置这些变量,推荐清理( :code:`rm -rf` )掉编译目录后,再指定。
+PaddlePaddle的编译选项
+======================
+
+PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考 `官方文档 `_ 。
+
+Bool型的编译选项
+----------------
+用户可在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如
+
+.. code-block:: bash
+
+ cmake .. -DWITH_GPU=OFF
+
+.. csv-table:: Bool型的编译选项
+ :widths: 1, 7, 2
+ :file: compile_options.csv
+
+BLAS/CUDA/Cudnn的编译选项
+--------------------------
+BLAS
++++++
+
+PaddlePaddle支持以下任意一种BLAS库:`MKL `_ ,`ATLAS `_ ,`OpenBlAS `_ 和 `REFERENCE BLAS `_ 。
+
+.. csv-table:: BLAS路径相关的编译选项
+ :widths: 1, 2, 7
+ :file: cblas_settings.csv
+
+CUDA/Cudnn
++++++++++++
+
+PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时,首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如
+
+.. code-block:: bash
+
+ cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
+
+注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(``rm -rf``)后,再指定。
\ No newline at end of file
diff --git a/doc_cn/build_and_install/install/ubuntu_install.rst b/doc_cn/build_and_install/install/ubuntu_install.rst
index 70ac5225bd82e40838875b49f67e70ff08eff853..0fb59e25f6932214a3f1c67b12b426e388c3fc5d 100644
--- a/doc_cn/build_and_install/install/ubuntu_install.rst
+++ b/doc_cn/build_and_install/install/ubuntu_install.rst
@@ -11,7 +11,7 @@ PaddlePaddle的ubuntu安装包分为四个版本,他们是 cpu、gpu、cpu-noa
.. code-block:: shell
- gdebi paddle-*-cpu.deb
+ gdebi paddle-*-cpu*.deb
如果 :code:`gdebi` 没有安装,则需要使用 :code:`sudo apt-get install gdebi`, 来安装 :code:`gdebi` 。
@@ -20,7 +20,7 @@ PaddlePaddle的ubuntu安装包分为四个版本,他们是 cpu、gpu、cpu-noa
.. code-block:: shell
- dpkg -i paddle-*-cpu.deb
+ dpkg -i paddle-*-cpu*.deb
apt-get install -f
在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的,
diff --git a/doc_cn/build_and_install/paddle_on_kubernetes.md b/doc_cn/build_and_install/paddle_on_kubernetes.md
new file mode 100644
index 0000000000000000000000000000000000000000..f8c9f19a9fef50c03f6ffee639a580adbf29844a
--- /dev/null
+++ b/doc_cn/build_and_install/paddle_on_kubernetes.md
@@ -0,0 +1,205 @@
+# Paddle On Kubernetes:单机训练
+
+在这篇文档里,我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中,我们将介绍如何启动分布式训练作业。
+
+## 制作Docker镜像
+
+在一个功能齐全的Kubernetes机群里,通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话,一个分布式Paddle训练任务中的每个进程都可以从Ceph读取数据。在这个例子里,我们只演示一个单机作业,所以可以简化对环境的要求,把训练数据直接放在
+Paddle的Docker image里。为此,我们需要制作一个包含训练数据的Paddle镜像。
+
+Paddle 的 [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html)
+里介绍了用Paddle源码中的脚本下载训练数据的过程。
+而 `paddledev/paddle:cpu-demo-latest` 镜像里有 Paddle 源码与demo,( 请注意,默认的
+Paddle镜像 `paddledev/paddle:cpu-latest` 是不包括源码的, Paddle的各版本镜像可以参考 [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html) ),所以我们使用这个镜像来下载训练数据到Docker container中,然后把这个包含了训练数据的container保存为一个新的镜像。
+
+### 运行容器
+
+```
+$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+```
+
+### 下载数据
+
+进入容器`/root/paddle/demo/quick_start/data`目录,使用`get_data.sh`下载数据
+
+```
+$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh
+
+Downloading Amazon Electronics reviews data...
+--2016-10-31 01:33:43-- http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
+Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
+HTTP request sent, awaiting response... 200 OK
+Length: 495854086 (473M) [application/x-gzip]
+Saving to: 'reviews_Electronics_5.json.gz'
+
+ 10% [=======> ] 874,279 64.7KB/s eta 2h 13m
+
+```
+
+### 修改启动脚本
+
+下载完数据后,修改`/root/paddle/demo/quick_start/train.sh`文件,内容如下(增加了一条cd命令)
+```
+set -e
+cd /root/paddle/demo/quick_start
+cfg=trainer_config.lr.py
+#cfg=trainer_config.emb.py
+#cfg=trainer_config.cnn.py
+#cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
+paddle train \
+ --config=$cfg \
+ --save_dir=./output \
+ --trainer_count=4 \
+ --log_period=20 \
+ --num_passes=15 \
+ --use_gpu=false \
+ --show_parameter_stats_period=100 \
+ --test_all_data_in_one_period=1 \
+ 2>&1 | tee 'train.log'
+```
+
+### 提交镜像
+
+修改启动脚本后,退出容器,使用`docker commit`命令创建新镜像。
+
+```
+$ docker commit quick_start_data mypaddle/paddle:quickstart
+```
+
+## 使用 Kubernetes 进行训练
+
+>针对任务运行完成后容器自动退出的场景,Kubernetes有Job类型的资源来支持。下文就是用Job类型的资源来进行训练。
+
+### 编写yaml文件
+
+在训练时,输出结果可能会随着容器的消耗而被删除,需要在创建容器前挂载卷以便我们保存训练结果。使用我们之前构造的镜像,可以创建一个 [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job),简单的yaml文件如下:
+
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: quickstart
+spec:
+ parallelism: 1
+ completions: 1
+ template:
+ metadata:
+ name: quickstart
+ spec:
+ volumes:
+ - name: output
+ hostPath:
+ path: /home/work/paddle_output
+ containers:
+ - name: pi
+ image: mypaddle/paddle:quickstart
+ command: ["bin/bash", "-c", "/root/paddle/demo/quick_start/train.sh"]
+ volumeMounts:
+ - name: output
+ mountPath: /root/paddle/demo/quick_start/output
+ restartPolicy: Never
+```
+
+### 创建Paddle Job
+
+使用上文创建的yaml文件创建Kubernetes Job,命令为:
+
+```
+$ kubectl create -f paddle.yaml
+```
+
+查看job的详细情况:
+
+```
+$ kubectl get job
+NAME DESIRED SUCCESSFUL AGE
+quickstart 1 0 58s
+
+$ kubectl describe job quickstart
+Name: quickstart
+Namespace: default
+Image(s): registry.baidu.com/public/paddle:cpu-demo-latest
+Selector: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84
+Parallelism: 1
+Completions: 1
+Start Time: Mon, 31 Oct 2016 11:20:16 +0800
+Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Pods Statuses: 0 Running / 1 Succeeded / 0 Failed
+Volumes:
+ output:
+ Type: HostPath (bare host directory volume)
+ Path: /home/work/paddle_output
+Events:
+ FirstSeen LastSeen Count From SubobjectPath Type Reason Message
+ --------- -------- ----- ---- ------------- -------- ------ -------
+ 1m 1m 1 {job-controller } Normal SuccessfulCreate Created pod: quickstart-fa0wx
+```
+
+### 查看训练结果
+
+根据Job对应的Pod信息,可以查看此Pod运行的宿主机。
+
+```
+kubectl describe pod quickstart-fa0wx
+Name: quickstart-fa0wx
+Namespace: default
+Node: paddle-demo-let02/10.206.202.44
+Start Time: Mon, 31 Oct 2016 11:20:17 +0800
+Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Status: Succeeded
+IP: 10.0.0.9
+Controllers: Job/quickstart
+Containers:
+ quickstart:
+ Container ID: docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465
+ Image: registry.baidu.com/public/paddle:cpu-demo-latest
+ Image ID: docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750
+ Port:
+ Command:
+ bin/bash
+ -c
+ /root/paddle/demo/quick_start/train.sh
+ QoS Tier:
+ cpu: BestEffort
+ memory: BestEffort
+ State: Terminated
+ Reason: Completed
+ Exit Code: 0
+ Started: Mon, 31 Oct 2016 11:20:20 +0800
+ Finished: Mon, 31 Oct 2016 11:21:46 +0800
+ Ready: False
+ Restart Count: 0
+ Environment Variables:
+Conditions:
+ Type Status
+ Ready False
+Volumes:
+ output:
+ Type: HostPath (bare host directory volume)
+ Path: /home/work/paddle_output
+```
+
+我们还可以登录到宿主机上查看训练结果。
+
+```
+[root@paddle-demo-let02 paddle_output]# ll
+total 60
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014
+```
diff --git a/doc_cn/cluster/k8s/Dockerfile b/doc_cn/cluster/k8s/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..3a73606c61432329b4cc2d2f8daadc5af8735c96
--- /dev/null
+++ b/doc_cn/cluster/k8s/Dockerfile
@@ -0,0 +1,7 @@
+FROM paddledev/paddle:cpu-latest
+
+MAINTAINER zjsxzong89@gmail.com
+
+COPY start.sh /root/
+COPY start_paddle.py /root/
+CMD ["bash"," -c","/root/start.sh"]
\ No newline at end of file
diff --git a/doc_cn/cluster/k8s/distributed_training_on_kubernetes.md b/doc_cn/cluster/k8s/distributed_training_on_kubernetes.md
new file mode 100644
index 0000000000000000000000000000000000000000..d9ed431ec0566cf90f11ebaeec56560ff69e71fe
--- /dev/null
+++ b/doc_cn/cluster/k8s/distributed_training_on_kubernetes.md
@@ -0,0 +1,309 @@
+
+# PaddlePaddle on Kubernetes:分布式训练
+
+前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里,我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练,文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务,进行分布式训练的方法,与此不同的是,本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群,进行分布式训练的方案。
+
+## Kubernetes 基本概念
+
+[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统,其提供应用部署、维护、 扩展机制等功能,利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行,且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws),[Azure](http://kubernetes.io/docs/getting-started-guides/azure/),[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前,需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识,下面先简要介绍一下本文用到的几个Kubernetes概念。
+
+- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点,这个节点可以是物理机或者虚拟机,Kubernetes集群就是由node节点与master节点组成的。
+
+- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器,pod是Kubernetes的最小调度单元,一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET,PID,IPC,UTS等Linux namespace。由于容器之间共享NET namespace,所以它们使用同一个IP地址,可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
+
+- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 是Kubernetes上运行的作业,一次作业称为一个job,通常每个job包括一个或者多个pods。
+
+- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷,是pod内的容器都可以访问的共享目录,也是容器与node之间共享文件的方式,因为容器内的文件都是暂时存在的,当容器因为各种原因被销毁时,其内部的文件也会随之消失。通过volume,就可以将这些文件持久化存储。Kubernetes支持多种volume,例如hostPath(宿主机目录),gcePersistentDisk,awsElasticBlockStore等。
+
+- [*Namespaces*](http://kubernetes.io/docs/user-guide/volumes/) 命名空间,在kubernetes中创建的所有资源对象(例如上文的pod,job)等都属于一个命名空间,在同一个命名空间中,资源对象的名字是唯一的,不同空间的资源名可以重复,命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
+
+## 整体方案
+
+### 部署Kubernetes集群
+
+首先,我们需要拥有一个Kubernetes集群,在这个集群中所有node与pod都可以互相通信。关于Kubernetes集群搭建,可以参考[官方文档](http://kubernetes.io/docs/getting-started-guides/kubeadm/),在以后的文章中我们也会介绍AWS上搭建的方案。本文假设大家能找到几台物理机,并且可以按照官方文档在上面部署Kubernetes。在本文的环境中,Kubernetes集群中所有node都挂载了一个[MFS](http://moosefs.org/)(Moose filesystem,一种分布式文件系统)共享目录,我们通过这个目录来存放训练文件与最终输出的模型。关于MFS的安装部署,可以参考[MooseFS documentation](https://moosefs.com/documentation.html)。在训练之前,用户将配置与训练数据切分好放在MFS目录中,训练时,程序从此目录拷贝文件到容器内进行训练,将结果保存到此目录里。整体的结构图如下:
+
+![paddle on kubernetes结构图](k8s-paddle-arch.png)
+
+上图描述了一个3节点的分布式训练场景,Kubernetes集群的每个node上都挂载了一个MFS目录,这个目录可以通过volume的形式挂载到容器中。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行,每个pod包含一个PaddlePaddle容器。在容器创建后,会启动pserver与trainer进程,读取volume中的数据进行这次分布式训练。
+
+### 使用 Job
+
+我们使用Kubernetes中的job这个概念来代表一次分布式训练。Job表示一次性作业,在作业完成后,Kubernetes会销毁job产生的容器并且释放相关资源。
+
+在Kubernetes中,可以通过编写一个YAML文件,来描述这个job,在这个文件中,主要包含了一些配置信息,例如PaddlePaddle的节点个数,`paddle pserver`开放的端口个数与端口号,使用的网卡设备等,这些信息通过环境变量的形式传递给容器内的程序使用。
+
+在一次分布式训练中,用户确定好本次训练需要的PaddlePaddle节点个数,将切分好的训练数据与配置文件上传到MFS共享目录中。然后编写这次训练的job YAML文件,提交给Kubernetes集群创建并开始作业。
+
+### 创建PaddlePaddle节点
+
+当Kubernetes master收到请求,解析完YAML文件后,会创建出多个pod(个数为PaddlePaddle节点数),Kubernetes会把这些pod调度到集群的node上运行。一个pod就代表一个PaddlePaddle节点,当pod被成功分配到一台物理/虚拟机上后,Kubernetes会启动pod内的容器,这个容器会根据YAML文件中的环境变量,启动`paddle pserver`与`paddle train`进程。
+
+### 启动训练
+
+在容器启动后,会通过脚本来启动这次分布式训练,我们知道`paddle train`进程启动时需要知道其他节点的IP地址以及本节点的trainer_id,由于PaddlePaddle本身不提供类似服务发现的功能,所以在本文的启动脚本中,每个节点会根据job name向Kubernetes apiserver查询这个job对应的所有pod信息(Kubernetes默认会在每个容器的环境变量中写入apiserver的地址)。
+
+根据这些pod信息,就可以通过某种方式,为每个pod分配一个唯一的trainer_id。本文把所有pod的IP地址进行排序,将顺序作为每个PaddlePaddle节点的trainer_id。启动脚本的工作流程大致如下:
+
+ 1. 查询Kubernetes apiserver获取pod信息,根据IP分配trainer_id
+ 1. 从MFS共享目录中拷贝训练文件到容器内
+ 1. 根据环境变量,解析出`paddle pserver`与`paddle train`的启动参数,启动进程
+ 1. 训练时,PaddlePaddle会自动将结果保存在trainer_id为0的节点上,将输出路径设置为MFS目录,保存输出的文件
+
+
+## 搭建过程
+
+根据前文的描述,要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练,主要分为以下几个步骤:
+
+1. 制作PaddlePaddle镜像
+1. 将训练文件与切分好的数据上传到共享存储
+1. 编写本次训练的YAML文件,创建一个Kubernetes job
+1. 训练结束后查看输出结果
+
+下面就根据这几个步骤分别介绍。
+
+
+### 制作镜像
+
+PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行环境,用这个镜像创建的容器需要有以下两个功能:
+
+- 拷贝训练文件到容器内
+
+- 生成`paddle pserver`与`paddle train`进程的启动参数,并且启动训练
+
+因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能,所以我们可以在这个基础上,添加启动脚本,制作新镜像来完成以上的工作。镜像的*Dockerfile*如下:
+
+```Dockerfile
+FROM paddledev/paddle:cpu-latest
+
+MAINTAINER zjsxzong89@gmail.com
+
+COPY start.sh /root/
+COPY start_paddle.py /root/
+CMD ["bash"," -c","/root/start.sh"]
+```
+
+[`start.sh`](start.sh)文件拷贝训练文件到容器内,然后执行[`start_paddle.py`](start_paddle.py)脚本启动训练,前文提到的获取其他节点IP地址,分配`trainer_id`等都在`start_paddle.py`脚本中完成。
+
+`start_paddle.py`脚本开始时,会先进行参数的初始化与解析。
+
+```python
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+ description='simple tool for k8s')
+ args, train_args_list = parser.parse_known_args()
+ train_args = refine_unknown_args(train_args_list)
+ train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+ podlist = getPodList()
+```
+
+然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态(容器运行都运行)时,再通过函数`getIdMap(podlist)`获取trainer_id。
+
+```python
+ podlist = getPodList()
+ # need to wait until all pods are running
+ while not isPodAllRunning(podlist):
+ time.sleep(10)
+ podlist = getPodList()
+ idMap = getIdMap(podlist)
+```
+
+在函数`getIdMap(podlist)`内部,我们通过读取`podlist`中每个pod的IP地址,将IP排序生成的序号作为trainer_id。
+
+```python
+def getIdMap(podlist):
+ '''
+ generate tainer_id by ip
+ '''
+ ips = []
+ for pod in podlist["items"]:
+ ips.append(pod["status"]["podIP"])
+ ips.sort()
+ idMap = {}
+ for i in range(len(ips)):
+ idMap[ips[i]] = i
+ return idMap
+```
+
+在得到`idMap`后,通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver`与`paddle train`的启动参数并执行进程。
+
+在函数`startPaddle`中,最主要的工作就是解析出`paddle pserver`与`paddle train`的启动参数。例如`paddle train`参数的解析,解析环境变量得到`PADDLE_NIC`,`PADDLE_PORT`,`PADDLE_PORTS_NUM`等参数,然后通过自身的IP地址在`idMap`中获取`trainerId`。
+
+```python
+ program = 'paddle train'
+ args = " --nics=" + PADDLE_NIC
+ args += " --port=" + str(PADDLE_PORT)
+ args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+ args += " --comment=" + "paddle_process_by_paddle"
+ ip_string = ""
+ for ip in idMap.keys():
+ ip_string += (ip + ",")
+ ip_string = ip_string.rstrip(",")
+ args += " --pservers=" + ip_string
+ args_ext = ""
+ for key, value in train_args_dict.items():
+ args_ext += (' --' + key + '=' + value)
+ localIP = socket.gethostbyname(socket.gethostname())
+ trainerId = idMap[localIP]
+ args += " " + args_ext + " --trainer_id=" + \
+ str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
+
+使用 `docker build` 构建镜像:
+
+```bash
+docker build -t your_repo/paddle:mypaddle .
+```
+
+然后将构建成功的镜像上传到镜像仓库。
+
+```bash
+docker push your_repo/paddle:mypaddle
+```
+
+### 上传训练文件
+
+本文使用PaddlePaddle官方的[recommendation demo](http://www.paddlepaddle.org/doc/demo/index.html#recommendation)作为这次训练的内容,我们将训练文件与数据放在一个job name命名的目录中,上传到MFS共享存储。完成后MFS上的文件内容大致如下:
+
+```bash
+[root@paddle-kubernetes-node0 mfs]# tree -d
+.
+└── paddle-cluster-job
+ ├── data
+ │ ├── 0
+ │ │
+ │ ├── 1
+ │ │
+ │ └── 2
+ ├── output
+ └── recommendation
+```
+
+目录中paddle-cluster-job是本次训练对应的job name,本次训练要求有3个PaddlePaddle节点,在paddle-cluster-job/data目录中存放切分好的数据,文件夹0,1,2分别代表3个节点的trainer_id。recommendation文件夹内存放训练文件,output文件夹存放训练结果与日志。
+
+### 创建Job
+
+Kubernetes可以通过YAML文件来创建相关对象,然后可以使用命令行工具创建job。
+
+Job YAML文件描述了这次训练使用的Docker镜像,需要启动的节点个数以及 `paddle pserver`与 `paddle train`进程启动的必要参数,也描述了容器需要使用的存储卷挂载的情况。YAML文件中各个字段的具体含义,可以查看[Kubernetes Job API](http://kubernetes.io/docs/api-reference/batch/v1/definitions/#_v1_job)。例如,本次训练的YAML文件可以写成:
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: paddle-cluster-job
+spec:
+ parallelism: 3
+ completions: 3
+ template:
+ metadata:
+ name: paddle-cluster-job
+ spec:
+ volumes:
+ - name: jobpath
+ hostPath:
+ path: /home/work/mfs
+ containers:
+ - name: trainer
+ image: your_repo/paddle:mypaddle
+ command: ["bin/bash", "-c", "/root/start.sh"]
+ env:
+ - name: JOB_NAME
+ value: paddle-cluster-job
+ - name: JOB_PATH
+ value: /home/jobpath
+ - name: JOB_NAMESPACE
+ value: default
+ - name: TRAIN_CONFIG_DIR
+ value: recommendation
+ - name: CONF_PADDLE_NIC
+ value: eth0
+ - name: CONF_PADDLE_PORT
+ value: "7164"
+ - name: CONF_PADDLE_PORTS_NUM
+ value: "2"
+ - name: CONF_PADDLE_PORTS_NUM_SPARSE
+ value: "2"
+ - name: CONF_PADDLE_GRADIENT_NUM
+ value: "3"
+ volumeMounts:
+ - name: jobpath
+ mountPath: /home/jobpath
+ restartPolicy: Never
+```
+
+文件中,`metadata`下的`name`表示这个job的名字。`parallelism,completions`字段表示这个job会同时开启3个PaddlePaddle节点,成功训练且退出的pod数目为3时,这个job才算成功结束。然后申明一个存储卷`jobpath`,代表宿主机目录`/home/work/mfs`,在对容器的描述`containers`字段中,将此目录挂载为容器的`/home/jobpath`目录,这样容器的`/home/jobpath`目录就成为了共享存储,放在这个目录里的文件其实是保存到了MFS上。
+
+`env`字段表示容器的环境变量,我们将`paddle`运行的一些参数通过这种方式传递到容器内。
+
+`JOB_PATH`表示共享存储挂载的路径,`JOB_NAME`表示job名字,`TRAIN_CONFIG_DIR`表示本次训练文件所在目录,这三个变量组合就可以找到本次训练需要的文件路径。
+
+`CONF_PADDLE_NIC`表示`paddle pserver`进程需要的`--nics`参数,即网卡名
+
+`CONF_PADDLE_PORT`表示`paddle pserver`的`--port`参数,`CONF_PADDLE_PORTS_NUM`则表示稠密更新的端口数量,也就是`--ports_num`参数。
+
+`CONF_PADDLE_PORTS_NUM_SPARSE`表示稀疏更新的端口数量,也就是`--ports_num_for_sparse`参数。
+
+`CONF_PADDLE_GRADIENT_NUM`表示训练节点数量,即`--num_gradient_servers`参数
+
+编写完YAML文件后,可以使用Kubernetes的命令行工具创建job。
+
+```bash
+kubectl create -f job.yaml
+```
+
+创建成功后,Kubernetes就会创建3个pod作为PaddlePaddle节点然后拉取镜像,启动容器开始训练。
+
+
+### 查看输出
+
+在训练过程中,可以在共享存储上查看输出的日志和模型,例如output目录下就存放了输出结果。注意node_0,node_1,node_2这几个目录表示PaddlePaddle节点与trainer_id,并不是Kubernetes中的node概念。
+
+```bash
+[root@paddle-kubernetes-node0 output]# tree -d
+.
+├── node_0
+│ ├── server.log
+│ └── train.log
+├── node_1
+│ ├── server.log
+│ └── train.log
+├── node_2
+......
+├── pass-00002
+│ ├── done
+│ ├── ___embedding_0__.w0
+│ ├── ___embedding_1__.w0
+......
+```
+
+我们可以通过日志查看容器训练的情况,例如:
+
+```bash
+[root@paddle-kubernetes-node0 node_0]# cat train.log
+I1116 09:10:17.123121 50 Util.cpp:155] commandline:
+ /usr/local/bin/../opt/paddle/bin/paddle_trainer
+ --nics=eth0 --port=7164
+ --ports_num=2 --comment=paddle_process_by_paddle
+ --pservers=192.168.129.66,192.168.223.143,192.168.129.71
+ --ports_num_for_sparse=2 --config=./trainer_config.py
+ --trainer_count=4 --num_passes=10 --use_gpu=0
+ --log_period=50 --dot_period=10 --saving_period=1
+ --local=0 --trainer_id=0
+ --save_dir=/home/jobpath/paddle-cluster-job/output
+I1116 09:10:17.123440 50 Util.cpp:130] Calling runInitFunctions
+I1116 09:10:17.123764 50 Util.cpp:143] Call runInitFunctions done.
+[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
+[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__regression_cost_0__]
+I1116 09:10:17.392917 50 Trainer.cpp:170] trainer mode: Normal
+I1116 09:10:17.613910 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.680917 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.681543 50 GradientMachine.cpp:134] Initing parameters..
+I1116 09:10:18.012390 50 GradientMachine.cpp:141] Init parameters done.
+I1116 09:10:18.018641 50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
+I1116 09:10:18.018950 50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
+I1116 09:10:18.019069 50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
+I1116 09:10:18.019492 50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
+I1116 09:10:18.019716 50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
+I1116 09:10:18.019836 50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
+```
\ No newline at end of file
diff --git a/doc_cn/cluster/k8s/job.yaml b/doc_cn/cluster/k8s/job.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e0ac464b2ec71e98c28f090124690b01b0755ce
--- /dev/null
+++ b/doc_cn/cluster/k8s/job.yaml
@@ -0,0 +1,43 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: paddle-cluster-job
+spec:
+ parallelism: 3
+ completions: 3
+ template:
+ metadata:
+ name: paddle-cluster-job
+ spec:
+ volumes:
+ - name: jobpath
+ hostPath:
+ path: /home/work/paddle_output
+ containers:
+ - name: trainer
+ image: registry.baidu.com/public/paddle:mypaddle
+ command: ["bin/bash", "-c", "/root/start.sh"]
+ env:
+ - name: JOB_NAME
+ value: paddle-cluster-job
+ - name: JOB_PATH
+ value: /home/jobpath
+ - name: JOB_NAMESPACE
+ value: default
+ - name: TRAIN_CONFIG_DIR
+ value: recommendation
+ - name: CONF_PADDLE_NIC
+ value: eth0
+ - name: CONF_PADDLE_PORT
+ value: "7164"
+ - name: CONF_PADDLE_PORTS_NUM
+ value: "2"
+ - name: CONF_PADDLE_PORTS_NUM_SPARSE
+ value: "2"
+ - name: CONF_PADDLE_GRADIENT_NUM
+ value: "3"
+ volumeMounts:
+ - name: jobpath
+ mountPath: /home/jobpath
+ restartPolicy: Never
+
\ No newline at end of file
diff --git a/doc_cn/cluster/k8s/k8s-paddle-arch.png b/doc_cn/cluster/k8s/k8s-paddle-arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..a8c64550b1fa7f41de1eaa9a037c65cddc0cd30e
Binary files /dev/null and b/doc_cn/cluster/k8s/k8s-paddle-arch.png differ
diff --git a/doc_cn/cluster/k8s/start.sh b/doc_cn/cluster/k8s/start.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b3a1334174a20b018d35de3b01b149fc5b10d49d
--- /dev/null
+++ b/doc_cn/cluster/k8s/start.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+set -eu
+
+jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR}
+cd /root
+cp -rf $jobconfig .
+cd $TRAIN_CONFIG_DIR
+
+
+python /root/start_paddle.py \
+ --dot_period=10 \
+ --ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM \
+ --log_period=50 \
+ --num_passes=10 \
+ --trainer_count=4 \
+ --saving_period=1 \
+ --local=0 \
+ --config=./trainer_config.py \
+ --use_gpu=0
diff --git a/doc_cn/cluster/k8s/start_paddle.py b/doc_cn/cluster/k8s/start_paddle.py
new file mode 100755
index 0000000000000000000000000000000000000000..bc0112a77fb84db8965a09716006377c127ad4db
--- /dev/null
+++ b/doc_cn/cluster/k8s/start_paddle.py
@@ -0,0 +1,159 @@
+#!/usr/bin/python
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import time
+import socket
+import os
+import argparse
+
+
+# configuration for cluster
+API = "/api/v1/namespaces/"
+JOBSELECTOR = "labelSelector=job-name="
+JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
+JOB_PATH_DATA = JOB_PATH + "/data"
+JOB_PATH_OUTPUT = JOB_PATH + "/output"
+JOBNAME = os.getenv("JOB_NAME")
+NAMESPACE = os.getenv("JOB_NAMESPACE")
+PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
+PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
+PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
+PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
+PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
+
+
+def refine_unknown_args(cmd_args):
+ '''
+ refine unknown parameters to handle some special parameters
+ '''
+ new_args = []
+ for arg in cmd_args:
+ if arg.startswith("--") and arg.find("=") != -1:
+ equal_pos = arg.find("=") # find first = pos
+ arglist = list(arg)
+ arglist[equal_pos] = " "
+ arg = "".join(arglist)
+ arg = arg.lstrip("-")
+ new_args += arg.split(" ")
+ elif arg.startswith("--") and arg.find("=") == -1:
+ arg = arg.lstrip("-")
+ new_args.append(arg)
+ else:
+ new_args.append(arg)
+ return new_args
+
+
+def isPodAllRunning(podlist):
+ '''
+ check all pod is running
+ '''
+ require = len(podlist["items"])
+ running = 0
+ for pod in podlist["items"]:
+ if pod["status"]["phase"] == "Running":
+ running += 1
+ if require == running:
+ return True
+ return False
+
+
+def getPodList():
+ '''
+ get all container status of the job
+ '''
+ apiserver = "https://" + \
+ os.getenv("KUBERNETES_SERVICE_HOST") + ":" + \
+ os.getenv("KUBERNETES_SERVICE_PORT_HTTPS")
+
+ pod = API + NAMESPACE + "/pods?"
+ job = JOBNAME
+ return requests.get(apiserver + pod + JOBSELECTOR + job,
+ verify=False).json()
+
+
+def getIdMap(podlist):
+ '''
+ generate tainer_id by ip
+ '''
+ ips = []
+ for pod in podlist["items"]:
+ ips.append(pod["status"]["podIP"])
+ ips.sort()
+ idMap = {}
+ for i in range(len(ips)):
+ idMap[ips[i]] = i
+ return idMap
+
+
+def startPaddle(idMap={}, train_args_dict=None):
+ '''
+ start paddle pserver and trainer
+ '''
+ program = 'paddle train'
+ args = " --nics=" + PADDLE_NIC
+ args += " --port=" + str(PADDLE_PORT)
+ args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+ args += " --comment=" + "paddle_process_by_paddle"
+ ip_string = ""
+ for ip in idMap.keys():
+ ip_string += (ip + ",")
+ ip_string = ip_string.rstrip(",")
+ args += " --pservers=" + ip_string
+ args_ext = ""
+ for key, value in train_args_dict.items():
+ args_ext += (' --' + key + '=' + value)
+ localIP = socket.gethostbyname(socket.gethostname())
+ trainerId = idMap[localIP]
+ args += " " + args_ext + " --trainer_id=" + \
+ str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+ logDir = JOB_PATH_OUTPUT + "/node_" + str(trainerId)
+ if not os.path.exists(JOB_PATH_OUTPUT):
+ os.makedirs(JOB_PATH_OUTPUT)
+ os.mkdir(logDir)
+ copyCommand = 'cp -rf ' + JOB_PATH_DATA + \
+ "/" + str(trainerId) + " ./data"
+ os.system(copyCommand)
+ startPserver = 'nohup paddle pserver' + \
+ " --port=" + str(PADDLE_PORT) + \
+ " --ports_num=" + str(PADDLE_PORTS_NUM) + \
+ " --ports_num_for_sparse=" + str(PADDLE_PORTS_NUM_SPARSE) + \
+ " --nics=" + PADDLE_NIC + \
+ " --comment=" + "paddle_process_by_paddle" + \
+ " --num_gradient_servers=" + str(PADDLE_SERVER_NUM) +\
+ " > " + logDir + "/server.log 2>&1 &"
+ print startPserver
+ os.system(startPserver)
+ # wait until pservers completely start
+ time.sleep(10)
+ startTrainer = program + args + " > " + \
+ logDir + "/train.log 2>&1 < /dev/null"
+ print startTrainer
+ os.system(startTrainer)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(prog="start_paddle.py",
+ description='simple tool for k8s')
+ args, train_args_list = parser.parse_known_args()
+ train_args = refine_unknown_args(train_args_list)
+ train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+ podlist = getPodList()
+ # need to wait until all pods are running
+ while not isPodAllRunning(podlist):
+ time.sleep(10)
+ podlist = getPodList()
+ idMap = getIdMap(podlist)
+ startPaddle(idMap, train_args_dict)
diff --git a/doc_cn/howto/how_to_write_docs/index.rst b/doc_cn/howto/how_to_write_docs/index.rst
index 869ef747f9f88c7dbb5efdf6e03111a3f76c4014..a1f983b3405fa40f436885e40fca2ebbb4695491 100644
--- a/doc_cn/howto/how_to_write_docs/index.rst
+++ b/doc_cn/howto/how_to_write_docs/index.rst
@@ -2,32 +2,19 @@
如何贡献/修改PaddlePaddle的文档
###############################
-PaddlePaddle的文档使用 `cmake`_ 驱动 `sphinx`_ 生成。公有两个文档,:code:`doc` 和 :code:`doc_cn` 。这两者会在 `cmake`_ 中进行编译,生成后的文档会存储在服务器的 :code:`doc` 和 :code:`doc_cn` 两个目录下。
+PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
-下面分几个部分介绍一下PaddlePaddle文档的贡献方法。
-
-如何书写PaddlePaddle的文档
-==========================
-
-TBD
如何构建PaddlePaddle的文档
==========================
-构建PaddlePaddle文档,需要使用构建Paddle的全部环境。准备这个环境相对来说比较复杂,所以本文档提供两种方式构建PaddlePaddle的文档,即
-
-* 使用Docker构建PaddlePaddle的文档
-* 直接构建PaddlePaddle的文档。
-
-并且,我们推荐使用Docker来构建PaddlePaddle的文档。
+PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式。构建PaddlePaddle文档需要准备的环境相对较复杂,所以我们推荐使用基于Docker来构建PaddlePaddle的文档。
使用Docker构建PaddlePaddle的文档
--------------------------------
-使用Docker构建PaddlePaddle的文档,首先要求在系统里安装好Docker工具包。安装Docker请参考 `Docker的官网 `_ 。
-
-安装好Docker之后可以使用源码目录下的脚本构建文档,即
+使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即
.. code-block:: bash
@@ -35,10 +22,10 @@ TBD
cd paddle/scripts/tools/build_docs
bash build_docs.sh
-执行完这个脚本后,该目录下会生成两个目录,分别是\:
+编译完成后,该目录下会生成如下两个子目录\:
-* doc 目录,英文文档地址
-* doc_cn 目录,中文文档地址
+* doc 英文文档目录
+* doc_cn 中文文档目录
打开浏览器访问对应目录下的index.html即可访问本地文档。
@@ -52,6 +39,10 @@ TBD
TBD
+如何书写PaddlePaddle的文档
+==========================
+
+TBD
如何更新www.paddlepaddle.org文档
================================
diff --git a/doc_cn/ui/cmd/index.rst b/doc_cn/ui/cmd/index.rst
index 6d62180a6a5e3f2490cccd2a90213050aa3c172e..f975d432c07f8f0cdc725af2b29c25b7bd6a0657 100644
--- a/doc_cn/ui/cmd/index.rst
+++ b/doc_cn/ui/cmd/index.rst
@@ -1,17 +1,22 @@
-命令行参数
-==========
+PaddlePaddle的命令行参数
+========================
-安装好的PaddlePaddle脚本包括多条命令,他们是
+安装好PaddlePaddle后,在命令行直接敲击 ``paddle`` 或 ``paddle --help`` 会显示如下一些命令行参数。
-* paddle train即为PaddlePaddle的训练进程。可以使用paddle train完成单机多显卡多线程的训
- 练。也可以和paddle pserver组合使用,完成多机训练。
-* paddle pserver为PaddlePaddle的parameter server进程。负责多机训练中的参数聚合工作。
-* paddle version可以打印出PaddlePaddle的版本和编译时信息。
-* merge_model 可以将PaddlePaddle的模型和配置打包成一个文件。方便部署分发。
-* dump_config 可以将PaddlePaddle的训练模型以proto string的格式打印出来
-* make_diagram 可以使用graphviz对PaddlePaddle的网络模型进行绘制,方便调试使用。
+* ``train`` Start a paddle_trainer
+ 启动一个PaddlePaddle训练进程。 ``paddle train`` 可以通过命令行参数 ``-local=true`` 启动一个单机的训练进程;也可以和 ``paddle pserver`` 一起使用启动多机的分布式训练进程。
+* ``pserver`` Start a paddle_pserver_main
+ 在多机分布式训练下启动PaddlePaddle的parameter server进程。
+* ``version`` Print paddle version
+ 用于打印当前PaddlePaddle的版本和编译选项相关信息。
+* ``merge_model`` Start a paddle_merge_model
+ 用于将PaddlePaddle的模型参数文件和模型配置文件打包成一个文件,方便做部署分发。
+* ``dump_config`` Dump the trainer config as proto string
+ 用于将PaddlePaddle的模型配置文件以proto string的格式打印出来。
+* ``make_diagram``
+ 使用graphviz对PaddlePaddle的模型配置文件进行绘制。
-更详细的介绍请参考各个命令的命令行参数文档。
+更详细的介绍请参考各命令行参数文档。
.. toctree::
:glob:
diff --git a/doc_cn/ui/cmd/paddle_pserver.rst b/doc_cn/ui/cmd/paddle_pserver.rst
index 891975c34af5c34dddc754b79bd3e1adda9d9671..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644
--- a/doc_cn/ui/cmd/paddle_pserver.rst
+++ b/doc_cn/ui/cmd/paddle_pserver.rst
@@ -1,2 +0,0 @@
-paddle pserver的命令行参数
-==========================
diff --git a/doc_cn/ui/cmd/paddle_train.rst b/doc_cn/ui/cmd/paddle_train.rst
index 87b84f5cbdbbe016d9bcdbda2cb30d93d2ad8022..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644
--- a/doc_cn/ui/cmd/paddle_train.rst
+++ b/doc_cn/ui/cmd/paddle_train.rst
@@ -1,2 +0,0 @@
-paddle train的命令行参数
-========================
diff --git a/doc_cn/ui/cmd/paddle_version.rst b/doc_cn/ui/cmd/paddle_version.rst
index 0a4f8dd472a6009ef6832df75be043c24bb32ba0..537c23df75ea8eee5d17cc3f05bf17ed1bdfcb73 100644
--- a/doc_cn/ui/cmd/paddle_version.rst
+++ b/doc_cn/ui/cmd/paddle_version.rst
@@ -1,9 +1,7 @@
paddle version的命令行参数
==========================
-paddle version可以打印出paddle的版本信息和编译的选项。常见的输出格式为
+paddle version用于打印当前的版本信息和相关编译选项。常见的输出格式如下。第一行说明了PaddlePaddle的版本信息,后面跟着一些主要的编译选项。编译选项的具体意义可以参考
+`编译参数选项文件 <../../build_and_install/cmake/compile_options.html>`_
.. literalinclude:: paddle_version.txt
-
-其第一行说明了paddle的版本,后面跟着一系列编译参数。这里可以参考paddle的
-`编译参数选项文件 <../../build/cmake/compile_options.html>`_
diff --git a/doc_cn/ui/index.rst b/doc_cn/ui/index.rst
index 5aba272c627204110a56337f0f120f3f2cd37ae9..8079bd9180cf02db944535829baca2dbaa1c4205 100644
--- a/doc_cn/ui/index.rst
+++ b/doc_cn/ui/index.rst
@@ -1,8 +1,9 @@
+########
用户接口
-========
+########
数据提供
-''''''''
+========
.. toctree::
:maxdepth: 1
@@ -11,14 +12,19 @@
命令行参数
-''''''''''
-* `Use Case <../../doc/ui/cmd_argument/use_case.html>`_
-* `Argument Outline <../../doc/ui/cmd_argument/argument_outline.html>`_
-* `Detail Description <../../doc/ui/cmd_argument/detail_introduction.html>`_
+==========
+
+.. toctree::
+
+ cmd/index.rst
+
+* `参数分类 <../../doc/ui/cmd_argument/argument_outline.html>`_
+* `参数描述 <../../doc/ui/cmd_argument/detail_introduction.html>`_
+* `参数用例 <../../doc/ui/cmd_argument/use_case.html>`_
预测
-''''
+====
.. toctree::
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index 6f51d551200696ebafade2a46243b78086975265..b539374cd4aa5a9510cdb728c1b22edf65a9f880 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "PaddleAPIPrivate.h"
@@ -112,7 +111,7 @@ void Arguments::setSlotSequenceStartPositions(size_t idx,
}
void Arguments::setSlotSubSequenceStartPositions(
- size_t idx, IVector *vec) throw(RangeError) {
+ size_t idx, IVector* vec) throw(RangeError) {
auto& a = m->getArg(idx);
auto& v = m->cast(vec->getSharedPtr());
a.subSequenceStartPositions = std::make_shared(v);
diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
index 25d94f5a6a1255f3e2faff9816cfd003b20c0418..bc40d871d180a6bfe21200c866181dc161f5f078 100644
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "PaddleAPIPrivate.h"
#include "paddle/trainer/Trainer.h"
@@ -44,8 +43,7 @@ TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
return retv;
}
-TrainerConfig* TrainerConfig::createFromProtoString(
- const std::string& str) {
+TrainerConfig* TrainerConfig::createFromProtoString(const std::string& str) {
auto retv = new TrainerConfig();
paddle::TrainerConfig trainerConfigProto;
auto conf = std::make_shared(trainerConfigProto);
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index bef499c67858b8e2d5432155a8defca56af6019c..9a4846d80980e23e97f89b6134e15af71207ae6b 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "PaddleAPIPrivate.h"
@@ -27,7 +26,8 @@ GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
GradientMachine::~GradientMachine() { delete m; }
GradientMachine* GradientMachine::createFromPaddleModelPtr(
- const void* confPtr, GradientMatchineCreateMode mode,
+ const void* confPtr,
+ GradientMatchineCreateMode mode,
const std::vector& types) {
auto& conf = *(const paddle::ModelConfig*)(confPtr);
std::vector realTypes;
@@ -44,7 +44,8 @@ GradientMachine* GradientMachine::createFromPaddleModelPtr(
}
GradientMachine* GradientMachine::createByConfigProtoStr(
- const std::string& protoStr, GradientMatchineCreateMode mode,
+ const std::string& protoStr,
+ GradientMatchineCreateMode mode,
const std::vector& types) {
paddle::ModelConfig conf;
conf.ParseFromString(protoStr);
@@ -56,13 +57,15 @@ GradientMachine* GradientMachine::createByConfigProtoStr(
}
GradientMachine* GradientMachine::createByModelConfig(
- ModelConfig* conf, GradientMatchineCreateMode mode,
+ ModelConfig* conf,
+ GradientMatchineCreateMode mode,
const std::vector& types) {
auto confPtr = &conf->m->conf->getModelConfig();
return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
}
-void GradientMachine::forward(const Arguments& inArgs, Arguments* outArgs,
+void GradientMachine::forward(const Arguments& inArgs,
+ Arguments* outArgs,
PassType passType) {
auto& in =
m->cast>(inArgs.getInternalArgumentsPtr());
@@ -99,7 +102,8 @@ void GradientMachine::backward(const UpdateCallback& callback) {
}
void GradientMachine::forwardBackward(const Arguments& inArgs,
- Arguments* outArgs, PassType passType,
+ Arguments* outArgs,
+ PassType passType,
const UpdateCallback& callback) {
auto& in =
m->cast>(inArgs.getInternalArgumentsPtr());
@@ -129,7 +133,7 @@ Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
void GradientMachine::randParameters() { m->machine->randParameters(); }
Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
- throw(UnsupportError) {
+ throw(UnsupportError) {
auto nn = std::dynamic_pointer_cast(m->machine);
if (nn) {
auto mat = nn->getLayerOutput(layerName);
@@ -140,8 +144,11 @@ Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
}
SequenceGenerator* GradientMachine::asSequenceGenerator(
- const std::vector& dict, size_t begin_id, size_t end_id,
- size_t max_length, size_t beam_size) {
+ const std::vector& dict,
+ size_t begin_id,
+ size_t end_id,
+ size_t max_length,
+ size_t beam_size) {
SequenceGenerator* r =
SequenceGenerator::createByGradientMachineSharedPtr(&m->machine);
r->setDict(dict);
diff --git a/paddle/api/Internal.h b/paddle/api/Internal.h
index b990f650be9fa401898a8c6d10c21d9c90eb728a..66a13bc603ed5098997f168d3f527160ac3822ef 100644
--- a/paddle/api/Internal.h
+++ b/paddle/api/Internal.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include "PaddleAPI.h"
@@ -23,7 +22,8 @@ limitations under the License. */
template
void staticCastVector(std::vector* dest, const std::vector& src) {
dest->resize(src.size());
- std::transform(src.begin(), src.end(), dest->begin(), [](T1 t){
- return static_cast(t);
- });
+ std::transform(src.begin(),
+ src.end(),
+ dest->begin(),
+ [](T1 t) { return static_cast(t); });
}
diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp
index e5493a381a6f9e3d135c14649a8e1e438494d363..f257ee65aa4a12dfcd1914ddbf0e16461a9b128c 100644
--- a/paddle/api/Matrix.cpp
+++ b/paddle/api/Matrix.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "paddle/math/Matrix.h"
#include "paddle/math/SparseMatrix.h"
@@ -44,17 +43,21 @@ Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
return m;
}
-Matrix* Matrix::createDense(const std::vector& data, size_t height,
- size_t width, bool useGpu) {
+Matrix* Matrix::createDense(const std::vector& data,
+ size_t height,
+ size_t width,
+ bool useGpu) {
auto m = new Matrix();
m->m->mat = paddle::Matrix::create(height, width, useGpu);
m->m->mat->copyFrom(data.data(), data.size());
return m;
}
-Matrix* Matrix::createDenseFromNumpy(float* data, int dim1, int dim2,
- bool copy, bool useGpu)
- throw (UnsupportError) {
+Matrix* Matrix::createDenseFromNumpy(float* data,
+ int dim1,
+ int dim2,
+ bool copy,
+ bool useGpu) throw(UnsupportError) {
if (useGpu) {
/// Gpu mode only supports copy=True
if (!copy) {
@@ -66,7 +69,9 @@ Matrix* Matrix::createDenseFromNumpy(float* data, int dim1, int dim2,
}
}
-Matrix* Matrix::createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+Matrix* Matrix::createCpuDenseFromNumpy(float* data,
+ int dim1,
+ int dim2,
bool copy) {
auto m = new Matrix();
if (copy) {
@@ -85,12 +90,20 @@ Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
return m;
}
-Matrix* Matrix::createSparse(size_t height, size_t width, size_t nnz,
- bool isNonVal, bool isTrans, bool useGpu) {
+Matrix* Matrix::createSparse(size_t height,
+ size_t width,
+ size_t nnz,
+ bool isNonVal,
+ bool isTrans,
+ bool useGpu) {
auto m = new Matrix();
m->m->mat = paddle::Matrix::createSparseMatrix(
- height, width, nnz, isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
- isTrans, useGpu);
+ height,
+ width,
+ nnz,
+ isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
+ isTrans,
+ useGpu);
return m;
}
@@ -221,7 +234,8 @@ FloatArray Matrix::getData() const {
}
void Matrix::sparseCopyFrom(
- const std::vector& rows, const std::vector& cols,
+ const std::vector& rows,
+ const std::vector& cols,
const std::vector& vals) throw(UnsupportError) {
auto cpuSparseMat =
std::dynamic_pointer_cast(m->mat);
@@ -240,7 +254,8 @@ void Matrix::sparseCopyFrom(
void* Matrix::getSharedPtr() const { return &m->mat; }
-void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
+void Matrix::toNumpyMatInplace(float** view_data,
+ int* dim1,
int* dim2) throw(UnsupportError) {
auto cpuMat = std::dynamic_pointer_cast(m->mat);
if (cpuMat) {
@@ -251,7 +266,8 @@ void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
throw UnsupportError();
}
}
-void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
+void Matrix::copyToNumpyMat(float** view_m_data,
+ int* dim1,
int* dim2) throw(UnsupportError) {
static_assert(sizeof(paddle::real) == sizeof(float),
"Currently PaddleAPI only support for single "
@@ -269,8 +285,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
} else if (auto gpuMat = dynamic_cast(m->mat.get())) {
auto src = gpuMat->getData();
auto dest = *view_m_data;
- hl_memcpy_device2host(dest, src,
- sizeof(paddle::real) * (*dim1) * (*dim2));
+ hl_memcpy_device2host(
+ dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
} else {
LOG(WARNING) << "Unexpected Situation";
throw UnsupportError();
@@ -278,7 +294,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
}
}
-void Matrix::copyFromNumpyMat(float* data, int dim1,
+void Matrix::copyFromNumpyMat(float* data,
+ int dim1,
int dim2) throw(UnsupportError, RangeError) {
if (isSparse()) {
throw UnsupportError();
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 5688ece44d2d58a2184a9f23d4af26c51c319579..c07facdb1292b34ac31247160a4347ea359e718b 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
@@ -61,8 +60,8 @@ class RangeError {};
/// Not support Error, such as access GPU memory directly, etc.
class UnsupportError : public std::runtime_error {
public:
- UnsupportError() : std::runtime_error(" ") {};
- UnsupportError(const std::string& message) : std::runtime_error(message) {};
+ UnsupportError() : std::runtime_error(" "){};
+ UnsupportError(const std::string& message) : std::runtime_error(message){};
};
/// This type will map to python's list of float.
@@ -112,7 +111,8 @@ public:
/**
* Create A Matrix with height,width, which is filled by zero.
*/
- static Matrix* createZero(size_t height, size_t width,
+ static Matrix* createZero(size_t height,
+ size_t width,
bool useGpu = isUsingGpu());
/**
@@ -124,8 +124,11 @@ public:
*
* @note the default sparse type is SPARSE_CSR.
*/
- static Matrix* createSparse(size_t height, size_t width, size_t nnz,
- bool isNonVal = true, bool trans = false,
+ static Matrix* createSparse(size_t height,
+ size_t width,
+ size_t nnz,
+ bool isNonVal = true,
+ bool trans = false,
bool useGpu = isUsingGpu());
/**
@@ -134,13 +137,17 @@ public:
* @param data list of float should be passed in python.
* @note the value will be copy into a new matrix.
*/
- static Matrix* createDense(const std::vector& data, size_t height,
- size_t width, bool useGpu = isUsingGpu());
-
- static Matrix* createDenseFromNumpy(float* data, int dim1, int dim2,
- bool copy = true,
- bool useGpu = isUsingGpu())
- throw (UnsupportError);
+ static Matrix* createDense(const std::vector& data,
+ size_t height,
+ size_t width,
+ bool useGpu = isUsingGpu());
+
+ static Matrix* createDenseFromNumpy(
+ float* data,
+ int dim1,
+ int dim2,
+ bool copy = true,
+ bool useGpu = isUsingGpu()) throw(UnsupportError);
/**
* Create Cpu Dense Matrix from numpy matrix, dtype=float32
@@ -151,7 +158,9 @@ public:
* @param copy true if copy into a new matrix, false will create
* matrix inplace.
*/
- static Matrix* createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+ static Matrix* createCpuDenseFromNumpy(float* data,
+ int dim1,
+ int dim2,
bool copy = false);
/// Create Gpu Dense Matrix from numpy matrix, dtype=float32
@@ -171,11 +180,13 @@ public:
* numpy_mat = m.toNumpyMat()
* @endcode
*/
- void toNumpyMatInplace(float** view_data, int* dim1,
+ void toNumpyMatInplace(float** view_data,
+ int* dim1,
int* dim2) throw(UnsupportError);
/// Copy To numpy mat.
- void copyToNumpyMat(float** view_m_data, int* dim1,
+ void copyToNumpyMat(float** view_m_data,
+ int* dim1,
int* dim2) throw(UnsupportError);
/// Copy From Numpy Mat
@@ -248,15 +259,18 @@ public:
static Vector* create(const std::vector& data,
bool useGpu = isUsingGpu());
- static Vector* createVectorFromNumpy(float* data, int dim, bool copy = true,
- bool useGpu = isUsingGpu())
- throw (UnsupportError);
+ static Vector* createVectorFromNumpy(
+ float* data,
+ int dim,
+ bool copy = true,
+ bool useGpu = isUsingGpu()) throw(UnsupportError);
/**
* Create Cpu Vector from numpy array, which dtype=float32
*
* If copy is false, it will create vector inplace.
*/
- static Vector* createCpuVectorFromNumpy(float* data, int dim,
+ static Vector* createCpuVectorFromNumpy(float* data,
+ int dim,
bool copy = false);
/// Create Gpu Vector from numpy array, which dtype=float32
@@ -312,16 +326,19 @@ public:
static IVector* create(const std::vector& data,
bool useGpu = isUsingGpu());
- static IVector* createVectorFromNumpy(int* data, int dim, bool copy = true,
- bool useGpu = isUsingGpu())
- throw (UnsupportError);
+ static IVector* createVectorFromNumpy(
+ int* data,
+ int dim,
+ bool copy = true,
+ bool useGpu = isUsingGpu()) throw(UnsupportError);
/**
* Create Cpu IVector from numpy array, which dtype=int32
*
* If copy is false, it will create vector inplace
*/
- static IVector* createCpuVectorFromNumpy(int* data, int dim,
+ static IVector* createCpuVectorFromNumpy(int* data,
+ int dim,
bool copy = false);
/**
* Create Gpu IVector from numpy array, which dtype=int32
@@ -605,7 +622,8 @@ class ParameterTraverseCallback {
public:
~ParameterTraverseCallback();
- void apply(const std::vector& vecs, const ParameterConfig& config,
+ void apply(const std::vector& vecs,
+ const ParameterConfig& config,
size_t sparseId);
private:
@@ -638,7 +656,8 @@ public:
void finishBatch();
- void update(const std::vector& vecs, const ParameterConfig& conf,
+ void update(const std::vector& vecs,
+ const ParameterConfig& conf,
size_t sparseId = NO_SPARSE_ID);
std::vector getParameterTypes() const;
@@ -678,7 +697,8 @@ public:
* model config by TrainerConfig
*/
static GradientMachine* createByModelConfig(
- ModelConfig* conf, GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
+ ModelConfig* conf,
+ GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
const std::vector& parameterTypes = defaultParamTypes);
/**
@@ -701,7 +721,8 @@ public:
/**
* Combine forward/backward
*/
- void forwardBackward(const Arguments& inArgs, Arguments* outArgs,
+ void forwardBackward(const Arguments& inArgs,
+ Arguments* outArgs,
PassType passType,
const UpdateCallback& callback = UpdateCallback());
@@ -722,14 +743,17 @@ public:
*/
SequenceGenerator* asSequenceGenerator(
const std::vector& dict = std::vector(),
- size_t begin_id = 0UL, size_t end_id = 0UL, size_t max_length = 100UL,
+ size_t begin_id = 0UL,
+ size_t end_id = 0UL,
+ size_t max_length = 100UL,
size_t beam_size = -1UL);
private:
GradientMachinePrivate* m;
static GradientMachine* createFromPaddleModelPtr(
- const void* confPtr, GradientMatchineCreateMode mode,
+ const void* confPtr,
+ GradientMatchineCreateMode mode,
const std::vector& types);
// Not to use c++ 11 init-list, so we use static var as function default arg.
@@ -751,8 +775,8 @@ public:
/// Create A Trainer By TrainerConfig. using paddle command line.
static Trainer* createByCommandLine() throw(IOError);
- static Trainer* create(TrainerConfig* optConfig, GradientMachine* gm)
- throw(IOError);
+ static Trainer* create(TrainerConfig* optConfig,
+ GradientMachine* gm) throw(IOError);
/// Start training
void startTrain();
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
index 8b56adc97c2d6178a9e0b272a9af89732a3573f6..c5876bb1c71438578831ffffd85840c706b6224c 100644
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "paddle/parameter/Parameter.h"
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
index b13761ab0900d57008c17094c5199ef31a040f54..21d031e4bcb897eb693e5cff56bc77a637dc6bd2 100644
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "PaddleAPIPrivate.h"
#include "paddle/parameter/ParameterOptimizer.h"
@@ -32,17 +31,21 @@ struct ParameterTraverseCallbackPrivate {
const paddle::ParameterOptimizer::TraverseCallback& callback)
: callback(callback) {}
- void apply(const std::vector& vecs, const ParameterConfig& conf,
+ void apply(const std::vector& vecs,
+ const ParameterConfig& conf,
size_t sparseId) {
std::vector real_vecs;
real_vecs.resize(vecs.size());
- std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
- if (v) {
- return *(paddle::VectorPtr*)(v->getSharedPtr());
- } else {
- return paddle::VectorPtr();
- }
- });
+ std::transform(vecs.begin(),
+ vecs.end(),
+ real_vecs.begin(),
+ [](Vector* v) {
+ if (v) {
+ return *(paddle::VectorPtr*)(v->getSharedPtr());
+ } else {
+ return paddle::VectorPtr();
+ }
+ });
paddle::ParameterConfig& real_conf =
*(paddle::ParameterConfig*)(const_cast(conf)
@@ -86,10 +89,12 @@ void ParameterOptimizer::startBatch(size_t numSamplesProcessed) {
void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); }
void ParameterOptimizer::update(const std::vector& vecs,
- const ParameterConfig& conf, size_t sparseId) {
- ParameterTraverseCallbackPrivate invoker([&](
- const paddle::VectorPtr _vecs[], const paddle::ParameterConfig& config,
- size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
+ const ParameterConfig& conf,
+ size_t sparseId) {
+ ParameterTraverseCallbackPrivate invoker(
+ [&](const paddle::VectorPtr _vecs[],
+ const paddle::ParameterConfig& config,
+ size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
invoker.apply(vecs, conf, sparseId);
}
@@ -116,8 +121,9 @@ void ParameterTraverseCallback::apply(const std::vector& vecs,
ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal(
const ParameterConfig& config) const {
- auto& param_config = *(paddle::ParameterConfig*)const_cast(
- config).getRawPtr();
+ auto& param_config =
+ *(paddle::ParameterConfig*)const_cast(config)
+ .getRawPtr();
auto callback = m->optimizer->needSpecialTraversal(param_config);
if (callback) {
auto retCallback = new ParameterTraverseCallback();
diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/api/SequenceGenerator.cpp
index 9d353ccc8e281e72a207ba19f45517fd256d6df2..d51be78d45902967107f4bf0af995958faed931a 100644
--- a/paddle/api/SequenceGenerator.cpp
+++ b/paddle/api/SequenceGenerator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "paddle/gserver/gradientmachines/GradientMachine.h"
#include "paddle/parameter/Argument.h"
@@ -42,8 +41,10 @@ struct Path {
// position
static void findNBest(paddle::GradientMachine* gradMachine,
std::vector& inArgs,
- std::vector& finalPaths, size_t bos_id,
- size_t eos_id, size_t max_length) {
+ std::vector& finalPaths,
+ size_t bos_id,
+ size_t eos_id,
+ size_t max_length) {
std::vector paths;
Path emptyPath;
paths.push_back(emptyPath);
@@ -166,7 +167,8 @@ public:
if (id < getSize()) {
Path& p = (*path_)[id];
std::ostringstream sout;
- std::transform(p.ids.begin(), p.ids.end(),
+ std::transform(p.ids.begin(),
+ p.ids.end(),
std::ostream_iterator(sout, split ? " " : ""),
[&](int id) { return (*dict_)[id]; });
return sout.str();
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
index b61f36f740d47fe785b30361f26059bf0b64829d..7a6aa69fb652313748b1fa787847ffd74fda7a22 100644
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -64,12 +64,11 @@ Trainer* Trainer::createByCommandLine() throw(IOError) {
Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
: m(new TrainerPrivate()) {
- m->init(config->m->conf, /* testing= */false, gm ? gm->m->machine : nullptr);
+ m->init(config->m->conf, /* testing= */ false, gm ? gm->m->machine : nullptr);
}
-Trainer* Trainer::create(TrainerConfig* config, GradientMachine* gm)
- throw(IOError)
-{
+Trainer* Trainer::create(TrainerConfig* config,
+ GradientMachine* gm) throw(IOError) {
auto retv = new Trainer(config, gm);
if (retv->m->getConfig().IsInitialized()) {
return retv;
@@ -134,15 +133,17 @@ void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
Matrix* Trainer::getLayerOutput(const std::string& layerName) {
auto nn = std::dynamic_pointer_cast(
- this->m->getGradientMachine());
+ this->m->getGradientMachine());
CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
auto m = nn->getLayerOutput(layerName);
return Matrix::createByPaddleMatrixPtr(&m);
}
-void Trainer::forwardOneBatch(size_t batchSize) { m->forwardOneBatch(batchSize); }
+void Trainer::forwardOneBatch(size_t batchSize) {
+ m->forwardOneBatch(batchSize);
+}
-bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
+bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
CHECK(dataProvider_) << "data_provider is not specified";
paddle::DataBatch dataBatch;
int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
@@ -156,7 +157,6 @@ bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
void TrainerPrivate::forwardOneDataBatch(
const std::vector& inArgs) {
-
std::vector& outArgs = forwardOutput_;
if (config_->getOptConfig().use_sparse_remote_updater()) {
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index a8932351a685474a756c3f5b0e5e8c42bbf58237..1bba1df2e1c0a2d3cd2d8307ed3a0d784bb949b4 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -37,13 +37,15 @@ FloatArray::FloatArray(const float* b, const size_t l)
IntArray::IntArray(const int* b, const size_t l, bool f)
: buf(b), length(l), needFree(f) {}
-IntWithFloatArray::IntWithFloatArray(const float* v, const int* i, size_t l,
+IntWithFloatArray::IntWithFloatArray(const float* v,
+ const int* i,
+ size_t l,
bool f)
: valBuf(v), idxBuf(i), length(l), needFree(f) {}
-bool isUsingGpu() {return FLAGS_use_gpu;}
+bool isUsingGpu() { return FLAGS_use_gpu; }
-void setUseGpu(bool useGpu) {FLAGS_use_gpu = useGpu;}
+void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
bool isGpuVersion() {
#ifdef PADDLE_ONLY_CPU
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index d44cdefc35bd09e04412b52fb9981947caf89588..cc1c098223826a06fea291a95730d7fc1fd1beb3 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "paddle/math/Vector.h"
@@ -39,8 +38,10 @@ IVector* IVector::create(const std::vector& data, bool useGpu) {
return v;
}
-IVector* IVector::createVectorFromNumpy(int* data, int dim, bool copy,
- bool useGpu) throw (UnsupportError){
+IVector* IVector::createVectorFromNumpy(int* data,
+ int dim,
+ bool copy,
+ bool useGpu) throw(UnsupportError) {
if (useGpu) {
/// if use gpu only copy=true is supported
if (!copy) {
@@ -137,8 +138,8 @@ void IVector::copyToNumpyArray(int** view_m_data, int* dim1) {
if (auto cpuVec = dynamic_cast(m->vec.get())) {
std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1));
} else if (auto gpuVec = dynamic_cast(m->vec.get())) {
- hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
- sizeof(int) * (*dim1));
+ hl_memcpy_device2host(
+ *view_m_data, gpuVec->getData(), sizeof(int) * (*dim1));
} else {
LOG(INFO) << "Unexpected situation";
}
@@ -201,8 +202,10 @@ Vector* Vector::createByPaddleVectorPtr(void* ptr) {
}
}
-Vector* Vector::createVectorFromNumpy(float* data, int dim, bool copy,
- bool useGpu) throw (UnsupportError){
+Vector* Vector::createVectorFromNumpy(float* data,
+ int dim,
+ bool copy,
+ bool useGpu) throw(UnsupportError) {
if (useGpu) {
/// if use gpu only copy=True is supported
if (!copy) {
@@ -251,8 +254,8 @@ void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
if (auto cpuVec = dynamic_cast(m->vec.get())) {
std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
} else if (auto gpuVec = dynamic_cast(m->vec.get())) {
- hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
- sizeof(float) * (*dim1));
+ hl_memcpy_device2host(
+ *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
} else {
LOG(INFO) << "Unexpected situation";
}
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index cdb730bb3cec7a32fa42cf4c6738d575b76c6032..11dbfb54b268774405ade1e532bef9a0e8c7ada9 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -81,5 +81,8 @@ else()
add_library(paddle_cuda ${CUDA_SOURCES})
endif()
-add_style_check_target(paddle_cuda ${CUDA_SOURCES})
-add_style_check_target(paddle_cuda ${CUDA_HEADERS})
+add_style_check_target(paddle_cuda
+ ${CUDA_SOURCES}
+ ${CUDA_HEADERS}
+ ${CUDA_DSO_SOURCES}
+ ${CUDA_CXX_WITH_GPU_SOURCES})
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/cuda/include/hl_activation_functions.h
index c8aabc7844cd48d7ebdd0077684f9efa50f941a2..03e15b2223a50625c6999f6b081ae984e76b182b 100644
--- a/paddle/cuda/include/hl_activation_functions.h
+++ b/paddle/cuda/include/hl_activation_functions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_ACTIVATION_FUNCTIONS_H_
#define HL_ACTIVATION_FUNCTIONS_H_
@@ -21,11 +20,8 @@ limitations under the License. */
/**
* Active functions: sigmoid, relu, tanh and linear.
*/
-#define HPPL_ACTIVE_FUNCTION {hppl::sigmoid, \
- hppl::relu, \
- hppl::tanh, \
- hppl::linear \
- }
+#define HPPL_ACTIVE_FUNCTION \
+ { hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear }
namespace hppl {
@@ -42,18 +38,18 @@ public:
#ifdef __NVCC__
namespace gpu {
-static __device__ Active::forward forward[] = HPPL_ACTIVE_FUNCTION;
+static __device__ Active::forward forward[] = HPPL_ACTIVE_FUNCTION;
static __device__ Active::backward backward[] = HPPL_ACTIVE_FUNCTION;
}
#else
namespace cpu {
-static Active::forward forward[] = HPPL_ACTIVE_FUNCTION;
+static Active::forward forward[] = HPPL_ACTIVE_FUNCTION;
static Active::backward backward[] = HPPL_ACTIVE_FUNCTION;
}
#ifdef __AVX__
namespace avx {
-static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
+static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION;
}
#endif
diff --git a/paddle/cuda/include/hl_aggregate.h b/paddle/cuda/include/hl_aggregate.h
index db75809f5de195d41577ed6569e8508f48241b69..a6d9ff8483eee28b2c8a380f0aca097c7662a02e 100644
--- a/paddle/cuda/include/hl_aggregate.h
+++ b/paddle/cuda/include/hl_aggregate.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_AGGREGATE_H_
#define HL_AGGREGATE_H_
diff --git a/paddle/cuda/include/hl_avx_functions.h b/paddle/cuda/include/hl_avx_functions.h
index cf062dd969bf79554e00369367e3b85c2ae7fc0d..ed339e312a7639cf9b78f130a43d67a7446576bb 100644
--- a/paddle/cuda/include/hl_avx_functions.h
+++ b/paddle/cuda/include/hl_avx_functions.h
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_AVX_FUNCTIONS_H_
#define HL_AVX_FUNCTIONS_H_
#include
namespace hppl {
- __m256 relu(const __m256 a);
- __m256 sigmoid(const __m256 a);
- __m256 tanh(const __m256 a);
- __m256 linear(const __m256 a);
-
- __m256 relu(const __m256 a, const __m256 b);
- __m256 sigmoid(const __m256 a, const __m256 b);
- __m256 tanh(const __m256 a, const __m256 b);
- __m256 linear(const __m256 a, const __m256 b);
+__m256 relu(const __m256 a);
+__m256 sigmoid(const __m256 a);
+__m256 tanh(const __m256 a);
+__m256 linear(const __m256 a);
+
+__m256 relu(const __m256 a, const __m256 b);
+__m256 sigmoid(const __m256 a, const __m256 b);
+__m256 tanh(const __m256 a, const __m256 b);
+__m256 linear(const __m256 a, const __m256 b);
} // namespace hppl
#endif // HL_AVX_FUNCTIONS_H_
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index 9f80898a1f927a0e8bbf86108567a04ccecc38f5..a076952467a5ce10dc1f58007dda2170aa694fbb 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
-
#ifndef HL_BASE_H_
#define HL_BASE_H_
@@ -33,36 +31,36 @@ limitations under the License. */
* HPPL_STREAM_DEFAULT is HPPL default stream.
*/
typedef enum {
- HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
- HPPL_STREAM_1 = 1,
- HPPL_STREAM_2 = 2,
- HPPL_STREAM_3 = 3,
- HPPL_STREAM_4 = 4,
- HPPL_THREAD_STREAM_1 = 5,
- HPPL_THREAD_STREAM_2 = 6,
- HPPL_THREAD_STREAM_3 = 7,
- HPPL_THREAD_STREAM_4 = 8,
- HPPL_STREAM_END
+ HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
+ HPPL_STREAM_1 = 1,
+ HPPL_STREAM_2 = 2,
+ HPPL_STREAM_3 = 3,
+ HPPL_STREAM_4 = 4,
+ HPPL_THREAD_STREAM_1 = 5,
+ HPPL_THREAD_STREAM_2 = 6,
+ HPPL_THREAD_STREAM_3 = 7,
+ HPPL_THREAD_STREAM_4 = 8,
+ HPPL_STREAM_END
} hl_stream_t;
/**
* @brief HPPL activation mode.
*/
typedef enum {
- HL_ACTIVATION_SIGMOID = 0,
- HL_ACTIVATION_RELU = 1,
- HL_ACTIVATION_TANH = 2,
- HL_ACTIVATION_LINEAR = 3,
- HL_ACTIVATION_END
+ HL_ACTIVATION_SIGMOID = 0,
+ HL_ACTIVATION_RELU = 1,
+ HL_ACTIVATION_TANH = 2,
+ HL_ACTIVATION_LINEAR = 3,
+ HL_ACTIVATION_END
} hl_activation_mode_t;
/**
* @brief Transpose type.
*/
typedef enum {
- HPPL_OP_N = 0, /* transpose */
- HPPL_OP_T = 1, /* non transpose */
- HPPL_OP_END
+ HPPL_OP_N = 0, /* transpose */
+ HPPL_OP_T = 1, /* non transpose */
+ HPPL_OP_END
} hl_trans_op_t;
/**
@@ -148,23 +146,21 @@ typedef struct {
* @brief Sparse matrix value type.
*/
typedef enum {
- HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
- HL_FLOAT_VALUE = 1,
- HL_VALUE_END
+ HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
+ HL_FLOAT_VALUE = 1,
+ HL_VALUE_END
} hl_matrix_value_t;
-
/**
* @brief HPPL matrix format.
*/
typedef enum {
- HL_SPARSE_CSR = 0,
- HL_SPARSE_CSC = 1,
- HL_SPARSE_END
+ HL_SPARSE_CSR = 0,
+ HL_SPARSE_CSC = 1,
+ HL_SPARSE_END
} hl_matrix_format_t;
-
-typedef struct _hl_matrix_s * hl_matrix_s;
+typedef struct _hl_matrix_s *hl_matrix_s;
/**
* @brief HPPL sparse matrix.
@@ -177,12 +173,12 @@ typedef struct _hl_matrix_s * hl_matrix_s;
* @param nnz nonzero values of sparse matrix.
*/
typedef struct {
- hl_matrix_s matrix;
- hl_matrix_format_t format;
- hl_matrix_value_t type;
- int rows;
- int cols;
- size_t nnz;
+ hl_matrix_s matrix;
+ hl_matrix_format_t format;
+ hl_matrix_value_t type;
+ int rows;
+ int cols;
+ size_t nnz;
} _hl_sparse_matrix_s, *hl_sparse_matrix_s;
#ifndef PADDLE_TYPE_DOUBLE
@@ -195,7 +191,7 @@ typedef struct {
*
* HL_FLOAT_MIN: 1.17549435e-38F
*/
-#define HL_FLOAT_MAX 3.40282347e+38F
+#define HL_FLOAT_MAX 3.40282347e+38F
/**
* if real == double
*
@@ -203,20 +199,18 @@ typedef struct {
*
* HL_FLOAT_MIN: 2.2250738585072014e-308
*/
-#define HL_FLOAT_MIN 1.17549435e-38F
+#define HL_FLOAT_MIN 1.17549435e-38F
#else
-#define HL_FLOAT_MAX 1.7976931348623157e+308
-#define HL_FLOAT_MIN 2.2250738585072014e-308
+#define HL_FLOAT_MAX 1.7976931348623157e+308
+#define HL_FLOAT_MIN 2.2250738585072014e-308
#endif
-
/**
* The maximum input value for exp, used to avoid overflow problem.
*
* Currently only used for tanh function.
*/
-#define EXP_MAX_INPUT 40.0
-
+#define EXP_MAX_INPUT 40.0
/**
* @brief DIVUP(x, y) is similar to ceil(x / y).
@@ -224,7 +218,7 @@ typedef struct {
* the size of blockDim.
*/
#ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y) - 1) / (y))
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
#endif
#ifdef __NVCC__
@@ -233,7 +227,7 @@ typedef struct {
#include "hl_cuda.h"
#include "cuda_runtime.h"
-extern __thread bool g_sync_flag;
+extern __thread bool g_sync_flag;
extern __thread cudaStream_t default_stream;
#define STREAM_DEFAULT default_stream
@@ -241,16 +235,15 @@ extern __thread cudaStream_t default_stream;
* @brief Check cuda kernel execution.
* @param msg error string
*/
-#define CHECK_SYNC(msg) \
- if (true == g_sync_flag) { \
- hl_stream_synchronize(HPPL_STREAM_DEFAULT); \
- cudaError_t err \
- = (cudaError_t)hl_get_device_last_error(); \
- CHECK_EQ(cudaSuccess, err) << "[" << msg << "] " \
- << "CUDA error: " \
- << hl_get_device_error_string((size_t)err); \
+#define CHECK_SYNC(msg) \
+ if (true == g_sync_flag) { \
+ hl_stream_synchronize(HPPL_STREAM_DEFAULT); \
+ cudaError_t err = (cudaError_t)hl_get_device_last_error(); \
+ CHECK_EQ(cudaSuccess, err) \
+ << "[" << msg << "] " \
+ << "CUDA error: " << hl_get_device_error_string((size_t)err); \
}
-#endif /* __NVCC__ */
+#endif /* __NVCC__ */
-#endif /* HL_BASE_H_ */
+#endif /* HL_BASE_H_ */
diff --git a/paddle/cuda/include/hl_batch_transpose.h b/paddle/cuda/include/hl_batch_transpose.h
index 414c7996acee4ccbe2d7dbd093e25a23119fea3c..f3630e9762508fd39935e62e0007de04f9140fff 100644
--- a/paddle/cuda/include/hl_batch_transpose.h
+++ b/paddle/cuda/include/hl_batch_transpose.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_BATCH_TRANSPOSE_H_
#define HL_BATCH_TRANSPOSE_H_
@@ -31,10 +30,7 @@ limitations under the License. */
* order. Each batch has height * width data, which are
* arranged in height-first (or row-first) manner.
*/
-extern void batchTranspose(const real* input,
- real* output,
- int width,
- int height,
- int batchSize);
+extern void batchTranspose(
+ const real* input, real* output, int width, int height, int batchSize);
#endif // HL_BATCH_TRANSPOSE_H_
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 70b5be6fda2509853029a68d31129df28d580942..cffaac634f0f64be5ddab961d549ae43775bb7b0 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CNN_H_
#define HL_CNN_H_
@@ -37,15 +36,21 @@ limitations under the License. */
* @param[in] alpha
* @param[in] beta
*/
-extern void hl_shrink_col2feature(
- const real * dataCol, size_t channels,
- size_t height, size_t width,
- size_t blockH, size_t blockW,
- size_t strideH, size_t strideW,
- size_t paddingH, size_t paddingW,
- size_t outputH, size_t outputW,
- real* dataIm,
- real alpha = 1.0f, real beta = 0.0f);
+extern void hl_shrink_col2feature(const real* dataCol,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t blockH,
+ size_t blockW,
+ size_t strideH,
+ size_t strideW,
+ size_t paddingH,
+ size_t paddingW,
+ size_t outputH,
+ size_t outputW,
+ real* dataIm,
+ real alpha = 1.0f,
+ real beta = 0.0f);
/**
* @brief Expand feature to column.
@@ -65,14 +70,19 @@ extern void hl_shrink_col2feature(
* @param[out] dataCol expand data.
*
*/
-extern void hl_expand_feature2col(
- const real* dataIm, size_t channels,
- size_t height, size_t width,
- size_t blockH, size_t blockW,
- size_t strideH, size_t strideW,
- size_t paddingH, size_t paddingW,
- size_t outputH, size_t outputW,
- real* dataCol);
+extern void hl_expand_feature2col(const real* dataIm,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t blockH,
+ size_t blockW,
+ size_t strideH,
+ size_t strideW,
+ size_t paddingH,
+ size_t paddingW,
+ size_t outputH,
+ size_t outputW,
+ real* dataCol);
/**
* @brief Maximum pool forward.
@@ -94,15 +104,21 @@ extern void hl_expand_feature2col(
* @param[in] tgtStride stride between output data samples.
*
*/
-extern void hl_maxpool_forward(
- const int frameCnt, const real* inputData,
- const int channels,
- const int height, const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real* tgtData, const int tgtStride);
+extern void hl_maxpool_forward(const int frameCnt,
+ const real* inputData,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real* tgtData,
+ const int tgtStride);
/**
* @brief Maximum pool backward.
@@ -125,20 +141,28 @@ extern void hl_maxpool_forward(
* @param[in] paddingH padding height.
* @param[in] paddingW padding width.
* @param[out] targetGrad output grad.
- * @param[in] outStride stride between output data samples.
+ * @param[in] outStride stride between output data samples.
*
*/
-extern void hl_maxpool_backward(
- const int frameCnt, const real* inputData,
- const real* outData, const real* outGrad,
- const int channels, const int height,
- const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real scaleA, real scaleB,
- real* targetGrad, const int outStride);
+extern void hl_maxpool_backward(const int frameCnt,
+ const real* inputData,
+ const real* outData,
+ const real* outGrad,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real scaleA,
+ real scaleB,
+ real* targetGrad,
+ const int outStride);
/**
* @brief Averge pool forward.
@@ -160,15 +184,21 @@ extern void hl_maxpool_backward(
* @param[in] tgtStride stride between output data samples.
*
*/
-extern void hl_avgpool_forward(
- const int frameCnt, const real* inputData,
- const int channels,
- const int height, const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real* tgtData, const int tgtStride);
+extern void hl_avgpool_forward(const int frameCnt,
+ const real* inputData,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real* tgtData,
+ const int tgtStride);
/**
* @brief Maximum pool backward.
@@ -189,19 +219,26 @@ extern void hl_avgpool_forward(
* @param[in] scaleA scale.
* @param[in] scaleB scale.
* @param[out] backGrad output grad.
- * @param[in] outStride stride between output data samples.
+ * @param[in] outStride stride between output data samples.
*
*/
-extern void hl_avgpool_backward(
- const int frameCnt, const real* outGrad,
- const int channels, const int height,
- const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- int paddingH, int paddingW,
- real scaleA, real scaleB,
- real* backGrad, const int outStride);
+extern void hl_avgpool_backward(const int frameCnt,
+ const real* outGrad,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ int paddingH,
+ int paddingW,
+ real scaleA,
+ real scaleB,
+ real* backGrad,
+ const int outStride);
/**
* @brief Cross-map-respose normalize forward.
@@ -218,10 +255,16 @@ extern void hl_avgpool_backward(
* @param[in] beta scale.
*
*/
-extern void hl_CMRNorm_forward(
- size_t frameCnt, const real* in, real* scale, real* out,
- size_t channels, size_t height, size_t width, size_t sizeX,
- real alpha, real beta);
+extern void hl_CMRNorm_forward(size_t frameCnt,
+ const real* in,
+ real* scale,
+ real* out,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t sizeX,
+ real alpha,
+ real beta);
/**
* @brief Cross-map-respose normalize backward.
@@ -240,11 +283,18 @@ extern void hl_CMRNorm_forward(
* @param[in] beta scale.
*
*/
-extern void hl_CMRNorm_backward(
- size_t frameCnt, const real* inV, const real* scale,
- const real* outV, const real* outDiff, real *inDiff,
- size_t channels, size_t height, size_t width, size_t sizeX,
- real alpha, real beta);
+extern void hl_CMRNorm_backward(size_t frameCnt,
+ const real* inV,
+ const real* scale,
+ const real* outV,
+ const real* outDiff,
+ real* inDiff,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t sizeX,
+ real alpha,
+ real beta);
/**
* @brief Bilinear interpolation forward.
@@ -278,24 +328,24 @@ extern void hl_bilinear_forward(const real* inData,
const real ratioH,
const real ratioW);
- /**
- * @brief Bilinear interpolation backward.
- *
- * @param[out] inGrad input gradient.
- * @param[in] inImgH input image height.
- * @param[in] inImgW input image width.
- * @param[in] inputH input batchSize.
- * @param[in] inputW input image data dim.
- * @param[in] outGrad output gradient.
- * @param[in] outImgH output image height.
- * @param[in] outImgW output image width.
- * @param[in] outputH output batchSize.
- * @param[in] outputW output image data dim.
- * @param[in] numChannels number of channels.
- * @param[in] ratioH inImgH / outImgH.
- * @param[in] ratioW inImgW / outImgW.
- *
- */
+/**
+* @brief Bilinear interpolation backward.
+*
+* @param[out] inGrad input gradient.
+* @param[in] inImgH input image height.
+* @param[in] inImgW input image width.
+* @param[in] inputH input batchSize.
+* @param[in] inputW input image data dim.
+* @param[in] outGrad output gradient.
+* @param[in] outImgH output image height.
+* @param[in] outImgW output image width.
+* @param[in] outputH output batchSize.
+* @param[in] outputW output image data dim.
+* @param[in] numChannels number of channels.
+* @param[in] ratioH inImgH / outImgH.
+* @param[in] ratioW inImgW / outImgW.
+*
+*/
extern void hl_bilinear_backward(real* inGrad,
const size_t inImgH,
const size_t inImgW,
@@ -321,9 +371,13 @@ extern void hl_bilinear_backward(real* inGrad,
* @param[in] featLen feature length = image height * image width.
* @param[in] groups number of groups.
*/
-extern void hl_maxout_forward(
- const real* inData, real* outData, int* idData,
- size_t batchSize, size_t size, size_t featLen, size_t groups);
+extern void hl_maxout_forward(const real* inData,
+ real* outData,
+ int* idData,
+ size_t batchSize,
+ size_t size,
+ size_t featLen,
+ size_t groups);
/**
* @brief MaxOut backward.
@@ -336,8 +390,12 @@ extern void hl_maxout_forward(
* @param[in] featLen feature length = image height * image width.
* @param[in] groups number of groups.
*/
-extern void hl_maxout_backward(
- real* inGrad, const real* outGrad, const int* idData,
- size_t batchSize, size_t size, size_t featLen, size_t groups);
+extern void hl_maxout_backward(real* inGrad,
+ const real* outGrad,
+ const int* idData,
+ size_t batchSize,
+ size_t size,
+ size_t featLen,
+ size_t groups);
#endif /* HL_CNN_H_ */
diff --git a/paddle/cuda/include/hl_cuda.h b/paddle/cuda/include/hl_cuda.h
index 3196db67f61fd2e6b75df4abb3652df4456a0366..2c7d665101f36f9c32ab132ca279abf3ac062a8f 100644
--- a/paddle/cuda/include/hl_cuda.h
+++ b/paddle/cuda/include/hl_cuda.h
@@ -12,18 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_H_
#define HL_CUDA_H_
-#include "hl_base.h"
#include
+#include "hl_base.h"
/**
* @brief HPPL event.
*/
-typedef struct _hl_event_st * hl_event_t;
-
+typedef struct _hl_event_st *hl_event_t;
/**
* @brief return cuda runtime api version.
@@ -42,7 +40,7 @@ extern void hl_start();
* if device is NULL, will start all GPU.
* @param[in] number number of devices.
*/
-extern void hl_specify_devices_start(int* device, int number);
+extern void hl_specify_devices_start(int *device, int number);
/**
* @brief Queries if a device may directly access a peer device's memory.
@@ -126,7 +124,7 @@ extern int hl_get_device();
*
* @return dest_d pointer to device memory.
*/
-extern void* hl_malloc_device(size_t size);
+extern void *hl_malloc_device(size_t size);
/**
* @brief Free device memory.
@@ -143,7 +141,7 @@ extern void hl_free_mem_device(void *dest_d);
*
* @return dest_h pointer to host memory.
*/
-extern void* hl_malloc_host(size_t size);
+extern void *hl_malloc_host(size_t size);
/**
* @brief Free host page-lock memory.
@@ -228,9 +226,9 @@ extern void hl_srand(unsigned int seed);
* @param[in] stream stream id.
*/
extern void hl_memcpy_async(void *dst,
- void *src,
- size_t size,
- hl_stream_t stream);
+ void *src,
+ size_t size,
+ hl_stream_t stream);
/**
* @brief Waits for stream tasks to complete.
@@ -261,8 +259,7 @@ extern void hl_destroy_event(hl_event_t event);
*
* @return time Time between start and end in ms.
*/
-extern float hl_event_elapsed_time(hl_event_t start,
- hl_event_t end);
+extern float hl_event_elapsed_time(hl_event_t start, hl_event_t end);
/**
* @brief Records an event.
@@ -300,7 +297,7 @@ extern void hl_set_device_flags_block();
/**
* @brief Returns the last error string from a cuda runtime call.
*/
-extern const char* hl_get_device_error_string();
+extern const char *hl_get_device_error_string();
/**
* @brief Returns the last error string from a cuda runtime call.
@@ -309,7 +306,7 @@ extern const char* hl_get_device_error_string();
*
* @see hl_get_device_last_error()
*/
-extern const char* hl_get_device_error_string(size_t err);
+extern const char *hl_get_device_error_string(size_t err);
/**
* @brief Returns the last error number.
@@ -335,4 +332,14 @@ extern bool hl_cuda_event_is_ready(hl_event_t event);
*/
extern void hl_device_synchronize();
+/**
+ * @brief gpu profiler start
+ */
+extern void hl_profiler_start();
+
+/**
+ * @brief gpu profiler stop
+ */
+extern void hl_profiler_end();
+
#endif // HL_CUDA_H_
diff --git a/paddle/cuda/include/hl_cuda_cublas.h b/paddle/cuda/include/hl_cuda_cublas.h
index d757317eb4a97559feef22d4fd8edf7c10ca6745..db8c03c2c01c67788622d37b5330e22c31e03f34 100644
--- a/paddle/cuda/include/hl_cuda_cublas.h
+++ b/paddle/cuda/include/hl_cuda_cublas.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_CUBLAS_H_
#define HL_CUDA_CUBLAS_H_
@@ -29,12 +28,8 @@ limitations under the License. */
* @param[in] ldc the first dimension of C_d.
*
*/
-extern void hl_matrix_transpose(real *A_d,
- real *C_d,
- int dimM,
- int dimN,
- int lda,
- int ldc);
+extern void hl_matrix_transpose(
+ real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc);
/*
* @brief Matrix transpose, while lda = dimN, ldc = dimM.
@@ -45,10 +40,7 @@ extern void hl_matrix_transpose(real *A_d,
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_transpose(real *A_d,
- real *C_d,
- int dimM,
- int dimN);
+extern void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN);
/*
* @brief Matrix inverse
@@ -60,11 +52,7 @@ extern void hl_matrix_transpose(real *A_d,
* @param[in] ldc the first dimension of C_d
*
*/
-extern void hl_matrix_inverse(real *A_d,
- real *C_d,
- int dimN,
- int lda,
- int ldc);
+extern void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc);
/**
* @brief C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -84,12 +72,19 @@ extern void hl_matrix_inverse(real *A_d,
* @param[in] ldc the first dimension of C_d.
*
*/
-extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+extern void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta,
- int lda, int ldb, int ldc);
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta,
+ int lda,
+ int ldb,
+ int ldc);
/**
* @brief C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -106,11 +101,16 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
* @param[in] beta scalar used for multiplication.
*
*/
-extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+extern void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta);
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta);
/**
* @brief This function performs the matrix-vector multiplication.
@@ -132,11 +132,17 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
*
*/
-extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
- real *B_d, real *C_d,
- int dimM, int dimN,
- real alpha, real beta,
- int lda, int incb, int incc);
+extern void hl_matrix_mul_vector(real *A_d,
+ hl_trans_op_t trans,
+ real *B_d,
+ real *C_d,
+ int dimM,
+ int dimN,
+ real alpha,
+ real beta,
+ int lda,
+ int incb,
+ int incc);
/**
* @brief This function performs the matrix-vector multiplication.
@@ -154,9 +160,13 @@ extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
* @param[in] beta scalar used for multiplication.
*
*/
-extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
- real *B_d, real *C_d,
- int dimM, int dimN,
- real alpha, real beta);
+extern void hl_matrix_mul_vector(real *A_d,
+ hl_trans_op_t trans,
+ real *B_d,
+ real *C_d,
+ int dimM,
+ int dimN,
+ real alpha,
+ real beta);
#endif /* HL_CUDA_CUBLAS_H_ */
diff --git a/paddle/cuda/include/hl_cuda_cudnn.h b/paddle/cuda/include/hl_cuda_cudnn.h
index f256cb54dfe69e8df7cc7fcc0ed0a58f3574acd3..3a2f916210277145efa8f6d7663a2698ea546b0b 100644
--- a/paddle/cuda/include/hl_cuda_cudnn.h
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_CUDNN_H_
#define HL_CUDA_CUDNN_H_
@@ -22,7 +21,7 @@ limitations under the License. */
* hppl pooling mode
*/
typedef enum {
- HL_POOLING_MAX = 0,
+ HL_POOLING_MAX = 0,
// average includes padded values
HL_POOLING_AVERAGE = 1,
// average does not include padded values
@@ -324,17 +323,16 @@ extern void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
* @param[in] sizeInBytes gpu workspace size (bytes).
* @param[in] convBwdFilterAlgo backward filter algorithm.
*/
-extern void hl_convolution_backward_filter(
- hl_tensor_descriptor input,
- real* input_data,
- hl_tensor_descriptor output,
- real* output_grad_data,
- hl_filter_descriptor filter,
- real* filter_grad_data,
- hl_convolution_descriptor conv,
- void* gpuWorkSpace,
- size_t sizeInBytes,
- int convBwdFilterAlgo);
+extern void hl_convolution_backward_filter(hl_tensor_descriptor input,
+ real* input_data,
+ hl_tensor_descriptor output,
+ real* output_grad_data,
+ hl_filter_descriptor filter,
+ real* filter_grad_data,
+ hl_convolution_descriptor conv,
+ void* gpuWorkSpace,
+ size_t sizeInBytes,
+ int convBwdFilterAlgo);
/**
* @brief convolution backward data(calculate input image grad data).
@@ -350,17 +348,16 @@ extern void hl_convolution_backward_filter(
* @param[in] sizeInBytes gpu workspace size (bytes).
* @param[in] convBwdDataAlgo backward data algorithm.
*/
-extern void hl_convolution_backward_data(
- hl_tensor_descriptor input,
- real* input_data_grad,
- hl_tensor_descriptor output,
- real* output_grad_data,
- hl_filter_descriptor filter,
- real* filter_data,
- hl_convolution_descriptor conv,
- void* gpuWorkSpace,
- size_t sizeInBytes,
- int convBwdDataAlgo);
+extern void hl_convolution_backward_data(hl_tensor_descriptor input,
+ real* input_data_grad,
+ hl_tensor_descriptor output,
+ real* output_grad_data,
+ hl_filter_descriptor filter,
+ real* filter_data,
+ hl_convolution_descriptor conv,
+ void* gpuWorkSpace,
+ size_t sizeInBytes,
+ int convBwdDataAlgo);
/**
* @brief convolution backward bias(calculate bias grad data).
@@ -383,8 +380,8 @@ extern void hl_convolution_backward_bias(hl_tensor_descriptor bias,
* @param[in] height matrix height.
* @param[in] width matrix width.
*/
-extern void hl_softmax_forward(real *input,
- real *output,
+extern void hl_softmax_forward(real* input,
+ real* output,
int height,
int width);
@@ -396,8 +393,8 @@ extern void hl_softmax_forward(real *input,
* @param[in] height matrix height.
* @param[in] width matrix width.
*/
-extern void hl_softmax_backward(real *output_value,
- real *output_grad,
+extern void hl_softmax_backward(real* output_value,
+ real* output_grad,
int height,
int width);
@@ -426,18 +423,18 @@ extern void hl_softmax_backward(real *output_value,
*
*/
extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outputDesc,
- real *output,
+ real* output,
hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
+ real* scale,
+ real* bias,
double factor,
- real *runningMean,
- real *runningInvVar,
+ real* runningMean,
+ real* runningInvVar,
double epsilon,
- real *savedMean,
- real *savedVar);
+ real* savedMean,
+ real* savedVar);
/**
* @brief cudnn batch norm forward.
@@ -463,14 +460,14 @@ extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
*
*/
extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outputDesc,
- real *output,
+ real* output,
hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
- real *estimatedMean,
- real *estimatedVar,
+ real* scale,
+ real* bias,
+ real* estimatedMean,
+ real* estimatedVar,
double epsilon);
/**
@@ -483,7 +480,8 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
* @param[in] inGradDesc input tensor descriptor desc.
* @param[in] inGrad input data.
* @param[in] dBnParamDesc tensor descriptor desc.
- * bnScale, bnBias, running mean/var, save_mean/var.
+ * bnScale, bnBias, running mean/var,
+ * save_mean/var.
* @param[in] scale batch normalization scale parameter (in original
* paper scale is referred to as gamma).
* @param[in] scaleGrad batch normalization scale parameter (in original
@@ -497,17 +495,17 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
*
*/
extern void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outGradDesc,
- real *outGrad,
+ real* outGrad,
hl_tensor_descriptor inGradDesc,
- real *inGrad,
+ real* inGrad,
hl_tensor_descriptor dBnParamDesc,
- real *scale,
- real *scaleGrad,
- real *biasGrad,
+ real* scale,
+ real* scaleGrad,
+ real* biasGrad,
double epsilon,
- real *savedMean,
- real *savedInvVar);
+ real* savedMean,
+ real* savedInvVar);
#endif // HL_CUDA_CUDNN_H_
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
index f36c724e2da3dce11696fcda7daf98f5cda36dd6..1eb9f9ca888d3a93f04621e10346b5f9ff34cdca 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_DSO_LOADER_H_
#define HL_DSO_LOADER_H_
diff --git a/paddle/cuda/include/hl_functions.h b/paddle/cuda/include/hl_functions.h
index 65f366461ced0f9ee31ff9075f6dfaeb6c9b72a2..91ce9a0678463597df88c548aeac322ee19d95de 100644
--- a/paddle/cuda/include/hl_functions.h
+++ b/paddle/cuda/include/hl_functions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_FUNCTIONS_H_
#define HL_FUNCTIONS_H_
@@ -21,30 +20,30 @@ limitations under the License. */
/**
* sigmoid threshold maximum
*/
-#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MIN -40.0
/**
* sigmoid threshold minimum
*/
-#define SIGMOID_THRESHOLD_MAX 13.0
+#define SIGMOID_THRESHOLD_MAX 13.0
#ifndef __NVCC__
namespace hppl {
- /*
- * forward activation
- */
- real relu(const real a);
- real sigmoid(const real a);
- real tanh(const real a);
- real linear(const real a);
-
- /*
- * backward activation
- */
- real relu(const real a, const real b);
- real sigmoid(const real a, const real b);
- real tanh(const real a, const real b);
- real linear(const real a, const real b);
+/*
+ * forward activation
+ */
+real relu(const real a);
+real sigmoid(const real a);
+real tanh(const real a);
+real linear(const real a);
+
+/*
+ * backward activation
+ */
+real relu(const real a, const real b);
+real sigmoid(const real a, const real b);
+real tanh(const real a, const real b);
+real linear(const real a, const real b);
} // namespace hppl
#ifdef __AVX__
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
index 05039663b6e9f5e4a72f15ab822d723635f9b282..3be0df3b93b69811fb9c36dae223cbd927b02559 100644
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_GPU_H_
#define HL_GPU_H_
diff --git a/paddle/cuda/include/hl_lstm.h b/paddle/cuda/include/hl_lstm.h
index 1f95e318a1fe06050bbd31c2e276974f4a8bdc1e..7e527a79025969320f1aca75d313fd9d0194efd1 100644
--- a/paddle/cuda/include/hl_lstm.h
+++ b/paddle/cuda/include/hl_lstm.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_LSTM_H_
#define HL_LSTM_H_
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index 6195e30b9974d3ad092b4cf604e6b74fa481835c..96648661e345d8fa5d50cb2aae3a56ee53921f90 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_MATRIX_H_
#define HL_MATRIX_H_
@@ -30,13 +29,8 @@ limitations under the License. */
* @param[in] beta scalar used for addition.
*
*/
-extern void hl_matrix_add(real* A_d,
- real* B_d,
- real* C_d,
- int dimM,
- int dimN,
- real alpha,
- real beta);
+extern void hl_matrix_add(
+ real* A_d, real* B_d, real* C_d, int dimM, int dimN, real alpha, real beta);
/**
* @brief Matrix Softmax.
*
@@ -46,7 +40,7 @@ extern void hl_matrix_add(real* A_d,
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN);
+extern void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN);
/**
* @brief Matrix softmax derivative.
@@ -58,11 +52,8 @@ extern void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN);
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_softmax_derivative(real* grad_d,
- real* output_d,
- real* sftmaxSum_d,
- int dimM,
- int dimN);
+extern void hl_matrix_softmax_derivative(
+ real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN);
/**
* @brief Sequence softmax.
@@ -73,8 +64,8 @@ extern void hl_matrix_softmax_derivative(real* grad_d,
* @param[in] numSequence sequence number.
*
*/
-extern void hl_sequence_softmax_forward(real *A_d,
- real *C_d,
+extern void hl_sequence_softmax_forward(real* A_d,
+ real* C_d,
const int* index,
int numSequence);
@@ -88,11 +79,8 @@ extern void hl_sequence_softmax_forward(real *A_d,
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_classification_error(real* A_d,
- int* B_d,
- real* C_d,
- int dimM,
- int dimN);
+extern void hl_matrix_classification_error(
+ real* A_d, int* B_d, real* C_d, int dimM, int dimN);
/**
* @brief Matrix cross entropy.
@@ -104,11 +92,8 @@ extern void hl_matrix_classification_error(real* A_d,
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_cross_entropy(real* A_d,
- real* C_d,
- int* label_d,
- int dimM,
- int dimN);
+extern void hl_matrix_cross_entropy(
+ real* A_d, real* C_d, int* label_d, int dimM, int dimN);
/**
* @brief Matrix cross entropy back propagation.
@@ -120,11 +105,8 @@ extern void hl_matrix_cross_entropy(real* A_d,
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_cross_entropy_bp(real* grad_d,
- real* output_d,
- int* label_d,
- int dimM,
- int dimN);
+extern void hl_matrix_cross_entropy_bp(
+ real* grad_d, real* output_d, int* label_d, int dimM, int dimN);
/**
* @brief Matrix multi-binary label cross entropy
@@ -135,11 +117,8 @@ extern void hl_matrix_cross_entropy_bp(real* grad_d,
* @param[in] dimM matrix height.
* @param[in] dimN matrix width.
*/
-extern void hl_matrix_multi_binary_cross_entropy(real* output,
- real* entropy,
- hl_sparse_matrix_s mat,
- int dimM,
- int dimN);
+extern void hl_matrix_multi_binary_cross_entropy(
+ real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN);
/**
* @brief Matrix multi-binary label cross entropy backprop
@@ -150,11 +129,8 @@ extern void hl_matrix_multi_binary_cross_entropy(real* output,
* @param[in] dimM matrix height.
* @param[in] dimN matrix width.
*/
-extern void hl_matrix_multi_binary_cross_entropy_bp(real* output,
- real* grad,
- hl_sparse_matrix_s mat,
- int dimM,
- int dimN);
+extern void hl_matrix_multi_binary_cross_entropy_bp(
+ real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN);
/**
* @brief Matrix zero memory.
@@ -176,12 +152,8 @@ extern void hl_matrix_zero_mem(real* data, int num);
* @param[in] partial_sum
*/
-extern void hl_param_relu_forward(real* output,
- real* input,
- real* w,
- int width,
- int height,
- int partial_sum);
+extern void hl_param_relu_forward(
+ real* output, real* input, real* w, int width, int height, int partial_sum);
/**
* @brief parameter relu backward w
*
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index 46d86b2982f065802eec83ca7554f787d1d02f3a..bb5124df44b492bd8fdeb2a0c75ebcf74d2c8157 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_SEQUENCE_H_
#define HL_SEQUENCE_H_
@@ -32,7 +31,7 @@ limitations under the License. */
extern void hl_max_sequence_forward(real* input,
const int* sequence,
real* output,
- int *index,
+ int* index,
int numSequences,
int dim);
@@ -46,11 +45,8 @@ extern void hl_max_sequence_forward(real* input,
* @param[in] dim input dimension.
*
*/
-extern void hl_max_sequence_backward(real* outputGrad,
- int *index,
- real* inputGrad,
- int numSequences,
- int dim);
+extern void hl_max_sequence_backward(
+ real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);
/**
* @brief Context projection forward.
@@ -63,7 +59,8 @@ extern void hl_max_sequence_backward(real* outputGrad,
* @param[in] inputDim input sequence dimension.
* @param[in] contextLength context length.
* @param[in] contextStart context start.
- * @param[in] beginPad number of extra timesteps added at the beginning.
+ * @param[in] beginPad number of extra timesteps added at the
+ * beginning.
* @param[in] isPadding trainable padding.
*
*/
@@ -109,7 +106,8 @@ extern void hl_context_projection_backward_data(real* outputGrad,
* @param[in] totalPad number of extra timesteps.
* @param[in] contextLength context length.
* @param[in] contextStart context start.
- * @param[in] beginPad number of extra timesteps added at the beginning.
+ * @param[in] beginPad number of extra timesteps added at the
+ * beginning.
*
*/
extern void hl_context_projection_backward_weight(real* outputGrad,
@@ -141,9 +139,9 @@ extern void hl_context_projection_backward_weight(real* outputGrad,
* @param[in] seq2batch copy direction.
*
*/
-extern void hl_sequence2batch_copy(real *batch,
- real *sequence,
- const int *batchIndex,
+extern void hl_sequence2batch_copy(real* batch,
+ real* sequence,
+ const int* batchIndex,
int seqWidth,
int batchCount,
bool seq2batch);
@@ -167,9 +165,9 @@ extern void hl_sequence2batch_copy(real *batch,
* @param[in] seq2batch copy direction.
*
*/
-extern void hl_sequence2batch_add(real *batch,
- real *sequence,
- int *batchIndex,
+extern void hl_sequence2batch_add(real* batch,
+ real* sequence,
+ int* batchIndex,
int seqWidth,
int batchCount,
bool seq2batch);
diff --git a/paddle/cuda/include/hl_sparse.h b/paddle/cuda/include/hl_sparse.h
index 9acdebdebf37761e1485e3441963586ead9f3c85..c4e0be23e2031cbcb124b532216a23d8a344668d 100644
--- a/paddle/cuda/include/hl_sparse.h
+++ b/paddle/cuda/include/hl_sparse.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_SPARSE_H_
#define HL_SPARSE_H_
@@ -31,7 +30,7 @@ limitations under the License. */
*/
extern void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz);
@@ -60,10 +59,10 @@ extern void hl_free_sparse_matrix(hl_sparse_matrix_s A_d);
*
*/
extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
- void * dest_d,
+ void *dest_d,
size_t size,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz);
@@ -94,11 +93,11 @@ extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
*
*/
extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
- real* value_d,
- int* rows_d,
- int* cols_d,
+ real *value_d,
+ int *rows_d,
+ int *cols_d,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz);
@@ -259,10 +258,14 @@ extern void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
*/
extern void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta);
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta);
/**
* @brief C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
@@ -311,11 +314,16 @@ extern void hl_matrix_dense_mul_csc(real *A_d,
* @note transb is not support HPPL_OP_T.
*
*/
-extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+extern void hl_sparse_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
hl_sparse_matrix_s C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta);
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta);
/**
* @brief C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -336,12 +344,16 @@ extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
* @note transa is not support HPPL_OP_T.
*
*/
-extern void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+extern void hl_matrix_dense_mul_csr(real *A_d,
+ hl_trans_op_t transa,
hl_sparse_matrix_s B_d,
hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta);
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta);
/**
* @brief Memcpy csc_matrix to host.
@@ -412,7 +424,6 @@ extern void hl_memcpy_from_csr_matrix(real *csr_val,
hl_sparse_matrix_s csr_matrix,
hl_stream_t stream);
-
/**
* @brief A_d[j] += B_d[i,j] for i in range(height)
*
@@ -423,19 +434,13 @@ extern void hl_memcpy_from_csr_matrix(real *csr_val,
* @param[in] scale scale of B_d
*
*/
-extern void hl_sparse_matrix_column_sum(real* A_d,
- hl_sparse_matrix_s B_d,
- int dimM,
- int dimN,
- real scale);
+extern void hl_sparse_matrix_column_sum(
+ real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
/**
* @brief implementation of csr sparse matrix in hl_sparse_matirx_column_sum
*/
-extern void hl_matrix_csr_column_sum(real* A_d,
- hl_sparse_matrix_s B_d,
- int dimM,
- int dimN,
- real scale);
+extern void hl_matrix_csr_column_sum(
+ real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
/**
* @brief A_d[i,j] += B_d[j]
@@ -446,13 +451,13 @@ extern void hl_matrix_csr_column_sum(real* A_d,
*
*/
extern void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
real scale);
/**
* @brief implementation of csr sparse matrix in hl_sparse_matrix_add_bias
*/
extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
real scale);
/**
@@ -470,7 +475,7 @@ extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
*
*/
extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
int dimM,
int dimN,
real alpha,
@@ -479,7 +484,7 @@ extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
* @brief implementation of csr sparse matrix in hl_sparse_matrix_add_dense
*/
extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
int dimM,
int dimN,
real alpha,
@@ -493,7 +498,7 @@ extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
* @return return rows pointer, which is gpu address
*
*/
-extern int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
+extern int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
/**
* @brief get cols pionter of GpuSparseMatrix
@@ -503,7 +508,7 @@ extern int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
* @return return cols pointer, which is gpu address
*
*/
-extern int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
+extern int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
/**
* @brief get value pionter of GpuSparseMatrix
@@ -513,7 +518,6 @@ extern int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
* @return return value pointer, which is gpu address
*
*/
-extern real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat);
-
+extern real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat);
#endif /* HL_SPARSE_H_ */
diff --git a/paddle/cuda/include/hl_table_apply.h b/paddle/cuda/include/hl_table_apply.h
index 3c9428e9253d5ed563e4e9f62d8842667496b83c..b4ac83a66af13c2a843872faba2ebd972008a738 100644
--- a/paddle/cuda/include/hl_table_apply.h
+++ b/paddle/cuda/include/hl_table_apply.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_TABLE_APPLY_H_
#define HL_TABLE_APPLY_H_
@@ -31,8 +30,10 @@ limitations under the License. */
* @param[in] dim width of table.
*
*/
-extern void hl_matrix_select_rows(real* output, int ldo,
- real* table, int ldt,
+extern void hl_matrix_select_rows(real* output,
+ int ldo,
+ real* table,
+ int ldt,
int* ids,
int numSamples,
int tableSize,
@@ -53,8 +54,10 @@ extern void hl_matrix_select_rows(real* output, int ldo,
* @param[in] dim width of table.
*
*/
-extern void hl_matrix_add_to_rows(real* table, int ldt,
- real* input, int ldi,
+extern void hl_matrix_add_to_rows(real* table,
+ int ldt,
+ real* input,
+ int ldi,
int* ids,
int numSamples,
int tableSize,
@@ -72,8 +75,7 @@ extern void hl_matrix_add_to_rows(real* table, int ldt,
*
*/
template
-extern void hl_vector_select_from(T* dst, int sized,
- const T* src, int sizes,
- const int* ids, int sizei);
+extern void hl_vector_select_from(
+ T* dst, int sized, const T* src, int sizes, const int* ids, int sizei);
-#endif /* HL_TABLE_APPLY_H_ */
+#endif /* HL_TABLE_APPLY_H_ */
diff --git a/paddle/cuda/include/hl_time.h b/paddle/cuda/include/hl_time.h
index 4414b0b2d2ed4ab6a48294ffaed3a43a639e5950..b0a88c66a12fcfec6ea96b877423f907dac8dfa1 100644
--- a/paddle/cuda/include/hl_time.h
+++ b/paddle/cuda/include/hl_time.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_TIME_H_
#define HL_TIME_H_
diff --git a/paddle/cuda/include/hl_top_k.h b/paddle/cuda/include/hl_top_k.h
index a38d4cf862278a060f72b970d723895dc3735d0a..e8cfebbf6a3bd27c10a71d7817238bc304681fa4 100644
--- a/paddle/cuda/include/hl_top_k.h
+++ b/paddle/cuda/include/hl_top_k.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_TOP_K_H_
#define HL_TOP_K_H_
@@ -31,9 +30,11 @@ limitations under the License. */
* @param[in] numSamples height of input value.
*
*/
-extern void hl_matrix_top_k(real* topVal, int ldv,
- int * topIds,
- real* src, int lds,
+extern void hl_matrix_top_k(real* topVal,
+ int ldv,
+ int* topIds,
+ real* src,
+ int lds,
int dim,
int beamSize,
int numSamples);
@@ -50,8 +51,9 @@ extern void hl_matrix_top_k(real* topVal, int ldv,
*
* @note Only support HL_SPARSE_CSR format.
*/
-extern void hl_sparse_matrix_top_k(real* topVal, int ldv,
- int * topIds,
+extern void hl_sparse_matrix_top_k(real* topVal,
+ int ldv,
+ int* topIds,
hl_sparse_matrix_s src,
int beamSize,
int numSamples);
diff --git a/paddle/cuda/include/stub/hl_aggregate_stub.h b/paddle/cuda/include/stub/hl_aggregate_stub.h
index 4c0c68f3c98fe95f01060b82c3a1b9822d2a3715..bb53fc581e09905aa7a9b2d8dfe44b04c677c40a 100644
--- a/paddle/cuda/include/stub/hl_aggregate_stub.h
+++ b/paddle/cuda/include/stub/hl_aggregate_stub.h
@@ -12,29 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_AGGREGATE_STUB_H_
#define HL_AGGREGATE_STUB_H_
#include "hl_aggregate.h"
-inline void hl_matrix_row_sum(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_row_max(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_row_min(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_column_sum(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_column_max(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_column_min(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {}
inline void hl_vector_sum(real *A_d, real *C_h, int dimM) {}
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index c6f32ad337705ff938b7b370a4785dc7f4393041..2f73b9671edd3609996aebff2913f5262805f869 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -12,84 +12,134 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CNN_STUB_H_
#define HL_CNN_STUB_H_
#include "hl_cnn.h"
-inline void hl_shrink_col2feature(
- const real * dataCol, size_t channels,
- size_t height, size_t width,
- size_t blockH, size_t blockW,
- size_t strideH, size_t strideW,
- size_t paddingH, size_t paddingW,
- size_t outputH, size_t outputW,
- real* dataIm,
- real alpha, real beta) {}
-
-inline void hl_expand_feature2col(
- const real* dataIm, size_t channels,
- size_t height, size_t width,
- size_t blockH, size_t blockW,
- size_t strideH, size_t strideW,
- size_t paddingH, size_t paddingW,
- size_t outputH, size_t outputW,
- real* dataCol) {}
-
-inline void hl_maxpool_forward(
- const int frameCnt, const real* inputData,
- const int channels,
- const int height, const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real* tgtData, const int tgtStride) {}
-
-inline void hl_maxpool_backward(
- const int frameCnt, const real* inputData,
- const real* outData, const real* outGrad,
- const int channels, const int height,
- const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real scaleA, real scaleB,
- real* targetGrad, const int outStride) {}
-
-inline void hl_avgpool_forward(
- const int frameCnt, const real* inputData,
- const int channels,
- const int height, const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real* tgtData, const int tgtStride) {}
-
-inline void hl_avgpool_backward(
- const int frameCnt, const real* outGrad,
- const int channels, const int height,
- const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- int paddingH, int paddingW,
- real scaleA, real scaleB,
- real* backGrad, const int outStride) {}
-
-inline void hl_CMRNorm_forward(
- size_t frameCnt, const real* in, real* scale, real* out,
- size_t channels, size_t height, size_t width, size_t sizeX,
- real alpha, real beta) {}
-
-inline void hl_CMRNorm_backward(
- size_t frameCnt, const real* inV, const real* scale,
- const real* outV, const real* outDiff, real *inDiff,
- size_t channels, size_t height, size_t width, size_t sizeX,
- real alpha, real beta) {}
+inline void hl_shrink_col2feature(const real* dataCol,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t blockH,
+ size_t blockW,
+ size_t strideH,
+ size_t strideW,
+ size_t paddingH,
+ size_t paddingW,
+ size_t outputH,
+ size_t outputW,
+ real* dataIm,
+ real alpha,
+ real beta) {}
+
+inline void hl_expand_feature2col(const real* dataIm,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t blockH,
+ size_t blockW,
+ size_t strideH,
+ size_t strideW,
+ size_t paddingH,
+ size_t paddingW,
+ size_t outputH,
+ size_t outputW,
+ real* dataCol) {}
+
+inline void hl_maxpool_forward(const int frameCnt,
+ const real* inputData,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real* tgtData,
+ const int tgtStride) {}
+
+inline void hl_maxpool_backward(const int frameCnt,
+ const real* inputData,
+ const real* outData,
+ const real* outGrad,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real scaleA,
+ real scaleB,
+ real* targetGrad,
+ const int outStride) {}
+
+inline void hl_avgpool_forward(const int frameCnt,
+ const real* inputData,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real* tgtData,
+ const int tgtStride) {}
+
+inline void hl_avgpool_backward(const int frameCnt,
+ const real* outGrad,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ int paddingH,
+ int paddingW,
+ real scaleA,
+ real scaleB,
+ real* backGrad,
+ const int outStride) {}
+
+inline void hl_CMRNorm_forward(size_t frameCnt,
+ const real* in,
+ real* scale,
+ real* out,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t sizeX,
+ real alpha,
+ real beta) {}
+
+inline void hl_CMRNorm_backward(size_t frameCnt,
+ const real* inV,
+ const real* scale,
+ const real* outV,
+ const real* outDiff,
+ real* inDiff,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t sizeX,
+ real alpha,
+ real beta) {}
inline void hl_bilinear_forward(const real* inData,
const size_t inImgH,
@@ -106,25 +156,33 @@ inline void hl_bilinear_forward(const real* inData,
const real ratioW) {}
inline void hl_bilinear_backward(real* inGrad,
- const size_t inImgH,
- const size_t inImgW,
- const size_t inputH,
- const size_t inputW,
- const real* outGrad,
- const size_t outImgH,
- const size_t outImgW,
- const size_t outputH,
- const size_t outputW,
- const size_t numChannels,
- const real ratioH,
- const real ratioW) {}
-
-inline void hl_maxout_forward(
- const real* inData, real* outData, int* idData,
- size_t batchSize, size_t size, size_t featLen, size_t group) {}
-
-inline void hl_maxout_backward(
- real* inGrad, const real* outGrad, const int* idData,
- size_t batchSize, size_t size, size_t featLen, size_t group) {}
+ const size_t inImgH,
+ const size_t inImgW,
+ const size_t inputH,
+ const size_t inputW,
+ const real* outGrad,
+ const size_t outImgH,
+ const size_t outImgW,
+ const size_t outputH,
+ const size_t outputW,
+ const size_t numChannels,
+ const real ratioH,
+ const real ratioW) {}
+
+inline void hl_maxout_forward(const real* inData,
+ real* outData,
+ int* idData,
+ size_t batchSize,
+ size_t size,
+ size_t featLen,
+ size_t group) {}
+
+inline void hl_maxout_backward(real* inGrad,
+ const real* outGrad,
+ const int* idData,
+ size_t batchSize,
+ size_t size,
+ size_t featLen,
+ size_t group) {}
#endif // HL_CNN_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
index 903dcbe8355d6f593d96bc1f9e686d54035a9366..85f7c390c47397127487b16fdc933f0afe2fb880 100644
--- a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
@@ -12,41 +12,42 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_CUBLAS_STUB_H_
#define HL_CUDA_CUBLAS_STUB_H_
#include "hl_cuda_cublas.h"
-inline void hl_matrix_transpose(real *A_d,
- real *C_d,
- int dimM,
- int dimN,
- int lda,
- int ldc) {}
-
-inline void hl_matrix_transpose(real *A_d,
- real *C_d,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_inverse(real *A_d,
- real *C_d,
- int dimN,
- int lda,
- int ldc) {}
-
-inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
- real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta,
- int lda, int ldb, int ldc) {}
+inline void hl_matrix_transpose(
+ real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {}
+
+inline void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+inline void hl_matrix_inverse(
+ real *A_d, real *C_d, int dimN, int lda, int ldc) {}
+
+inline void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
+ real *C_d,
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta,
+ int lda,
+ int ldb,
+ int ldc) {}
+
+inline void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta) {}
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta) {}
#endif // HL_CUDA_CUBLAS_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
index b96804afd86ba5e8c7b7eed7eb768295b4e23096..3beb0e5b5170261a6c453936b8b0347f3e97dbff 100644
--- a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
@@ -12,15 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_CUDNN_STUB_H_
#define HL_CUDA_CUDNN_STUB_H_
#include "hl_cuda_cudnn.h"
-inline int hl_get_cudnn_lib_version() {
- return 0;
-}
+inline int hl_get_cudnn_lib_version() { return 0; }
inline void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {}
@@ -68,41 +65,41 @@ inline void hl_pooling_backward(hl_tensor_descriptor input,
hl_pooling_descriptor pooling) {}
inline void hl_create_filter_descriptor(hl_filter_descriptor* filter,
- int input_feature_maps,
- int output_feature_maps,
- int height,
- int width) {}
+ int input_feature_maps,
+ int output_feature_maps,
+ int height,
+ int width) {}
inline void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {}
inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
- hl_tensor_descriptor image,
- hl_filter_descriptor filter,
- int padding_height,
- int padding_width,
- int stride_height,
- int stride_width) {}
+ hl_tensor_descriptor image,
+ hl_filter_descriptor filter,
+ int padding_height,
+ int padding_width,
+ int stride_height,
+ int stride_width) {}
inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
- hl_tensor_descriptor image,
- hl_filter_descriptor filter,
- int padding_height,
- int padding_width,
- int stride_height,
- int stride_width) {}
+ hl_tensor_descriptor image,
+ hl_filter_descriptor filter,
+ int padding_height,
+ int padding_width,
+ int stride_height,
+ int stride_width) {}
inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {}
inline void hl_conv_workspace(hl_tensor_descriptor input,
- hl_tensor_descriptor output,
- hl_filter_descriptor filter,
- hl_convolution_descriptor conv,
- int* convFwdAlgo,
- size_t* fwdLimitBytes,
- int* convBwdDataAlgo,
- size_t* bwdDataLimitBytes,
- int* convBwdFilterAlgo,
- size_t* bwdFilterLimitBytes) {}
+ hl_tensor_descriptor output,
+ hl_filter_descriptor filter,
+ hl_convolution_descriptor conv,
+ int* convFwdAlgo,
+ size_t* fwdLimitBytes,
+ int* convBwdDataAlgo,
+ size_t* bwdDataLimitBytes,
+ int* convBwdFilterAlgo,
+ size_t* bwdFilterLimitBytes) {}
inline void hl_convolution_forward(hl_tensor_descriptor input,
real* input_data,
@@ -116,86 +113,84 @@ inline void hl_convolution_forward(hl_tensor_descriptor input,
int convFwdAlgo) {}
inline void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
- real* bias_data,
- hl_tensor_descriptor output,
- real* output_data) {}
-
-inline void hl_convolution_backward_filter(
- hl_tensor_descriptor input,
- real* input_data,
- hl_tensor_descriptor output,
- real* output_grad_data,
- hl_filter_descriptor filter,
- real* filter_grad_data,
- hl_convolution_descriptor conv,
- void* gpuWorkSpace,
- size_t sizeInBytes,
- int convBwdFilterAlgo) {}
-
-inline void hl_convolution_backward_data(
- hl_tensor_descriptor input,
- real* input_data_grad,
- hl_tensor_descriptor output,
- real* output_grad_data,
- hl_filter_descriptor filter,
- real* filter_data,
- hl_convolution_descriptor conv,
- void* gpuWorkSpace,
- size_t sizeInBytes,
- int convBwdDataAlgo) {}
+ real* bias_data,
+ hl_tensor_descriptor output,
+ real* output_data) {}
+
+inline void hl_convolution_backward_filter(hl_tensor_descriptor input,
+ real* input_data,
+ hl_tensor_descriptor output,
+ real* output_grad_data,
+ hl_filter_descriptor filter,
+ real* filter_grad_data,
+ hl_convolution_descriptor conv,
+ void* gpuWorkSpace,
+ size_t sizeInBytes,
+ int convBwdFilterAlgo) {}
+
+inline void hl_convolution_backward_data(hl_tensor_descriptor input,
+ real* input_data_grad,
+ hl_tensor_descriptor output,
+ real* output_grad_data,
+ hl_filter_descriptor filter,
+ real* filter_data,
+ hl_convolution_descriptor conv,
+ void* gpuWorkSpace,
+ size_t sizeInBytes,
+ int convBwdDataAlgo) {}
inline void hl_convolution_backward_bias(hl_tensor_descriptor bias,
- real* bias_grad_data,
- hl_tensor_descriptor output,
- real* output_grad_data) {}
+ real* bias_grad_data,
+ hl_tensor_descriptor output,
+ real* output_grad_data) {}
-inline void hl_softmax_forward(real *input,
- real *output,
- int height,
- int width) {}
-
-inline void hl_softmax_backward(real *output_value,
- real *output_grad,
+inline void hl_softmax_forward(real* input,
+ real* output,
int height,
int width) {}
+inline void hl_softmax_backward(real* output_value,
+ real* output_grad,
+ int height,
+ int width) {}
+
inline void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outputDesc,
- real *output,
+ real* output,
hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
+ real* scale,
+ real* bias,
double factor,
- real *runningMean,
- real *runningInvVar,
+ real* runningMean,
+ real* runningInvVar,
double epsilon,
- real *savedMean,
- real *savedVar) {}
+ real* savedMean,
+ real* savedVar) {}
inline void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outputDesc,
- real *output,
+ real* output,
hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
- real *estimatedMean,
- real *estimatedVar,
+ real* scale,
+ real* bias,
+ real* estimatedMean,
+ real* estimatedVar,
double epsilon) {}
inline void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outGradDesc,
- real *outGrad,
+ real* outGrad,
hl_tensor_descriptor inGradDesc,
- real *inGrad,
+ real* inGrad,
hl_tensor_descriptor dBnParamDesc,
- real *scale,
- real *scaleGrad,
- real *biasGrad,
+ real* scale,
+ real* scaleGrad,
+ real* biasGrad,
double epsilon,
- real *savedMean,
- real *savedInvVar) {}
+ real* savedMean,
+ real* savedInvVar) {}
#endif // HL_CUDA_CUDNN_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_stub.h b/paddle/cuda/include/stub/hl_cuda_stub.h
index 675ac03b0e188e9b26038dd4e40264099618e17a..24923a0d4a0cdd49214305c2f7716eeef575c7ee 100644
--- a/paddle/cuda/include/stub/hl_cuda_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_STUB_H_
#define HL_CUDA_STUB_H_
@@ -24,29 +23,25 @@ inline void hl_specify_devices_start(int *device, int number) {}
inline void hl_init(int device) {}
-inline int hl_get_cuda_lib_version(int device) {
- return 0;
-}
+inline int hl_get_cuda_lib_version(int device) { return 0; }
inline void hl_fini() {}
inline void hl_set_sync_flag(bool flag) {}
-inline bool hl_get_sync_flag() {
- return false;
-}
+inline bool hl_get_sync_flag() { return false; }
-inline int hl_get_device_count() { return 0; }
+inline int hl_get_device_count() { return 0; }
inline void hl_set_device(int device) {}
-inline int hl_get_device() { return 0; }
+inline int hl_get_device() { return 0; }
-inline void* hl_malloc_device(size_t size) { return NULL; }
+inline void *hl_malloc_device(size_t size) { return NULL; }
inline void hl_free_mem_device(void *dest_d) {}
-inline void* hl_malloc_host(size_t size) { return NULL; }
+inline void *hl_malloc_host(size_t size) { return NULL; }
inline void hl_free_mem_host(void *dest_h) {}
@@ -64,7 +59,9 @@ inline void hl_rand(real *dest_d, size_t num) {}
inline void hl_srand(unsigned int seed) {}
-inline void hl_memcpy_async(void *dst, void *src, size_t size,
+inline void hl_memcpy_async(void *dst,
+ void *src,
+ size_t size,
hl_stream_t stream) {}
inline void hl_stream_synchronize(hl_stream_t stream) {}
@@ -83,14 +80,18 @@ inline void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {}
inline void hl_event_synchronize(hl_event_t event) {}
-inline int hl_get_device_last_error() { return 0; }
+inline int hl_get_device_last_error() { return 0; }
-inline const char* hl_get_device_error_string() { return NULL; }
+inline const char *hl_get_device_error_string() { return NULL; }
-inline const char* hl_get_device_error_string(size_t err) { return NULL; }
+inline const char *hl_get_device_error_string(size_t err) { return NULL; }
inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; }
inline void hl_device_synchronize() {}
+inline void hl_profiler_start() {}
+
+inline void hl_profiler_end() {}
+
#endif // HL_CUDA_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_lstm_stub.h b/paddle/cuda/include/stub/hl_lstm_stub.h
index 2700bef02a5e1e40ee7603ccab7fec754196f8cd..7ccda032d26f2fbbe99136e8481416daea557a78 100644
--- a/paddle/cuda/include/stub/hl_lstm_stub.h
+++ b/paddle/cuda/include/stub/hl_lstm_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_LSTM_STUB_H_
#define HL_LSTM_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index 76cac2e57769301fee2e5979e2685976daf35441..1bd78d23fbaf46e6265ba0db25ea399a204bd96f 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_MATRIX_STUB_H_
#define HL_MATRIX_STUB_H_
@@ -26,48 +25,30 @@ inline void hl_matrix_add(real* A_d,
real alpha,
real beta) {}
-inline void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) {}
+inline void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {}
-inline void hl_sequence_softmax_forward(real *A_d,
- real *C_d,
+inline void hl_sequence_softmax_forward(real* A_d,
+ real* C_d,
const int* index,
int numSequence) {}
-inline void hl_matrix_softmax_derivative(real* grad_d,
- real* output_d,
- real* sftmaxSum_d,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_classification_error(real* A_d,
- int* B_d,
- real* C_d,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_cross_entropy(real* A_d,
- real* C_d,
- int* label_d,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_cross_entropy_bp(real* grad_d,
- real* output_d,
- int* label_d,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_multi_binary_cross_entropy(real* output,
- real* entropy,
- hl_sparse_matrix_s mat,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_multi_binary_cross_entropy_bp(real* output,
- real* grad,
- hl_sparse_matrix_s mat,
- int dimM,
- int dimN) {}
+inline void hl_matrix_softmax_derivative(
+ real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {}
+
+inline void hl_matrix_classification_error(
+ real* A_d, int* B_d, real* C_d, int dimM, int dimN) {}
+
+inline void hl_matrix_cross_entropy(
+ real* A_d, real* C_d, int* label_d, int dimM, int dimN) {}
+
+inline void hl_matrix_cross_entropy_bp(
+ real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {}
+
+inline void hl_matrix_multi_binary_cross_entropy(
+ real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN) {}
+
+inline void hl_matrix_multi_binary_cross_entropy_bp(
+ real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN) {}
inline void hl_matrix_zero_mem(real* data, int num) {}
@@ -101,7 +82,6 @@ inline void hl_cossim(real* output,
int input2_height,
real scale) {}
-
inline void hl_cossim_derivative(real* grad,
real* output,
real* prevOutX,
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index aabd956c37f7dce48a379b995ab88a53aa65c760..381f0a6f26c5669465f029e972c6ca8b0e6e1776 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_SEQUENCE_STUB_H_
#define HL_SEQUENCE_STUB_H_
@@ -21,15 +20,12 @@ limitations under the License. */
inline void hl_max_sequence_forward(real* input,
const int* sequence,
real* output,
- int *index,
+ int* index,
int numSequences,
int dim) {}
-inline void hl_max_sequence_backward(real* outputGrad,
- int *index,
- real* inputGrad,
- int numSequences,
- int dim) {}
+inline void hl_max_sequence_backward(
+ real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}
inline void hl_context_projection_forward(real* input,
const int* sequence,
@@ -60,16 +56,16 @@ inline void hl_context_projection_backward_weight(real* outputGrad,
int contextStart,
int beginPad) {}
-inline void hl_sequence2batch_copy(real *batch,
- real *sequence,
- const int *batchIndex,
+inline void hl_sequence2batch_copy(real* batch,
+ real* sequence,
+ const int* batchIndex,
int seqWidth,
int batchCount,
bool seq2batch) {}
-inline void hl_sequence2batch_add(real *batch,
- real *sequence,
- int *batchIndex,
+inline void hl_sequence2batch_add(real* batch,
+ real* sequence,
+ int* batchIndex,
int seqWidth,
int batchCount,
bool seq2batch) {}
diff --git a/paddle/cuda/include/stub/hl_sparse_stub.h b/paddle/cuda/include/stub/hl_sparse_stub.h
index 346a1900dda5825e9a4311a2c51e8a50e6e7df0b..d47bdd2c47d097c4c68b7b7e88ef888bc18270c2 100644
--- a/paddle/cuda/include/stub/hl_sparse_stub.h
+++ b/paddle/cuda/include/stub/hl_sparse_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_SPARSE_STUB_H_
#define HL_SPARSE_STUB_H_
@@ -20,7 +19,7 @@ limitations under the License. */
inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz) {}
@@ -28,20 +27,20 @@ inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
inline void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {}
inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
- void * dest_d,
+ void *dest_d,
size_t size,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz) {}
inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
- real* value_d,
- int* rows_d,
- int* cols_d,
+ real *value_d,
+ int *rows_d,
+ int *cols_d,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz) {}
@@ -87,10 +86,14 @@ inline void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
inline void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta) {}
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta) {}
inline void hl_matrix_dense_mul_csc(real *A_d,
hl_trans_op_t transa,
@@ -103,18 +106,27 @@ inline void hl_matrix_dense_mul_csc(real *A_d,
real alpha,
real beta) {}
-inline void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+inline void hl_sparse_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
hl_sparse_matrix_s C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta) {}
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta) {}
-inline void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+inline void hl_matrix_dense_mul_csr(real *A_d,
+ hl_trans_op_t transa,
hl_sparse_matrix_s B_d,
hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta) {}
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta) {}
inline void hl_memcpy_from_csc_matrix(real *csc_val,
size_t val_size,
@@ -134,49 +146,39 @@ inline void hl_memcpy_from_csr_matrix(real *csr_val,
hl_sparse_matrix_s csr_matrix,
hl_stream_t stream) {}
-inline void hl_sparse_matrix_column_sum(real* A_d,
- hl_sparse_matrix_s B_d,
- int dimM,
- int dimN,
- real scale) {}
+inline void hl_sparse_matrix_column_sum(
+ real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
-inline void hl_matrix_csr_column_sum(real* A_d,
- hl_sparse_matrix_s B_d,
- int dimM,
- int dimN,
- real scale) {}
+inline void hl_matrix_csr_column_sum(
+ real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
inline void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
real scale) {}
inline void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
real scale) {}
inline void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
int dimM,
int dimN,
real alpha,
real beta) {}
inline void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
int dimM,
int dimN,
real alpha,
real beta) {}
-inline int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
- return NULL;
-}
+inline int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { return NULL; }
-inline int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
- return NULL;
-}
+inline int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { return NULL; }
-inline real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
+inline real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
return NULL;
}
diff --git a/paddle/cuda/src/avx_mathfun.h b/paddle/cuda/src/avx_mathfun.h
index 2922d4dc2937662d66fb2433f4883448ba21fa3f..2412ed5abc13b2a83521a75524f581e106788b60 100644
--- a/paddle/cuda/src/avx_mathfun.h
+++ b/paddle/cuda/src/avx_mathfun.h
@@ -32,32 +32,35 @@
#include
/* yes I know, the top of this file is quite ugly */
-# define ALIGN32_BEG
-# define ALIGN32_END __attribute__((aligned(32)))
+#define ALIGN32_BEG
+#define ALIGN32_END __attribute__((aligned(32)))
/* __m128 is ugly to write */
-typedef __m256 v8sf; // vector of 8 float (avx)
-typedef __m256i v8si; // vector of 8 int (avx)
-typedef __m128i v4si; // vector of 8 int (avx)
+typedef __m256 v8sf; // vector of 8 float (avx)
+typedef __m256i v8si; // vector of 8 int (avx)
+typedef __m128i v4si; // vector of 8 int (avx)
-#define _PI32AVX_CONST(Name, Val) \
- static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val }
+#define _PI32AVX_CONST(Name, Val) \
+ static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { \
+ Val, Val, Val, Val}
_PI32AVX_CONST(1, 1);
_PI32AVX_CONST(inv1, ~1);
_PI32AVX_CONST(2, 2);
_PI32AVX_CONST(4, 4);
-
/* declare some AVX constants -- why can't I figure a better way to do that? */
-#define _PS256_CONST(Name, Val) \
- static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-#define _PI32_CONST256(Name, Val) \
- static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-#define _PS256_CONST_TYPE(Name, Type, Val) \
- static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-
-_PS256_CONST(1 , 1.0f);
+#define _PS256_CONST(Name, Val) \
+ static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \
+ Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PI32_CONST256(Name, Val) \
+ static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \
+ Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PS256_CONST_TYPE(Name, Type, Val) \
+ static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \
+ Val, Val, Val, Val, Val, Val, Val, Val}
+
+_PS256_CONST(1, 1.0f);
_PS256_CONST(0p5, 0.5f);
/* the smallest non denormalized float number */
_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
@@ -76,14 +79,14 @@ _PI32_CONST256(0x7f, 0x7f);
_PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
_PS256_CONST(cephes_log_p0, 7.0376836292E-2);
-_PS256_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS256_CONST(cephes_log_p1, -1.1514610310E-1);
_PS256_CONST(cephes_log_p2, 1.1676998740E-1);
-_PS256_CONST(cephes_log_p3, - 1.2420140846E-1);
-_PS256_CONST(cephes_log_p4, + 1.4249322787E-1);
-_PS256_CONST(cephes_log_p5, - 1.6668057665E-1);
-_PS256_CONST(cephes_log_p6, + 2.0000714765E-1);
-_PS256_CONST(cephes_log_p7, - 2.4999993993E-1);
-_PS256_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS256_CONST(cephes_log_p3, -1.2420140846E-1);
+_PS256_CONST(cephes_log_p4, +1.4249322787E-1);
+_PS256_CONST(cephes_log_p5, -1.6668057665E-1);
+_PS256_CONST(cephes_log_p6, +2.0000714765E-1);
+_PS256_CONST(cephes_log_p7, -2.4999993993E-1);
+_PS256_CONST(cephes_log_p8, +3.3333331174E-1);
_PS256_CONST(cephes_log_q1, -2.12194440e-4);
_PS256_CONST(cephes_log_q2, 0.693359375);
@@ -94,50 +97,51 @@ typedef union imm_xmm_union {
v4si xmm[2];
} imm_xmm_union;
-#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \
- imm_xmm_union u __attribute__((aligned(32))); \
- u.imm = imm_; \
- xmm0_ = u.xmm[0]; \
- xmm1_ = u.xmm[1]; \
-}
-
-#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) { \
+#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \
+ { \
imm_xmm_union u __attribute__((aligned(32))); \
- u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \
+ u.imm = imm_; \
+ xmm0_ = u.xmm[0]; \
+ xmm1_ = u.xmm[1]; \
}
+#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \
+ { \
+ imm_xmm_union u __attribute__((aligned(32))); \
+ u.xmm[0] = xmm0_; \
+ u.xmm[1] = xmm1_; \
+ imm_ = u.imm; \
+ }
-#define AVX2_BITOP_USING_SSE2(fn) \
-static inline v8si avx2_mm256_##fn(v8si x, int a) \
-{ \
- /* use SSE2 instruction to perform the bitop AVX2 */ \
- v4si x1, x2; \
- v8si ret; \
- COPY_IMM_TO_XMM(x, x1, x2); \
- x1 = _mm_##fn(x1,a); \
- x2 = _mm_##fn(x2,a); \
- COPY_XMM_TO_IMM(x1, x2, ret); \
- return(ret); \
-}
+#define AVX2_BITOP_USING_SSE2(fn) \
+ static inline v8si avx2_mm256_##fn(v8si x, int a) { \
+ /* use SSE2 instruction to perform the bitop AVX2 */ \
+ v4si x1, x2; \
+ v8si ret; \
+ COPY_IMM_TO_XMM(x, x1, x2); \
+ x1 = _mm_##fn(x1, a); \
+ x2 = _mm_##fn(x2, a); \
+ COPY_XMM_TO_IMM(x1, x2, ret); \
+ return (ret); \
+ }
//#warning "Using SSE2 to perform AVX2 bitshift ops"
AVX2_BITOP_USING_SSE2(slli_epi32)
AVX2_BITOP_USING_SSE2(srli_epi32)
-#define AVX2_INTOP_USING_SSE2(fn) \
-static inline v8si avx2_mm256_##fn(v8si x, v8si y) \
-{ \
- /* use SSE2 instructions to perform the AVX2 integer operation */ \
- v4si x1, x2; \
- v4si y1, y2; \
- v8si ret; \
- COPY_IMM_TO_XMM(x, x1, x2); \
- COPY_IMM_TO_XMM(y, y1, y2); \
- x1 = _mm_##fn(x1,y1); \
- x2 = _mm_##fn(x2,y2); \
- COPY_XMM_TO_IMM(x1, x2, ret); \
- return(ret); \
-}
+#define AVX2_INTOP_USING_SSE2(fn) \
+ static inline v8si avx2_mm256_##fn(v8si x, v8si y) { \
+ /* use SSE2 instructions to perform the AVX2 integer operation */ \
+ v4si x1, x2; \
+ v4si y1, y2; \
+ v8si ret; \
+ COPY_IMM_TO_XMM(x, x1, x2); \
+ COPY_IMM_TO_XMM(y, y1, y2); \
+ x1 = _mm_##fn(x1, y1); \
+ x2 = _mm_##fn(x2, y2); \
+ COPY_XMM_TO_IMM(x1, x2, ret); \
+ return (ret); \
+ }
//#warning "Using SSE2 to perform AVX2 integer ops"
AVX2_INTOP_USING_SSE2(and_si128)
@@ -157,84 +161,83 @@ AVX2_INTOP_USING_SSE2(add_epi32)
#define avx2_mm256_add_epi32 _mm256_add_epi32
#endif /* __AVX2__ */
-
-/* natural logarithm computed for 8 simultaneous float
+/* natural logarithm computed for 8 simultaneous float
return NaN for x <= 0
*/
v8sf log256_ps(v8sf x) {
v8si imm0;
- v8sf one = *(v8sf*)_ps256_1;
+ v8sf one = *(v8sf *)_ps256_1;
- //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
+ // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
- x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos); /* cut off denormalized stuff */
+ x = _mm256_max_ps(
+ x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
// can be done with AVX2
imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
/* keep only the fractional part */
- x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
- x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
+ x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
+ x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
// this is again another AVX2 instruction
- imm0 = avx2_mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+ imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
v8sf e = _mm256_cvtepi32_ps(imm0);
e = _mm256_add_ps(e, one);
- /* part2:
+ /* part2:
if( x < SQRTHF ) {
e -= 1;
x = x + x - 1.0;
} else { x = x - 1.0; }
*/
- //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
- v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+ // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
+ v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
v8sf tmp = _mm256_and_ps(x, mask);
x = _mm256_sub_ps(x, one);
e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
x = _mm256_add_ps(x, tmp);
- v8sf z = _mm256_mul_ps(x,x);
+ v8sf z = _mm256_mul_ps(x, x);
- v8sf y = *(v8sf*)_ps256_cephes_log_p0;
+ v8sf y = *(v8sf *)_ps256_cephes_log_p0;
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
y = _mm256_mul_ps(y, x);
y = _mm256_mul_ps(y, z);
-
- tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
- y = _mm256_add_ps(y, tmp);
+ tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
+ y = _mm256_add_ps(y, tmp);
- tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+ tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
y = _mm256_sub_ps(y, tmp);
- tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
+ tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
x = _mm256_add_ps(x, y);
x = _mm256_add_ps(x, tmp);
- x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
+ x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
return x;
}
-_PS256_CONST(exp_hi, 88.3762626647949f);
-_PS256_CONST(exp_lo, -88.3762626647949f);
+_PS256_CONST(exp_hi, 88.3762626647949f);
+_PS256_CONST(exp_lo, -88.3762626647949f);
_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
_PS256_CONST(cephes_exp_C1, 0.693359375);
@@ -250,45 +253,45 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
v8sf exp256_ps(v8sf x) {
v8sf tmp = _mm256_setzero_ps(), fx;
v8si imm0;
- v8sf one = *(v8sf*)_ps256_1;
+ v8sf one = *(v8sf *)_ps256_1;
- x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
- x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);
+ x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
+ x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
/* express exp(x) as exp(g + n*log(2)) */
- fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
- fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);
+ fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
+ fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
/* how to perform a floorf with SSE: just below */
- //imm0 = _mm256_cvttps_epi32(fx);
- //tmp = _mm256_cvtepi32_ps(imm0);
-
+ // imm0 = _mm256_cvttps_epi32(fx);
+ // tmp = _mm256_cvtepi32_ps(imm0);
+
tmp = _mm256_floor_ps(fx);
/* if greater, substract 1 */
- //v8sf mask = _mm256_cmpgt_ps(tmp, fx);
- v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
+ // v8sf mask = _mm256_cmpgt_ps(tmp, fx);
+ v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
mask = _mm256_and_ps(mask, one);
fx = _mm256_sub_ps(tmp, mask);
- tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
- v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
+ tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
+ v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
x = _mm256_sub_ps(x, tmp);
x = _mm256_sub_ps(x, z);
- z = _mm256_mul_ps(x,x);
-
- v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
+ z = _mm256_mul_ps(x, x);
+
+ v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, x);
y = _mm256_add_ps(y, one);
@@ -296,7 +299,7 @@ v8sf exp256_ps(v8sf x) {
/* build 2^n */
imm0 = _mm256_cvttps_epi32(fx);
// another two AVX2 instructions
- imm0 = avx2_mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+ imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
imm0 = avx2_mm256_slli_epi32(imm0, 23);
v8sf pow2n = _mm256_castsi256_ps(imm0);
y = _mm256_mul_ps(y, pow2n);
@@ -307,13 +310,12 @@ _PS256_CONST(minus_cephes_DP1, -0.78515625);
_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
_PS256_CONST(sincof_p0, -1.9515295891E-4);
-_PS256_CONST(sincof_p1, 8.3321608736E-3);
+_PS256_CONST(sincof_p1, 8.3321608736E-3);
_PS256_CONST(sincof_p2, -1.6666654611E-1);
-_PS256_CONST(coscof_p0, 2.443315711809948E-005);
+_PS256_CONST(coscof_p0, 2.443315711809948E-005);
_PS256_CONST(coscof_p1, -1.388731625493765E-003);
-_PS256_CONST(coscof_p2, 4.166664568298827E-002);
-_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
-
+_PS256_CONST(coscof_p2, 4.166664568298827E-002);
+_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
/* evaluation of 8 sines at onces using AVX intrisics
@@ -327,7 +329,7 @@ _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
surprising but correct result.
*/
-v8sf sin256_ps(v8sf x) { // any x
+v8sf sin256_ps(v8sf x) { // any x
v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
v8si imm0, imm2;
@@ -338,78 +340,78 @@ v8sf sin256_ps(v8sf x) { // any x
sign_bit = x;
/* take the absolute value */
- x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+ x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
/* extract the sign bit (upper one) */
- sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
-
+ sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
+
/* scale by 4/Pi */
- y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+ y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
- /*
- Here we start a series of integer operations, which are in the
- realm of AVX2.
- If we don't have AVX, let's perform them using SSE2 directives
- */
+/*
+ Here we start a series of integer operations, which are in the
+ realm of AVX2.
+ If we don't have AVX, let's perform them using SSE2 directives
+*/
#ifdef __AVX2__
/* store the integer part of y in mm0 */
imm2 = _mm256_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
// another two AVX2 instruction
- imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
- imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+ imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+ imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
y = _mm256_cvtepi32_ps(imm2);
/* get the swap sign flag */
- imm0 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
+ imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
imm0 = avx2_mm256_slli_epi32(imm0, 29);
- /* get the polynom selection mask
+ /* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
and another one for Pi/4
#include "hl_functions.h"
namespace hppl {
- extern __m256 exp(__m256 a);
+extern __m256 exp(__m256 a);
- __m256 relu(const __m256 a) {
- __m256 tmp = _mm256_set1_ps(0.0f);
- return _mm256_max_ps(a, tmp);
- }
+__m256 relu(const __m256 a) {
+ __m256 tmp = _mm256_set1_ps(0.0f);
+ return _mm256_max_ps(a, tmp);
+}
- __m256 sigmoid(const __m256 a) {
- __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
- __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
- __m256 tmp = _mm256_max_ps(a, min);
- tmp = _mm256_min_ps(tmp, max);
- tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
- tmp = exp(tmp);
- tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
- tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
- return tmp;
- }
+__m256 sigmoid(const __m256 a) {
+ __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+ __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+ __m256 tmp = _mm256_max_ps(a, min);
+ tmp = _mm256_min_ps(tmp, max);
+ tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
+ tmp = exp(tmp);
+ tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
+ tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
+ return tmp;
+}
- __m256 tanh(const __m256 a) {
- __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
- __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
- tmp = _mm256_min_ps(tmp, max);
- tmp = exp(tmp);
- return _mm256_sub_ps(
- _mm256_div_ps(_mm256_set1_ps(2.0f),
- _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), _mm256_set1_ps(1.0f));
- }
+__m256 tanh(const __m256 a) {
+ __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
+ __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+ tmp = _mm256_min_ps(tmp, max);
+ tmp = exp(tmp);
+ return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
+ _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
+ _mm256_set1_ps(1.0f));
+}
- __m256 linear(const __m256 a) {
- return a;
- }
+__m256 linear(const __m256 a) { return a; }
- __m256 relu(const __m256 a, const __m256 b) {
- return _mm256_mul_ps(a,
+__m256 relu(const __m256 a, const __m256 b) {
+ return _mm256_mul_ps(
+ a,
_mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
- _mm256_set1_ps(1.0f)));
- }
+ _mm256_set1_ps(1.0f)));
+}
- __m256 sigmoid(const __m256 a, const __m256 b) {
- return _mm256_mul_ps(_mm256_mul_ps(a, b),
- _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
- }
+__m256 sigmoid(const __m256 a, const __m256 b) {
+ return _mm256_mul_ps(_mm256_mul_ps(a, b),
+ _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
+}
- __m256 tanh(const __m256 a, const __m256 b) {
- return _mm256_mul_ps(a,
- _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
- }
+__m256 tanh(const __m256 a, const __m256 b) {
+ return _mm256_mul_ps(
+ a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
+}
- __m256 linear(const __m256 a, const __m256 b) {
- return a;
- }
+__m256 linear(const __m256 a, const __m256 b) { return a; }
} // namespace hppl
diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/cuda/src/hl_cpu_functions.cc
index b8352c2d537fba5ec9cd3237fe8f3fa9c25cbffe..af00f352e536bf342e15315d1f6804225b87eb0b 100644
--- a/paddle/cuda/src/hl_cpu_functions.cc
+++ b/paddle/cuda/src/hl_cpu_functions.cc
@@ -12,46 +12,33 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include
#include "hl_functions.h"
namespace hppl {
- real relu(const real a) {
- return a > 0.0f ? a : 0.0f;
- }
-
- real sigmoid(const real a) {
- const real min = SIGMOID_THRESHOLD_MIN;
- const real max = SIGMOID_THRESHOLD_MAX;
- real tmp = (a < min) ? min : ((a > max) ? max : a);
- return 1.0 / (1.0 + exp(-tmp));
- }
-
- real tanh(const real a) {
- real tmp = -2.0 * a;
- tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
- return (2.0 / (1.0 + exp(tmp))) - 1.0;
- }
-
- real linear(const real a) {
- return a;
- }
-
- real relu(const real a, const real b) {
- return a * (b > 0.0f ? 1.0f : 0.0f);
- }
-
- real sigmoid(const real a, const real b) {
- return a * b * (1 - b);
- }
-
- real tanh(const real a, const real b) {
- return a * (1.0f - b * b);
- }
-
- real linear(const real a, const real b) {
- return a;
- }
+real relu(const real a) { return a > 0.0f ? a : 0.0f; }
+
+real sigmoid(const real a) {
+ const real min = SIGMOID_THRESHOLD_MIN;
+ const real max = SIGMOID_THRESHOLD_MAX;
+ real tmp = (a < min) ? min : ((a > max) ? max : a);
+ return 1.0 / (1.0 + exp(-tmp));
+}
+
+real tanh(const real a) {
+ real tmp = -2.0 * a;
+ tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+ return (2.0 / (1.0 + exp(tmp))) - 1.0;
+}
+
+real linear(const real a) { return a; }
+
+real relu(const real a, const real b) { return a * (b > 0.0f ? 1.0f : 0.0f); }
+
+real sigmoid(const real a, const real b) { return a * b * (1 - b); }
+
+real tanh(const real a, const real b) { return a * (1.0f - b * b); }
+
+real linear(const real a, const real b) { return a; }
} // namespace hppl
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index f16376ec937d3a397d9e7117de528c304f8403ee..e8ba232d44b3f66254d4749d4abbcfbe46d1fd0e 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include
#include
#include "hl_cuda.h"
@@ -24,7 +23,7 @@ limitations under the License. */
namespace dynload {
std::once_flag cublas_dso_flag;
-void* cublas_dso_handle = nullptr;
+void *cublas_dso_handle = nullptr;
/**
* The following macro definition can generate structs
@@ -34,31 +33,30 @@ void* cublas_dso_handle = nullptr;
* note: default dynamic linked libs
*/
#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- cublasStatus_t operator()(Args... args) { \
- typedef cublasStatus_t (*cublasFunc)(Args...); \
- std::call_once(cublas_dso_flag, GetCublasDsoHandle, \
- &cublas_dso_handle); \
- void* p_##__name = dlsym(cublas_dso_handle, #__name); \
- return reinterpret_cast(p_##__name)(args...); \
- } \
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ cublasStatus_t operator()(Args... args) { \
+ typedef cublasStatus_t (*cublasFunc)(Args...); \
+ std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \
+ void *p_##__name = dlsym(cublas_dso_handle, #__name); \
+ return reinterpret_cast(p_##__name)(args...); \
+ } \
} __name; // struct DynLoad__##__name
#else
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- cublasStatus_t operator()(Args... args) { \
- return __name(args...); \
- } \
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ cublasStatus_t operator()(Args... args) { \
+ return __name(args...); \
+ } \
} __name; // struct DynLoad__##__name
#endif
-#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
- DYNAMIC_LOAD_CUBLAS_WRAP(__name)
+#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
// include all needed cublas functions in HPPL
+// clang-format off
#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasSgemv) \
__macro(cublasDgemv) \
@@ -88,41 +86,41 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
} /* namespace dynload */
-
+// clang-format on
#ifndef PADDLE_TYPE_DOUBLE
-#define CUBLAS_GEAM dynload::cublasSgeam
-#define CUBLAS_GEMV dynload::cublasSgemv
-#define CUBLAS_GEMM dynload::cublasSgemm
-#define CUBLAS_GETRF dynload::cublasSgetrfBatched
-#define CUBLAS_GETRI dynload::cublasSgetriBatched
+#define CUBLAS_GEAM dynload::cublasSgeam
+#define CUBLAS_GEMV dynload::cublasSgemv
+#define CUBLAS_GEMM dynload::cublasSgemm
+#define CUBLAS_GETRF dynload::cublasSgetrfBatched
+#define CUBLAS_GETRI dynload::cublasSgetriBatched
#else
-#define CUBLAS_GEAM dynload::cublasDgeam
-#define CUBLAS_GEMV dynload::cublasDgemv
-#define CUBLAS_GEMM dynload::cublasDgemm
-#define CUBLAS_GETRF dynload::cublasDgetrfBatched
-#define CUBLAS_GETRI dynload::cublasDgetriBatched
+#define CUBLAS_GEAM dynload::cublasDgeam
+#define CUBLAS_GEMV dynload::cublasDgemv
+#define CUBLAS_GEMM dynload::cublasDgemm
+#define CUBLAS_GETRF dynload::cublasDgetrfBatched
+#define CUBLAS_GETRI dynload::cublasDgetriBatched
#endif
-const char* hl_cublas_get_error_string(cublasStatus_t status) {
- switch(status) {
- case CUBLAS_STATUS_NOT_INITIALIZED:
- return "[cublas status]: not initialized";
- case CUBLAS_STATUS_ALLOC_FAILED:
- return "[cublas status]: allocate failed";
- case CUBLAS_STATUS_INVALID_VALUE:
- return "[cublas status]: invalid value";
- case CUBLAS_STATUS_ARCH_MISMATCH:
- return "[cublas status]: arch mismatch";
- case CUBLAS_STATUS_MAPPING_ERROR:
- return "[cublas status]: mapping error";
- case CUBLAS_STATUS_EXECUTION_FAILED:
- return "[cublas status]: execution failed";
- case CUBLAS_STATUS_INTERNAL_ERROR:
- return "[cublas status]: internal error";
- case CUBLAS_STATUS_SUCCESS:
- return "[cublas status]: success";
- default:
- return "[cublas status]: unknown error";
+const char *hl_cublas_get_error_string(cublasStatus_t status) {
+ switch (status) {
+ case CUBLAS_STATUS_NOT_INITIALIZED:
+ return "[cublas status]: not initialized";
+ case CUBLAS_STATUS_ALLOC_FAILED:
+ return "[cublas status]: allocate failed";
+ case CUBLAS_STATUS_INVALID_VALUE:
+ return "[cublas status]: invalid value";
+ case CUBLAS_STATUS_ARCH_MISMATCH:
+ return "[cublas status]: arch mismatch";
+ case CUBLAS_STATUS_MAPPING_ERROR:
+ return "[cublas status]: mapping error";
+ case CUBLAS_STATUS_EXECUTION_FAILED:
+ return "[cublas status]: execution failed";
+ case CUBLAS_STATUS_INTERNAL_ERROR:
+ return "[cublas status]: internal error";
+ case CUBLAS_STATUS_SUCCESS:
+ return "[cublas status]: success";
+ default:
+ return "[cublas status]: unknown error";
}
}
@@ -131,27 +129,21 @@ const char* hl_cublas_get_error_string(cublasStatus_t status) {
* support << operator for more details error info.
*/
cublasStatus_t g_cublasStat;
-#define CHECK_CUBLAS(cublas_func) \
- g_cublasStat = cublas_func; \
- CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \
- << "Cublas Error: " \
- << hl_cublas_get_error_string(g_cublasStat) \
- << " "
+#define CHECK_CUBLAS(cublas_func) \
+ g_cublasStat = cublas_func; \
+ CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \
+ << "Cublas Error: " << hl_cublas_get_error_string(g_cublasStat) << " "
void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream) {
CHECK_CUBLAS(dynload::cublasCreate(cublas_handle))
- << "[cublas init] Cublas create handle faild!";
+ << "[cublas init] Cublas create handle faild!";
CHECK_CUBLAS(dynload::cublasSetStream(*cublas_handle, stream))
- << "[cublas init] Cublas set stream faild!";
+ << "[cublas init] Cublas set stream faild!";
}
-void hl_matrix_transpose(real *A_d,
- real *C_d,
- int dimM,
- int dimN,
- int lda,
- int ldc) {
+void hl_matrix_transpose(
+ real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {
real alpha = 1.0;
real beta = 0.0;
@@ -159,11 +151,18 @@ void hl_matrix_transpose(real *A_d,
CHECK_NOTNULL(C_d);
CHECK_CUBLAS(CUBLAS_GEAM(t_resource.handle,
- CUBLAS_OP_T, CUBLAS_OP_N,
- dimM, dimN,
- &alpha, A_d, lda,
- &beta, nullptr, dimM,
- C_d, ldc));
+ CUBLAS_OP_T,
+ CUBLAS_OP_N,
+ dimM,
+ dimN,
+ &alpha,
+ A_d,
+ lda,
+ &beta,
+ nullptr,
+ dimM,
+ C_d,
+ ldc));
CHECK_SYNC("hl_matrix_transpose failed");
}
@@ -181,21 +180,20 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
real **inout_d = (real **)hl_malloc_device(sizeof(real *));
hl_memcpy(inout_d, inout_h, sizeof(real *));
- int *pivot_d = (int *)hl_malloc_device(dimN*sizeof(int));
+ int *pivot_d = (int *)hl_malloc_device(dimN * sizeof(int));
int *info_d = (int *)t_resource.gpu_mem;
/* Note: cublasSgetrfBatched is used to calculate a number of
small-sized matrices. There may be a better way to reconstruct
the API for better performance.
*/
- CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle,
- dimN, inout_d, lda, pivot_d,
- info_d, 1));
+ CHECK_CUBLAS(
+ CUBLAS_GETRF(t_resource.handle, dimN, inout_d, lda, pivot_d, info_d, 1));
- int info_h;
+ int info_h;
hl_memcpy(&info_h, info_d, sizeof(int));
if (info_h != 0) {
- LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
+ LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
}
/* Step 2: Compute the inverse of the matrix given its LU decomposition */
@@ -204,27 +202,40 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
hl_memcpy(out_d, out_h, sizeof(real *));
CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
- dimN, (const real **)inout_d, lda, pivot_d,
- out_d, ldc, info_d, 1));
+ dimN,
+ (const real **)inout_d,
+ lda,
+ pivot_d,
+ out_d,
+ ldc,
+ info_d,
+ 1));
hl_memcpy(&info_h, info_d, sizeof(int));
if (info_h != 0) {
- LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
+ LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
}
hl_free_mem_device(inout_d);
hl_free_mem_device(pivot_d);
hl_free_mem_device(out_d);
-
+
CHECK_SYNC("hl_matrix_inverse failed");
}
-void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta,
- int lda, int ldb, int ldc) {
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta,
+ int lda,
+ int ldb,
+ int ldc) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(B_d);
CHECK_NOTNULL(C_d);
@@ -232,8 +243,8 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
if (dimN == 1 && dimM != 1 && dimK != 1 && transb == HPPL_OP_N) {
int m = (transa == HPPL_OP_N) ? dimM : dimK;
int n = (transa == HPPL_OP_N) ? dimK : dimM;
- hl_matrix_mul_vector(A_d, transa, B_d, C_d, m, n,
- alpha, beta, lda, ldb, ldc);
+ hl_matrix_mul_vector(
+ A_d, transa, B_d, C_d, m, n, alpha, beta, lda, ldb, ldc);
return;
}
@@ -241,8 +252,7 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
int m = (transb == HPPL_OP_N) ? dimK : dimN;
int n = (transb == HPPL_OP_N) ? dimN : dimK;
hl_trans_op_t trans = (transb == HPPL_OP_N) ? HPPL_OP_T : HPPL_OP_N;
- hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n,
- alpha, beta, ldb, 1, 1);
+ hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n, alpha, beta, ldb, 1, 1);
return;
}
@@ -251,26 +261,47 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
stat = CUBLAS_GEMM(t_resource.handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
- dimN, dimM, dimK,
- &alpha, B_d, ldb,
- A_d, lda,
- &beta, C_d, ldc);
+ dimN,
+ dimM,
+ dimK,
+ &alpha,
+ B_d,
+ ldb,
+ A_d,
+ lda,
+ &beta,
+ C_d,
+ ldc);
} else if ((HPPL_OP_T == transa) && (HPPL_OP_N == transb)) {
stat = CUBLAS_GEMM(t_resource.handle,
CUBLAS_OP_N,
CUBLAS_OP_T,
- dimN, dimM, dimK,
- &alpha, B_d, ldb,
- A_d, lda,
- &beta, C_d, ldc);
+ dimN,
+ dimM,
+ dimK,
+ &alpha,
+ B_d,
+ ldb,
+ A_d,
+ lda,
+ &beta,
+ C_d,
+ ldc);
} else if ((HPPL_OP_N == transa) && (HPPL_OP_T == transb)) {
stat = CUBLAS_GEMM(t_resource.handle,
CUBLAS_OP_T,
CUBLAS_OP_N,
- dimN, dimM, dimK,
- &alpha, B_d, ldb,
- A_d, lda,
- &beta, C_d, ldc);
+ dimN,
+ dimM,
+ dimK,
+ &alpha,
+ B_d,
+ ldb,
+ A_d,
+ lda,
+ &beta,
+ C_d,
+ ldc);
} else {
LOG(FATAL) << "parameter transa error!";
}
@@ -278,24 +309,46 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
CHECK_SYNC("hl_matrix_mul failed");
}
-void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta) {
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta) {
int lda = (HPPL_OP_N == transa) ? dimK : dimM;
int ldb = (HPPL_OP_N == transb) ? dimN : dimK;
int ldc = dimN;
- hl_matrix_mul(A_d, transa, B_d, transb, C_d, dimM, dimN,
- dimK, alpha, beta, lda, ldb, ldc);
+ hl_matrix_mul(A_d,
+ transa,
+ B_d,
+ transb,
+ C_d,
+ dimM,
+ dimN,
+ dimK,
+ alpha,
+ beta,
+ lda,
+ ldb,
+ ldc);
}
-void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
- real *B_d, real *C_d,
- int dimM, int dimN,
- real alpha, real beta,
- int lda, int incb, int incc) {
+void hl_matrix_mul_vector(real *A_d,
+ hl_trans_op_t trans,
+ real *B_d,
+ real *C_d,
+ int dimM,
+ int dimN,
+ real alpha,
+ real beta,
+ int lda,
+ int incb,
+ int incc) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(B_d);
CHECK_NOTNULL(C_d);
@@ -304,21 +357,29 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
if (HPPL_OP_N == trans) {
stat = CUBLAS_GEMV(t_resource.handle,
CUBLAS_OP_T,
- dimN, dimM,
+ dimN,
+ dimM,
&alpha,
- A_d, lda,
- B_d, incb,
+ A_d,
+ lda,
+ B_d,
+ incb,
&beta,
- C_d, incc);
+ C_d,
+ incc);
} else if (HPPL_OP_T == trans) {
stat = CUBLAS_GEMV(t_resource.handle,
CUBLAS_OP_N,
- dimN, dimM,
+ dimN,
+ dimM,
&alpha,
- A_d, lda,
- B_d, incb,
+ A_d,
+ lda,
+ B_d,
+ incb,
&beta,
- C_d, incc);
+ C_d,
+ incc);
} else {
LOG(FATAL) << "parameter transa error!";
}
@@ -327,10 +388,14 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
CHECK_SYNC("hl_matrix_mul_vector");
}
-void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
- real *B_d, real *C_d,
- int dimM, int dimN,
- real alpha, real beta) {
- hl_matrix_mul_vector(A_d, trans, B_d, C_d, dimM, dimN,
- alpha, beta, dimN, 1, 1);
+void hl_matrix_mul_vector(real *A_d,
+ hl_trans_op_t trans,
+ real *B_d,
+ real *C_d,
+ int dimM,
+ int dimN,
+ real alpha,
+ real beta) {
+ hl_matrix_mul_vector(
+ A_d, trans, B_d, C_d, dimM, dimN, alpha, beta, dimN, 1, 1);
}
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 92b28e4345c3d4d306e6ee2a7f9f50189454f951..9d4ff08a78d641896e946e9bf04590d4ba93350f 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include
#include
#include "hl_cuda_cudnn.h"
@@ -22,9 +21,10 @@ limitations under the License. */
#include "paddle/utils/Logging.h"
#include "paddle/utils/CommandLineParser.h"
-P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb, 4096,
- "Specify cuDNN max workspace limit, in units MB, "
- "4096MB=4GB by default.");
+P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
+ 4096,
+ "Specify cuDNN max workspace limit, in units MB, "
+ "4096MB=4GB by default.");
namespace dynload {
@@ -41,16 +41,15 @@ void* cudnn_dso_handle = nullptr;
#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- auto operator()(Args... args) -> decltype(__name(args...)) { \
- using cudnn_func = decltype(__name(args...))(*)(Args...); \
- std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, \
- &cudnn_dso_handle); \
- void* p_##__name = dlsym(cudnn_dso_handle, #__name); \
- return reinterpret_cast(p_##__name)(args...); \
- } \
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ auto operator()(Args... args) -> decltype(__name(args...)) { \
+ using cudnn_func = decltype(__name(args...)) (*)(Args...); \
+ std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \
+ void* p_##__name = dlsym(cudnn_dso_handle, #__name); \
+ return reinterpret_cast(p_##__name)(args...); \
+ } \
} __name; /* struct DynLoad__##__name */
#else
@@ -69,6 +68,7 @@ void* cudnn_dso_handle = nullptr;
* include all needed cudnn functions in HPPL
* different cudnn version has different interfaces
**/
+// clang-format off
#define CUDNN_DNN_ROUTINE_EACH(__macro) \
__macro(cudnnSetTensor4dDescriptor) \
__macro(cudnnSetTensor4dDescriptorEx) \
@@ -141,58 +141,53 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#undef CUDNN_DNN_ROUTINE_EACH
-
+// clang-format on
} /* namespace dynload */
/**
* Check build-in cudnn function using glog and it **does not**
* support << operator for more details error info.
*/
-#define CHECK_CUDNN(cudnnFunc) \
- do { \
- cudnnStatus_t cudnnStat = cudnnFunc; \
- CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat) \
- << "Cudnn Error: " \
- << dynload::cudnnGetErrorString(cudnnStat); \
+#define CHECK_CUDNN(cudnnFunc) \
+ do { \
+ cudnnStatus_t cudnnStat = cudnnFunc; \
+ CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat) \
+ << "Cudnn Error: " << dynload::cudnnGetErrorString(cudnnStat); \
} while (0)
bool g_is_libcudnn_init = false;
int g_cudnn_lib_version = 0;
-void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc)
-{
- CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
+void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) {
+ CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
}
-void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream)
-{
- size_t cudnn_dso_ver = dynload::cudnnGetVersion();
- size_t cudnn_dso_major = cudnn_dso_ver / 1000;
- size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
-
- // Compare cudnn header version with that of cudnn.so.
- CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
- (cudnn_cuh_major == cudnn_dso_major))
- << "[cudnn init] libcudnn v" << cudnn_dso_major <<
- " with header v" << cudnn_cuh_major << " unmatched!\n"
- << "PaddlePaddle Requirement: "
- << "(header v[2-3] with libcudnn v[2-3]) Or "
- << "(header v4 with libcudnn v4) Or "
- << "(header v5 with libcudnn v5).";
-
- CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
- << "cudnn v5 requires cuda version >= 7.5";
-
- CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
- CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
-
- g_is_libcudnn_init = true;
- g_cudnn_lib_version = cudnn_dso_ver;
+void hl_cudnn_init(cudnnHandle_t* cudnn_handle, cudaStream_t stream) {
+ size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+ size_t cudnn_dso_major = cudnn_dso_ver / 1000;
+ size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
+
+ // Compare cudnn header version with that of cudnn.so.
+ CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
+ (cudnn_cuh_major == cudnn_dso_major))
+ << "[cudnn init] libcudnn v" << cudnn_dso_major << " with header v"
+ << cudnn_cuh_major << " unmatched!\n"
+ << "PaddlePaddle Requirement: "
+ << "(header v[2-3] with libcudnn v[2-3]) Or "
+ << "(header v4 with libcudnn v4) Or "
+ << "(header v5 with libcudnn v5).";
+
+ CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
+ << "cudnn v5 requires cuda version >= 7.5";
+
+ CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
+ CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
+
+ g_is_libcudnn_init = true;
+ g_cudnn_lib_version = cudnn_dso_ver;
}
-int hl_get_cudnn_lib_version() {
- return g_cudnn_lib_version;
-}
+int hl_get_cudnn_lib_version() { return g_cudnn_lib_version; }
void hl_conv_workspace(hl_tensor_descriptor input,
hl_tensor_descriptor output,
@@ -206,94 +201,91 @@ void hl_conv_workspace(hl_tensor_descriptor input,
size_t* bwdFilterLimitBytes) {
#if CUDNN_VERSION >= 4000
- CHECK_NOTNULL(input);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(filter);
- CHECK_NOTNULL(conv);
-
- // Specify workspace limit directly
- size_t memoryLimitBytes = (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
-
- // cudnn convolution forward configuration
- cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
- cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
- cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
- cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
- t_resource.cudnn_handle,
- fwd_src_desc,
- fwd_filter_desc,
- fwd_conv_desc,
- fwd_dest_desc,
- CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
- memoryLimitBytes,
- reinterpret_cast(convFwdAlgo)));
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
- t_resource.cudnn_handle,
- fwd_src_desc,
- fwd_filter_desc,
- fwd_conv_desc,
- fwd_dest_desc,
- static_cast(*convFwdAlgo),
- fwdLimitBytes));
-
- // cudnn convolution backward data configuration
- cudnnFilterDescriptor_t bwd_data_filter_desc =
- GET_FILTER_DESCRIPTOR(filter);
- cudnnTensorDescriptor_t bwd_data_diff_desc =
- GET_TENSOR_DESCRIPTOR(output);
- cudnnTensorDescriptor_t bwd_data_grad_desc =
- GET_TENSOR_DESCRIPTOR(input);
- cudnnConvolutionDescriptor_t bwd_data_conv_desc =
- GET_CONVOLUTION_DESCRIPTOR(conv);
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
- t_resource.cudnn_handle,
- bwd_data_filter_desc,
- bwd_data_diff_desc,
- bwd_data_conv_desc,
- bwd_data_grad_desc,
- CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
- memoryLimitBytes,
- reinterpret_cast(convBwdDataAlgo)));
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
- t_resource.cudnn_handle,
- bwd_data_filter_desc,
- bwd_data_diff_desc,
- bwd_data_conv_desc,
- bwd_data_grad_desc,
- static_cast(*convBwdDataAlgo),
- bwdDataLimitBytes));
-
- // cudnn convolution backward filter configuration
- cudnnTensorDescriptor_t bwd_filter_src_desc =
- GET_TENSOR_DESCRIPTOR(input);
- cudnnTensorDescriptor_t bwd_filter_diff_desc =
- GET_TENSOR_DESCRIPTOR(output);
- cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
- GET_CONVOLUTION_DESCRIPTOR(conv);
- cudnnFilterDescriptor_t bwd_filter_grad_desc =
- GET_FILTER_DESCRIPTOR(filter);
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
- t_resource.cudnn_handle,
- bwd_filter_src_desc,
- bwd_filter_diff_desc,
- bwd_filter_conv_desc,
- bwd_filter_grad_desc,
- CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
- memoryLimitBytes,
- reinterpret_cast(convBwdFilterAlgo)));
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
- t_resource.cudnn_handle, bwd_filter_src_desc,
- bwd_filter_diff_desc, bwd_filter_conv_desc,
- bwd_filter_grad_desc,
- static_cast(*convBwdFilterAlgo),
- bwdFilterLimitBytes));
+ CHECK_NOTNULL(input);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(filter);
+ CHECK_NOTNULL(conv);
+
+ // Specify workspace limit directly
+ size_t memoryLimitBytes =
+ (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
+
+ // cudnn convolution forward configuration
+ cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+ cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
+ t_resource.cudnn_handle,
+ fwd_src_desc,
+ fwd_filter_desc,
+ fwd_conv_desc,
+ fwd_dest_desc,
+ CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+ memoryLimitBytes,
+ reinterpret_cast(convFwdAlgo)));
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
+ t_resource.cudnn_handle,
+ fwd_src_desc,
+ fwd_filter_desc,
+ fwd_conv_desc,
+ fwd_dest_desc,
+ static_cast(*convFwdAlgo),
+ fwdLimitBytes));
+
+ // cudnn convolution backward data configuration
+ cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+ cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnConvolutionDescriptor_t bwd_data_conv_desc =
+ GET_CONVOLUTION_DESCRIPTOR(conv);
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+ t_resource.cudnn_handle,
+ bwd_data_filter_desc,
+ bwd_data_diff_desc,
+ bwd_data_conv_desc,
+ bwd_data_grad_desc,
+ CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+ memoryLimitBytes,
+ reinterpret_cast(convBwdDataAlgo)));
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+ t_resource.cudnn_handle,
+ bwd_data_filter_desc,
+ bwd_data_diff_desc,
+ bwd_data_conv_desc,
+ bwd_data_grad_desc,
+ static_cast(*convBwdDataAlgo),
+ bwdDataLimitBytes));
+
+ // cudnn convolution backward filter configuration
+ cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
+ GET_CONVOLUTION_DESCRIPTOR(conv);
+ cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+ t_resource.cudnn_handle,
+ bwd_filter_src_desc,
+ bwd_filter_diff_desc,
+ bwd_filter_conv_desc,
+ bwd_filter_grad_desc,
+ CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+ memoryLimitBytes,
+ reinterpret_cast(convBwdFilterAlgo)));
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+ t_resource.cudnn_handle,
+ bwd_filter_src_desc,
+ bwd_filter_diff_desc,
+ bwd_filter_conv_desc,
+ bwd_filter_grad_desc,
+ static_cast(*convBwdFilterAlgo),
+ bwdFilterLimitBytes));
#endif
}
@@ -302,78 +294,75 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
int batch_size,
int feature_maps,
int height,
- int width)
-{
- CHECK_NOTNULL(image_desc);
+ int width) {
+ CHECK_NOTNULL(image_desc);
- cudnn_tensor_descriptor hl_desc =
- (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
- CHECK_NOTNULL(hl_desc);
+ cudnn_tensor_descriptor hl_desc =
+ (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+ CHECK_NOTNULL(hl_desc);
#ifndef PADDLE_TYPE_DOUBLE
- cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+ cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
- cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+ cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
#endif
- CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
-
- CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
- hl_desc->desc,
- CUDNN_TENSOR_NCHW,
- data_type,
- batch_size,
- feature_maps,
- height,
- width));
-
- hl_desc->format = CUDNN_TENSOR_NCHW;
- hl_desc->data_type = data_type;
- hl_desc->batch_size = batch_size;
- hl_desc->feature_maps = feature_maps;
- hl_desc->height = height;
- hl_desc->width = width;
-
- *image_desc = (hl_tensor_descriptor)hl_desc;
+ CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+
+ CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(hl_desc->desc,
+ CUDNN_TENSOR_NCHW,
+ data_type,
+ batch_size,
+ feature_maps,
+ height,
+ width));
+
+ hl_desc->format = CUDNN_TENSOR_NCHW;
+ hl_desc->data_type = data_type;
+ hl_desc->batch_size = batch_size;
+ hl_desc->feature_maps = feature_maps;
+ hl_desc->height = height;
+ hl_desc->width = width;
+
+ *image_desc = (hl_tensor_descriptor)hl_desc;
}
void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
- CHECK_NOTNULL(image_desc);
+ CHECK_NOTNULL(image_desc);
- cudnn_tensor_descriptor hl_desc =
- (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
- CHECK_NOTNULL(hl_desc);
+ cudnn_tensor_descriptor hl_desc =
+ (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+ CHECK_NOTNULL(hl_desc);
#ifndef PADDLE_TYPE_DOUBLE
- cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+ cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
- cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+ cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
#endif
- CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+ CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
- hl_desc->data_type = data_type;
+ hl_desc->data_type = data_type;
- *image_desc = (hl_tensor_descriptor)hl_desc;
+ *image_desc = (hl_tensor_descriptor)hl_desc;
}
void hl_tensor_reshape(hl_tensor_descriptor image_desc,
int batch_size,
int feature_maps,
int height,
- int width)
-{
- const int stride_w = 1;
- const int stride_h = width * stride_w;
- const int stride_c = height * stride_h;
- const int stride_n = feature_maps * stride_c;
- return hl_tensor_reshape(image_desc,
- batch_size,
- feature_maps,
- height,
- width,
- stride_n,
- stride_c,
- stride_h,
- stride_w);
+ int width) {
+ const int stride_w = 1;
+ const int stride_h = width * stride_w;
+ const int stride_c = height * stride_h;
+ const int stride_n = feature_maps * stride_c;
+ return hl_tensor_reshape(image_desc,
+ batch_size,
+ feature_maps,
+ height,
+ width,
+ stride_n,
+ stride_c,
+ stride_h,
+ stride_w);
}
void hl_tensor_reshape(hl_tensor_descriptor image_desc,
@@ -384,45 +373,42 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
int nStride,
int cStride,
int hStride,
- int wStride)
-{
- CHECK_NOTNULL(image_desc);
-
- cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
- CHECK_NOTNULL(hl_desc->desc);
-
- CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
- hl_desc->data_type,
- batch_size,
- feature_maps,
- height,
- width,
- nStride,
- cStride,
- hStride,
- wStride));
-
- hl_desc->batch_size = batch_size;
- hl_desc->feature_maps = feature_maps;
- hl_desc->height = height;
- hl_desc->width = width;
+ int wStride) {
+ CHECK_NOTNULL(image_desc);
+
+ cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+ CHECK_NOTNULL(hl_desc->desc);
+
+ CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
+ hl_desc->data_type,
+ batch_size,
+ feature_maps,
+ height,
+ width,
+ nStride,
+ cStride,
+ hStride,
+ wStride));
+
+ hl_desc->batch_size = batch_size;
+ hl_desc->feature_maps = feature_maps;
+ hl_desc->height = height;
+ hl_desc->width = width;
}
-void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc)
-{
- CHECK_NOTNULL(image_desc);
+void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
+ CHECK_NOTNULL(image_desc);
- cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
- CHECK_NOTNULL(hl_desc->desc);
+ cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+ CHECK_NOTNULL(hl_desc->desc);
- CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
+ CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
- hl_desc->desc = NULL;
+ hl_desc->desc = NULL;
- free(image_desc);
+ free(image_desc);
}
-
void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
hl_pooling_mode_t mode,
int height,
@@ -430,99 +416,93 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
int height_padding,
int width_padding,
int stride_height,
- int stride_width)
-{
- cudnnPoolingMode_t cudnn_mode;
- switch (mode)
- {
- case HL_POOLING_MAX:
- cudnn_mode = CUDNN_POOLING_MAX;
- break;
- case HL_POOLING_AVERAGE:
- cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
- break;
- case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
- cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
- break;
- default:
- LOG(FATAL) << "parameter mode error";
- }
-
- CHECK_NOTNULL(pooling_desc);
-
- cudnn_pooling_descriptor hl_pooling_desc =
- (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
- CHECK_NOTNULL(hl_pooling_desc);
-
- CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
-
- CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(
- hl_pooling_desc->desc,
- cudnn_mode,
+ int stride_width) {
+ cudnnPoolingMode_t cudnn_mode;
+ switch (mode) {
+ case HL_POOLING_MAX:
+ cudnn_mode = CUDNN_POOLING_MAX;
+ break;
+ case HL_POOLING_AVERAGE:
+ cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+ break;
+ case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
+ cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+ break;
+ default:
+ LOG(FATAL) << "parameter mode error";
+ }
+
+ CHECK_NOTNULL(pooling_desc);
+
+ cudnn_pooling_descriptor hl_pooling_desc =
+ (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
+ CHECK_NOTNULL(hl_pooling_desc);
+
+ CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
+
+ CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(hl_pooling_desc->desc,
+ cudnn_mode,
#if CUDNN_VERSION >= 5000
- CUDNN_PROPAGATE_NAN,
+ CUDNN_PROPAGATE_NAN,
#endif
- height,
- width,
- height_padding,
- width_padding,
- stride_height,
- stride_width));
-
- hl_pooling_desc->mode = cudnn_mode;
- hl_pooling_desc->window_height = height;
- hl_pooling_desc->window_width = width;
- hl_pooling_desc->stride_height = stride_height;
- hl_pooling_desc->stride_width = stride_width;
-
- *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
+ height,
+ width,
+ height_padding,
+ width_padding,
+ stride_height,
+ stride_width));
+
+ hl_pooling_desc->mode = cudnn_mode;
+ hl_pooling_desc->window_height = height;
+ hl_pooling_desc->window_width = width;
+ hl_pooling_desc->stride_height = stride_height;
+ hl_pooling_desc->stride_width = stride_width;
+
+ *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
}
-void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc)
-{
- CHECK_NOTNULL(pooling_desc);
+void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
+ CHECK_NOTNULL(pooling_desc);
- cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
- CHECK_NOTNULL(hl_pooling->desc);
+ cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
- CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
+ CHECK_NOTNULL(hl_pooling->desc);
+ CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
- hl_pooling->desc = NULL;
+ hl_pooling->desc = NULL;
- free(pooling_desc);
+ free(pooling_desc);
}
void hl_pooling_forward(hl_tensor_descriptor input,
real* input_image,
hl_tensor_descriptor output,
real* output_image,
- hl_pooling_descriptor pooling)
-{
- cudnnPoolingDescriptor_t pooling_desc;
- cudnnTensorDescriptor_t input_desc;
- cudnnTensorDescriptor_t output_desc;
-
- CHECK_NOTNULL(input);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(pooling);
- CHECK_NOTNULL(input_image);
- CHECK_NOTNULL(output_image);
-
- real alpha = 1.0f;
- real beta = 1.0f;
- input_desc = ((cudnn_tensor_descriptor)input)->desc;
- output_desc = ((cudnn_tensor_descriptor)output)->desc;
- pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
- CHECK_CUDNN(dynload::cudnnPoolingForward(
- t_resource.cudnn_handle,
- pooling_desc,
- &alpha,
- input_desc,
- input_image,
- &beta,
- output_desc,
- output_image));
- CHECK_SYNC("hl_pooling_forward failed");
+ hl_pooling_descriptor pooling) {
+ cudnnPoolingDescriptor_t pooling_desc;
+ cudnnTensorDescriptor_t input_desc;
+ cudnnTensorDescriptor_t output_desc;
+
+ CHECK_NOTNULL(input);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(pooling);
+ CHECK_NOTNULL(input_image);
+ CHECK_NOTNULL(output_image);
+
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ input_desc = ((cudnn_tensor_descriptor)input)->desc;
+ output_desc = ((cudnn_tensor_descriptor)output)->desc;
+ pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+ CHECK_CUDNN(dynload::cudnnPoolingForward(t_resource.cudnn_handle,
+ pooling_desc,
+ &alpha,
+ input_desc,
+ input_image,
+ &beta,
+ output_desc,
+ output_image));
+ CHECK_SYNC("hl_pooling_forward failed");
}
void hl_pooling_backward(hl_tensor_descriptor input,
@@ -531,94 +511,87 @@ void hl_pooling_backward(hl_tensor_descriptor input,
hl_tensor_descriptor output,
real* output_image,
real* output_image_grad,
- hl_pooling_descriptor pooling)
-{
- cudnnPoolingDescriptor_t pooling_desc;
- cudnnTensorDescriptor_t input_desc;
- cudnnTensorDescriptor_t output_desc;
-
- CHECK_NOTNULL(input);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(pooling);
- CHECK_NOTNULL(input_image);
- CHECK_NOTNULL(input_image_grad);
- CHECK_NOTNULL(output_image);
- CHECK_NOTNULL(output_image_grad);
-
- real alpha = 1.0f;
- real beta = 1.0f;
- input_desc = ((cudnn_tensor_descriptor)input)->desc;
- output_desc = ((cudnn_tensor_descriptor)output)->desc;
- pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
- CHECK_CUDNN(dynload::cudnnPoolingBackward(
- t_resource.cudnn_handle,
- pooling_desc,
- &alpha,
- output_desc,
- output_image,
- output_desc,
- output_image_grad,
- input_desc,
- input_image,
- &beta,
- input_desc,
- input_image_grad));
+ hl_pooling_descriptor pooling) {
+ cudnnPoolingDescriptor_t pooling_desc;
+ cudnnTensorDescriptor_t input_desc;
+ cudnnTensorDescriptor_t output_desc;
+
+ CHECK_NOTNULL(input);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(pooling);
+ CHECK_NOTNULL(input_image);
+ CHECK_NOTNULL(input_image_grad);
+ CHECK_NOTNULL(output_image);
+ CHECK_NOTNULL(output_image_grad);
+
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ input_desc = ((cudnn_tensor_descriptor)input)->desc;
+ output_desc = ((cudnn_tensor_descriptor)output)->desc;
+ pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+ CHECK_CUDNN(dynload::cudnnPoolingBackward(t_resource.cudnn_handle,
+ pooling_desc,
+ &alpha,
+ output_desc,
+ output_image,
+ output_desc,
+ output_image_grad,
+ input_desc,
+ input_image,
+ &beta,
+ input_desc,
+ input_image_grad));
CHECK_SYNC("hl_pooling_backward failed");
}
-
void hl_create_filter_descriptor(hl_filter_descriptor* filter,
int input_feature_maps,
int output_feature_maps,
int height,
- int width)
-{
- CHECK_NOTNULL(filter);
+ int width) {
+ CHECK_NOTNULL(filter);
- cudnn_filter_descriptor hl_filter =
- (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
- CHECK_NOTNULL(hl_filter);
+ cudnn_filter_descriptor hl_filter =
+ (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
+ CHECK_NOTNULL(hl_filter);
- CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
+ CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
#ifndef PADDLE_TYPE_DOUBLE
- cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+ cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
- cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+ cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
#endif
- CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(
- hl_filter->desc,
- data_type,
+ CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(hl_filter->desc,
+ data_type,
#if CUDNN_VERSION >= 5000
- CUDNN_TENSOR_NCHW,
+ CUDNN_TENSOR_NCHW,
#endif
- output_feature_maps,
- input_feature_maps,
- height,
- width));
-
- hl_filter->data_type = data_type;
- hl_filter->output_feature_maps = output_feature_maps;
- hl_filter->input_feature_maps = input_feature_maps;
- hl_filter->filter_height = height;
- hl_filter->filter_width = width;
-
- *filter = (hl_filter_descriptor)hl_filter;
+ output_feature_maps,
+ input_feature_maps,
+ height,
+ width));
+
+ hl_filter->data_type = data_type;
+ hl_filter->output_feature_maps = output_feature_maps;
+ hl_filter->input_feature_maps = input_feature_maps;
+ hl_filter->filter_height = height;
+ hl_filter->filter_width = width;
+
+ *filter = (hl_filter_descriptor)hl_filter;
}
+void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
+ CHECK_NOTNULL(filter);
-void hl_destroy_filter_descriptor(hl_filter_descriptor filter)
-{
- CHECK_NOTNULL(filter);
+ cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
+ CHECK_NOTNULL(hl_filter->desc);
- cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
- CHECK_NOTNULL(hl_filter->desc);
+ CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
- CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
+ hl_filter->desc = NULL;
- hl_filter->desc = NULL;
-
- free(filter);
+ free(filter);
}
void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
@@ -627,38 +600,36 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
int padding_height,
int padding_width,
int stride_height,
- int stride_width)
-{
- CHECK_NOTNULL(conv);
-
- cudnn_convolution_descriptor hl_conv =
- (cudnn_convolution_descriptor)malloc(sizeof(_cudnn_convolution_descriptor));
- CHECK_NOTNULL(hl_conv);
-
- CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
-
- cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
- CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(
- hl_conv->desc,
- padding_height,
- padding_width,
- stride_height,
- stride_width,
- 1,
- 1,
- mode));
-
- hl_conv->input_image = image;
- hl_conv->filter = filter;
- hl_conv->padding_height = padding_height;
- hl_conv->padding_width = padding_width;
- hl_conv->stride_height = stride_height;
- hl_conv->stride_width = stride_width;
- hl_conv->upscalex = 1;
- hl_conv->upscaley = 1;
- hl_conv->mode = mode;
-
- *conv = (hl_convolution_descriptor)hl_conv;
+ int stride_width) {
+ CHECK_NOTNULL(conv);
+
+ cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
+ sizeof(_cudnn_convolution_descriptor));
+
+ CHECK_NOTNULL(hl_conv);
+ CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
+
+ cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+ CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
+ padding_height,
+ padding_width,
+ stride_height,
+ stride_width,
+ 1,
+ 1,
+ mode));
+
+ hl_conv->input_image = image;
+ hl_conv->filter = filter;
+ hl_conv->padding_height = padding_height;
+ hl_conv->padding_width = padding_width;
+ hl_conv->stride_height = stride_height;
+ hl_conv->stride_width = stride_width;
+ hl_conv->upscalex = 1;
+ hl_conv->upscaley = 1;
+ hl_conv->mode = mode;
+
+ *conv = (hl_convolution_descriptor)hl_conv;
}
void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
@@ -667,47 +638,44 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
int padding_height,
int padding_width,
int stride_height,
- int stride_width)
-{
- CHECK_NOTNULL(conv);
- CHECK_NOTNULL(image);
- CHECK_NOTNULL(filter);
-
- cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
- cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
- CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(
- conv_desc,
- padding_height,
- padding_width,
- stride_height,
- stride_width,
- 1,
- 1,
- mode));
-
- cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
- hl_conv->input_image = image;
- hl_conv->filter = filter;
- hl_conv->padding_height = padding_height;
- hl_conv->padding_width = padding_width;
- hl_conv->stride_height = stride_height;
- hl_conv->stride_width = stride_width;
- hl_conv->upscalex = 1;
- hl_conv->upscaley = 1;
- hl_conv->mode = mode;
+ int stride_width) {
+ CHECK_NOTNULL(conv);
+ CHECK_NOTNULL(image);
+ CHECK_NOTNULL(filter);
+
+ cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+ cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+ CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
+ padding_height,
+ padding_width,
+ stride_height,
+ stride_width,
+ 1,
+ 1,
+ mode));
+
+ cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+ hl_conv->input_image = image;
+ hl_conv->filter = filter;
+ hl_conv->padding_height = padding_height;
+ hl_conv->padding_width = padding_width;
+ hl_conv->stride_height = stride_height;
+ hl_conv->stride_width = stride_width;
+ hl_conv->upscalex = 1;
+ hl_conv->upscaley = 1;
+ hl_conv->mode = mode;
}
-void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv)
-{
- CHECK_NOTNULL(conv);
+void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
+ CHECK_NOTNULL(conv);
- cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
- CHECK_NOTNULL(hl_conv->desc);
+ cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+ CHECK_NOTNULL(hl_conv->desc);
- CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
- hl_conv->desc = NULL;
+ CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
+ hl_conv->desc = NULL;
- free(conv);
+ free(conv);
}
void hl_convolution_forward(hl_tensor_descriptor input,
@@ -720,87 +688,83 @@ void hl_convolution_forward(hl_tensor_descriptor input,
void* gpuWorkSpace,
size_t sizeInBytes,
int convFwdAlgo) {
- CHECK_NOTNULL(input);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(filter);
- CHECK_NOTNULL(conv);
- CHECK_NOTNULL(input_data);
- CHECK_NOTNULL(output_data);
- CHECK_NOTNULL(filter_data);
- cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
- cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output);
- cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
- cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
- real alpha = 1.0f;
- real beta = 1.0f;
- CHECK_CUDNN(dynload::cudnnConvolutionForward(
- t_resource.cudnn_handle,
- &alpha,
- src_desc,
- input_data,
- filter_desc,
- filter_data,
- conv_desc,
- static_cast(convFwdAlgo),
- gpuWorkSpace,
- sizeInBytes,
- &beta,
- dest_desc,
- output_data));
+ CHECK_NOTNULL(input);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(filter);
+ CHECK_NOTNULL(conv);
+ CHECK_NOTNULL(input_data);
+ CHECK_NOTNULL(output_data);
+ CHECK_NOTNULL(filter_data);
+ cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+ cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ CHECK_CUDNN(dynload::cudnnConvolutionForward(
+ t_resource.cudnn_handle,
+ &alpha,
+ src_desc,
+ input_data,
+ filter_desc,
+ filter_data,
+ conv_desc,
+ static_cast(convFwdAlgo),
+ gpuWorkSpace,
+ sizeInBytes,
+ &beta,
+ dest_desc,
+ output_data));
CHECK_SYNC("hl_convolution_forward failed");
}
void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
real* bias_data,
hl_tensor_descriptor output,
- real* output_data)
-{
- CHECK_NOTNULL(bias);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(bias_data);
- CHECK_NOTNULL(output_data);
-
- cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
- cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
- real alpha = 1.0f;
- real beta = 1.0f;
-
- CHECK_CUDNN(dynload::cudnnAddTensor(
- t_resource.cudnn_handle,
+ real* output_data) {
+ CHECK_NOTNULL(bias);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(bias_data);
+ CHECK_NOTNULL(output_data);
+
+ cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+ real alpha = 1.0f;
+ real beta = 1.0f;
+
+ CHECK_CUDNN(dynload::cudnnAddTensor(t_resource.cudnn_handle,
#if CUDNN_VERSION < 4000
- CUDNN_ADD_SAME_C,
+ CUDNN_ADD_SAME_C,
#endif
- &alpha,
- bias_desc,
- bias_data,
- &beta,
- output_desc,
- output_data));
+ &alpha,
+ bias_desc,
+ bias_data,
+ &beta,
+ output_desc,
+ output_data));
CHECK_SYNC("hl_convolution_forward_add_bias failed");
}
void hl_convolution_backward_bias(hl_tensor_descriptor bias,
real* bias_grad_data,
hl_tensor_descriptor output,
- real* output_grad_data)
-{
- CHECK_NOTNULL(bias);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(bias_grad_data);
- CHECK_NOTNULL(output_grad_data);
-
- real alpha = 1.0f;
- real beta = 1.0f;
- cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
- cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
- CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(
- t_resource.cudnn_handle,
- &alpha,
- diff_desc,
- output_grad_data,
- &beta,
- bias_desc,
- bias_grad_data));
+ real* output_grad_data) {
+ CHECK_NOTNULL(bias);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(bias_grad_data);
+ CHECK_NOTNULL(output_grad_data);
+
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+ CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(t_resource.cudnn_handle,
+ &alpha,
+ diff_desc,
+ output_grad_data,
+ &beta,
+ bias_desc,
+ bias_grad_data));
CHECK_SYNC("hl_convolution_backward_bias failed");
}
@@ -814,38 +778,37 @@ void hl_convolution_backward_filter(hl_tensor_descriptor input,
void* gpuWorkSpace,
size_t sizeInBytes,
int convBwdFilterAlgo) {
+ CHECK_NOTNULL(input);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(filter);
+ CHECK_NOTNULL(conv);
+ CHECK_NOTNULL(input_data);
+ CHECK_NOTNULL(output_grad_data);
+ CHECK_NOTNULL(filter_grad_data);
- CHECK_NOTNULL(input);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(filter);
- CHECK_NOTNULL(conv);
- CHECK_NOTNULL(input_data);
- CHECK_NOTNULL(output_grad_data);
- CHECK_NOTNULL(filter_grad_data);
-
- real alpha = 1.0f;
- real beta = 1.0f;
- cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
- cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
- cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
- cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
- CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
- t_resource.cudnn_handle,
- &alpha,
- src_desc,
- input_data,
- diff_desc,
- output_grad_data,
- conv_desc,
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+ cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+ CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
+ t_resource.cudnn_handle,
+ &alpha,
+ src_desc,
+ input_data,
+ diff_desc,
+ output_grad_data,
+ conv_desc,
#if CUDNN_VERSION >= 4000
- static_cast(convBwdFilterAlgo),
- gpuWorkSpace,
- sizeInBytes,
+ static_cast(convBwdFilterAlgo),
+ gpuWorkSpace,
+ sizeInBytes,
#endif
- &beta,
- grad_desc,
- filter_grad_data));
+ &beta,
+ grad_desc,
+ filter_grad_data));
CHECK_SYNC("hl_convolution_backward_filter failed");
}
@@ -859,121 +822,111 @@ void hl_convolution_backward_data(hl_tensor_descriptor input,
void* gpuWorkSpace,
size_t sizeInBytes,
int convBwdDataAlgo) {
- real alpha = 1.0f;
- real beta = 1.0f;
- cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
- cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
- cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input);
- cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-
- CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
- t_resource.cudnn_handle,
- &alpha,
- filter_desc,
- filter_data,
- diff_desc,
- output_grad_data,
- conv_desc,
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+ cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+ CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
+ t_resource.cudnn_handle,
+ &alpha,
+ filter_desc,
+ filter_data,
+ diff_desc,
+ output_grad_data,
+ conv_desc,
#if CUDNN_VERSION >= 4000
- static_cast(convBwdDataAlgo),
- gpuWorkSpace,
- sizeInBytes,
+ static_cast(convBwdDataAlgo),
+ gpuWorkSpace,
+ sizeInBytes,
#endif
- &beta,
- grad_desc,
- input_data_grad));
+ &beta,
+ grad_desc,
+ input_data_grad));
CHECK_SYNC("hl_convolution_backward_data failed");
}
-
-void hl_softmax_forward(real *input,
- real *output,
- int height,
- int width)
-{
+void hl_softmax_forward(real* input, real* output, int height, int width) {
#ifndef PADDLE_TYPE_DOUBLE
- cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+ cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
- cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+ cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
#endif
- CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
- t_resource.cudnn_desc,
- CUDNN_TENSOR_NCHW,
- data_type,
- height,
- width,
- 1,
- 1));
-
- real alpha = 1.0f;
- real beta = 0.0f;
- CHECK_CUDNN(dynload::cudnnSoftmaxForward(
- t_resource.cudnn_handle,
- CUDNN_SOFTMAX_ACCURATE,
- CUDNN_SOFTMAX_MODE_CHANNEL,
- &alpha,
- t_resource.cudnn_desc,
- input,
- &beta,
- t_resource.cudnn_desc,
- output));
+ CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+ CUDNN_TENSOR_NCHW,
+ data_type,
+ height,
+ width,
+ 1,
+ 1));
+
+ real alpha = 1.0f;
+ real beta = 0.0f;
+ CHECK_CUDNN(dynload::cudnnSoftmaxForward(t_resource.cudnn_handle,
+ CUDNN_SOFTMAX_ACCURATE,
+ CUDNN_SOFTMAX_MODE_CHANNEL,
+ &alpha,
+ t_resource.cudnn_desc,
+ input,
+ &beta,
+ t_resource.cudnn_desc,
+ output));
CHECK_SYNC("hl_softmax_forward failed");
}
-void hl_softmax_backward(real *output_value,
- real *output_grad,
+void hl_softmax_backward(real* output_value,
+ real* output_grad,
int height,
- int width)
-{
+ int width) {
#ifndef PADDLE_TYPE_DOUBLE
- cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+ cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
- cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+ cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
#endif
- CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
- t_resource.cudnn_desc,
- CUDNN_TENSOR_NCHW,
- data_type,
- height,
- width,
- 1,
- 1));
-
- real alpha = 1.0f;
- real beta = 0.0f;
- CHECK_CUDNN(dynload::cudnnSoftmaxBackward(
- t_resource.cudnn_handle,
- CUDNN_SOFTMAX_ACCURATE,
- CUDNN_SOFTMAX_MODE_CHANNEL,
- &alpha,
- t_resource.cudnn_desc,
- output_value,
- t_resource.cudnn_desc,
- output_grad,
- &beta,
- t_resource.cudnn_desc,
- output_grad));
+ CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+ CUDNN_TENSOR_NCHW,
+ data_type,
+ height,
+ width,
+ 1,
+ 1));
+
+ real alpha = 1.0f;
+ real beta = 0.0f;
+ CHECK_CUDNN(dynload::cudnnSoftmaxBackward(t_resource.cudnn_handle,
+ CUDNN_SOFTMAX_ACCURATE,
+ CUDNN_SOFTMAX_MODE_CHANNEL,
+ &alpha,
+ t_resource.cudnn_desc,
+ output_value,
+ t_resource.cudnn_desc,
+ output_grad,
+ &beta,
+ t_resource.cudnn_desc,
+ output_grad));
CHECK_SYNC("hl_softmax_backward failed");
}
void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outputDesc,
- real *output,
+ real* output,
hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
+ real* scale,
+ real* bias,
double factor,
- real *runningMean,
- real *runningInvVar,
+ real* runningMean,
+ real* runningInvVar,
double epsilon,
- real *savedMean,
- real *savedVar) {
+ real* savedMean,
+ real* savedVar) {
#if CUDNN_VERSION >= 4007
if ((NULL != runningMean && NULL == runningInvVar) ||
(NULL == runningMean && NULL != runningInvVar)) {
LOG(FATAL) << "runningMean and runningInvVar can be NULL "
- << "but only at the same time.";
+ << "but only at the same time.";
}
if ((NULL != savedMean && NULL == savedVar) ||
(NULL == savedMean && NULL != savedVar)) {
@@ -987,10 +940,24 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
real alpha = 1.0f;
real beta = 1.0f;
cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
- CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardTraining(
- t_resource.cudnn_handle, mode, &alpha, &beta, xDesc,
- input, yDesc, output, bnDesc, scale, bias, factor,
- runningMean, runningInvVar, epsilon, savedMean, savedVar));
+ CHECK_CUDNN(
+ dynload::cudnnBatchNormalizationForwardTraining(t_resource.cudnn_handle,
+ mode,
+ &alpha,
+ &beta,
+ xDesc,
+ input,
+ yDesc,
+ output,
+ bnDesc,
+ scale,
+ bias,
+ factor,
+ runningMean,
+ runningInvVar,
+ epsilon,
+ savedMean,
+ savedVar));
CHECK_SYNC("hl_batch_norm_forward_training failed");
#else
@@ -1000,15 +967,15 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
}
void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
- real *input,
- hl_tensor_descriptor outputDesc,
- real *output,
- hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
- real *estimatedMean,
- real *estimatedInvVar,
- double epsilon) {
+ real* input,
+ hl_tensor_descriptor outputDesc,
+ real* output,
+ hl_tensor_descriptor bnParamDesc,
+ real* scale,
+ real* bias,
+ real* estimatedMean,
+ real* estimatedInvVar,
+ double epsilon) {
#if CUDNN_VERSION >= 4007
cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
@@ -1016,10 +983,21 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
real alpha = 1.0f;
real beta = 1.0f;
cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
- CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardInference(
- t_resource.cudnn_handle, mode, &alpha, &beta, xDesc,
- input, yDesc, output, bnDesc, scale, bias,
- estimatedMean, estimatedInvVar, epsilon));
+ CHECK_CUDNN(
+ dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
+ mode,
+ &alpha,
+ &beta,
+ xDesc,
+ input,
+ yDesc,
+ output,
+ bnDesc,
+ scale,
+ bias,
+ estimatedMean,
+ estimatedInvVar,
+ epsilon));
CHECK_SYNC("hl_batch_norm_forward_inference failed");
#else
@@ -1029,18 +1007,18 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
}
void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outGradDesc,
- real *outGrad,
+ real* outGrad,
hl_tensor_descriptor inGradDesc,
- real *inGrad,
+ real* inGrad,
hl_tensor_descriptor dBnParamDesc,
- real *scale,
- real *scaleGrad,
- real *biasGrad,
+ real* scale,
+ real* scaleGrad,
+ real* biasGrad,
double epsilon,
- real *savedMean,
- real *savedInvVar) {
+ real* savedMean,
+ real* savedInvVar) {
#if CUDNN_VERSION >= 4007
if ((NULL != savedMean && NULL == savedInvVar) ||
(NULL == savedMean && NULL != savedInvVar)) {
@@ -1055,12 +1033,25 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
real alpha = 1.0f;
real beta = 1.0f;
cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
- CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(
- t_resource.cudnn_handle, mode, &alpha, &beta,
- &alpha, &beta,
- xDesc, input, dyDesc, outGrad, dxDesc, inGrad,
- bnDesc, scale, scaleGrad, biasGrad, epsilon,
- savedMean, savedInvVar));
+ CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(t_resource.cudnn_handle,
+ mode,
+ &alpha,
+ &beta,
+ &alpha,
+ &beta,
+ xDesc,
+ input,
+ dyDesc,
+ outGrad,
+ dxDesc,
+ inGrad,
+ bnDesc,
+ scale,
+ scaleGrad,
+ biasGrad,
+ epsilon,
+ savedMean,
+ savedInvVar));
CHECK_SYNC("hl_batch_norm_backward failed");
#else
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index 3ea2c91bd5a41e0cd6ece0605a25e645676faa40..6b71a538485a09cf40a53eddf1ee2f3e2c768b2c 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -12,22 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
-#include
+#include
#include
-#include
#include
+#include
+#include
#include
#include "hl_cuda.h"
#include "hl_cuda.ph"
-#include "hl_thread.ph"
#include "hl_dso_loader.h"
+#include "hl_thread.ph"
#include "paddle/utils/Logging.h"
namespace dynload {
std::once_flag curand_dso_flag;
-void* curand_dso_handle = nullptr;
+void *curand_dso_handle = nullptr;
/**
* The following macro definition can generate structs
@@ -37,34 +37,35 @@ void* curand_dso_handle = nullptr;
* note: default dynamic linked libs
*/
#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- curandStatus_t operator()(Args... args) { \
- typedef curandStatus_t (*curandFunc)(Args...); \
- std::call_once(curand_dso_flag, GetCurandDsoHandle, \
- &curand_dso_handle); \
- void* p_##__name = dlsym(curand_dso_handle, #__name); \
- return reinterpret_cast(p_##__name)(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CURAND_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ curandStatus_t operator()(Args... args) { \
+ typedef curandStatus_t (*curandFunc)(Args...); \
+ std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \
+ void *p_##__name = dlsym(curand_dso_handle, #__name); \
+ return reinterpret_cast(p_##__name)(args...); \
+ } \
+ } __name; /* struct DynLoad__##__name */
#else
-#define DYNAMIC_LOAD_CURAND_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- curandStatus_t operator()(Args... args) { \
- return __name(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CURAND_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ curandStatus_t operator()(Args... args) { \
+ return __name(args...); \
+ } \
+ } __name; /* struct DynLoad__##__name */
#endif
/* include all needed curand functions in HPPL */
+// clang-format off
#define CURAND_RAND_ROUTINE_EACH(__macro) \
__macro(curandCreateGenerator) \
__macro(curandSetStream) \
__macro(curandSetPseudoRandomGeneratorSeed)\
__macro(curandGenerateUniform) \
__macro(curandGenerateUniformDouble)
+// clang-format on
CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
@@ -72,7 +73,7 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
#undef DYNAMIC_LOAD_CURAND_WRAP
std::once_flag cudart_dso_flag;
-void* cudart_dso_handle = nullptr;
+void *cudart_dso_handle = nullptr;
/**
* The following macro definition can generate structs
@@ -82,28 +83,28 @@ void* cudart_dso_handle = nullptr;
* note: default dynamic linked libs
*/
#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- auto operator()(Args... args) -> decltype(__name(args...)) { \
- using cudart_func = decltype(__name(args...))(*)(Args...); \
- std::call_once(cudart_dso_flag, GetCudartDsoHandle, \
- &cudart_dso_handle); \
- void* p_##__name = dlsym(cudart_dso_handle, #__name); \
- return reinterpret_cast(p_##__name)(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ auto operator()(Args... args) -> decltype(__name(args...)) { \
+ using cudart_func = decltype(__name(args...)) (*)(Args...); \
+ std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
+ void *p_##__name = dlsym(cudart_dso_handle, #__name); \
+ return reinterpret_cast(p_##__name)(args...); \
+ } \
+ } __name; /* struct DynLoad__##__name */
#else
-#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- auto operator()(Args... args) -> decltype(__name(args...)) { \
- return __name(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ auto operator()(Args... args) -> decltype(__name(args...)) { \
+ return __name(args...); \
+ } \
+ } __name; /* struct DynLoad__##__name */
#endif
/* include all needed cuda functions in HPPL */
+// clang-format off
#define CUDA_ROUTINE_EACH(__macro) \
__macro(cudaMalloc) \
__macro(cudaHostAlloc) \
@@ -133,58 +134,60 @@ void* cudart_dso_handle = nullptr;
__macro(cudaGetLastError) \
__macro(cudaFuncSetCacheConfig) \
__macro(cudaRuntimeGetVersion) \
- __macro(cudaGetErrorString)
+ __macro(cudaGetErrorString) \
+ __macro(cudaProfilerStart) \
+ __macro(cudaProfilerStop)
+// clang-format on
CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
#undef CUDA_ROUNTINE_EACH
#undef DYNAMIC_LOAD_CUDART_WRAP
-} /* namespace dynload */
+} /* namespace dynload */
/**
* @brief global resource.
*/
-int g_system_device_num = 0; /* system device number */
-int device_num = 0; /* use device number */
-hl_device_prop *g_device; /* device info table */
-__thread thread_device_resources *t_device; /* device resources table */
+int g_system_device_num = 0; /* system device number */
+int device_num = 0; /* use device number */
+hl_device_prop *g_device; /* device info table */
+__thread thread_device_resources *t_device; /* device resources table */
int g_cuda_lib_version = 0;
/* number of global stream */
-#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1)
+#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1)
/* number of thread stream */
-#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
+#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
/* sizeof of device memory */
-#define HPPL_GPU_MEMORY_SIZE (256*4)
+#define HPPL_GPU_MEMORY_SIZE (256 * 4)
/**
* Check build-in cuda function using glog and it **does not**
* support << operator for more details error info.
*/
-#define CHECK_CUDA(cudaFunc) \
- do { \
- cudaError_t cudaStat = cudaFunc; \
- CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \
- << dynload::cudaGetErrorString(cudaStat); \
+#define CHECK_CUDA(cudaFunc) \
+ do { \
+ cudaError_t cudaStat = cudaFunc; \
+ CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \
+ << dynload::cudaGetErrorString(cudaStat); \
} while (0)
/**
* @brief thread resource.
*/
-__thread _hl_thread_resource t_resource = {
- {0}, /* stream */
- 0, /* handle */
- 0, /* gen */
- 0, /* cudnn_handle */
- 0, /* cudnn_desc */
- NULL, /* gen_mutex */
- NULL, /* gpu_mem */
- NULL, /* cpu_mem */
- 0, /* event */
- -1, /* device */
- 0, /* major */
- false}; /* is_init */
+__thread _hl_thread_resource t_resource = {{0}, /* stream */
+ 0, /* handle */
+ 0, /* gen */
+ 0, /* cudnn_handle */
+ 0, /* cudnn_desc */
+ NULL, /* gen_mutex */
+ NULL, /* gpu_mem */
+ NULL, /* cpu_mem */
+ 0, /* event */
+ -1, /* device */
+ 0, /* major */
+ false}; /* is_init */
__thread cudaStream_t default_stream = 0;
__thread bool g_sync_flag = true;
@@ -198,18 +201,17 @@ inline pid_t gettid() {
uint64_t tid;
pthread_threadid_np(NULL, &tid);
#else
- #ifndef __NR_gettid
- #define __NR_gettid 224
- #endif
+#ifndef __NR_gettid
+#define __NR_gettid 224
+#endif
pid_t tid = syscall(__NR_gettid);
#endif
- CHECK_NE(tid, -1);
- return tid;
+ CHECK_NE((int)tid, -1);
+ return tid;
}
void hl_init(int device) {
- CHECK(hl_start_flag)
- << "[Init failed] hl_start() did not succeed.";
+ CHECK(hl_start_flag) << "[Init failed] hl_start() did not succeed.";
/* thread has been initialized */
if (true == t_resource.is_init) {
@@ -220,16 +222,16 @@ void hl_init(int device) {
/* create thread devcie resources */
char *tmp;
thread_device_resources device_res;
- tmp = (char *)malloc(g_system_device_num*sizeof(thread_device_resources*) +
- device_num*sizeof(_thread_device_resources));
+ tmp = (char *)malloc(g_system_device_num * sizeof(thread_device_resources *) +
+ device_num * sizeof(_thread_device_resources));
CHECK_NOTNULL(tmp);
- t_device = (thread_device_resources*)tmp;
- device_res = (thread_device_resources)((char*)tmp +
- g_system_device_num*sizeof(thread_device_resources*));
- memset(t_device, 0, g_system_device_num*sizeof(thread_device_resources*));
+ t_device = (thread_device_resources *)tmp;
+ device_res = (thread_device_resources)(
+ (char *)tmp + g_system_device_num * sizeof(thread_device_resources *));
+ memset(t_device, 0, g_system_device_num * sizeof(thread_device_resources *));
- char *tmp_stream = (char *)
- malloc(device_num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t));
+ char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_THREAD_STREAM *
+ sizeof(cudaStream_t));
CHECK_NOTNULL(tmp_stream);
int num = 0;
@@ -239,8 +241,9 @@ void hl_init(int device) {
}
t_device[dev] = &device_res[num];
- t_device[dev]->stream = (cudaStream_t*)(tmp_stream +
- num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t));
+ t_device[dev]->stream =
+ (cudaStream_t *)(tmp_stream +
+ num * NUMBER_OF_THREAD_STREAM * sizeof(cudaStream_t));
hl_create_thread_resources(dev, t_device[dev]);
num++;
@@ -266,14 +269,14 @@ void hl_fini() {
t_resource.stream[i] = 0;
}
- char* tmp = (char*)t_device;
- char* tmp_stream = NULL;
+ char *tmp = (char *)t_device;
+ char *tmp_stream = NULL;
for (int dev = 0; dev < g_system_device_num; dev++) {
if (!t_device[dev]) {
continue;
}
if (!tmp_stream) {
- tmp_stream = (char*)t_device[dev]->stream;
+ tmp_stream = (char *)t_device[dev]->stream;
}
for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j]));
@@ -290,9 +293,7 @@ void hl_fini() {
t_resource.is_init = false;
}
-int hl_get_device_count() {
- return device_num;
-}
+int hl_get_device_count() { return device_num; }
void hl_set_device(int device) {
if (device == t_resource.device) {
@@ -300,7 +301,7 @@ void hl_set_device(int device) {
}
CHECK(device >= 0 && device < g_system_device_num && g_device[device])
- << "Device: " << device << " is not specified in startup.";
+ << "Device: " << device << " is not specified in startup.";
CHECK_CUDA(dynload::cudaSetDevice(device));
@@ -312,11 +313,11 @@ void hl_set_device(int device) {
if (true == t_resource.is_init) {
for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
t_resource.stream[i] =
- t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
+ t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
}
t_resource.gpu_mem = t_device[device]->gpu_mem;
t_resource.cpu_mem = t_device[device]->cpu_mem;
- t_resource.event = t_device[device]->mem_event;
+ t_resource.event = t_device[device]->mem_event;
}
t_resource.handle = g_device[device]->device_resources->handle;
@@ -334,11 +335,11 @@ int hl_get_device() {
return device;
}
-void* hl_malloc_device(size_t size) {
+void *hl_malloc_device(size_t size) {
void *dest_d;
CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
- CHECK_CUDA(dynload::cudaMalloc((void**)&dest_d, size));
+ CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size));
return dest_d;
}
@@ -348,14 +349,15 @@ void hl_free_mem_device(void *dest_d) {
cudaError_t err = dynload::cudaFree(dest_d);
CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
- << hl_get_device_error_string();
+ << hl_get_device_error_string();
}
-void* hl_malloc_host(size_t size) {
+void *hl_malloc_host(size_t size) {
void *dest_h;
CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
- CHECK_CUDA(dynload::cudaHostAlloc((void**)&dest_h, size, cudaHostAllocDefault));
+ CHECK_CUDA(
+ dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
return dest_h;
}
@@ -364,8 +366,8 @@ void hl_free_mem_host(void *dest_h) {
CHECK_NOTNULL(dest_h);
cudaError_t err = dynload::cudaFreeHost(dest_h);
- CHECK (cudaSuccess == err || cudaErrorCudartUnloading == err)
- << hl_get_device_error_string();
+ CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
+ << hl_get_device_error_string();
}
void hl_memcpy(void *dst, void *src, size_t size) {
@@ -387,8 +389,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
}
CHECK_NOTNULL(src_h);
CHECK_NOTNULL(dest_d);
- CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size,
- cudaMemcpyHostToDevice));
+ CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
}
void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
@@ -397,8 +398,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
}
CHECK_NOTNULL(dest_h);
CHECK_NOTNULL(src_d);
- CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size,
- cudaMemcpyDeviceToHost));
+ CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
}
void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
@@ -407,8 +407,8 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
}
CHECK_NOTNULL(dest_d);
CHECK_NOTNULL(src_d);
- CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_d, size,
- cudaMemcpyDeviceToDevice));
+ CHECK_CUDA(
+ dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
}
void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
@@ -422,8 +422,8 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
CHECK_LT(stream, HPPL_STREAM_END);
cu_stream = t_resource.stream[stream];
- CHECK_CUDA(dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault,
- cu_stream));
+ CHECK_CUDA(
+ dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
}
void hl_start() {
@@ -434,8 +434,8 @@ void hl_start() {
bool hl_device_can_access_peer(int device, int peerDevice) {
int canAccessPeer;
- CHECK_CUDA(dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device,
- peerDevice));
+ CHECK_CUDA(
+ dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
if (canAccessPeer == 1) {
return true;
@@ -477,32 +477,32 @@ void hl_create_global_resources(hl_device_prop device_prop) {
/* create curand gen */
CHECK_EQ(dynload::curandCreateGenerator(&device_res->gen,
- CURAND_RNG_PSEUDO_DEFAULT), CURAND_STATUS_SUCCESS)
- << "[Start failed] Curand init failed.";
+ CURAND_RNG_PSEUDO_DEFAULT),
+ CURAND_STATUS_SUCCESS)
+ << "[Start failed] Curand init failed.";
- CHECK_EQ(dynload::curandSetStream(device_res->gen,
- device_res->stream[0]), CURAND_STATUS_SUCCESS)
- << "[Start failed] Curand set stream failed!";
+ CHECK_EQ(dynload::curandSetStream(device_res->gen, device_res->stream[0]),
+ CURAND_STATUS_SUCCESS)
+ << "[Start failed] Curand set stream failed!";
/* create cudnn handle */
hl_cudnn_init(&device_res->cudnn_handle, device_res->stream[0]);
int seed = gettid();
- CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(
- device_res->gen, seed+device), CURAND_STATUS_SUCCESS);
+ CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(device_res->gen,
+ seed + device),
+ CURAND_STATUS_SUCCESS);
- device_res->gen_mutex =
- (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t)));
+ device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
pthread_mutex_init(device_res->gen_mutex, NULL);
CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version));
}
-int hl_get_cuda_version() {
- return g_cuda_lib_version;
-}
+int hl_get_cuda_version() { return g_cuda_lib_version; }
-void hl_create_thread_resources(int device, thread_device_resources device_res) {
+void hl_create_thread_resources(int device,
+ thread_device_resources device_res) {
CHECK_CUDA(dynload::cudaSetDevice(device));
/* create thread stream */
@@ -511,15 +511,15 @@ void hl_create_thread_resources(int device, thread_device_resources device_res)
}
/* allocation device memory */
- device_res->gpu_mem = (real*)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
+ device_res->gpu_mem = (real *)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
/* allocation host memory */
- device_res->cpu_mem = (real*)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
+ device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event));
}
-void hl_specify_devices_start(int* device, int number) {
+void hl_specify_devices_start(int *device, int number) {
if (hl_start_flag) return;
/* 1. get the number of devices */
@@ -531,20 +531,19 @@ void hl_specify_devices_start(int* device, int number) {
/* 2. check device & create device property table */
CHECK_LE(number, g_system_device_num)
- << "[Start failed] System does not have enough device. "
- << "Device number: " << g_system_device_num
- << "Input number: " << number;
+ << "[Start failed] System does not have enough device. "
+ << "Device number: " << g_system_device_num << "Input number: " << number;
char *tmp;
hl_device_prop device_prop;
- tmp = (char *)malloc(g_system_device_num*sizeof(hl_device_prop*) +
- number*sizeof(_hl_device_prop));
+ tmp = (char *)malloc(g_system_device_num * sizeof(hl_device_prop *) +
+ number * sizeof(_hl_device_prop));
CHECK(tmp) << "[Start failed] System memory is not enough.";
- g_device = (hl_device_prop*)tmp;
- device_prop = (hl_device_prop)((char*)tmp +
- g_system_device_num*sizeof(hl_device_prop*));
- memset(g_device, 0, g_system_device_num*sizeof(hl_device_prop*));
+ g_device = (hl_device_prop *)tmp;
+ device_prop = (hl_device_prop)(
+ (char *)tmp + g_system_device_num * sizeof(hl_device_prop *));
+ memset(g_device, 0, g_system_device_num * sizeof(hl_device_prop *));
int num = 0;
for (int i = 0; i < number; i++) {
int dev;
@@ -555,13 +554,13 @@ void hl_specify_devices_start(int* device, int number) {
}
CHECK_LT(dev, g_system_device_num)
- << "[Start failed] The specified device number is "
- << "out of range. Max device number: " << g_system_device_num - 1
- << " Specified devcie number: "<< dev;
+ << "[Start failed] The specified device number is "
+ << "out of range. Max device number: " << g_system_device_num - 1
+ << " Specified devcie number: " << dev;
if (g_device[dev]) {
/* Warning */
- LOG(WARNING) <<"[Warning] Repeat specify device: " << dev;
+ LOG(WARNING) << "[Warning] Repeat specify device: " << dev;
continue;
}
@@ -572,11 +571,11 @@ void hl_specify_devices_start(int* device, int number) {
device_num = num;
/* 3. create global device resources */
- char *tmp_res = (char *)malloc(device_num*sizeof(_global_device_resources));
+ char *tmp_res = (char *)malloc(device_num * sizeof(_global_device_resources));
CHECK_NOTNULL(tmp_res);
- char *tmp_stream =
- (char *)malloc(device_num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t));
+ char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_GLOBAL_STREAM *
+ sizeof(cudaStream_t));
CHECK_NOTNULL(tmp_stream);
num = 0;
@@ -585,10 +584,11 @@ void hl_specify_devices_start(int* device, int number) {
continue;
}
- g_device[i]->device_resources = (global_device_resources)(tmp_res +
- num*sizeof(_global_device_resources));
- g_device[i]->device_resources->stream = (cudaStream_t*)(tmp_stream +
- num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t));
+ g_device[i]->device_resources = (global_device_resources)(
+ tmp_res + num * sizeof(_global_device_resources));
+ g_device[i]->device_resources->stream =
+ (cudaStream_t *)(tmp_stream +
+ num * NUMBER_OF_GLOBAL_STREAM * sizeof(cudaStream_t));
hl_create_global_resources(g_device[i]);
num++;
@@ -598,9 +598,9 @@ void hl_specify_devices_start(int* device, int number) {
hl_start_flag = true;
/* set default device */
if (device == NULL) {
- hl_set_device(0);
+ hl_set_device(0);
} else {
- hl_set_device(device[0]);
+ hl_set_device(device[0]);
}
}
@@ -608,35 +608,31 @@ void hl_rand(real *dest_d, size_t num) {
pthread_mutex_lock(t_resource.gen_mutex);
CHECK_EQ(
#ifndef PADDLE_TYPE_DOUBLE
- dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
+ dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
#else
- dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
+ dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
#endif
- CURAND_STATUS_SUCCESS);
+ CURAND_STATUS_SUCCESS);
pthread_mutex_unlock(t_resource.gen_mutex);
CHECK_SYNC("hl_rand failed");
}
void hl_srand(unsigned int seed) {
pthread_mutex_lock(t_resource.gen_mutex);
- CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(
- t_resource.gen, seed), CURAND_STATUS_SUCCESS);
+ CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(t_resource.gen, seed),
+ CURAND_STATUS_SUCCESS);
pthread_mutex_unlock(t_resource.gen_mutex);
}
-void hl_set_sync_flag(bool flag) {
- g_sync_flag = flag;
-}
+void hl_set_sync_flag(bool flag) { g_sync_flag = flag; }
-bool hl_get_sync_flag() {
- return g_sync_flag;
-}
+bool hl_get_sync_flag() { return g_sync_flag; }
void hl_stream_synchronize(hl_stream_t stream) {
cudaStream_t cu_stream;
- CHECK_LT(stream, HPPL_STREAM_END)
- << __func__ <<": the parameter stream is error.";
+ CHECK_LT(stream, HPPL_STREAM_END) << __func__
+ << ": the parameter stream is error.";
cu_stream = t_resource.stream[stream];
CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream));
@@ -645,8 +641,8 @@ void hl_stream_synchronize(hl_stream_t stream) {
void hl_create_event(hl_event_t *event) {
CHECK_NOTNULL(event);
- struct _hl_event_st* st_event =
- (struct _hl_event_st*)malloc(sizeof(struct _hl_event_st));
+ struct _hl_event_st *st_event =
+ (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event));
@@ -658,8 +654,8 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
CHECK_NOTNULL(start);
CHECK_NOTNULL(end);
- CHECK_CUDA(dynload::cudaEventElapsedTime(&time,
- start->cu_event, end->cu_event));
+ CHECK_CUDA(
+ dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
return time;
}
@@ -667,24 +663,22 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
cudaStream_t cu_stream;
CHECK_NOTNULL(event);
- CHECK_LT(stream, HPPL_STREAM_END)
- << __func__ <<": the parameter stream is error.";
+ CHECK_LT(stream, HPPL_STREAM_END) << __func__
+ << ": the parameter stream is error.";
cu_stream = t_resource.stream[stream];
- CHECK_CUDA(dynload::cudaEventRecord(
- event->cu_event, cu_stream));
+ CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream));
}
void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
cudaStream_t cu_stream;
CHECK_NOTNULL(event);
- CHECK_LT(stream, HPPL_STREAM_END)
- << __func__ <<": the parameter stream is error.";
+ CHECK_LT(stream, HPPL_STREAM_END) << __func__
+ << ": the parameter stream is error.";
cu_stream = t_resource.stream[stream];
- CHECK_CUDA(dynload::cudaStreamWaitEvent(
- cu_stream, event->cu_event, 0));
+ CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
}
void hl_destroy_event(hl_event_t event) {
@@ -703,15 +697,15 @@ void hl_event_synchronize(hl_event_t event) {
void hl_get_device_name(char *name, int len, int device) {
CHECK_NOTNULL(name);
CHECK(device >= 0 && device < g_system_device_num && g_device[device])
- << "Device("<< device <<") is not specified in startup.";
+ << "Device(" << device << ") is not specified in startup.";
- strncpy(name, g_device[device]->device_name , len);
+ strncpy(name, g_device[device]->device_name, len);
}
void hl_get_device_memory(size_t *mem_size, int device) {
CHECK_NOTNULL(mem_size);
CHECK(device >= 0 && device < g_system_device_num && g_device[device])
- << "Device("<< device <<") is not specified in startup.";
+ << "Device(" << device << ") is not specified in startup.";
*mem_size = g_device[device]->device_mem;
}
@@ -720,31 +714,26 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) {
CHECK_NOTNULL(major);
CHECK_NOTNULL(minor);
CHECK(device >= 0 && device < g_system_device_num && g_device[device])
- << "Device("<< device << ") is not specified in startup.";
+ << "Device(" << device << ") is not specified in startup.";
*major = g_device[device]->major;
*minor = g_device[device]->minor;
}
-int hl_get_device_last_error() {
- return (int)dynload::cudaGetLastError();
-}
+int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); }
-const char* hl_get_device_error_string() {
+const char *hl_get_device_error_string() {
cudaError_t err = dynload::cudaGetLastError();
return dynload::cudaGetErrorString(err);
}
-const char* hl_get_device_error_string(size_t err) {
+const char *hl_get_device_error_string(size_t err) {
return dynload::cudaGetErrorString((cudaError_t)err);
}
-void hl_device_synchronize() {
- CHECK_CUDA(dynload::cudaDeviceSynchronize());
-}
+void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); }
void hl_set_device_flags_block() {
- CHECK_CUDA(dynload::cudaSetDeviceFlags(
- cudaDeviceScheduleBlockingSync));
+ CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
}
bool hl_cuda_event_is_ready(hl_event_t event) {
@@ -756,3 +745,7 @@ bool hl_cuda_event_is_ready(hl_event_t event) {
}
return true;
}
+
+void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); }
+
+void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); }
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
index 27bbd03bc328293d978867c6badddc13a754ece2..ff6b830b7addc5c87af0d55070260c279a046a75 100644
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifdef PADDLE_USE_DSO
#include
@@ -29,26 +28,26 @@ limitations under the License. */
namespace dynload {
extern std::once_flag cudart_dso_flag;
-extern void* cudart_dso_handle;
+extern void *cudart_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cuda routine
* via operator overloading.
**/
-#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type) \
- struct DynLoad__##__name { \
- template \
- __type operator()(Args... args) { \
- typedef __type (*cudartFunc)(Args...); \
- std::call_once(cudart_dso_flag, GetCudartDsoHandle, \
- &cudart_dso_handle); \
- void* p_##__name = dlsym(cudart_dso_handle, #__name); \
- return reinterpret_cast(p_##__name)(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type) \
+ struct DynLoad__##__name { \
+ template \
+ __type operator()(Args... args) { \
+ typedef __type (*cudartFunc)(Args...); \
+ std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
+ void *p_##__name = dlsym(cudart_dso_handle, #__name); \
+ return reinterpret_cast(p_##__name)(args...); \
+ } \
+ } __name; /* struct DynLoad__##__name */
/* include all needed cuda functions in HPPL */
+// clang-format off
#define CUDA_ROUTINE_EACH(__macro) \
__macro(cudaLaunch, cudaError_t) \
__macro(cudaSetupArgument, cudaError_t) \
@@ -61,16 +60,17 @@ extern void* cudart_dso_handle;
__macro(__cudaInitModule, char) \
__macro(__cudaRegisterTexture, void) \
__macro(__cudaRegisterSurface, void)
+// clang-format on
CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
#if CUDART_VERSION >= 7000
- DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
+DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
#endif
#undef CUDA_ROUNTINE_EACH
-} /* namespace dynload */
+} /* namespace dynload */
#if CUDART_VERSION >= 7000
__host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
@@ -78,131 +78,120 @@ __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
dim3 blockDim,
void **args,
size_t sharedMem,
- cudaStream_t stream)
-{
- return dynload::cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
+ cudaStream_t stream) {
+ return dynload::cudaLaunchKernel(
+ func, gridDim, blockDim, args, sharedMem, stream);
}
#endif /* CUDART_VERSION >= 7000 */
-
-__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func)
-{
+__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
return dynload::cudaLaunch(func);
}
__host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
size_t size,
- size_t offset)
-{
+ size_t offset) {
return dynload::cudaSetupArgument(arg, size, offset);
}
__host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
dim3 blockDim,
size_t sharedMem,
- cudaStream_t stream)
-{
- return dynload::cudaConfigureCall(gridDim, blockDim,
- sharedMem, stream);
+ cudaStream_t stream) {
+ return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
}
extern "C" {
-void** CUDARTAPI __cudaRegisterFatBinary(
- void *fatCubin
-)
-{
+void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
return dynload::__cudaRegisterFatBinary(fatCubin);
-
}
-void CUDARTAPI __cudaUnregisterFatBinary(
- void **fatCubinHandle
-)
-{
+void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
}
-void CUDARTAPI __cudaRegisterFunction(
- void **fatCubinHandle,
- const char *hostFun,
- char *deviceFun,
- const char *deviceName,
- int thread_limit,
- uint3 *tid,
- uint3 *bid,
- dim3 *bDim,
- dim3 *gDim,
- int *wSize
-) {
- return dynload::__cudaRegisterFunction(
- fatCubinHandle, hostFun, deviceFun, deviceName,
- thread_limit, tid, bid, bDim, gDim, wSize);
+void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
+ const char *hostFun,
+ char *deviceFun,
+ const char *deviceName,
+ int thread_limit,
+ uint3 *tid,
+ uint3 *bid,
+ dim3 *bDim,
+ dim3 *gDim,
+ int *wSize) {
+ return dynload::__cudaRegisterFunction(fatCubinHandle,
+ hostFun,
+ deviceFun,
+ deviceName,
+ thread_limit,
+ tid,
+ bid,
+ bDim,
+ gDim,
+ wSize);
}
-void CUDARTAPI __cudaRegisterVar(
- void **fatCubinHandle,
- char *hostVar,
- char *deviceAddress,
- const char *deviceName,
- int ext,
- int size,
- int constant,
- int global
-) {
- return dynload::__cudaRegisterVar(
- fatCubinHandle, hostVar, deviceAddress,
- deviceName, ext, size, constant, global);
+void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle,
+ char *hostVar,
+ char *deviceAddress,
+ const char *deviceName,
+ int ext,
+ int size,
+ int constant,
+ int global) {
+ return dynload::__cudaRegisterVar(fatCubinHandle,
+ hostVar,
+ deviceAddress,
+ deviceName,
+ ext,
+ size,
+ constant,
+ global);
}
-
-
-extern void CUDARTAPI __cudaRegisterManagedVar(
- void **fatCubinHandle,
- void **hostVarPtrAddress,
- char *deviceAddress,
- const char *deviceName,
- int ext,
- int size,
- int constant,
- int global
-) {
- return dynload::__cudaRegisterManagedVar(
- fatCubinHandle, hostVarPtrAddress, deviceAddress,
- deviceName, ext, size, constant, global);
+extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle,
+ void **hostVarPtrAddress,
+ char *deviceAddress,
+ const char *deviceName,
+ int ext,
+ int size,
+ int constant,
+ int global) {
+ return dynload::__cudaRegisterManagedVar(fatCubinHandle,
+ hostVarPtrAddress,
+ deviceAddress,
+ deviceName,
+ ext,
+ size,
+ constant,
+ global);
}
-char CUDARTAPI __cudaInitModule(
- void **fatCubinHandle
-) {
+char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
return dynload::__cudaInitModule(fatCubinHandle);
}
-void CUDARTAPI __cudaRegisterTexture(
- void **fatCubinHandle,
- const struct textureReference *hostVar,
- const void **deviceAddress,
- const char *deviceName,
- int dim,
- int norm,
- int ext
-) {
+void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle,
+ const struct textureReference *hostVar,
+ const void **deviceAddress,
+ const char *deviceName,
+ int dim,
+ int norm,
+ int ext) {
return dynload::__cudaRegisterTexture(
- fatCubinHandle, hostVar, deviceAddress,
- deviceName, dim, norm, ext);
+ fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext);
}
-void CUDARTAPI __cudaRegisterSurface(
- void **fatCubinHandle,
- const struct surfaceReference *hostVar,
- const void **deviceAddress,
- const char *deviceName,
- int dim,
- int ext
-) {
+void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle,
+ const struct surfaceReference *hostVar,
+ const void **deviceAddress,
+ const char *deviceName,
+ int dim,
+ int ext) {
return dynload::__cudaRegisterSurface(
- fatCubinHandle, hostVar, deviceAddress,
- deviceName, dim, ext);
+ fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext);
}
} /* extern "C" */
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index b564b969033680a001577de25ceb84dae391754a..1a3ce08619fc3a5787576b30e9f4c13336990e74 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -12,27 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "hl_dso_loader.h"
-#include "paddle/utils/Logging.h"
#include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Logging.h"
-P_DEFINE_string(cudnn_dir, "",
+P_DEFINE_string(cudnn_dir,
+ "",
"Specify path for loading libcudnn.so. For instance, "
- "/usr/local/cudnn/lib64. If empty [default], dlopen will search "
- "cudnn from LD_LIBRARY_PATH");
+ "/usr/local/cudnn/lib. If empty [default], dlopen "
+ "will search cudnn from LD_LIBRARY_PATH");
-P_DEFINE_string(cuda_dir, "",
+P_DEFINE_string(cuda_dir,
+ "",
"Specify path for loading cuda library, such as libcublas, "
- "libcurand. For instance, /usr/local/cuda/lib64. "
- "(Note: libcudart can not be specified by cuda_dir, since some "
+ "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
+ "libcudart can not be specified by cuda_dir, since some "
"build-in function in cudart already ran before main entry). "
- "If empty [default], dlopen will search cuda from LD_LIBRARY_PATH");
+ "If default, dlopen will search cuda from LD_LIBRARY_PATH");
-static inline std::string join(const std::string& part1, const std::string& part2) {
+static inline std::string join(const std::string& part1,
+ const std::string& part2) {
// directory separator
const char sep = '/';
-
if (!part2.empty() && part2.front() == sep) {
return part2;
}
@@ -46,100 +47,115 @@ static inline std::string join(const std::string& part1, const std::string& part
return ret;
}
-static inline void GetDsoHandleFromDefaultPath(
- std::string& dso_path, void** dso_handle, int dynload_flags) {
- VLOG(3) << "Try to find cuda library: " << dso_path
- << " from default system path.";
- // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
+ void** dso_handle,
+ int dynload_flags) {
+ VLOG(3) << "Try to find cuda library: " << dso_path
+ << " from default system path.";
+ // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+ *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+
+// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+// bring System Integrity Projection (SIP), if dso_handle
+// is null, search from default package path in Mac OS.
+#if defined(__APPLE__) || defined(__OSX__)
+ if (nullptr == *dso_handle) {
+ dso_path = join("/usr/local/cuda/lib/", dso_path);
*dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-
- // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
- // bring System Integrity Projection (SIP), if dso_handle
- // is null, search from default package path in Mac OS.
- #if defined(__APPLE__) || defined(__OSX__)
if (nullptr == *dso_handle) {
- dso_path = join("/usr/local/cuda/lib/", dso_path);
- *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
- if (nullptr == *dso_handle) {
- if (dso_path == "libcudnn.dylib") {
- LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"
- << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C "
- << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h "
- << "/usr/local/cuda/lib/libcudnn*";
- }
- }
- }
- #endif
+ if (dso_path == "libcudnn.dylib") {
+ LOG(FATAL)
+ << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n" // NOLINT
+ << "For instance, sudo tar -xzf "
+ "cudnn-7.5-osx-x64-v5.0-ga.tgz -C " // NOLINT
+ << "/usr/local \n sudo chmod a+r "
+ "/usr/local/cuda/include/cudnn.h " // NOLINT
+ << "/usr/local/cuda/lib/libcudnn*";
+ }
+ }
+ }
+#endif
}
-static inline void GetDsoHandleFromSearchPath(
- const std::string& search_root,
- const std::string& dso_name,
- void** dso_handle) {
- int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
- *dso_handle = nullptr;
-
- std::string dlPath = dso_name;
- if (search_root.empty()) {
- GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
- } else {
- // search xxx.so from custom path
- dlPath = join(search_root, dso_name);
- *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
- // if not found, search from default path
- if (nullptr == dso_handle) {
- LOG(WARNING) << "Failed to find cuda library: " << dlPath;
- dlPath = dso_name;
- GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
- }
+static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
+ const std::string& dso_name,
+ void** dso_handle) {
+ int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+ *dso_handle = nullptr;
+
+ std::string dlPath = dso_name;
+ if (search_root.empty()) {
+ GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+ } else {
+ // search xxx.so from custom path
+ dlPath = join(search_root, dso_name);
+ *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+ // if not found, search from default path
+ if (nullptr == *dso_handle) {
+ LOG(WARNING) << "Failed to find cuda library: " << dlPath;
+ dlPath = dso_name;
+ GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
}
+ }
- CHECK(nullptr != *dso_handle)
- << "Failed to find cuda library: " << dlPath << std::endl
- << "Please specify its path correctly using one of the following ideas: \n"
-
- << "Idea 1. set cuda and cudnn lib path at runtime. "
- << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n"
- << "For instance, issue command: paddle train --use_gpu=1 "
- << "--cuda_dir=/usr/local/cudnn/lib --cudnn_dir=/usr/local/cudnn/lib ...\n"
-
- << "Idea 2. set environment variable LD_LIBRARY_PATH on Linux or "
- << "DYLD_LIBRARY_PATH on Mac OS. \n"
- << "For instance, issue command: export LD_LIBRARY_PATH=... \n"
-
- << "Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible "
- << "unless System Integrity Protection (SIP) is disabled. However, @Idea 1"
- << "always work well.";
+ CHECK(nullptr != *dso_handle) << "Failed to find cuda library: " << dlPath
+ << std::endl
+ << "Please specify its path correctly using "
+ "one of the following ways: \n" // NOLINT
+
+ << "Method 1. set cuda and cudnn lib path at "
+ "runtime. "
+ << "http://www.paddlepaddle.org/doc/ui/"
+ "cmd_argument/"
+ "argument_outline.html \n" // NOLINT
+ << "For instance, issue command: paddle train "
+ "--use_gpu=1 "
+ << "--cuda_dir=/usr/local/cuda/lib64 "
+ "--cudnn_dir=/usr/local/cudnn/lib "
+ "...\n" // NOLINT
+
+ << "Method 2. set environment variable "
+ "LD_LIBRARY_PATH on Linux or "
+ << "DYLD_LIBRARY_PATH on Mac OS. \n"
+ << "For instance, issue command: export "
+ "LD_LIBRARY_PATH=... \n"
+
+ << "Note: After Mac OS 10.11, using the "
+ "DYLD_LIBRARY_PATH is impossible "
+ << "unless System Integrity Protection (SIP) "
+ "is disabled. However, "
+ "method 1 " // NOLINT
+ << "always work well.";
}
void GetCublasDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
- GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
#else
- GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
#endif
}
void GetCudnnDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
- GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
#else
- GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
#endif
}
void GetCudartDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
- GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
+ GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
#else
- GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
+ GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
#endif
}
void GetCurandDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
- GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
#else
- GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
#endif
}
diff --git a/paddle/cuda/src/hl_math.cc b/paddle/cuda/src/hl_math.cc
index 76d48c4a9b94d402cf84c57bd240e03a1a83b1a0..f4bf888bab4e92dd940714ef1b7aeee9242eb817 100644
--- a/paddle/cuda/src/hl_math.cc
+++ b/paddle/cuda/src/hl_math.cc
@@ -12,24 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "avx_mathfun.h"
namespace hppl {
-__m256 exp(__m256 a) {
- return exp256_ps(a);
-}
+__m256 exp(__m256 a) { return exp256_ps(a); }
-__m256 log(__m256 a) {
- return log256_ps(a);
-}
+__m256 log(__m256 a) { return log256_ps(a); }
-__m256 sin(__m256 a) {
- return sin256_ps(a);
-}
+__m256 sin(__m256 a) { return sin256_ps(a); }
-__m256 cos(__m256 a) {
- return cos256_ps(a);
-}
+__m256 cos(__m256 a) { return cos256_ps(a); }
} // namespace hppl
diff --git a/paddle/cuda/src/hl_time.cc b/paddle/cuda/src/hl_time.cc
index adc88d60dd8d547cedcae5fd088b2fa581d8e5be..d52b2a1df07374f632def12eb52e10e10ca86028 100644
--- a/paddle/cuda/src/hl_time.cc
+++ b/paddle/cuda/src/hl_time.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include
#include
#include
@@ -21,8 +20,7 @@ limitations under the License. */
using std::chrono::high_resolution_clock;
int64_t getCurrentTimeStick() {
- high_resolution_clock::time_point tp = high_resolution_clock::now();
- high_resolution_clock::duration dtn = tp.time_since_epoch();
- return dtn.count();
+ high_resolution_clock::time_point tp = high_resolution_clock::now();
+ high_resolution_clock::duration dtn = tp.time_since_epoch();
+ return dtn.count();
}
-
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index 27eed75d4d76c351e381a3b71dc44a3254fb1a4d..f1bb94216c44b3e915f87a3ae49bdfd3ef812916 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -51,12 +51,14 @@ static ClassRegistrar gActivationRegistrar;
* @brief Macro for registering a derived activation class
*/
#define END_DEFINE_ACTIVATION(ACTIVATION_NAME) \
- }; \
+ } \
+ ; \
const std::string ACTIVATION_CLASS_NAME(ACTIVATION_NAME)::name = \
#ACTIVATION_NAME; \
static InitFunction __reg_activation__##ACTIVATION_NAME([] { \
- gActivationRegistrar.registerClass< \
- ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(#ACTIVATION_NAME); \
+ gActivationRegistrar \
+ .registerClass( \
+ #ACTIVATION_NAME); \
});
/**
@@ -111,14 +113,22 @@ void backward(Argument& act) {
outputG->softmaxBackward(*outputV);
} else {
SetDevice device(act.deviceId);
- Matrix::resizeOrCreate(sftMaxDot_, outputG->getHeight(),
+ Matrix::resizeOrCreate(sftMaxDot_,
+ outputG->getHeight(),
outputG->getWidth(),
- /* trans */ false, useGpu(act.deviceId));
- Matrix::resizeOrCreate(sftMaxSum_, outputG->getHeight(), 1,
- /* trans */ false, useGpu(act.deviceId));
+ /* trans */ false,
+ useGpu(act.deviceId));
+ Matrix::resizeOrCreate(sftMaxSum_,
+ outputG->getHeight(),
+ 1,
+ /* trans */ false,
+ useGpu(act.deviceId));
if (!one_ || one_->getWidth() != outputG->getWidth()) {
- Matrix::resizeOrCreate(one_, 1, outputG->getWidth(),
- /* trans */ false, useGpu(act.deviceId));
+ Matrix::resizeOrCreate(one_,
+ 1,
+ outputG->getWidth(),
+ /* trans */ false,
+ useGpu(act.deviceId));
one_->one();
}
@@ -130,7 +140,6 @@ void backward(Argument& act) {
}
END_DEFINE_ACTIVATION(softmax)
-
/**
* @brief Sequence_softmax Activation
* @note Softmax on all frames of one sequence.
@@ -146,10 +155,16 @@ void forward(Argument& act) {
CHECK_EQ(act.value->getWidth(), 1UL);
if (!argument_.value) {
- argument_.value = Matrix::create(nullptr, /* height= */ 1, 1,
- /* trans= */ false, useGpu(act.deviceId));
- argument_.grad = Matrix::create(nullptr, /* height= */ 1, 1,
- /* trans= */ false, useGpu(act.deviceId));
+ argument_.value = Matrix::create(nullptr,
+ /* height= */ 1,
+ 1,
+ /* trans= */ false,
+ useGpu(act.deviceId));
+ argument_.grad = Matrix::create(nullptr,
+ /* height= */ 1,
+ 1,
+ /* trans= */ false,
+ useGpu(act.deviceId));
}
auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId));
@@ -267,8 +282,11 @@ END_DEFINE_ACTIVATION(softrelu)
BEGIN_DEFINE_ACTIVATION(abs)
void forward(Argument& act) {
SetDevice device(act.deviceId);
- Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
- /* trans */ false, useGpu(act.deviceId));
+ Matrix::resizeOrCreate(act.in,
+ act.value->getHeight(),
+ act.value->getWidth(),
+ /* trans */ false,
+ useGpu(act.deviceId));
act.in->copyFrom(*act.value);
act.value->abs(*act.value);
@@ -286,8 +304,11 @@ END_DEFINE_ACTIVATION(abs)
BEGIN_DEFINE_ACTIVATION(square)
void forward(Argument& act) {
SetDevice device(act.deviceId);
- Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
- /* trans */ false, useGpu(act.deviceId));
+ Matrix::resizeOrCreate(act.in,
+ act.value->getHeight(),
+ act.value->getWidth(),
+ /* trans */ false,
+ useGpu(act.deviceId));
act.in->copyFrom(*act.value);
act.value->square(*act.value);
@@ -317,8 +338,11 @@ END_DEFINE_ACTIVATION(exponential)
BEGIN_DEFINE_ACTIVATION(log)
void forward(Argument& act) {
SetDevice device(act.deviceId);
- Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
- /* trans */ false, useGpu(act.deviceId));
+ Matrix::resizeOrCreate(act.in,
+ act.value->getHeight(),
+ act.value->getWidth(),
+ /* trans */ false,
+ useGpu(act.deviceId));
act.in->copyFrom(*act.value);
act.value->log(*act.value);
@@ -333,11 +357,9 @@ ActivationFunction* ActivationFunction::create(const std::string& type) {
std::vector ActivationFunction::getAllRegisteredTypes() {
std::vector types;
- gActivationRegistrar.forEachType([&](const std::string& type) {
- types.push_back(type);
- });
+ gActivationRegistrar.forEachType(
+ [&](const std::string& type) { types.push_back(type); });
return types;
}
-
} // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index c483372256c035e39bfdbcaa4193a1a2e7fd80b8..e9ed5c619ab5e4dd9c52c0dac24478c2a57aa1bf 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
#include
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index 2cfb5a3a18c8a63d69bf0598eeee2807376340bc..e6cc4a246a8494d287f8638674f4ae213f38f657 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "DataProvider.h"
#include "paddle/utils/Util.h"
@@ -57,7 +56,7 @@ void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
}
}
-DoubleBuffer::DoubleBuffer(DataProvider *dataPool,
+DoubleBuffer::DoubleBuffer(DataProvider* dataPool,
bool useGpu,
int64_t batchSize) {
batchSize_ = batchSize;
@@ -155,7 +154,7 @@ void DoubleBuffer::startAsyncLoad() {
}
ClassRegistrar
-DataProvider::registrar_;
+ DataProvider::registrar_;
DataProvider* DataProvider::create(const DataConfig& config,
const ModelConfig& modelConfig,
@@ -182,7 +181,8 @@ int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
for (int i = 0; i < config_.constant_slots_size(); ++i) {
MemoryHandlePtr handle =
constantSlots[i] ? constantSlots[i]->getMemoryHandle() : nullptr;
- Matrix::resizeOrCreate(constantSlots[i], batchSize,
+ Matrix::resizeOrCreate(constantSlots[i],
+ batchSize,
1, // = width
false, // = trans
useGpu_); // = useGpu
@@ -216,7 +216,8 @@ void DataProvider::initAsyncLoader() {
}
SimpleDataProviderBase::SimpleDataProviderBase(const DataConfig& config,
- bool useGpu, bool withInfo)
+ bool useGpu,
+ bool withInfo)
: DataProvider(config, useGpu) {
/* initialize the size of a sample, and the buffer */
sampleDim_ = config_.feat_dim() * (2 * config_.context_len() + 1);
@@ -337,7 +338,8 @@ int64_t SimpleDataProviderBase::fillBuffer() {
sampleNumInBuf_ =
n + fillBufferImp(hInputDataBuf_->getData() + n * sampleDim_,
hInputLabelBuf_->getData() + n,
- hInputInfoBuf_->getData() + n, bufferCapacity_ - n);
+ hInputInfoBuf_->getData() + n,
+ bufferCapacity_ - n);
/* for stachastic gradient training */
if (!skipShuffle_) {
@@ -357,11 +359,14 @@ SimpleDataProvider::SimpleDataProvider(const DataConfig& config, bool useGpu)
SimpleDataProvider::~SimpleDataProvider() {}
-int64_t SimpleDataProvider::fillBufferImp(real* data, int* label, int* info,
+int64_t SimpleDataProvider::fillBufferImp(real* data,
+ int* label,
+ int* info,
int64_t size) {
(void)info;
int64_t n = std::min(labels_.size() - currentSampleIndex_, size);
- memcpy(data, &data_[currentSampleIndex_ * sampleDim_],
+ memcpy(data,
+ &data_[currentSampleIndex_ * sampleDim_],
n * sampleDim_ * sizeof(real));
memcpy(label, &labels_[currentSampleIndex_], sizeof(int) * n);
currentSampleIndex_ += n;
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 112e45de1cb232097ed63b120d5ac631b37952e9..8b7fb27f821a47d830413eced79b3352a6969c90 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
@@ -44,15 +43,15 @@ namespace paddle {
* @brief Macro for registering a data provider. The class type should contain
* a consturctor with parameter (DataConfig, bool).
*/
-#define REGISTER_DATA_PROVIDER(__type_name, __class_name)\
- static InitFunction __reg_type_##__type_name([]() {\
- DataProvider::registrar_.registerClass(\
- #__type_name, \
- [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
- DataProvider* dp = new __class_name (conf, useGpu);\
- return dp;\
- });\
-})
+#define REGISTER_DATA_PROVIDER(__type_name, __class_name) \
+ static InitFunction __reg_type_##__type_name([]() { \
+ DataProvider::registrar_.registerClass( \
+ #__type_name, \
+ [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
+ DataProvider* dp = new __class_name(conf, useGpu); \
+ return dp; \
+ }); \
+ })
/**
* @def REGISTER_DATA_PROVIDER_EX
@@ -61,8 +60,8 @@ namespace paddle {
*/
#define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name) \
static InitFunction __reg_type_##__type_name([] { \
- DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
-})
+ DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
+ })
class DataBatch;
class BufferBatch;
@@ -181,7 +180,8 @@ public:
* @param[in] size DataBatch.getSize()
* @param[in] dataId sub dataprovider id (in MultiDataProvider)
*/
- void appendArguments(const std::vector& argus, int size,
+ void appendArguments(const std::vector& argus,
+ int size,
int dataId) {
size_ += size;
for (const auto& argu : argus) {
@@ -259,9 +259,7 @@ typedef Queue BufferBatchQueue;
class DoubleBuffer {
public:
- DoubleBuffer(DataProvider* dataPool,
- bool useGpu,
- int64_t batchSize = 0);
+ DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
virtual ~DoubleBuffer();
void removeOneBatch(DataBatch* dataBatch);
@@ -310,7 +308,7 @@ public:
/**
* @brief create only used for unittest.
*/
- inline static DataProvider* create(const DataConfig &config,
+ inline static DataProvider* create(const DataConfig& config,
bool useGpu = FLAGS_use_gpu) {
return create(config, ModelConfig(), useGpu);
}
@@ -462,7 +460,9 @@ protected:
*
* label[n] is the label for the n-th sample.
*/
- virtual int64_t fillBufferImp(real* data, int* label, int* info,
+ virtual int64_t fillBufferImp(real* data,
+ int* label,
+ int* info,
int64_t size) = 0;
};
@@ -475,7 +475,9 @@ public:
protected:
void loadData(const std::string& fileName);
void loadDataFile(const std::string& fileName);
- virtual int64_t fillBufferImp(real* data, int* label, int* info,
+ virtual int64_t fillBufferImp(real* data,
+ int* label,
+ int* info,
int64_t size);
protected:
diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/gserver/dataproviders/DataProviderGroup.h
index 0689f90f3e7dd3d3e1df19f3958c821d53e69700..6c178e29ee714a6bd7f58861d7cf15716fee848d 100644
--- a/paddle/gserver/dataproviders/DataProviderGroup.h
+++ b/paddle/gserver/dataproviders/DataProviderGroup.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include "DataProvider.h"
@@ -65,8 +64,8 @@ void DataProviderGroup::reset() {
provider_ = nullptr;
// shuffle file list
- std::shuffle(fileList_.begin(), fileList_.end(),
- ThreadLocalRandomEngine::get());
+ std::shuffle(
+ fileList_.begin(), fileList_.end(), ThreadLocalRandomEngine::get());
startLoader();
DataProvider::reset();
@@ -113,8 +112,9 @@ void DataProviderGroup::startLoader() {
size_t endPos = std::min(fileList_.size(), startPos + loadFileCount);
std::vector fileVec(fileList_.begin() + startPos,
fileList_.begin() + endPos);
- loader_->addJob([this, fileVec]()
- -> ProviderPtrType { return this->loadFile(fileVec); });
+ loader_->addJob([this, fileVec]() -> ProviderPtrType {
+ return this->loadFile(fileVec);
+ });
}
loader_->stopAddJob();
}
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.cpp b/paddle/gserver/dataproviders/MultiDataProvider.cpp
index 8e4f53978a0451f3bb6cd5da30f017708448f9ac..51fb1f26668c55dc1c2aecd5389f327e2569a52f 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.cpp
+++ b/paddle/gserver/dataproviders/MultiDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "paddle/utils/Util.h"
#include "MultiDataProvider.h"
#include "paddle/utils/Logging.h"
@@ -59,10 +58,8 @@ MultiDataProvider::MultiDataProvider(const DataConfig& config,
"MultiDataProvider";
subConfig.set_async_load_data(false);
}
- subDataProviders_[i] =
- std::unique_ptr(DataProvider::create(subConfig,
- modelConfig,
- useGpu_));
+ subDataProviders_[i] = std::unique_ptr(
+ DataProvider::create(subConfig, modelConfig, useGpu_));
}
}
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/gserver/dataproviders/MultiDataProvider.h
index b498ba6516c4320566b1b3cc2bd557ae016d7c39..876467c04f074cf37e48fdfa9b24f236fcfe8ba1 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.h
+++ b/paddle/gserver/dataproviders/MultiDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include "DataProvider.h"
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
index 344644755f24045443b8cb3ebd08004a4b1cdcb5..0a7ff802461f2ded0e6e842c088bddf218361f79 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "ProtoDataProvider.h"
#include "paddle/utils/Util.h"
#include "paddle/utils/StringUtil.h"
@@ -23,7 +22,8 @@ limitations under the License. */
#include "paddle/utils/Logging.h"
#include "DataProviderGroup.h"
-P_DEFINE_double(memory_threshold_on_load_data, 1.0,
+P_DEFINE_double(memory_threshold_on_load_data,
+ 1.0,
"stop loading data when memory is not sufficient");
namespace paddle {
@@ -32,7 +32,8 @@ REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup);
REGISTER_DATA_PROVIDER(proto_sequence_group,
DataProviderGroup);
-ProtoDataProvider::ProtoDataProvider(const DataConfig& config, bool useGpu,
+ProtoDataProvider::ProtoDataProvider(const DataConfig& config,
+ bool useGpu,
bool loadDataAll)
: DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) {
if (loadDataAll) {
@@ -279,7 +280,8 @@ void ProtoDataProvider::fillSlots(const DataSample& sample) {
}
slot.sparseNonValueData.resize(slot.indices.back() + slotSize);
const unsigned int* ids = sample.vector_slots(i).ids().data();
- memcpy(slot.sparseNonValueData.data() + slot.indices.back(), ids,
+ memcpy(slot.sparseNonValueData.data() + slot.indices.back(),
+ ids,
sizeof(*ids) * slotSize);
slot.indices.push_back(slot.indices.back() + slotSize);
if (subSlotSize) {
@@ -318,10 +320,11 @@ void ProtoDataProvider::fillSlots(const DataSample& sample) {
slot.varDenseData[oldSize].data.resize(varDim);
const float* values = sample.vector_slots(i).values().data();
#ifdef PADDLE_TYPE_DOUBLE
- std::copy(values, values + varDim,
- slot.varDenseData[oldSize].data.data());
+ std::copy(
+ values, values + varDim, slot.varDenseData[oldSize].data.data());
#else
- memcpy(slot.varDenseData[oldSize].data.data(), values,
+ memcpy(slot.varDenseData[oldSize].data.data(),
+ values,
sizeof(real) * varDim);
#endif
slot.varDenseData[oldSize].dims.resize(
@@ -374,8 +377,9 @@ void ProtoDataProvider::reset() {
}
void ProtoDataProvider::shuffle() {
- std::shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end(),
- ThreadLocalRandomEngine::get());
+ std::shuffle(shuffledSequenceIds_.begin(),
+ shuffledSequenceIds_.end(),
+ ThreadLocalRandomEngine::get());
}
/*
@@ -502,7 +506,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
if (!iidData()) {
ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions,
- numSequences + 1, /* useGpu= */ false);
+ numSequences + 1,
+ /* useGpu= */ false);
int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false);
int pos = 0;
int i = 0;
@@ -530,7 +535,9 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
switch (slotType) {
case SlotDef::VECTOR_DENSE: {
- Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim,
+ Matrix::resizeOrCreate(cpuArguments[slot].value,
+ size,
+ dim,
false, // trans = false
false); // useGpu = false
real* buf = cpuArguments[slot].value->getData();
@@ -543,19 +550,27 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
}
case SlotDef::VECTOR_SPARSE_NON_VALUE: {
if (!(cpuArguments[slot].value)) {
- cpuArguments[slot].value = Matrix::createSparseMatrix(
- size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE, SPARSE_CSR,
- false, useGpu_);
+ cpuArguments[slot].value =
+ Matrix::createSparseMatrix(size,
+ dim,
+ size /*DEFAULT_AVG_WIDTH = 1*/,
+ NO_VALUE,
+ SPARSE_CSR,
+ false,
+ useGpu_);
}
auto mat = cpuArguments[slot].value;
mat->resize(size, dim);
if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
- slots_[slot].sparseNonValueData.data(), HPPL_STREAM_1);
+ ->copyFrom(dataPos.data(),
+ slots_[slot].indices.data(),
+ slots_[slot].sparseNonValueData.data(),
+ HPPL_STREAM_1);
} else if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
+ ->copyFrom(dataPos.data(),
+ slots_[slot].indices.data(),
slots_[slot].sparseNonValueData.data());
} else {
LOG(FATAL) << "Not Supported";
@@ -571,19 +586,27 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
}
case SlotDef::VECTOR_SPARSE_VALUE: {
if (!(cpuArguments[slot].value)) {
- cpuArguments[slot].value = Matrix::createSparseMatrix(
- size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, FLOAT_VALUE,
- SPARSE_CSR, false, useGpu_);
+ cpuArguments[slot].value =
+ Matrix::createSparseMatrix(size,
+ dim,
+ size /*DEFAULT_AVG_WIDTH = 1*/,
+ FLOAT_VALUE,
+ SPARSE_CSR,
+ false,
+ useGpu_);
}
auto mat = cpuArguments[slot].value;
mat->resize(size, dim);
if (std::dynamic_pointer_cast(mat)) {
- std::dynamic_pointer_cast(mat)->copyFrom(
- dataPos.data(), slots_[slot].indices.data(),
- slots_[slot].sparseFloatValueData.data(), HPPL_STREAM_1);
+ std::dynamic_pointer_cast(mat)
+ ->copyFrom(dataPos.data(),
+ slots_[slot].indices.data(),
+ slots_[slot].sparseFloatValueData.data(),
+ HPPL_STREAM_1);
} else if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
+ ->copyFrom(dataPos.data(),
+ slots_[slot].indices.data(),
slots_[slot].sparseFloatValueData.data());
} else {
LOG(FATAL) << "Not Supported";
@@ -591,7 +614,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
break;
}
case SlotDef::INDEX: {
- IVector::resizeOrCreate(cpuArguments[slot].ids, size,
+ IVector::resizeOrCreate(cpuArguments[slot].ids,
+ size,
/* useGpu= */ false);
int* buf = cpuArguments[slot].ids->getData();
for (int i = 0; i < size; ++i) {
@@ -621,7 +645,9 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
if (oldWidth < height) {
totalDim = width * height * depth;
}
- Matrix::resizeOrCreate(cpuArguments[slot].value, size, totalDim,
+ Matrix::resizeOrCreate(cpuArguments[slot].value,
+ size,
+ totalDim,
false, // trans = false
false); // useGpu = false
real* buf = cpuArguments[slot].value->getData();
@@ -637,13 +663,13 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
}
}
} else {
- memcpy(buf, slots_[slot].varDenseData[dataPos[0]].data.data(),
+ memcpy(buf,
+ slots_[slot].varDenseData[dataPos[0]].data.data(),
sizeof(real) * totalDim);
}
- ICpuGpuVector::resizeOrCreate(
- cpuArguments[slot].sequenceStartPositions,
- size + 1, /* size == 1 currently */
- /* useGpu= */ false);
+ ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+ size + 1, /* size == 1 currently */
+ /* useGpu= */ false);
int* bufStarts =
cpuArguments[slot].sequenceStartPositions->getMutableData(false);
bufStarts[0] = 0;
@@ -653,16 +679,17 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
case SlotDef::VAR_MDIM_INDEX: {
CHECK_EQ(size, 1);
size_t totalDim = slots_[slot].varIndices[dataPos[0]].size();
- IVector::resizeOrCreate(cpuArguments[slot].ids, totalDim,
+ IVector::resizeOrCreate(cpuArguments[slot].ids,
+ totalDim,
/* useGpu= */ false);
int* buf = cpuArguments[slot].ids->getData();
- memcpy(buf, slots_[slot].varIndices[dataPos[0]].data(),
+ memcpy(buf,
+ slots_[slot].varIndices[dataPos[0]].data(),
sizeof(int) * totalDim);
- ICpuGpuVector::resizeOrCreate(
- cpuArguments[slot].sequenceStartPositions,
- size + 1, /* size == 1 currently */
- /* useGpu= */ false);
+ ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+ size + 1, /* size == 1 currently */
+ /* useGpu= */ false);
int* bufStarts =
cpuArguments[slot].sequenceStartPositions->getMutableData(false);
bufStarts[0] = 0;
@@ -700,8 +727,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
gpuArguments[i].sequenceStartPositions =
cpuArguments[i].sequenceStartPositions;
} else {
- gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
- HPPL_STREAM_1);
+ gpuArguments[i].resizeAndCopyFrom(
+ cpuArguments[i], useGpu_, HPPL_STREAM_1);
}
}
hl_stream_synchronize(HPPL_STREAM_1);
@@ -746,10 +773,9 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
sampleLoop(op, size);
// current slot: sequenceStartPositions
- ICpuGpuVector::resizeOrCreate(
- cpuArguments[slot].sequenceStartPositions,
- size + 1,
- /* useGpu= */ false);
+ ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+ size + 1,
+ /* useGpu= */ false);
switch (slotType) {
case SlotDef::VECTOR_SPARSE_VALUE:
@@ -821,10 +847,10 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
};
int subSize = subSampleLoop(op, size, slot);
ICpuGpuVector::resizeOrCreate(
- cpuArguments[slot].subSequenceStartPositions, subSize + 1,
- false);
+ cpuArguments[slot].subSequenceStartPositions, subSize + 1, false);
int* currPosOfArgumentSubSeqStart =
- cpuArguments[slot].subSequenceStartPositions->getMutableData(false);
+ cpuArguments[slot].subSequenceStartPositions->getMutableData(
+ false);
int64_t* subSeqs = dataSubPos.data();
int64_t* subIndexs = slots_[slot].subIndices.data();
int allSubSequenceLength = 0;
@@ -849,7 +875,8 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
}
case SlotDef::INDEX: {
// label slot
- IVector::resizeOrCreate(cpuArguments[slot].ids, size,
+ IVector::resizeOrCreate(cpuArguments[slot].ids,
+ size,
/* useGpu= */ false);
// fill labels
int* buf = cpuArguments[slot].ids->getData();
@@ -863,7 +890,9 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
case SlotDef::VECTOR_DENSE: {
// copy values
size_t dim = header_.slot_defs(slot).dim();
- Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim,
+ Matrix::resizeOrCreate(cpuArguments[slot].value,
+ size,
+ dim,
false, // trans = false
false); // useGpu = false
real* buf = cpuArguments[slot].value->getData();
@@ -887,8 +916,8 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
gpuArguments.resize(cpuArguments.size());
gpuBatch.setSize(size);
for (size_t i = 0; i < cpuArguments.size(); ++i) {
- gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
- HPPL_STREAM_1);
+ gpuArguments[i].resizeAndCopyFrom(
+ cpuArguments[i], useGpu_, HPPL_STREAM_1);
}
hl_stream_synchronize(HPPL_STREAM_1);
*batch = gpuBatch;
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h
index 846dd7673abe8b836be1b728bb690daa0e8acc20..ffdcc8fdc977f53e29dc9f03fa3cf7af56acb92f 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
@@ -48,7 +47,8 @@ namespace paddle {
*/
class ProtoDataProvider : public DataProvider {
public:
- ProtoDataProvider(const DataConfig& config, bool useGpu,
+ ProtoDataProvider(const DataConfig& config,
+ bool useGpu,
bool loadDataAll = true);
virtual void reset();
@@ -161,14 +161,16 @@ protected:
};
/**
- * @brief Special use for Proto data: instances should contain sparse-non-value slots
+ * @brief Special use for Proto data: instances should contain sparse-non-value
+ * slots
* and label.
*
* @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
*/
class ProtoSequenceDataProvider : public ProtoDataProvider {
public:
- ProtoSequenceDataProvider(const DataConfig& config, bool useGpu,
+ ProtoSequenceDataProvider(const DataConfig& config,
+ bool useGpu,
bool loadDataAll = true);
~ProtoSequenceDataProvider() {}
virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
diff --git a/paddle/gserver/dataproviders/ProtoReader.h b/paddle/gserver/dataproviders/ProtoReader.h
index 3b1eb7e9ef03c42df31c6efc9f0e0240d64e78df..b8fca3cd7f3c5efaea35dc8e09f7ca0ec250830f 100644
--- a/paddle/gserver/dataproviders/ProtoReader.h
+++ b/paddle/gserver/dataproviders/ProtoReader.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
@@ -138,7 +137,8 @@ protected:
*
* @note this code depends on protobuf 2.4.0. There is nothing like
* CodedInputStream::CurrentPosition() in protobuf 2.5.0 to tell us how many
- * bytes has the object readed so far. Therefore, we calculated bytes ourselves.
+ * bytes has the object readed so far. Therefore, we calculated bytes
+ * ourselves.
*/
int approximateReadedBytes_;
};
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
index 1332c0ab635b6ebec05f25fd77b9703b39227bc1..bee6ca14a2ec3995a3b432fc5a39419a5dd8a8ce 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PyDataProvider.h"
#include "paddle/utils/PythonUtil.h"
#include
#include "paddle/utils/Util.h"
#include "paddle/utils/Excepts.h"
-
namespace paddle {
#ifndef PADDLE_NO_PYTHON
REGISTER_DATA_PROVIDER(py, PyDataProvider);
#endif
-PyDataProvider::PyDataProvider(const DataConfig& config, bool useGpu,
+PyDataProvider::PyDataProvider(const DataConfig& config,
+ bool useGpu,
bool loadDataAll)
: DataProvider(config, useGpu), batchSize_(0) {
PyGuard guard;
@@ -50,8 +49,8 @@ void PyDataProvider::loadData(const std::vector& fileList) {
classInstance_ =
createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
CHECK(classInstance_) << "Create class instance failed.";
- PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
- const_cast("getHeader"), NULL));
+ PyObjectPtr obj(PyObject_CallMethod(
+ classInstance_.get(), const_cast("getHeader"), NULL));
CHECK_PY(obj) << "Call function getHeader failed.";
std::string headerInfo =
std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
@@ -90,7 +89,8 @@ void PyDataProvider::resetSlots() {
}
}
-void PyDataProvider::fillDenseSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillDenseSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd) {
unsigned int dim = slot.dim;
slot.sampleNum = readT(data, dataEnd);
@@ -102,14 +102,17 @@ void PyDataProvider::fillDenseSlot(ProtoSlot& slot, char*& data,
float* dat = reinterpret_cast(data);
std::copy(dat, dat + slot.sampleNum * dim, slot.denseData.begin());
#else
- memcpyWithCheck(slot.denseData.data(), data,
- sizeof(real) * dim * slot.sampleNum, dataEnd);
+ memcpyWithCheck(slot.denseData.data(),
+ data,
+ sizeof(real) * dim * slot.sampleNum,
+ dataEnd);
#endif
// PyDataProvider always provide data in float
data += sizeof(float) * dim * slot.sampleNum;
}
-void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd) {
slot.sampleNum = readT(data, dataEnd);
unsigned int* indexPtr = (unsigned int*)data;
@@ -121,12 +124,15 @@ void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
length = readT(data, dataEnd);
slot.indices.push_back(length);
slot.sparseNonValueData.resize(length);
- memcpyWithCheck(slot.sparseNonValueData.data(), data,
- sizeof(unsigned int) * length, dataEnd);
+ memcpyWithCheck(slot.sparseNonValueData.data(),
+ data,
+ sizeof(unsigned int) * length,
+ dataEnd);
data += sizeof(unsigned int) * length;
}
-void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd) {
slot.sampleNum = readT(data, dataEnd);
unsigned int* indexPtr = (unsigned int*)data;
@@ -153,7 +159,8 @@ void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, char*& data,
}
}
-void PyDataProvider::fillIndexSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillIndexSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd) {
slot.sampleNum = readT(data, dataEnd);
CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
@@ -163,7 +170,8 @@ void PyDataProvider::fillIndexSlot(ProtoSlot& slot, char*& data,
data += sizeof(unsigned int) * slot.sampleNum;
}
-void PyDataProvider::fillStringSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillStringSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd) {
slot.sampleNum = readT(data, dataEnd);
for (unsigned int i = 0; i < slot.sampleNum; ++i) {
@@ -225,9 +233,8 @@ void PyDataProvider::fillSlotsByStr(const std::string& samples) {
}
for (size_t i = 0; i < sequenceNum; ++i) {
size_t begin = slot.sequenceStartPositions[i];
- size_t end = (i < sequenceNum - 1)
- ? slot.sequenceStartPositions[i + 1]
- : slot.sampleNum;
+ size_t end = (i < sequenceNum - 1) ? slot.sequenceStartPositions[i + 1]
+ : slot.sampleNum;
for (size_t ii = begin; ii < end; ++ii) {
slot.sampleSequenceIdVec.push_back(ii);
}
@@ -255,8 +262,8 @@ void PyDataProvider::fillSlotsByStr(const std::string& samples) {
void PyDataProvider::reset() {
{ // Invoke PyDataProvider Reset
PyGuard guard;
- PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
- const_cast("reset"), NULL));
+ PyObjectPtr obj(PyObject_CallMethod(
+ classInstance_.get(), const_cast("reset"), NULL));
CHECK_PY(obj) << "Call function reset failed.";
}
@@ -270,15 +277,18 @@ void PyDataProvider::reset() {
void PyDataProvider::shuffle() {
// py shuffle
PyGuard guard;
- PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
- const_cast("shuffle"), NULL));
+ PyObjectPtr obj(PyObject_CallMethod(
+ classInstance_.get(), const_cast("shuffle"), NULL));
CHECK_PY(obj) << "Call function shuffle failed.";
}
-void PyDataProvider::handleDenseSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleDenseSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments) {
unsigned int dim = slot.dim;
- Matrix::resizeOrCreate(cpuArguments[slotIndex].value, slot.sampleNum, dim,
+ Matrix::resizeOrCreate(cpuArguments[slotIndex].value,
+ slot.sampleNum,
+ dim,
false, // trans = false
false); // useGpu = false
real* buf = cpuArguments[slotIndex].value->getData();
@@ -294,19 +304,27 @@ void PyDataProvider::handleSparseNonValueSlot(
ProtoSlot& slot, size_t slotIndex, std::vector& cpuArguments) {
unsigned int dim = slot.dim;
if (!(cpuArguments[slotIndex].value)) {
- cpuArguments[slotIndex].value = Matrix::createSparseMatrix(
- slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE,
- SPARSE_CSR, false, useGpu_);
+ cpuArguments[slotIndex].value =
+ Matrix::createSparseMatrix(slot.sampleNum,
+ dim,
+ slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+ NO_VALUE,
+ SPARSE_CSR,
+ false,
+ useGpu_);
}
auto mat = cpuArguments[slotIndex].value;
mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR);
if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
- slot.sparseNonValueData.data(), HPPL_STREAM_1);
+ ->copyFrom(slot.sampleSequenceIdVec.data(),
+ slot.indices.data(),
+ slot.sparseNonValueData.data(),
+ HPPL_STREAM_1);
} else if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+ ->copyFrom(slot.sampleSequenceIdVec.data(),
+ slot.indices.data(),
slot.sparseNonValueData.data());
} else {
LOG(FATAL) << "Not Supported";
@@ -317,28 +335,38 @@ void PyDataProvider::handleSparseValueSlot(
ProtoSlot& slot, size_t slotIndex, std::vector& cpuArguments) {
unsigned int dim = slot.dim;
if (!(cpuArguments[slotIndex].value)) {
- cpuArguments[slotIndex].value = Matrix::createSparseMatrix(
- slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
- FLOAT_VALUE, SPARSE_CSR, false, useGpu_);
+ cpuArguments[slotIndex].value =
+ Matrix::createSparseMatrix(slot.sampleNum,
+ dim,
+ slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+ FLOAT_VALUE,
+ SPARSE_CSR,
+ false,
+ useGpu_);
}
auto mat = cpuArguments[slotIndex].value;
mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR);
if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
- slot.sparseFloatValueData.data(), HPPL_STREAM_DEFAULT);
+ ->copyFrom(slot.sampleSequenceIdVec.data(),
+ slot.indices.data(),
+ slot.sparseFloatValueData.data(),
+ HPPL_STREAM_DEFAULT);
} else if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+ ->copyFrom(slot.sampleSequenceIdVec.data(),
+ slot.indices.data(),
slot.sparseFloatValueData.data());
} else {
LOG(FATAL) << "Not Supported";
}
}
-void PyDataProvider::handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleIndexSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments) {
- IVector::resizeOrCreate(cpuArguments[slotIndex].ids, slot.sampleNum,
+ IVector::resizeOrCreate(cpuArguments[slotIndex].ids,
+ slot.sampleNum,
/*useGpu_*/ false);
int* buf = cpuArguments[slotIndex].ids->getData();
for (size_t i = 0; i < slot.sampleNum; ++i) {
@@ -346,7 +374,8 @@ void PyDataProvider::handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
}
}
-void PyDataProvider::handleStringSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleStringSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments) {
if (cpuArguments[slotIndex].strs) {
cpuArguments[slotIndex].strs->resize(slot.sampleNum);
@@ -364,7 +393,8 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
PyGuard guard;
PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
const_cast("getNextBatch"),
- const_cast("i"), size));
+ const_cast("i"),
+ size));
CHECK_PY(obj) << "Call function getNextBatch failed.";
const std::string& samples =
std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
@@ -381,23 +411,24 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
if (!iidData()) {
for (size_t j = 0; j < slotNum_; ++j) {
auto& slot = slots_[j];
- ICpuGpuVector::resizeOrCreate(
- cpuArguments[j].sequenceStartPositions,
- slot.sequenceNum + 1, /* useGpu= */ false);
+ ICpuGpuVector::resizeOrCreate(cpuArguments[j].sequenceStartPositions,
+ slot.sequenceNum + 1,
+ /* useGpu= */ false);
int* buf = cpuArguments[j].sequenceStartPositions->getMutableData(false);
std::copy(slot.sequenceStartPositions.begin(),
- slot.sequenceStartPositions.end(), buf);
+ slot.sequenceStartPositions.end(),
+ buf);
buf[slot.sequenceStartPositions.size()] = slot.sampleNum;
if (slot.subSequenceStartPositions.size()) {
- ICpuGpuVector::resizeOrCreate(
- cpuArguments[j].subSequenceStartPositions,
- slot.subSequenceNum + 1,
- /* useGpu= */ false);
+ ICpuGpuVector::resizeOrCreate(cpuArguments[j].subSequenceStartPositions,
+ slot.subSequenceNum + 1,
+ /* useGpu= */ false);
int* buf =
- cpuArguments[j].subSequenceStartPositions->getMutableData(false);
+ cpuArguments[j].subSequenceStartPositions->getMutableData(false);
std::copy(slot.subSequenceStartPositions.begin(),
- slot.subSequenceStartPositions.end(), buf);
+ slot.subSequenceStartPositions.end(),
+ buf);
buf[slot.subSequenceNum] = slot.sampleNum;
// check subSequenceStartPositions and sequenceStartPositions
cpuArguments[j].checkSubset();
@@ -452,8 +483,8 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
cpuArguments[i].subSequenceStartPositions;
}
} else {
- gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
- HPPL_STREAM_1);
+ gpuArguments[i].resizeAndCopyFrom(
+ cpuArguments[i], useGpu_, HPPL_STREAM_1);
}
}
hl_stream_synchronize(HPPL_STREAM_1);
diff --git a/paddle/gserver/dataproviders/PyDataProvider.h b/paddle/gserver/dataproviders/PyDataProvider.h
index 939d9cf725c2fe6e4989c17e1e768c9f8aedfc95..6bb7c831fdd451abc5241199d6a4d1b1ad814517 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.h
+++ b/paddle/gserver/dataproviders/PyDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
@@ -25,7 +24,8 @@ namespace paddle {
class PyDataProvider : public DataProvider {
public:
- PyDataProvider(const DataConfig& config, bool useGpu,
+ PyDataProvider(const DataConfig& config,
+ bool useGpu,
bool loadDataAll = true);
virtual void reset();
@@ -48,21 +48,27 @@ protected:
void parseHeaderData(const std::string& headerData);
void fillDenseSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
- void fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
+ void fillSparseNonValueSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd);
void fillSparseValueSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
void fillIndexSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
void fillStringSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
void fillSlotsByStr(const std::string& samples);
- void handleDenseSlot(ProtoSlot& slot, size_t slotIndex,
+ void handleDenseSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments);
- void handleSparseNonValueSlot(ProtoSlot& slot, size_t slotIndex,
+ void handleSparseNonValueSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments);
- void handleSparseValueSlot(ProtoSlot& slot, size_t slotIndex,
+ void handleSparseValueSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments);
- void handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
+ void handleIndexSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments);
- void handleStringSlot(ProtoSlot& slot, size_t slotIndex,
+ void handleStringSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments);
void resetSlots();
void loadData(const std::vector& fileList);
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index 90391a7c307d8dff7e289d445cafd27dc5008547..967fc9026a39967477d606862e060b680512901a 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -34,7 +34,7 @@ namespace paddle {
namespace unittest {
static std::unique_ptr>
- OnPoolFilled;
+ OnPoolFilled;
namespace pydp2 {
@@ -43,15 +43,11 @@ void setOnPoolFilledHook(const std::function& callback) {
*OnPoolFilled = callback;
}
-void clearOnPoolFilledHook() {
- OnPoolFilled.reset();
-}
+void clearOnPoolFilledHook() { OnPoolFilled.reset(); }
} // namespace pydp2
} // namespace unittest
-
-
/**
* Slot type
*/
@@ -65,17 +61,13 @@ enum SlotType {
/**
* Sequence type
*/
-enum SeqType {
- SQT_NONE = 0,
- SQT_SEQ,
- SQT_SUBSEQ
-};
+enum SeqType { SQT_NONE = 0, SQT_SEQ, SQT_SUBSEQ };
/**
* Cache Type.
*/
enum CacheType {
- NO_CACHE = 0, // Each pass will load data from PyDataProvider2.
+ NO_CACHE = 0, // Each pass will load data from PyDataProvider2.
CACHE_PASS_IN_MEM = 1, // First pass will load data from PyDataProvider2,
// then cache all data in memory. Load data from
// memory in rest passes.
@@ -87,8 +79,8 @@ struct SlotHeader { // Slot Header will parse from python object's slots field.
SeqType seqType;
};
-inline std::ostream& operator << (std::ostream& os, const SlotHeader& header) {
- os <<"Dim = " << header.dim << " Type = " << header.slotType
+inline std::ostream& operator<<(std::ostream& os, const SlotHeader& header) {
+ os << "Dim = " << header.dim << " Type = " << header.slotType
<< " SeqType = " << header.seqType;
return os;
}
@@ -158,7 +150,6 @@ protected:
SlotHeader* headerPtr_;
};
-
/**
* Py Data Provider Cache Interface.
*/
@@ -209,17 +200,13 @@ public:
PyDataProvider2(const DataConfig& config,
const ModelConfig& modelConfig,
bool useGpu)
- :DataProvider(config, useGpu),
- callingContextCreated_(2) {
- if (PyArray_API == NULL)
- import_array();
+ : DataProvider(config, useGpu), callingContextCreated_(2) {
+ if (PyArray_API == NULL) import_array();
auto& args = config.load_data_args();
PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
if (!args.empty()) {
kwargs = callPythonFuncRetPyObj(
- "paddle.trainer.PyDataProvider2",
- "deserialize_args",
- {args});
+ "paddle.trainer.PyDataProvider2", "deserialize_args", {args});
}
py::DictHelper kwargsDict(kwargs);
@@ -245,40 +232,38 @@ public:
* Dtor
* @note will stop loading thread when destructing
*/
- virtual ~PyDataProvider2() {
- resetImpl(false);
- }
+ virtual ~PyDataProvider2() { resetImpl(false); }
private:
void createPyDataObj(const std::string& model,
const std::string& className,
const std::string& fileListName,
- PyObjectPtr && kwargs) {
- LOG(INFO) << "loading dataprovider " << model <<"::" << className;
+ PyObjectPtr&& kwargs // NOLINT
+ ) {
+ LOG(INFO) << "loading dataprovider " << model << "::" << className;
PyObjectPtr module = py::import(model);
PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
- PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(),
- className.c_str()));
+ PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), className.c_str()));
CHECK_PY(cls) << "load class " << className.c_str() << "error";
// If there are multiple python instance share same module, the PyObjectPtr
// only for instance will make python reference-count error.
//
// So here, we increase reference count manually.
- if (gModuleClsPtrs_.find((uintptr_t) module.get())
- != gModuleClsPtrs_.end()) {
+ if (gModuleClsPtrs_.find((uintptr_t)module.get()) !=
+ gModuleClsPtrs_.end()) {
// Multi instance use same module
Py_XINCREF(module.get());
Py_XINCREF(moduleDict.get());
} else {
- gModuleClsPtrs_.insert((uintptr_t) module.get());
+ gModuleClsPtrs_.insert((uintptr_t)module.get());
}
- if (gModuleClsPtrs_.find((uintptr_t) cls.get()) != gModuleClsPtrs_.end()) {
+ if (gModuleClsPtrs_.find((uintptr_t)cls.get()) != gModuleClsPtrs_.end()) {
Py_XINCREF(cls.get());
} else {
- gModuleClsPtrs_.insert((uintptr_t) cls.get());
+ gModuleClsPtrs_.insert((uintptr_t)cls.get());
}
PyObjectPtr fileListInPy = loadPyFileLists(fileListName);
@@ -294,8 +279,8 @@ private:
py::ObjectHelper self(this->instance_);
bool ok;
- this->skipShuffle_ = !self.getBoolAttr("should_shuffle",
- &ok /*isBoolType*/);
+ this->skipShuffle_ =
+ !self.getBoolAttr("should_shuffle", &ok /*isBoolType*/);
if (!ok) {
this->skipShuffle_ = testing; // shuffle when is training, skip shuffle
// when is testing.
@@ -335,12 +320,12 @@ private:
PyObjectPtr headerPtrWrap(hdPtr);
py::ObjectHelper hd(headerPtrWrap);
header.dim = hd.getIntAttrWithError("dim");
- header.seqType = (SeqType) hd.getIntAttrWithError("seq_type");
- header.slotType = (SlotType) hd.getIntAttrWithError("type");
+ header.seqType = (SeqType)hd.getIntAttrWithError("seq_type");
+ header.slotType = (SlotType)hd.getIntAttrWithError("type");
}
DBG << "Data header size " << headers_.size();
- for (auto & header : headers_) {
+ for (auto& header : headers_) {
DBG << header;
}
cache_.reset(IPyDataProviderCache::create(
@@ -351,8 +336,7 @@ private:
loadFileList(fileListName, fileLists_);
PyObject* lst = PyList_New(fileLists_.size());
for (size_t i = 0; i < fileLists_.size(); ++i) {
- PyList_SET_ITEM(lst, i,
- PyString_FromString(fileLists_[i].c_str()));
+ PyList_SET_ITEM(lst, i, PyString_FromString(fileLists_[i].c_str()));
}
return PyObjectPtr(lst);
}
@@ -414,11 +398,12 @@ private:
CHECK(ok) << "CalcBatchSize must return int or long";
}
- if (this->loadThread_){ // wait poolActualSize < poolSize;
+ if (this->loadThread_) { // wait poolActualSize < poolSize;
std::unique_lock l(mtx_);
- pushCV_.wait(l, [this, additionalBatchSize] {
- return this->poolActualSize_ < poolSize_;
- });
+ pushCV_.wait(l,
+ [this, additionalBatchSize] {
+ return this->poolActualSize_ < poolSize_;
+ });
}
{
@@ -487,14 +472,14 @@ private:
std::vector fileLists_;
std::vector headers_;
static PyObjectPtr zeroTuple_;
- static std::unordered_set gModuleClsPtrs_;
+ static std::unordered_set gModuleClsPtrs_;
class PositionRandom {
public:
- inline explicit PositionRandom(bool skipRand):
- eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
+ inline explicit PositionRandom(bool skipRand)
+ : eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
- inline size_t operator() (size_t len) {
+ inline size_t operator()(size_t len) {
if (!skipRand_) {
if (!dist_ || dist_->b() != len - 1) {
dist_.reset(new std::uniform_int_distribution(0, len - 1));
@@ -525,32 +510,31 @@ public:
* Shuffle. Do nothing because PyDataProvider do shuffle implicitly by random
* select data from datapool.
*/
- void shuffle() {
- }
+ void shuffle() {}
/**
* Not limited size.
*/
- int64_t getSize() {
- return -1;
- }
+ int64_t getSize() { return -1; }
/**
* Loading a batch of data.
*/
- int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) {
+ int64_t getNextBatchInternal(int64_t size_, DataBatch* batch) {
std::lock_guard guard(mutexForReset_);
REGISTER_TIMER("PyDP2.getNextBatchInternal")
CHECK_GE(size_, 0);
- size_t size = (size_t) size_;
+ size_t size = (size_t)size_;
if (loadThread_) { // loading from thread should wait for data pool ready.
// but, loading from cache, cache object should ensure
// data pool ready.
std::unique_lock l(mtx_);
- pullCV_.wait(l, [this, &size] {
- return this->poolActualSize_ >= std::max(size, this->minPoolSize_)
- || callingContexts_.empty();
- });
+ pullCV_.wait(l,
+ [this, &size] {
+ return this->poolActualSize_ >=
+ std::max(size, this->minPoolSize_) ||
+ callingContexts_.empty();
+ });
if (unittest::OnPoolFilled) {
(*unittest::OnPoolFilled)(this->poolActualSize_);
@@ -633,35 +617,35 @@ public:
cpuBatch.setSize(bsize);
auto& inArgs = cpuBatch.getStreams();
inArgs.resize(headers_.size());
- std::vector > scanners;
+ std::vector> scanners;
scanners.reserve(headers_.size());
for (auto& header : headers_) {
scanners.emplace_back(IFieldScanner::create(&header));
}
DBG << "Scanner created.";
- for (size_t i=0; i < headers_.size(); ++i) {
+ for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->startPrepare(inArgs[i]);
}
- for (auto & d : data) {
+ for (auto& d : data) {
py::SequenceHelper s(d);
- for (size_t i=0; i < headers_.size(); ++i) {
+ for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->prepare(inArgs[i], s[i]);
}
}
- for (size_t i=0; i < headers_.size(); ++i) {
+ for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->finishPrepare(inArgs[i]);
}
- for (size_t i=0; i < headers_.size(); ++i) {
+ for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->startFill(inArgs[i]);
}
- for (auto & d : data) {
+ for (auto& d : data) {
py::SequenceHelper s(d);
for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->fill(inArgs[i], s[i]);
}
}
- for (size_t i=0; i < headers_.size(); ++i) {
+ for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->finishFill(inArgs[i]);
}
@@ -679,8 +663,8 @@ public:
gpuArguments.resize(cpuArguments.size());
gpuBatch.setSize(size);
for (size_t i = 0; i < headers_.size(); ++i) {
- gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
- HPPL_STREAM_1);
+ gpuArguments[i].resizeAndCopyFrom(
+ cpuArguments[i], useGpu_, HPPL_STREAM_1);
}
hl_stream_synchronize(HPPL_STREAM_1);
} else {
@@ -690,31 +674,28 @@ public:
}
};
-std::unordered_set PyDataProvider2::gModuleClsPtrs_;
+std::unordered_set PyDataProvider2::gModuleClsPtrs_;
PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));
REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
-
/**
* Scanner for dense slot.
*/
-class DenseScanner: public IFieldScanner {
+class DenseScanner : public IFieldScanner {
public:
- explicit DenseScanner(SlotHeader* ptr):IFieldScanner(ptr), height_(0) {}
+ explicit DenseScanner(SlotHeader* ptr) : IFieldScanner(ptr), height_(0) {}
/**
* Prepare.
* @param argument target argument
* @param obj each timestep of a sample.
*/
- virtual void prepare(Argument &argument, PyObject *obj) {
- ++height_;
- }
+ virtual void prepare(Argument& argument, PyObject* obj) { ++height_; }
- virtual void finishPrepare(Argument &argument) {
- Matrix::resizeOrCreate(argument.value, height_, headerPtr_->dim,
- false, false);
+ virtual void finishPrepare(Argument& argument) {
+ Matrix::resizeOrCreate(
+ argument.value, height_, headerPtr_->dim, false, false);
height_ = 0;
}
@@ -723,24 +704,23 @@ public:
* @param argument
* @param obj
*/
- virtual void fill(Argument &argument, PyObject *obj) {
+ virtual void fill(Argument& argument, PyObject* obj) {
real* dat = argument.value->getData() + height_ * headerPtr_->dim;
if (PyArray_Check(obj)) {
- auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
- if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
- real * data = (real*)PyArray_DATA((PyArrayObject*)obj);
- auto sz = PyArray_SIZE((PyArrayObject*)obj);
- std::copy(data, data + sz, dat);
- } else {
- LOG(FATAL) << "You should yield float" << sizeof(real) * 8
- << " array";
- }
- } else {
- py::SequenceHelper s(obj);
- // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
- for (size_t i=0; i < headerPtr_->dim; ++i) {
- dat[i] = (real) s.getDouble(i);
- }
+ auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
+ if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
+ real* data = (real*)PyArray_DATA((PyArrayObject*)obj);
+ auto sz = PyArray_SIZE((PyArrayObject*)obj);
+ std::copy(data, data + sz, dat);
+ } else {
+ LOG(FATAL) << "You should yield float" << sizeof(real) * 8 << " array";
+ }
+ } else {
+ py::SequenceHelper s(obj);
+ // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
+ for (size_t i = 0; i < headerPtr_->dim; ++i) {
+ dat[i] = (real)s.getDouble(i);
+ }
}
++height_;
}
@@ -752,20 +732,18 @@ private:
/**
* Scanner for index slot
*/
-class IndexScanner: public IFieldScanner {
+class IndexScanner : public IFieldScanner {
public:
- explicit IndexScanner(SlotHeader* ptr):IFieldScanner(ptr), cnt_(0) {}
+ explicit IndexScanner(SlotHeader* ptr) : IFieldScanner(ptr), cnt_(0) {}
/**
* Prepare memory space.
*
* @note obj is a single timestep of sample
*/
- virtual void prepare(Argument &argument, PyObject *obj) {
- ++cnt_;
- }
+ virtual void prepare(Argument& argument, PyObject* obj) { ++cnt_; }
- virtual void finishPrepare(Argument &argument) {
+ virtual void finishPrepare(Argument& argument) {
IVector::resizeOrCreate(argument.ids, cnt_, false);
cnt_ = 0;
}
@@ -773,9 +751,9 @@ public:
/**
* Fill one index to argument.
*/
- virtual void fill(Argument &argument, PyObject *obj) {
+ virtual void fill(Argument& argument, PyObject* obj) {
bool ok;
- argument.ids->getData()[cnt_++] = py::castInt(obj, &ok);
+ argument.ids->getData()[cnt_++] = py::castInt(obj, &ok);
CHECK(ok) << "Cannot cast int " << py::repr(obj);
}
@@ -785,27 +763,25 @@ private:
class SparseNonValueScanner : public IFieldScanner {
public:
- explicit SparseNonValueScanner(SlotHeader* ptr): IFieldScanner(ptr),
- nnz_(0),
- height_(0) {}
+ explicit SparseNonValueScanner(SlotHeader* ptr)
+ : IFieldScanner(ptr), nnz_(0), height_(0) {}
/**
* Prepare memory space
* @note obj is a timestep of one sample.
*/
- virtual void prepare(Argument &argument, PyObject *obj) {
+ virtual void prepare(Argument& argument, PyObject* obj) {
++height_;
nnz_ += py::SequenceHelper(obj).size();
}
- virtual void finishPrepare(Argument &argument) {
- Matrix::resizeOrCreateSparseMatrix(argument.value, height_,
- headerPtr_->dim,
- nnz_, NO_VALUE);
+ virtual void finishPrepare(Argument& argument) {
+ Matrix::resizeOrCreateSparseMatrix(
+ argument.value, height_, headerPtr_->dim, nnz_, NO_VALUE);
}
- virtual void startFill(Argument & argument) {
- auto smat = (CpuSparseMatrix*) (argument.value.get());
+ virtual void startFill(Argument& argument) {
+ auto smat = (CpuSparseMatrix*)(argument.value.get());
smat->getRows()[0] = 0;
nnz_ = 0;
height_ = 1;
@@ -818,14 +794,14 @@ public:
virtual void fill(Argument& argument, PyObject* obj) {
py::SequenceHelper s(obj);
auto sz = s.size();
- auto smat = (CpuSparseMatrix*) (argument.value.get());
+ auto smat = (CpuSparseMatrix*)(argument.value.get());
int* row = smat->getRows();
int* col = smat->getCols();
real* dat = smat->getData();
- row[height_] = row[height_-1] + (int)sz;
+ row[height_] = row[height_ - 1] + (int)sz;
for (decltype(sz) i = 0; i < sz; ++i) {
- setData(col+nnz_, dat+nnz_, s[i]);
+ setData(col + nnz_, dat + nnz_, s[i]);
++nnz_;
}
++height_;
@@ -839,7 +815,7 @@ protected:
* @param [in] obj Python Object. For sparse_non_value is a PyInt or PyLong.
* For sparse_value is a Tuple (int, float).
*/
- virtual void setData(int* col, real * dat, PyObject* obj) {
+ virtual void setData(int* col, real* dat, PyObject* obj) {
bool ok;
*col = py::castInt(obj, &ok);
CHECK(ok);
@@ -851,26 +827,25 @@ protected:
class SparseValueScanner : public SparseNonValueScanner {
public:
- explicit SparseValueScanner(SlotHeader *ptr) : SparseNonValueScanner(ptr) {}
+ explicit SparseValueScanner(SlotHeader* ptr) : SparseNonValueScanner(ptr) {}
- virtual void finishPrepare(Argument &argument) {
- Matrix::resizeOrCreateSparseMatrix(argument.value, height_,
- headerPtr_->dim,
- nnz_, FLOAT_VALUE);
+ virtual void finishPrepare(Argument& argument) {
+ Matrix::resizeOrCreateSparseMatrix(
+ argument.value, height_, headerPtr_->dim, nnz_, FLOAT_VALUE);
}
protected:
- virtual void setData(int *col, real *dat, PyObject *obj) {
+ virtual void setData(int* col, real* dat, PyObject* obj) {
py::SequenceHelper s(obj);
SparseNonValueScanner::setData(col, dat, s[0]);
- *dat = (real) s.getDouble(1);
+ *dat = (real)s.getDouble(1);
}
};
/**
* Sequence Scanner. Scanner for sequence or sub-sequence.
*/
-class SequenceScanner: public IFieldScanner {
+class SequenceScanner : public IFieldScanner {
public:
/**
* Ctor
@@ -879,15 +854,18 @@ public:
* return a sequence start position or a sub-sequence
* start position.
*/
- SequenceScanner(std::unique_ptr&& innerScanner,
- const std::function& getSeqStartPos)
- : IFieldScanner(nullptr), inner_(std::move(innerScanner)),
- cnt_(0), getSeqStartPos_(getSeqStartPos) {}
+ SequenceScanner(
+ std::unique_ptr&& innerScanner,
+ const std::function& getSeqStartPos)
+ : IFieldScanner(nullptr),
+ inner_(std::move(innerScanner)),
+ cnt_(0),
+ getSeqStartPos_(getSeqStartPos) {}
/**
* Start prepare. Invoke inner->startPrepare too.
*/
- virtual void startPrepare(Argument &argument) {
+ virtual void startPrepare(Argument& argument) {
inner_->startPrepare(argument);
}
@@ -895,10 +873,10 @@ public:
* Prepare. obj is a list or tuple. it will invoke inner_->prepare for each
* element of sequence obj.
*/
- virtual void prepare(Argument &argument, PyObject *obj) {
+ virtual void prepare(Argument& argument, PyObject* obj) {
py::SequenceHelper s(obj);
++cnt_;
- for (size_t i=0; i < s.size(); ++i) {
+ for (size_t i = 0; i < s.size(); ++i) {
inner_->prepare(argument, s[i]);
}
}
@@ -906,7 +884,7 @@ public:
/**
* Finish prepare. invoke inner_->finishPrepare too.
*/
- virtual void finishPrepare(Argument &argument) {
+ virtual void finishPrepare(Argument& argument) {
ICpuGpuVector::resizeOrCreate(getSeqStartPos_(argument), cnt_ + 1, false);
inner_->finishPrepare(argument);
}
@@ -914,7 +892,7 @@ public:
/**
* Start fill. invoke inner->startFill too.
*/
- virtual void startFill(Argument &argument) {
+ virtual void startFill(Argument& argument) {
getSeqStartPos_(argument)->getMutableData(false)[0] = 0;
cnt_ = 1;
inner_->startFill(argument);
@@ -925,13 +903,13 @@ public:
* sequence obj. And set seqStartPos at same time. The seqStartPos will be
* calculated by getSeqStartPos callback passed in ctor.
*/
- virtual void fill(Argument &argument, PyObject *obj) {
+ virtual void fill(Argument& argument, PyObject* obj) {
getSeqStartPos_(argument)->getMutableData(false)[cnt_] =
- getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
- (int)getSize(obj);
+ getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
+ (int)getSize(obj);
py::SequenceHelper s(obj);
++cnt_;
- for (size_t i=0; i < s.size(); ++i) {
+ for (size_t i = 0; i < s.size(); ++i) {
inner_->fill(argument, s[i]);
}
}
@@ -939,9 +917,7 @@ public:
/**
* Finish fill. will invoke inner->finishFill too.
*/
- virtual void finishFill(Argument &argument) {
- inner_->finishFill(argument);
- }
+ virtual void finishFill(Argument& argument) { inner_->finishFill(argument); }
protected:
size_t getSize(PyObject* obj) {
@@ -949,7 +925,7 @@ protected:
auto sc = dynamic_cast(inner_.get());
if (sc) {
size_t sum = 0;
- for (size_t i=0; i < s.size(); ++i) {
+ for (size_t i = 0; i < s.size(); ++i) {
sum += sc->getSize(s[i]);
}
return sum;
@@ -964,8 +940,7 @@ private:
std::function getSeqStartPos_;
};
-
-IFieldScanner* IFieldScanner::create(SlotHeader *header) {
+IFieldScanner* IFieldScanner::create(SlotHeader* header) {
IFieldScanner* retv = nullptr;
switch (header->slotType) {
case ST_DENSE:
@@ -989,15 +964,15 @@ IFieldScanner* IFieldScanner::create(SlotHeader *header) {
break;
case SQT_SUBSEQ:
retv = new SequenceScanner(std::unique_ptr(retv),
- [](Argument& arg) -> ICpuGpuVectorPtr& {
- return arg.subSequenceStartPositions;
- });
- // fall through, not break;
+ [](Argument& arg) -> ICpuGpuVectorPtr& {
+ return arg.subSequenceStartPositions;
+ });
+ // fall through, not break;
case SQT_SEQ:
retv = new SequenceScanner(std::unique_ptr(retv),
- [](Argument& arg) -> ICpuGpuVectorPtr& {
- return arg.sequenceStartPositions;
- });
+ [](Argument& arg) -> ICpuGpuVectorPtr& {
+ return arg.sequenceStartPositions;
+ });
break;
default:
LOG(FATAL) << "Not implemented";
@@ -1010,19 +985,13 @@ IFieldScanner* IFieldScanner::create(SlotHeader *header) {
* No Cache Strategy. Will destruct old data immediately and load data from
* python every pass.
*/
-class NoCacheStrategy: public IPyDataProviderCache {
+class NoCacheStrategy : public IPyDataProviderCache {
public:
- virtual bool reset() {
- return true;
- }
+ virtual bool reset() { return true; }
- virtual void drop(std::deque *data) {
- data->clear();
- }
+ virtual void drop(std::deque* data) { data->clear(); }
- virtual std::deque* load() {
- return nullptr;
- }
+ virtual std::deque* load() { return nullptr; }
};
/**
@@ -1033,9 +1002,9 @@ public:
*/
class CacheOnePassInMemory : public IPyDataProviderCache {
public:
- CacheOnePassInMemory() : objPool_(new std::deque()),
- droppedPool_(new std::deque())
- {}
+ CacheOnePassInMemory()
+ : objPool_(new std::deque()),
+ droppedPool_(new std::deque()) {}
virtual bool reset() {
if (objPool_->empty() && droppedPool_->empty()) {
@@ -1048,25 +1017,22 @@ public:
}
}
- virtual void drop(std::deque *data) {
+ virtual void drop(std::deque* data) {
size_t orgSize = droppedPool_->size();
droppedPool_->resize(orgSize + data->size());
- for (size_t i=0; i < data->size(); ++i) {
+ for (size_t i = 0; i < data->size(); ++i) {
std::swap((*droppedPool_)[orgSize + i], (*data)[i]);
}
data->clear();
}
- virtual std::deque* load() {
- return objPool_.get();
- }
+ virtual std::deque* load() { return objPool_.get(); }
private:
- std::unique_ptr