diff --git a/.travis.yml b/.travis.yml
index eecf5e81f0c952cb4cf7bd215496350d14ed7f85..0705baa1aca8b480b2a774076bd91fb9df401a53 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,9 @@
language: cpp
-cache: ccache
+cache:
+ directories:
+ - $HOME/third_party
+ - $HOME/.ccache
+ - $HOME/.cache/pip
sudo: required
dist: trusty
os:
@@ -35,6 +39,7 @@ addons:
- clang-format-3.8
- automake
- libtool
+ - ccache
before_install:
- |
if [ ${JOB} == "BUILD_AND_TEST" ]; then
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 59182d299be1ccc5f57e22f325b7f684fdf97866..15e310a6ae1155796687f18f7797ae48c8a5ecbf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,16 @@ option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
+
+# CMAKE_BUILD_TYPE
+if(NOT CMAKE_BUILD_TYPE)
+ set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
+ "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+ FORCE)
+endif()
+
+set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
+ "A path setting third party libraries download & build directories.")
########################################################################################
include(external/zlib) # download, build, install zlib
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index d38b7d1ba2a74d5bb46d0c07e3abe6832d4c8af3..2a49d76eb30f592a28746f5897b14b7dd319d784 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -14,8 +14,8 @@
INCLUDE(ExternalProject)
-SET(GFLAGS_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/gflags)
-SET(GFLAGS_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/gflags)
+SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags)
+SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
IF(WIN32)
set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index bec69f3ddf093b62f084f9080fa1fe4398c93e9a..71e20c85276b014c2e33735c3199c3772526c6c7 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -14,8 +14,8 @@
INCLUDE(ExternalProject)
-SET(GLOG_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/glog)
-SET(GLOG_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/glog)
+SET(GLOG_SOURCES_DIR ${THIRD_PARTY_PATH}/glog)
+SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog)
SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE)
IF(WIN32)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 2fcb7893fa30e7fcd84b9e860217f82cf01bf89e..11d829a9e2f239848803130505c9862695b25029 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -16,8 +16,8 @@ IF(WITH_TESTING)
ENABLE_TESTING()
INCLUDE(ExternalProject)
- SET(GTEST_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/gtest)
- SET(GTEST_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/gtest)
+ SET(GTEST_SOURCES_DIR ${THIRD_PARTY_PATH}/gtest)
+ SET(GTEST_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gtest)
SET(GTEST_INCLUDE_DIR "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE)
INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR})
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 66a72cd243e09ccf32b61d419f6d0ad9ec3fe9c8..0e8c29c831c823f701d8eecd954d3b120085e495 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -18,8 +18,8 @@ IF(NOT ${CBLAS_FOUND})
MESSAGE(FATAL_ERROR "Please install OpenBlas, MKL or ATLAS.")
INCLUDE(ExternalProject)
- SET(CBLAS_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/openblas)
- SET(CBLAS_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/openblas)
+ SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
+ SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
IF(WIN32)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 2f2769b4c628d8570c335d344cbf608bda84206f..c0cf2719f9a7b3ae6be5cefffa3dbd2c3f712e82 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,8 +14,8 @@
INCLUDE(ExternalProject)
-SET(PROTOBUF_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/protobuf)
-SET(PROTOBUF_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/protobuf)
+SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf)
+SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf)
SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE)
INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 2f86ab3901d4cfc40a294309662c986b818e64f7..b865c74a44af75f2db6f78c260fde5e27db2fa58 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -26,10 +26,14 @@ IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
find_python_module(wheel REQUIRED)
find_python_module(google.protobuf REQUIRED)
FIND_PACKAGE(NumPy REQUIRED)
+ IF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
+ MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
+ "please use pip to upgrade protobuf.")
+ ENDIF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
##################################### PYTHON ########################################
- SET(PYTHON_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/python)
- SET(PYTHON_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/python)
+ SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python)
+ SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python)
SET(_python_DIR ${PYTHON_INSTALL_DIR})
IF(UNIX)
diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
index 40088c65ef7166ddef52956a1a7470ccab8087c9..63e8bd25462e50e2f78908899938468c989b3ac3 100644
--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
@@ -18,8 +18,8 @@ IF(NOT SWIG_FOUND)
# build swig as an external project
INCLUDE(ExternalProject)
- SET(SWIG_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/swig)
- SET(SWIG_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/swig)
+ SET(SWIG_SOURCES_DIR ${THIRD_PARTY_PATH}/swig)
+ SET(SWIG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/swig)
SET(SWIG_TARGET_VERSION "3.0.2")
SET(SWIG_DOWNLOAD_SRC_MD5 "62f9b0d010cef36a13a010dc530d0d41")
SET(SWIG_DOWNLOAD_WIN_MD5 "3f18de4fc09ab9abb0d3be37c11fbc8f")
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 7386d935b8931670d4fd7aa305f74b21471a5562..f5e4b3e1eb39acbe8dbcd0023956ca7e52c1ecd8 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -14,8 +14,8 @@
INCLUDE(ExternalProject)
-SET(WARPCTC_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/warpctc)
-SET(WARPCTC_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/warpctc)
+SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
+SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE)
INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 916f6816aae9938aad95ac527cf07ffbe38f7479..47fa8817fb64fb8fd718e2892ad5bae7bbe956eb 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -14,8 +14,8 @@
INCLUDE(ExternalProject)
-SET(ZLIB_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/zlib)
-SET(ZLIB_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/zlib)
+SET(ZLIB_SOURCES_DIR ${THIRD_PARTY_PATH}/zlib)
+SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib)
SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE)
SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 0983d83b73a32d0615170155759d45001cc6ff54..0d1ef5cd8449bd31b4cfa4619f27bce7c1f55ebb 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -3,12 +3,6 @@ include(CheckCXXCompilerFlag)
include(CheckCCompilerFlag)
include(CheckCXXSymbolExists)
-if(NOT CMAKE_BUILD_TYPE)
- set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
- "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
- FORCE)
-endif()
-
function(CheckCompilerCXX11Flag)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
diff --git a/cmake/python_module.cmake b/cmake/python_module.cmake
index 2eb3441428e8290b665e092f6e4b40e146ea5a52..1412b7f7f20600acf95a4a899f5e6529c3b67a35 100644
--- a/cmake/python_module.cmake
+++ b/cmake/python_module.cmake
@@ -26,5 +26,18 @@ function(find_python_module module)
if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED)
message(FATAL_ERROR "python module ${module} is not found")
endif()
+
+ execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+ "import sys, ${module}; sys.stdout.write(${module}.__version__)"
+ OUTPUT_VARIABLE _${module}_version
+ RESULT_VARIABLE _${module}_status
+ ERROR_QUIET
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+ if(NOT _${module}_status)
+ set(PY_${module_upper}_VERSION ${_${module}_version} CACHE STRING
+ "Version of Python module ${module}")
+ endif(NOT _${module}_status)
+
set(PY_${module_upper}_FOUND ${PY_${module_upper}_FOUND} PARENT_SCOPE)
+ set(PY_${module_upper}_VERSION ${PY_${module_upper}_VERSION} PARENT_SCOPE)
endfunction(find_python_module)
diff --git a/demo/image_classification/train.sh b/demo/image_classification/train.sh
index 6fc11caf1c75192242482c2e85f8167eb9fba4ec..e45bd47ad5925c6674d628a70a7ad7c4d5d5c173 100755
--- a/demo/image_classification/train.sh
+++ b/demo/image_classification/train.sh
@@ -27,5 +27,6 @@ paddle train \
--num_passes=300 \
--save_dir=$output \
2>&1 | tee $log
+paddle usage -l $log -e $? -n "image_classification_train" >/dev/null 2>&1
python -m paddle.utils.plotcurve -i $log > plot.png
diff --git a/demo/introduction/train.sh b/demo/introduction/train.sh
index b7bbb90ddd287e3e312a490b53924ae76fb20d2c..2ce6446d7c943ffc9bea8da43d153539f6f9f15f 100755
--- a/demo/introduction/train.sh
+++ b/demo/introduction/train.sh
@@ -19,3 +19,4 @@ paddle train \
--save_dir=./output \
--num_passes=30 \
2>&1 |tee 'train.log'
+paddle usage -l "train.log" -e $? -n "introduction" >/dev/null 2>&1
diff --git a/demo/mnist/train.sh b/demo/mnist/train.sh
index da90cd749a02976633d0f0d6e4352d8a85c7cdef..ca2b1ad9eb960685b95b0f294a9b929e1a4acab1 100755
--- a/demo/mnist/train.sh
+++ b/demo/mnist/train.sh
@@ -27,5 +27,6 @@ paddle train \
--num_passes=100 \
--save_dir=$output \
2>&1 | tee $log
+paddle usage -l $log -e $? -n "mnist_train" >/dev/null 2>&1
python -m paddle.utils.plotcurve -i $log > plot.png
diff --git a/demo/quick_start/cluster/cluster_train.sh b/demo/quick_start/cluster/cluster_train.sh
index aac9b89b14b98ac8e2db7def19e5f06c01682493..a7b1f01064b29cf6abc4cd6b706ee466a6d6da36 100755
--- a/demo/quick_start/cluster/cluster_train.sh
+++ b/demo/quick_start/cluster/cluster_train.sh
@@ -25,6 +25,7 @@ log_file="$bin_dir/train.log"
pushd "$home_dir"
cfg=trainer_config.lr.py
paddle train \
+ --start_pserver=false \
--config=$cfg \
--save_dir=${model_dir} \
--trainer_count=4 \
diff --git a/demo/quick_start/predict.sh b/demo/quick_start/predict.sh
index f02e5038e92790c7f1ddcd84a09c6d9a02f84ac4..e47c2dd01fb5c919203964e298018e6dc2bd366e 100755
--- a/demo/quick_start/predict.sh
+++ b/demo/quick_start/predict.sh
@@ -26,5 +26,7 @@ paddle train \
--init_model_path=$model \
--config_args=is_predict=1 \
--predict_output_dir=. \
+2>&1 | tee 'predict.log'
+paddle usage -l 'predict.log' -e $? -n "quick_start_predict_${cfg}" >/dev/null 2>&1
mv rank-00000 result.txt
diff --git a/demo/quick_start/train.sh b/demo/quick_start/train.sh
index e3595fce7519297058e1eeb66487692267ddcfcc..01697fed48054be8ad98a01d4cbb5029e6a1ead0 100755
--- a/demo/quick_start/train.sh
+++ b/demo/quick_start/train.sh
@@ -31,3 +31,4 @@ paddle train \
--show_parameter_stats_period=100 \
--test_all_data_in_one_period=1 \
2>&1 | tee 'train.log'
+paddle usage -l "train.log" -e $? -n "quick_start_${cfg}" >/dev/null 2>&1
diff --git a/demo/recommendation/run.sh b/demo/recommendation/run.sh
index e341d1cc7a3267bef9db916719b2e4b1981e31bc..22aef556082ba429e9ca7c6dd3ec72699b9dbcf4 100755
--- a/demo/recommendation/run.sh
+++ b/demo/recommendation/run.sh
@@ -22,3 +22,4 @@ paddle train \
--log_period=100 \
--dot_period=1 \
--num_passes=50 2>&1 | tee 'log.txt'
+paddle usage -l log.txt -e $? -n "recommendation" >/dev/null 2>&1
diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh
index 11d9d6a19c1b17ad1b7540ee7a03017f85dd821e..095bbff2ea42627a13d8ebab436f5a05abc09743 100755
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
@@ -38,3 +38,4 @@ paddle train \
--config_args=is_test=1 \
--test_all_data_in_one_period=1 \
2>&1 | tee 'test.log'
+paddle usage -l test.log -e $? -n "semantic_role_labeling_test" >/dev/null 2>&1
diff --git a/demo/semantic_role_labeling/train.sh b/demo/semantic_role_labeling/train.sh
index 9354e72f46dc4dfc46138a04c330933d404c6cb8..eee14010d7b04a1b824f39090fa82fc532085e0d 100755
--- a/demo/semantic_role_labeling/train.sh
+++ b/demo/semantic_role_labeling/train.sh
@@ -27,3 +27,4 @@ paddle train \
--load_missing_parameter_strategy=rand \
--test_all_data_in_one_period=1 \
2>&1 | tee 'train.log'
+paddle usage -l train.log -e $? -n "semantic_role_labeling_train" >/dev/null 2>&1
diff --git a/demo/sentiment/test.sh b/demo/sentiment/test.sh
index 8af827c3388c8df88a872bd87d121a4f9631c3ff..85c4f3ccfc3ede23fcf701769b9701ecbf57c789 100755
--- a/demo/sentiment/test.sh
+++ b/demo/sentiment/test.sh
@@ -37,3 +37,4 @@ paddle train --config=$net_conf \
--trainer_count=4 \
--config_args=is_test=1 \
2>&1 | tee 'test.log'
+paddle usage -l test.log -e $? -n "sentiment_test" >/dev/null 2>&1
diff --git a/demo/sentiment/train.sh b/demo/sentiment/train.sh
index 5ce8bf4b997d962b9b61593cec0954d76c4874bc..14620f733bf03444e5ba3b3b792dfbed6146ecde 100755
--- a/demo/sentiment/train.sh
+++ b/demo/sentiment/train.sh
@@ -27,3 +27,4 @@ paddle train --config=$config \
--show_parameter_stats_period=100 \
--test_all_data_in_one_period=1 \
2>&1 | tee 'train.log'
+paddle usage -l train.log -e $? -n "sentiment_train" >/dev/null 2>&1
diff --git a/demo/seqToseq/paraphrase/train.sh b/demo/seqToseq/paraphrase/train.sh
index 33a42f6eff2b0414c466d5f78c89989a6a517eb9..9bb6dbdb1d4c5e35bfb31855e0331f0250a69a20 100755
--- a/demo/seqToseq/paraphrase/train.sh
+++ b/demo/seqToseq/paraphrase/train.sh
@@ -27,3 +27,4 @@ paddle train \
--log_period=10 \
--dot_period=5 \
2>&1 | tee 'paraphrase/train.log'
+paddle usage -l 'paraphrase/train.log' -e $? -n "seqToseq_paraphrase_train" >/dev/null 2>&1
diff --git a/demo/seqToseq/translation/gen.sh b/demo/seqToseq/translation/gen.sh
index a700ae213473dfe7c5b77156de15775b8fe9a9f0..64b78f5e9654e7b206740f92e224e0164108c9f1 100755
--- a/demo/seqToseq/translation/gen.sh
+++ b/demo/seqToseq/translation/gen.sh
@@ -24,3 +24,4 @@ paddle train \
--test_pass=12 \
--trainer_count=1 \
2>&1 | tee 'translation/gen.log'
+paddle usage -l 'translation/gen.log' -e $? -n "seqToseq_translation_gen" >/dev/null 2>&1
diff --git a/demo/seqToseq/translation/train.sh b/demo/seqToseq/translation/train.sh
index bdece693e5c407c89bc172c461bac7f9b20560d3..b0ec9854b118cbb9ed39d6bed0cdd845403926a4 100755
--- a/demo/seqToseq/translation/train.sh
+++ b/demo/seqToseq/translation/train.sh
@@ -25,3 +25,4 @@ paddle train \
--log_period=10 \
--dot_period=5 \
2>&1 | tee 'translation/train.log'
+paddle usage -l 'translation/train.log' -e $? -n "seqToseq_translation_train" >/dev/null 2>&1
diff --git a/demo/sequence_tagging/train.sh b/demo/sequence_tagging/train.sh
index 9a706b98d8686101ba21b513644bdd791062ec26..37e196c84200dc26ccb523076a81dbc393b1280f 100755
--- a/demo/sequence_tagging/train.sh
+++ b/demo/sequence_tagging/train.sh
@@ -7,4 +7,6 @@ paddle train \
--dot_period=10 \
--log_period=1000 \
--test_period=0 \
- --num_passes=10
+ --num_passes=10 \
+2>&1 | tee 'train.log'
+paddle usage -l 'train.log' -e $? -n "sequence_tagging_train" >/dev/null 2>&1
diff --git a/demo/sequence_tagging/train_linear.sh b/demo/sequence_tagging/train_linear.sh
index 597b5afea9c63a8e209b69b6a40e74556e27ac31..ad6e2d8ee7f813c69f9dd250c6f7bbb4403a0ed5 100755
--- a/demo/sequence_tagging/train_linear.sh
+++ b/demo/sequence_tagging/train_linear.sh
@@ -7,3 +7,5 @@ paddle train \
--log_period=10000 \
--test_period=0 \
--num_passes=10
+2>&1 | tee 'train_linear.log'
+paddle usage -l 'train_linear.log' -e $? -n "sequence_tagging_train_linear" >/dev/null 2>&1
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index 7d425a05d46131d84ba895d0fefc3a592a9a36e1..6d5367177da2af6276698f94f86664a5b506dca2 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -286,22 +286,3 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字
.. code-block:: bash
paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
-
-12. 编译源码提示warp-ctc/include/ctc.h 找不到的情况
----------------------------------------------------
-
-目前Paddle使用\ :code:`git submodule`\ 来引用一些第三方模块。简单的\
-:code:`git clone`\ 命令不能得到第三方模块的代码。需要使用\:
-
-.. code-block:: bash
-
- git clone --recursive https://github.com/PaddlePaddle/Paddle.git
-
-来获取所有源码。对于已经clone的git版本库,可以在Paddle的源码目录中执行\:
-
-.. code-block:: bash
-
- git submodule init
- git submodule update
-
-来获得所有第三方模块。
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index aaa07d49d3148266db27670a98c2b27db4dc0a8f..6954be3b2bb956755c7820bf285addfd15226874 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -11,32 +11,21 @@ You can download PaddlePaddle from the [github source](https://github.com/Paddle
```bash
git clone https://github.com/PaddlePaddle/Paddle paddle
cd paddle
-git submodule update --init --recursive
```
-
-If you already have a local PaddlePaddle repo and have not initialized the submodule, your local submodule folder will be empty. You can simply run the last line of the above codes in your PaddlePaddle home directory to initialize your submodule folder.
-
-If you have already initialized your submodule and you would like to sync with the upstream submodule repo, you can run the following command
-```
-git submodule update --remote
-```
-
## Requirements
To compile the source code, your computer must be equipped with the following dependencies.
- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1)
-- **CMake**: version >= 2.8
+- **CMake**: version >= 3.0 (at least CMake 3.4 on Mac OS X)
- **BLAS**: MKL, OpenBlas or ATLAS
-- **Protocol Buffers**: version >= 2.4, **Note: 3.x is not supported**
-- **Python**: only python 2.7 is supported currently
**Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
For CUDA 8.0, GCC versions later than 5.3 are not supported!
### Options
-PaddlePaddle supports some build options. To enable it, first you need to install the related libraries.
+PaddlePaddle supports some build options.
@@ -47,12 +36,21 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
-WITH_GPU | Compile with GPU mode. |
-WITH_DOUBLE | Compile with double precision floating-point, default: single precision. |
-WITH_TESTING | Compile with gtest for PaddlePaddle's unit testing. |
-WITH_DOC | Compile to generate PaddlePaddle's docs, default: disabled (OFF). |
-WITH_SWIG_PY | Compile with python predict API, default: disabled (OFF). |
-WITH_STYLE_CHECK | Compile with code style check, default: enabled (ON). |
+WITH_GPU | Compile PaddlePaddle with NVIDIA GPU |
+WITH_AVX | Compile PaddlePaddle with AVX intrinsics |
+WITH_DSO | Compile PaddlePaddle with dynamic linked CUDA |
+WITH_TESTING | Compile PaddlePaddle with unit testing |
+WITH_SWIG_PY | Compile PaddlePaddle with inference api |
+WITH_STYLE_CHECK | Compile PaddlePaddle with style check |
+WITH_PYTHON | Compile PaddlePaddle with python interpreter |
+WITH_DOUBLE | Compile PaddlePaddle with double precision |
+WITH_RDMA | Compile PaddlePaddle with RDMA support |
+WITH_TIMER | Compile PaddlePaddle with stats timer |
+WITH_PROFILER | Compile PaddlePaddle with GPU profiler |
+WITH_DOC | Compile PaddlePaddle with documentation |
+ON_COVERALLS | Compile PaddlePaddle with code coverage |
+COVERALLS_UPLOAD | Package code coverage data to coveralls |
+ON_TRAVIS | Exclude special unit test on Travis CI |
@@ -64,18 +62,15 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
As a simple example, consider the following:
-1. **Python Dependencies(optional)**
+1. **BLAS Dependencies(optional)**
- To compile PaddlePaddle with python predict API, make sure swig installed and set `-DWITH_SWIG_PY=ON` as follows:
+ Paddle will find BLAS from system's default path. But you can specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
```bash
- # install swig on ubuntu
- sudo apt-get install swig
- # install swig on Mac OS X
- brew install swig
-
- # active swig in cmake
- cmake .. -DWITH_SWIG_PY=ON
+ # specify MKL
+ cmake .. -DMKL_ROOT=
+ # or specify OpenBLAS
+ cmake .. -DOPENBLAS_ROOT=
```
2. **Doc Dependencies(optional)**
@@ -104,17 +99,9 @@ As a simple example, consider the following:
```bash
# necessary
sudo apt-get update
- sudo apt-get install -y g++ make cmake swig build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
- # optional
- sudo apt-get install libgoogle-glog-dev
- sudo apt-get install libgflags-dev
- sudo apt-get install libgtest-dev
- sudo pip install wheel
- pushd /usr/src/gtest
- cmake .
- make
- sudo cp *.a /usr/lib
- popd
+ sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev git
+ sudo pip install wheel numpy
+ sudo pip install 'protobuf>=3.0.0'
```
- **GPU Dependencies (optional)**
@@ -149,51 +136,17 @@ As usual, the best option is to create build folder under paddle project directo
```bash
mkdir build && cd build
-cmake ..
-```
-
-CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
-libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
-If still not found, you can manually set it based on CMake error information from your screen.
-
-As a simple example, consider the following:
+```
-- **Only CPU with swig**
-
- ```bash
- cmake .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON
- ```
-- **GPU with swig**
-
- ```bash
- cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON
- ```
-
-- **GPU with doc and swig**
-
- ```bash
- cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
- ```
-
-Finally, you can build PaddlePaddle:
+Finally, you can build and install PaddlePaddle:
```bash
# you can add build option here, such as:
-cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX= -DWITH_SWIG_PY=ON
+cmake .. -DCMAKE_INSTALL_PREFIX=
# please use sudo make install, if you want to install PaddlePaddle into the system
make -j `nproc` && make install
# set PaddlePaddle installation path in ~/.bashrc
export PATH=/bin:$PATH
-```
-
-If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
-Otherwise, PaddlePaddle will automatically install python dependencies
-at first time when user run paddle commands, such as `paddle version`, `paddle train`.
-It may require sudo privileges:
-
-```bash
-# you can run
+# install PaddlePaddle Python modules.
sudo pip install /opt/paddle/share/wheels/*.whl
-# or just run
-sudo paddle version
```
diff --git a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
index 3a52c8723bbccd70dd89e8913092d92813925f90..be0c1ffa451b2901ec06621dd4d886f800b4562e 100644
--- a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
@@ -40,4 +40,4 @@ PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。
cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
-注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(``rm -rf``)后,再指定。
\ No newline at end of file
+注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(``rm -rf``)后,再指定。
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 34279a29b2e4c84aa5039f2e5ab2c6ed9a06da2f..51a1a11674d98781d04137ff14bf8debe3277318 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -16,23 +16,13 @@ Developers can work on PaddlePaddle using Docker. This allows
developers to work on different platforms -- Linux, Mac OS X, and
Windows -- in a consistent way.
-The general development workflow with Docker and Bazel is as follows:
+The general development workflow with Docker and CMake is as follows:
1. Get the source code of Paddle:
.. code-block:: bash
- git clone --recursive https://github.com/PaddlePaddle/Paddle.git
-
-
- Here **git clone --recursive is required** as we have a submodule `warp-ctc `_.
-
- If you have used :code:`git clone https://github.com/PaddlePaddle/Paddle` and find that the directory :code:`warp-ctc` is
- empty, please use the following command to get the submodule.
-
- .. code-block:: bash
-
- git submodule update --init --recursive
+ git clone https://github.com/PaddlePaddle/Paddle.git
2. Build a development Docker image :code:`paddle:dev` from the source
@@ -162,7 +152,6 @@ source code:
cd ~
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
- git submodule update --init --recursive
docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md
index e0a63f5a14c7b2e8953aa21739668ee2a9ebeff1..ee1b3213eaed3bfd94e449997dff9848b8fb4228 100644
--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -33,7 +33,6 @@ cd Paddle
git checkout -b develop # 创建 develop 分支
git remote add upstream https://github.com/PaddlePaddle/Paddle.git # 添加 upstream 到 baidu/Paddle
git pull upstream develop # 更新 upstream
-git submodule update --init --recursive
```
然后你可以通过做一个本地开发分支开始开发
diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md
index e578f6fce8b94180da7d5de041a0e17b1d59f6ea..9b0d3e83c0dc264650eda73e6801c60a75439b4a 100644
--- a/doc/howto/dev/contribute_to_paddle_en.md
+++ b/doc/howto/dev/contribute_to_paddle_en.md
@@ -38,7 +38,6 @@ cd Paddle
git checkout -b develop # create develop branch.
git remote add upstream https://github.com/PaddlePaddle/Paddle.git # add upstream to baidu/Paddle
git pull upstream develop # update to upstream
-git submodule update --init --recursive
```
Then you can start to develop by making a local developement branch
diff --git a/doc/howto/usage/k8s/k8s_distributed_cn.md b/doc/howto/usage/k8s/k8s_distributed_cn.md
index b63b8437a0114a0165971933912da83c2dd770a6..2063b98ca8aab9c348fe2b53bb1e6d96b7750dd3 100644
--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
@@ -159,6 +159,8 @@ docker build -t your_repo/paddle:mypaddle .
docker push your_repo/paddle:mypaddle
```
+注意上述命令中`your_repo`表示读者所使用的Docker镜像仓库地址,读者需要替换成自己使用的仓库地址。下文使用`your_repo/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。
+
### 上传训练文件
本文使用PaddlePaddle官方的[recommendation demo](http://www.paddlepaddle.org/doc/demo/index.html#recommendation)作为这次训练的内容,我们将训练文件与数据放在一个job name命名的目录中,上传到MFS共享存储。完成后MFS上的文件内容大致如下:
@@ -244,6 +246,8 @@ spec:
`CONF_PADDLE_GRADIENT_NUM`表示训练节点数量,即`--num_gradient_servers`参数
+这些参数的具体描述,读者可以查看[这里](http://www.paddlepaddle.org/doc/ui/cmd_argument/detail_introduction.html#parameter-server-and-distributed-communication)。
+
编写完YAML文件后,可以使用Kubernetes的命令行工具创建job。
```bash
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index 0cafbd896e2d88aee4406bd0305878ce489bc18d..41beed38a87601cb57072c8966cd0fd2ea156524 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -137,6 +137,10 @@ void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
a.cpuSequenceDims = m->cast(vec->getSharedPtr());
}
+float Arguments::sumCosts() const {
+ return paddle::Argument::sumCosts(m->outputs);
+}
+
int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
auto& a = m->getArg(idx);
return a.getBatchSize();
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 364d19f9414430709108824dce75a1007332d824..f5af8b0035b44d97832dd90ca2eeba079503715c 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -450,6 +450,8 @@ public:
IVector* vec) throw(RangeError);
void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
+ float sumCosts() const;
+
private:
static Arguments* createByPaddleArgumentVector(void* ptr);
void* getInternalArgumentsPtr() const;
@@ -546,6 +548,10 @@ public:
ParameterConfig* getConfig();
void setValueUpdated();
+ bool save(const std::string& filename) const;
+
+ bool load(const std::string& filename) const;
+
size_t getSize() const;
private:
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
index ddc00d8d1af4c58d7e2233423bea916408bee92b..19f7a898d6b8d3d02c5654559dcb86728266731e 100644
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -57,4 +57,12 @@ size_t Parameter::getID() const { return m->getPtr()->getID(); }
void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
+bool Parameter::save(const std::string& filename) const {
+ return m->getPtr()->save(filename);
+}
+
+bool Parameter::load(const std::string& filename) const {
+ return m->getPtr()->load(filename);
+}
+
size_t Parameter::getSize() const { return m->getPtr()->getSize(); }
diff --git a/paddle/api/paddle_api_config.py.in b/paddle/api/paddle_api_config.py.in
new file mode 100644
index 0000000000000000000000000000000000000000..82f45ba6ccec49eb190d1814a67a575f311689e8
--- /dev/null
+++ b/paddle/api/paddle_api_config.py.in
@@ -0,0 +1,17 @@
+PADDLE_BUILD_DIR="@CMAKE_CURRENT_BINARY_DIR@/../"
+WITH_GPU="@WITH_GPU@"
+PROTOBUF_LIBRARY="@PROTOBUF_LIBRARY@"
+ZLIB_LIBRARIES="@ZLIB_LIBRARIES@"
+CMAKE_THREAD_LIB="@CMAKE_THREAD_LIBS_INIT@"
+CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"
+
+
+WITH_PYTHON="@WITH_PYTHON@"
+PYTHON_LIBRARIES="@PYTHON_LIBRARIES@"
+GLOG_LIBRARIES="@GLOG_LIBRARIES@"
+GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
+GFLAGS_LOCATION="@GFLAGS_LOCATION@"
+CBLAS_LIBRARIES="@CBLAS_LIBRARIES@"
+
+CUDA_LIBRARIES="@CUDA_CUDART_LIBRARY@"
+WITH_COVERALLS="@ON_COVERALLS@"
diff --git a/paddle/api/test/.gitignore b/paddle/api/test/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..b7948824a1eab119140dd9bea20276c303fe4af1
--- /dev/null
+++ b/paddle/api/test/.gitignore
@@ -0,0 +1,2 @@
+*.w0
+*.wbias
diff --git a/paddle/api/test/testArguments.py b/paddle/api/test/testArguments.py
index 8cabecd242fb4eb98c0fe468687ef179245e4535..a04a805d7a64ef906c8388f1241b9ef823e4d9e0 100644
--- a/paddle/api/test/testArguments.py
+++ b/paddle/api/test/testArguments.py
@@ -22,6 +22,8 @@ class TestArguments(unittest.TestCase):
args = swig_paddle.Arguments.createArguments(1)
args.setSlotValue(0, m)
+ self.assertAlmostEqual(27.0, args.sumCosts())
+
mat = args.getSlotValue(0)
assert isinstance(mat, swig_paddle.Matrix)
np_mat = mat.toNumpyMatInplace()
diff --git a/paddle/api/test/testGradientMachine.py b/paddle/api/test/testGradientMachine.py
index b81eafa9673ca34f1b7e06401098d55bdb1b35a5..4b705f66eccd267f326fe0662a17b33a09fda982 100644
--- a/paddle/api/test/testGradientMachine.py
+++ b/paddle/api/test/testGradientMachine.py
@@ -45,6 +45,7 @@ class TestGradientMachine(unittest.TestCase):
assert isinstance(val, swig_paddle.Vector)
arr = numpy.full((len(val), ), 0.1, dtype="float32")
val.copyFromNumpyArray(arr)
+ self.assertTrue(param.save(param.getName()))
param_config = param.getConfig().toProto()
assert isinstance(param_config,
paddle.proto.ParameterConfig_pb2.ParameterConfig)
@@ -92,6 +93,9 @@ class TestGradientMachine(unittest.TestCase):
self.assertTrue(self.isCalled)
+ for param in machine.getParameters():
+ self.assertTrue(param.load(param.getName()))
+
def test_train_one_pass(self):
conf_file_path = './testTrainConfig.py'
trainer_config = swig_paddle.TrainerConfig.createFromTrainerConfigFile(
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index 57fb89608f4bcf3e6829fe850a61c2a626adfbdc..a28ccd6f07cfd56b7f1978f67fdcf6e7e5fe6337 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -15,7 +15,6 @@ else()
endif()
set(CUDA_CXX_WITH_GPU_SOURCES
- src/hl_cudart_wrap.cc
src/hl_cuda_cublas.cc
src/hl_cuda_cudnn.cc
src/hl_cuda_device.cc)
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
index 20c13f21e61a92b0635b686f6f724ae2b44518cc..276a07d3c735c771c851e8b4bd14c720f9ab6569 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -36,14 +36,6 @@ void GetCublasDsoHandle(void** dso_handle);
*/
void GetCudnnDsoHandle(void** dso_handle);
-/**
- * @brief load the DSO of CUDA Run Time
- *
- * @param **dso_handle dso handler
- *
- */
-void GetCudartDsoHandle(void** dso_handle);
-
/**
* @brief load the DSO of CURAND
*
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index a71eecba2736234dafaf6b67e5efac5358a30871..6dfb12e00b80db36ad2e53326b880c7d1ed59263 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -22,10 +22,9 @@ limitations under the License. */
#include
#include
#include
-#include "hl_cuda.h"
#include "hl_cuda.ph"
-#include "hl_dso_loader.h"
#include "hl_thread.ph"
+#include "hl_dso_loader.h"
#include "paddle/utils/Logging.h"
// clang-format on
@@ -77,78 +76,6 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
#undef CURAND_RAND_ROUTINE_EACH
#undef DYNAMIC_LOAD_CURAND_WRAP
-std::once_flag cudart_dso_flag;
-void *cudart_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cuda routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- auto operator()(Args... args) -> decltype(__name(args...)) { \
- using cudart_func = decltype(__name(args...)) (*)(Args...); \
- std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
- void *p_##__name = dlsym(cudart_dso_handle, #__name); \
- return reinterpret_cast(p_##__name)(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
-#else
-#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- auto operator()(Args... args) -> decltype(__name(args...)) { \
- return __name(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
-#endif
-
-/* include all needed cuda functions in HPPL */
-// clang-format off
-#define CUDA_ROUTINE_EACH(__macro) \
- __macro(cudaMalloc) \
- __macro(cudaHostAlloc) \
- __macro(cudaFree) \
- __macro(cudaFreeHost) \
- __macro(cudaMemcpy) \
- __macro(cudaMemset) \
- __macro(cudaMemcpyAsync) \
- __macro(cudaSetDevice) \
- __macro(cudaGetDevice) \
- __macro(cudaGetDeviceCount) \
- __macro(cudaGetDeviceProperties) \
- __macro(cudaDeviceSynchronize) \
- __macro(cudaDeviceCanAccessPeer) \
- __macro(cudaDeviceEnablePeerAccess) \
- __macro(cudaStreamCreate) \
- __macro(cudaStreamDestroy) \
- __macro(cudaStreamSynchronize) \
- __macro(cudaStreamWaitEvent) \
- __macro(cudaEventCreate) \
- __macro(cudaEventRecord) \
- __macro(cudaEventQuery) \
- __macro(cudaEventDestroy) \
- __macro(cudaEventSynchronize) \
- __macro(cudaEventElapsedTime) \
- __macro(cudaSetDeviceFlags) \
- __macro(cudaGetLastError) \
- __macro(cudaFuncSetCacheConfig) \
- __macro(cudaRuntimeGetVersion) \
- __macro(cudaGetErrorString) \
- __macro(cudaProfilerStart) \
- __macro(cudaProfilerStop)
-// clang-format on
-
-CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
-
-#undef CUDA_ROUNTINE_EACH
-#undef DYNAMIC_LOAD_CUDART_WRAP
-
} /* namespace dynload */
/**
@@ -171,11 +98,11 @@ int g_cuda_lib_version = 0;
* Check build-in cuda function using glog and it **does not**
* support << operator for more details error info.
*/
-#define CHECK_CUDA(cudaFunc) \
- do { \
- cudaError_t cudaStat = cudaFunc; \
- CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \
- << dynload::cudaGetErrorString(cudaStat); \
+#define CHECK_CUDA(cudaFunc) \
+ do { \
+ cudaError_t cudaStat = cudaFunc; \
+ CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \
+ << cudaGetErrorString(cudaStat); \
} while (0)
/**
@@ -284,13 +211,13 @@ void hl_fini() {
tmp_stream = (char *)t_device[dev]->stream;
}
for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
- CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j]));
+ CHECK_CUDA(cudaStreamDestroy(t_device[dev]->stream[j]));
}
/* free device memory */
hl_free_mem_device(t_device[dev]->gpu_mem);
hl_free_mem_host(t_device[dev]->cpu_mem);
- CHECK_CUDA(dynload::cudaEventDestroy(t_device[dev]->mem_event));
+ CHECK_CUDA(cudaEventDestroy(t_device[dev]->mem_event));
}
free(tmp);
@@ -308,7 +235,7 @@ void hl_set_device(int device) {
CHECK(device >= 0 && device < g_system_device_num && g_device[device])
<< "Device: " << device << " is not specified in startup.";
- CHECK_CUDA(dynload::cudaSetDevice(device));
+ CHECK_CUDA(cudaSetDevice(device));
/* switch thread stream */
for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) {
@@ -336,7 +263,7 @@ void hl_set_device(int device) {
int hl_get_device() {
int device;
- CHECK_CUDA(dynload::cudaGetDevice(&device));
+ CHECK_CUDA(cudaGetDevice(&device));
return device;
}
@@ -344,7 +271,7 @@ void *hl_malloc_device(size_t size) {
void *dest_d;
CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
- CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size));
+ CHECK_CUDA(cudaMalloc((void **)&dest_d, size));
return dest_d;
}
@@ -352,7 +279,7 @@ void *hl_malloc_device(size_t size) {
void hl_free_mem_device(void *dest_d) {
CHECK_NOTNULL(dest_d);
- cudaError_t err = dynload::cudaFree(dest_d);
+ cudaError_t err = cudaFree(dest_d);
CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
<< hl_get_device_error_string();
}
@@ -361,8 +288,7 @@ void *hl_malloc_host(size_t size) {
void *dest_h;
CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
- CHECK_CUDA(
- dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
+ CHECK_CUDA(cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
return dest_h;
}
@@ -370,7 +296,7 @@ void *hl_malloc_host(size_t size) {
void hl_free_mem_host(void *dest_h) {
CHECK_NOTNULL(dest_h);
- cudaError_t err = dynload::cudaFreeHost(dest_h);
+ cudaError_t err = cudaFreeHost(dest_h);
CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
<< hl_get_device_error_string();
}
@@ -381,11 +307,11 @@ void hl_memcpy(void *dst, void *src, size_t size) {
}
CHECK_NOTNULL(dst);
CHECK_NOTNULL(src);
- CHECK_CUDA(dynload::cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+ CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
}
void hl_memset_device(void *dest_d, int value, size_t size) {
- CHECK_CUDA(dynload::cudaMemset(dest_d, value, size));
+ CHECK_CUDA(cudaMemset(dest_d, value, size));
}
void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
@@ -394,7 +320,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
}
CHECK_NOTNULL(src_h);
CHECK_NOTNULL(dest_d);
- CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
+ CHECK_CUDA(cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
}
void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
@@ -403,7 +329,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
}
CHECK_NOTNULL(dest_h);
CHECK_NOTNULL(src_d);
- CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
+ CHECK_CUDA(cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
}
void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
@@ -412,8 +338,7 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
}
CHECK_NOTNULL(dest_d);
CHECK_NOTNULL(src_d);
- CHECK_CUDA(
- dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
+ CHECK_CUDA(cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
}
void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
@@ -427,8 +352,7 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
CHECK_LT(stream, HPPL_STREAM_END);
cu_stream = t_resource.stream[stream];
- CHECK_CUDA(
- dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
+ CHECK_CUDA(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
}
void hl_start() {
@@ -439,8 +363,7 @@ void hl_start() {
bool hl_device_can_access_peer(int device, int peerDevice) {
int canAccessPeer;
- CHECK_CUDA(
- dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
+ CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
if (canAccessPeer == 1) {
return true;
@@ -450,9 +373,9 @@ bool hl_device_can_access_peer(int device, int peerDevice) {
}
void hl_device_enable_peer_access(int peerDevice) {
- cudaError_t err = dynload::cudaDeviceEnablePeerAccess(peerDevice, 0);
+ cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice, 0);
if (cudaErrorPeerAccessAlreadyEnabled == err) {
- dynload::cudaGetLastError();
+ cudaGetLastError();
} else {
CHECK_CUDA(err);
}
@@ -463,9 +386,9 @@ void hl_create_global_resources(hl_device_prop device_prop) {
int device = device_prop->device;
global_device_resources device_res = device_prop->device_resources;
- CHECK_CUDA(dynload::cudaSetDevice(device));
+ CHECK_CUDA(cudaSetDevice(device));
/* device properties */
- CHECK_CUDA(dynload::cudaGetDeviceProperties(&cu_prop, device));
+ CHECK_CUDA(cudaGetDeviceProperties(&cu_prop, device));
device_prop->major = cu_prop.major;
device_prop->minor = cu_prop.minor;
@@ -474,7 +397,7 @@ void hl_create_global_resources(hl_device_prop device_prop) {
/* create device stream */
for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) {
- CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j]));
+ CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
}
/* cublas init */
@@ -501,18 +424,18 @@ void hl_create_global_resources(hl_device_prop device_prop) {
device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
pthread_mutex_init(device_res->gen_mutex, NULL);
- CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version));
+ CHECK_CUDA(cudaRuntimeGetVersion(&g_cuda_lib_version));
}
int hl_get_cuda_version() { return g_cuda_lib_version; }
void hl_create_thread_resources(int device,
thread_device_resources device_res) {
- CHECK_CUDA(dynload::cudaSetDevice(device));
+ CHECK_CUDA(cudaSetDevice(device));
/* create thread stream */
for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
- CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j]));
+ CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
}
/* allocation device memory */
@@ -521,14 +444,14 @@ void hl_create_thread_resources(int device,
/* allocation host memory */
device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
- CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event));
+ CHECK_CUDA(cudaEventCreate(&device_res->mem_event));
}
void hl_specify_devices_start(int *device, int number) {
if (hl_start_flag) return;
/* 1. get the number of devices */
- CHECK_CUDA(dynload::cudaGetDeviceCount(&g_system_device_num));
+ CHECK_CUDA(cudaGetDeviceCount(&g_system_device_num));
CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device";
if (device == NULL) {
number = g_system_device_num;
@@ -640,7 +563,7 @@ void hl_stream_synchronize(hl_stream_t stream) {
<< ": the parameter stream is error.";
cu_stream = t_resource.stream[stream];
- CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream));
+ CHECK_CUDA(cudaStreamSynchronize(cu_stream));
}
void hl_create_event(hl_event_t *event) {
@@ -649,7 +572,7 @@ void hl_create_event(hl_event_t *event) {
struct _hl_event_st *st_event =
(struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
- CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event));
+ CHECK_CUDA(cudaEventCreate(&st_event->cu_event));
*event = st_event;
}
@@ -659,8 +582,7 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
CHECK_NOTNULL(start);
CHECK_NOTNULL(end);
- CHECK_CUDA(
- dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
+ CHECK_CUDA(cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
return time;
}
@@ -672,7 +594,7 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
<< ": the parameter stream is error.";
cu_stream = t_resource.stream[stream];
- CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream));
+ CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream));
}
void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
@@ -683,12 +605,12 @@ void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
<< ": the parameter stream is error.";
cu_stream = t_resource.stream[stream];
- CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
+ CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
}
void hl_destroy_event(hl_event_t event) {
CHECK_NOTNULL(event);
- CHECK_CUDA(dynload::cudaEventDestroy(event->cu_event));
+ CHECK_CUDA(cudaEventDestroy(event->cu_event));
free(event);
event = NULL;
@@ -696,7 +618,7 @@ void hl_destroy_event(hl_event_t event) {
void hl_event_synchronize(hl_event_t event) {
CHECK_NOTNULL(event);
- CHECK_CUDA(dynload::cudaEventSynchronize(event->cu_event));
+ CHECK_CUDA(cudaEventSynchronize(event->cu_event));
}
void hl_get_device_name(char *name, int len, int device) {
@@ -725,24 +647,24 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) {
*minor = g_device[device]->minor;
}
-int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); }
+int hl_get_device_last_error() { return (int)cudaGetLastError(); }
const char *hl_get_device_error_string() {
- cudaError_t err = dynload::cudaGetLastError();
- return dynload::cudaGetErrorString(err);
+ cudaError_t err = cudaGetLastError();
+ return cudaGetErrorString(err);
}
const char *hl_get_device_error_string(size_t err) {
- return dynload::cudaGetErrorString((cudaError_t)err);
+ return cudaGetErrorString((cudaError_t)err);
}
-void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); }
+void hl_device_synchronize() { CHECK_CUDA(cudaDeviceSynchronize()); }
void hl_set_device_flags_block() {
- CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+ CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
}
bool hl_cuda_event_is_ready(hl_event_t event) {
- cudaError_t err = dynload::cudaEventQuery(event->cu_event);
+ cudaError_t err = cudaEventQuery(event->cu_event);
CHECK(cudaSuccess == err || cudaErrorNotReady == err);
if (cudaErrorNotReady == err) {
@@ -751,6 +673,6 @@ bool hl_cuda_event_is_ready(hl_event_t event) {
return true;
}
-void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); }
+void hl_profiler_start() { CHECK_CUDA(cudaProfilerStart()); }
-void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); }
+void hl_profiler_end() { CHECK_CUDA(cudaProfilerStop()); }
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
deleted file mode 100644
index ecc03a729dde2f2b4f8f004234a47d9272997a50..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_USE_DSO
-
-#include
-#include
-#include "hl_dso_loader.h"
-
-/**
- * cudart wrapper: for dynamic load libcudart.so.
- * When nvcc compile cuda kernels, it will insert
- * some build-in runtime routines, which must be
- * provided by us if PADDLE_USE_DSO is true. If
- * PADDLE_USE_DSO is false, all of them must be
- * ignored to avoid multiple definitions.
- */
-namespace dynload {
-
-extern std::once_flag cudart_dso_flag;
-extern void *cudart_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cuda routine
- * via operator overloading.
- **/
-#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type) \
- struct DynLoad__##__name { \
- template \
- __type operator()(Args... args) { \
- typedef __type (*cudartFunc)(Args...); \
- std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
- void *p_##__name = dlsym(cudart_dso_handle, #__name); \
- return reinterpret_cast(p_##__name)(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
-
-/* include all needed cuda functions in HPPL */
-// clang-format off
-#define CUDA_ROUTINE_EACH(__macro) \
- __macro(cudaLaunch, cudaError_t) \
- __macro(cudaSetupArgument, cudaError_t) \
- __macro(cudaConfigureCall, cudaError_t) \
- __macro(__cudaRegisterFatBinary, void**) \
- __macro(__cudaUnregisterFatBinary, void) \
- __macro(__cudaRegisterFunction, void) \
- __macro(__cudaRegisterVar, void) \
- __macro(__cudaRegisterManagedVar, void) \
- __macro(__cudaInitModule, char) \
- __macro(__cudaRegisterTexture, void) \
- __macro(__cudaRegisterSurface, void)
-// clang-format on
-
-CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
-
-#if CUDART_VERSION >= 7000
-DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
-#endif
-
-#undef CUDA_ROUNTINE_EACH
-
-} /* namespace dynload */
-
-#if CUDART_VERSION >= 7000
-__host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
- dim3 gridDim,
- dim3 blockDim,
- void **args,
- size_t sharedMem,
- cudaStream_t stream) {
- return dynload::cudaLaunchKernel(
- func, gridDim, blockDim, args, sharedMem, stream);
-}
-#endif /* CUDART_VERSION >= 7000 */
-
-__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
- return dynload::cudaLaunch(func);
-}
-
-__host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
- size_t size,
- size_t offset) {
- return dynload::cudaSetupArgument(arg, size, offset);
-}
-
-__host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
- dim3 blockDim,
- size_t sharedMem,
- cudaStream_t stream) {
- return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
-}
-
-extern "C" {
-
-void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
- return dynload::__cudaRegisterFatBinary(fatCubin);
-}
-
-void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
- return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
-}
-
-void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
- const char *hostFun,
- char *deviceFun,
- const char *deviceName,
- int thread_limit,
- uint3 *tid,
- uint3 *bid,
- dim3 *bDim,
- dim3 *gDim,
- int *wSize) {
- return dynload::__cudaRegisterFunction(fatCubinHandle,
- hostFun,
- deviceFun,
- deviceName,
- thread_limit,
- tid,
- bid,
- bDim,
- gDim,
- wSize);
-}
-
-void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle,
- char *hostVar,
- char *deviceAddress,
- const char *deviceName,
- int ext,
- int size,
- int constant,
- int global) {
- return dynload::__cudaRegisterVar(fatCubinHandle,
- hostVar,
- deviceAddress,
- deviceName,
- ext,
- size,
- constant,
- global);
-}
-
-extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle,
- void **hostVarPtrAddress,
- char *deviceAddress,
- const char *deviceName,
- int ext,
- int size,
- int constant,
- int global) {
- return dynload::__cudaRegisterManagedVar(fatCubinHandle,
- hostVarPtrAddress,
- deviceAddress,
- deviceName,
- ext,
- size,
- constant,
- global);
-}
-
-char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
- return dynload::__cudaInitModule(fatCubinHandle);
-}
-
-void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle,
- const struct textureReference *hostVar,
- const void **deviceAddress,
- const char *deviceName,
- int dim,
- int norm,
- int ext) {
- return dynload::__cudaRegisterTexture(
- fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext);
-}
-
-void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle,
- const struct surfaceReference *hostVar,
- const void **deviceAddress,
- const char *deviceName,
- int dim,
- int ext) {
- return dynload::__cudaRegisterSurface(
- fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext);
-}
-
-} /* extern "C" */
-
-#endif
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index c92909de534a875028d6d4784b02f08648c85a9a..53164dd27c7c5f5254e743b6fcf1d7b6fc895e31 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -25,10 +25,8 @@ DEFINE_string(cudnn_dir,
DEFINE_string(cuda_dir,
"",
"Specify path for loading cuda library, such as libcublas, "
- "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
- "libcudart can not be specified by cuda_dir, since some "
- "build-in function in cudart already ran before main entry). "
- "If default, dlopen will search cuda from LD_LIBRARY_PATH");
+ "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+ "dlopen will search cuda from LD_LIBRARY_PATH");
DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
@@ -147,14 +145,6 @@ void GetCudnnDsoHandle(void** dso_handle) {
#endif
}
-void GetCudartDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
- GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
-#else
- GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
-#endif
-}
-
void GetCurandDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fde48a73b61c31d06225cc1763efbc6971c86f57
--- /dev/null
+++ b/paddle/function/BufferArg.cpp
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include
+
+#include "BufferArg.h"
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+
+const SequenceArg& BufferArg::sequence() const {
+ // CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
+ return dynamic_cast(*this);
+}
+
+const SparseMatrixArg& BufferArg::sparse() const {
+ // CHECK_EQ(bufferType_, TENSOR_SPARSE);
+ return dynamic_cast(*this);
+}
+
+SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
+ : BufferArg(sparse, argType),
+ row_(reinterpret_cast(sparse.getRows()), VALUE_TYPE_INT32),
+ col_(reinterpret_cast(sparse.getCols()), VALUE_TYPE_INT32) {}
+
+SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
+ : BufferArg(sparse, argType),
+ row_(reinterpret_cast(sparse.getRows()), VALUE_TYPE_INT32),
+ col_(reinterpret_cast(sparse.getCols()), VALUE_TYPE_INT32) {}
+
+} // namespace paddle
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
new file mode 100644
index 0000000000000000000000000000000000000000..12352ba29e33920ba65bd66088b6f7cc53517b52
--- /dev/null
+++ b/paddle/function/BufferArg.h
@@ -0,0 +1,273 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include
+
+#include "TensorShape.h"
+#include "TensorType.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+enum BufferType {
+ TENSOR_NORMAL = 0,
+ TENSOR_SEQUENCE_ID = 1,
+ TENSOR_SEQUENCE_DATA = 2,
+ TENSOR_SPARSE = 3
+};
+
+enum SparseDataType {
+ SPARSE_NO_VALUE = 0, // do not need value pointer, all values are 1
+ SPARSE_FLOAT_VALUE = 1
+};
+
+enum SparseDataFormat { SPARSE_CSR_FORMAT = 0, SPARSE_CSC_FORMAT = 1 };
+
+class BufferArg;
+class SequenceArg;
+class SparseMatrixArg;
+typedef std::shared_ptr BufferArgPtr;
+
+/**
+ * \brief BufferArg used as the argument type of Function.
+ *
+ * The arguments of the Paddle Function have four Buffer types.
+ * 1. BufferArg for a dense Buffer of any dimension.
+ * 2. SequenceIdArg for a Buffer of sequence start positions.
+ * 3. SequenceArg for a Buffer of sequence data.
+ * 4. SparseMatrixArg for a Buffer of sparse matrix.
+ *
+ * There is an ArgType property for the BufferArg used as Function Output.
+ * Whether the result of the Function calculation is assigned to the
+ * output Buffer or added to the output Buffer is determined by the
+ * argType_ property of the output BufferArg.
+ */
+
+// ArgType is only used by output BufferArg.
+// For input argument, argType_ is ignored.
+// For output argument, need to set the argType_ of the BufferArg.
+enum ArgType {
+ UNSPECIFIED = 0,
+ ASSIGN_TO = 1,
+ ADD_TO = 2,
+};
+class BufferArg {
+public:
+ void setArgType(ArgType argType) { argType_ = argType; }
+
+ ArgType getArgType() const { return argType_; }
+
+public:
+ BufferArg(void* buf,
+ ValueType valueType,
+ const TensorShape& shape,
+ ArgType argType = UNSPECIFIED)
+ : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {}
+
+ BufferArg(void* buf, ValueType valueType)
+ : buf_(buf), valueType_(valueType) {}
+
+ BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
+ : buf_(
+ const_cast(reinterpret_cast(matrix.getData()))),
+ valueType_(DataType::value),
+ shape_(2),
+ argType_(argType) {
+ shape_.setDim(0, matrix.getHeight());
+ shape_.setDim(1, matrix.getWidth());
+ }
+
+ BufferArg(const Matrix& matrix,
+ const TensorShape& shape,
+ ArgType argType = UNSPECIFIED)
+ : buf_(
+ const_cast(reinterpret_cast(matrix.getData()))),
+ valueType_(DataType::value),
+ shape_(shape),
+ argType_(argType) {
+ CHECK_EQ(matrix.getElementCnt(), shape.getElements());
+ }
+
+ BufferArg(const Vector& vector, ArgType argType = UNSPECIFIED)
+ : buf_(
+ const_cast(reinterpret_cast(vector.getData()))),
+ valueType_(DataType::value),
+ shape_(1),
+ argType_(argType) {
+ shape_.setDim(0, vector.getSize());
+ }
+
+ BufferArg(const IVector& vector, ArgType argType = UNSPECIFIED)
+ : buf_(
+ const_cast(reinterpret_cast(vector.getData()))),
+ valueType_(VALUE_TYPE_INT32),
+ shape_(1),
+ argType_(argType) {
+ shape_.setDim(0, vector.getSize());
+ }
+
+ template
+ typename Tensor::Matrix matrix() const {
+ CHECK(buf_);
+ CHECK(valueType_ == DataType::value);
+ // CHECK(deviceType_ == DType);
+ CHECK_EQ((size_t)2, shape_.ndims());
+ return typename Tensor::Matrix(
+ reinterpret_cast(buf_), shape_[0], shape_[1]);
+ }
+
+ template
+ typename Tensor::Vector vector() const {
+ CHECK(buf_);
+ CHECK(valueType_ == DataType::value);
+ // CHECK(deviceType_ == DType);
+ CHECK_EQ((size_t)1, shape_.ndims());
+ return typename Tensor::Vector(
+ shape_[0], reinterpret_cast(buf_));
+ }
+
+ virtual ~BufferArg() {}
+
+ template
+ T* data() const {
+ return reinterpret_cast(buf_);
+ }
+
+ void* data() const { return buf_; }
+ ValueType valueType() const { return valueType_; }
+ BufferType bufferType() const { return bufferType_; }
+ const TensorShape& shape() const { return shape_; }
+
+ const SequenceArg& sequence() const;
+ const SparseMatrixArg& sparse() const;
+
+protected:
+ void* buf_;
+ ValueType valueType_;
+ TensorShape shape_;
+ BufferType bufferType_;
+ ArgType argType_ = UNSPECIFIED;
+ // leading dimensions. The size is dims_.size()
+ // Dims lds_;
+};
+
+// sequence start positions in a mini-batch of sequences
+// shape_.ndims() == 1
+// valueType_ = int32
+// if a < b then value_.buf_[a] < value_.buf_[b]
+class SequenceIdArg : public BufferArg {
+public:
+ SequenceIdArg(void* buf,
+ const TensorShape& shape,
+ ArgType argType = UNSPECIFIED)
+ : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
+ CHECK_EQ(shape_.ndims(), (size_t)1);
+ numSeqs_ = shape_[0] - 1;
+ }
+
+ SequenceIdArg(const IVector& vector) : BufferArg(vector) {
+ numSeqs_ = shape_[0] - 1;
+ }
+
+ ~SequenceIdArg() {}
+
+ size_t numSeqs() const { return numSeqs_; }
+
+private:
+ size_t numSeqs_;
+};
+
+// sequence data
+class SequenceArg : public BufferArg {
+public:
+ SequenceArg(void* buf,
+ ValueType valueType,
+ const TensorShape& shape,
+ const SequenceIdArg& startPositions,
+ ArgType argType = UNSPECIFIED)
+ : BufferArg(buf, valueType, shape, argType),
+ startPositions_(startPositions) {}
+
+ SequenceArg(const Matrix& matrix,
+ const IVector& vector,
+ ArgType argType = UNSPECIFIED)
+ : BufferArg(matrix, argType), startPositions_(vector) {}
+
+ ~SequenceArg() {}
+
+ void* getIdBuf() const { return startPositions_.data(); }
+ size_t numSeqs() const { return startPositions_.numSeqs(); }
+
+private:
+ SequenceIdArg startPositions_;
+};
+
+// sparse matrix
+// valueType_ == float or double
+// shape_.ndims() == 2
+class SparseMatrixArg : public BufferArg {
+public:
+ SparseMatrixArg(void* buf,
+ ValueType valueType,
+ const TensorShape& shape,
+ const BufferArg& row,
+ const BufferArg& col,
+ size_t nnz,
+ SparseDataFormat format,
+ SparseDataType type,
+ ArgType argType = UNSPECIFIED)
+ : BufferArg(buf, valueType, shape, argType),
+ row_(row),
+ col_(col),
+ nnz_(nnz),
+ format_(format),
+ type_(type) {
+ CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
+ CHECK_EQ(shape_.ndims(), (size_t)2);
+ CHECK_EQ(row_.shape().ndims(), (size_t)1);
+ CHECK_EQ(col_.shape().ndims(), (size_t)1);
+ if (format == SPARSE_CSR_FORMAT) {
+ CHECK_EQ(nnz, col.shape()[0]);
+ } else if (format == SPARSE_CSC_FORMAT) {
+ CHECK_EQ(nnz, row.shape()[0]);
+ }
+ }
+
+ SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
+
+ SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
+
+ ~SparseMatrixArg() {}
+
+ void* getRowBuf() const { return row_.data(); }
+
+ void* getColBuf() const { return col_.data(); }
+
+ size_t nnz() const { return nnz_; }
+
+ SparseDataFormat dataFormat() const { return format_; }
+
+ SparseDataType dataType() const { return type_; }
+
+private:
+ BufferArg row_;
+ BufferArg col_;
+ size_t nnz_;
+ SparseDataFormat format_;
+ SparseDataType type_;
+};
+
+} // namespace paddle
diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b345597435c9911ce95b596f5f7f2add47f4cd03
--- /dev/null
+++ b/paddle/function/BufferArgTest.cpp
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BufferArg.h"
+#include
+#include "Function.h"
+#include "paddle/math/MemoryHandle.h"
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+
+TEST(BufferTest, BufferArg) {
+ TensorShape shape({8, 10});
+ CpuMemoryHandle memory(shape.getElements() *
+ sizeOfValuType(VALUE_TYPE_FLOAT));
+ BufferArg buffer(memory.getBuf(), VALUE_TYPE_FLOAT, shape);
+ EXPECT_EQ(buffer.data(), memory.getBuf());
+}
+
+TEST(BufferTest, SequenceIdArg) {
+ TensorShape shape({10});
+ CpuMemoryHandle memory(shape.getElements() *
+ sizeOfValuType(VALUE_TYPE_INT32));
+ SequenceIdArg buffer(memory.getBuf(), shape);
+ EXPECT_EQ(buffer.data(), memory.getBuf());
+ EXPECT_EQ(buffer.numSeqs(), 9);
+}
+
+TEST(BufferTest, asArgument) {
+ MatrixPtr matrix = Matrix::create(100, 200);
+ VectorPtr vector = Vector::create(100, false);
+ CpuSparseMatrix sparse(200, 300, 50);
+
+ // prepare arguments
+ BufferArgs argments;
+ argments.addArg(*matrix);
+ argments.addArg(*vector);
+ argments.addArg(sparse);
+
+ // function
+ auto function = [=](const BufferArgs& inputs) {
+ EXPECT_EQ(inputs.size(), 3);
+
+ // check inputs[0]
+ EXPECT_EQ(inputs[0].shape().ndims(), 2);
+ EXPECT_EQ(inputs[0].shape()[0], 100);
+ EXPECT_EQ(inputs[0].shape()[1], 200);
+ EXPECT_EQ(inputs[0].data(), matrix->getData());
+
+ EXPECT_EQ(inputs[0].matrix().getHeight(),
+ matrix->getHeight());
+ EXPECT_EQ(inputs[0].matrix().getWidth(),
+ matrix->getWidth());
+ EXPECT_EQ(inputs[0].matrix().getData(), matrix->getData());
+
+ // check inputs[1]
+ EXPECT_EQ(inputs[1].shape().ndims(), 1);
+ EXPECT_EQ(inputs[1].shape()[0], 100);
+ EXPECT_EQ(inputs[1].data(), vector->getData());
+ CpuVector inVector = inputs[1].vector();
+ EXPECT_EQ(inVector.getSize(), vector->getSize());
+ EXPECT_EQ(inVector.getData(), vector->getData());
+
+ // check inputs[2]
+ EXPECT_EQ(inputs[2].shape().ndims(), 2);
+ EXPECT_EQ(inputs[2].shape()[0], 200);
+ EXPECT_EQ(inputs[2].shape()[1], 300);
+ EXPECT_EQ(inputs[2].data(), sparse.getData());
+ // CHECK_EQ(inputs[2].sparse().nnz(), 50);
+ // CHECK_EQ(inputs[2].sparse().dataFormat(), SPARSE_CSR_FORMAT);
+ // CHECK_EQ(inputs[2].sparse().dataType(), SPARSE_FLOAT_VALUE);
+ EXPECT_EQ(inputs[2].sparse().getRowBuf(), sparse.getRows());
+ EXPECT_EQ(inputs[2].sparse().getColBuf(), sparse.getCols());
+ };
+
+ // call function
+ function(argments);
+}
+
+} // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index de85eeca821742e1d39d5ce26f873238d4359cba..75a2acc55ec3d33687f96d2b0398e52b69e8680d 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -3,6 +3,7 @@ file(GLOB cpp_files . *Op.cpp)
list(APPEND h_files Function.h)
list(APPEND cpp_files Function.cpp)
+list(APPEND cpp_files BufferArg.cpp)
if(WITH_GPU)
file(GLOB cu_files . *OpGpu.cu)
@@ -18,8 +19,12 @@ if(WITH_TESTING)
# TODO:
# file(GLOB test_files . *OpTest.cpp)
# add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
- add_simple_unittest(CrossMapNormalOpTest)
- add_simple_unittest(ContextProjectionOpTest)
+ # add_simple_unittest(CrossMapNormalOpTest)
+ add_simple_unittest(TensorShapeTest)
+ add_simple_unittest(TensorTypeTest)
+ add_simple_unittest(BufferArgTest)
+ add_simple_unittest(FunctionTest)
+ # add_simple_unittest(ContextProjectionOpTest)
endif()
endif()
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index 07907fc1ba7973c728c3a882e4be6b1a7ef7a97a..cb448562ebb37022f727ee65024f06f69d63e9cb 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -19,17 +19,15 @@ limitations under the License. */
namespace paddle {
template <>
-void ContextProjectionForward(CpuMatrix* out_mat,
- const CpuMatrix* input_mat,
- const CpuMatrix* weight_mat,
+void ContextProjectionForward(CpuMatrix& out_mat,
+ const CpuMatrix& input_mat,
+ const CpuMatrix& weight_mat,
const CpuIVector& seq_vec,
size_t context_length,
int context_start,
size_t begin_pad) {
const int* starts = seq_vec.getData();
const size_t num_sequences = seq_vec.getSize() - 1;
- auto w_mat = const_cast(weight_mat);
- auto in_mat = const_cast(input_mat);
for (size_t i = 0; i < num_sequences; ++i) {
for (size_t j = 0; j < context_length; ++j) {
int begin = starts[i] + context_start + j;
@@ -39,10 +37,11 @@ void ContextProjectionForward(CpuMatrix* out_mat,
if (begin < starts[i]) {
int64_t pad_size =
std::min(starts[i] - begin, starts[i + 1] - starts[i]);
- MatrixPtr mat = out_mat->subMatrix(starts[i], pad_size);
- if (w_mat) {
- MatrixPtr sub = w_mat->subMatrix(j, pad_size);
- mat->addAtOffset(*sub, j * in_mat->getWidth());
+ MatrixPtr mat = out_mat.subMatrix(starts[i], pad_size);
+ if (weight_mat) {
+ MatrixPtr sub =
+ const_cast(weight_mat).subMatrix(j, pad_size);
+ mat->addAtOffset(*sub, j * input_mat.getWidth());
}
dst_begin = starts[i] + pad_size;
begin = starts[i];
@@ -50,19 +49,22 @@ void ContextProjectionForward(CpuMatrix* out_mat,
if (end > starts[i + 1]) {
int64_t pad_size =
std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
- MatrixPtr mat = out_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
- if (w_mat) {
- MatrixPtr sub = w_mat->subMatrix(
- begin_pad + context_start + j - pad_size, pad_size);
- mat->addAtOffset(*sub, j * in_mat->getWidth());
+ MatrixPtr mat = out_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+ if (weight_mat) {
+ MatrixPtr sub =
+ const_cast(weight_mat)
+ .subMatrix(begin_pad + context_start + j - pad_size,
+ pad_size);
+ mat->addAtOffset(*sub, j * input_mat.getWidth());
}
dst_end = starts[i + 1] - pad_size;
end = starts[i + 1];
}
if (end <= begin) continue;
- MatrixPtr src = in_mat->subMatrix(begin, end - begin);
- MatrixPtr dst = out_mat->subMatrix(dst_begin, dst_end - dst_begin);
- dst->addAtOffset(*src, j * in_mat->getWidth());
+ MatrixPtr src =
+ const_cast(input_mat).subMatrix(begin, end - begin);
+ MatrixPtr dst = out_mat.subMatrix(dst_begin, dst_end - dst_begin);
+ dst->addAtOffset(*src, j * input_mat.getWidth());
}
}
}
@@ -82,40 +84,32 @@ public:
begin_pad_ = config.get("begin_pad");
}
- void calc(const Arguments& inputs,
- const Arguments& outputs,
- const Arguments& inouts) override {
- CHECK_EQ(3, static_cast(inputs.size()));
- CHECK_EQ(1, static_cast(outputs.size()));
- CHECK_EQ(0, static_cast(inouts.size()));
+ void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+ CHECK_EQ((size_t)3, inputs.size());
+ CHECK_EQ((size_t)1, outputs.size());
- CHECK(outputs[0].getData() && inputs[0].getData() && inputs[2].getData());
- CHECK_EQ(static_cast(outputs[0].dims_.size()), 2);
- CHECK_EQ(static_cast(inputs[0].dims_.size()), 2);
- CHECK_EQ(static_cast(inputs[1].dims_.size()), 2);
- CHECK_EQ(static_cast(inputs[2].dims_.size()), 1);
+ CHECK(outputs[0].data() && inputs[0].data() && inputs[2].data());
+ CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
+ CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
+ CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
+ CHECK_EQ(inputs[2].shape().ndims(), (size_t)1);
/// dim of output = dim of input * context_length
- CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+ CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
/// dim of input == dim of weight
- CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
+ CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
/// input and output has the same batch_size
- CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
-
- auto out_mat = std::make_shared::type>(
- outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
- const auto in_mat = std::make_shared::type>(
- inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
- const auto w_mat =
- !inputs[1].getData()
- ? nullptr
- : std::make_shared::type>(
- inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
- typename SequenceT::type seq_vec(
- inputs[2].dims_[0], reinterpret_cast(inputs[2].getData()));
-
- ContextProjectionForward(out_mat.get(),
- in_mat.get(),
- w_mat.get(),
+ CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+
+ CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+ auto out_mat = outputs[0].matrix();
+ auto in_mat = inputs[0].matrix();
+ auto w_mat = !inputs[1].data()
+ ? typename Tensor::Matrix(nullptr, 0, 0)
+ : inputs[1].matrix();
+ auto seq_vec = inputs[2].vector();
+ ContextProjectionForward(out_mat,
+ in_mat,
+ w_mat,
seq_vec,
context_length_,
context_start_,
@@ -129,18 +123,17 @@ private:
};
template <>
-void ContextProjectionBackward(CpuMatrix* out_grad_mat,
- CpuMatrix* in_grad_mat,
- CpuMatrix* w_grad_mat,
+void ContextProjectionBackward(CpuMatrix& out_grad_mat,
+ CpuMatrix& in_grad_mat,
+ CpuMatrix& w_grad_mat,
const CpuIVector& seq_vec,
size_t context_length,
int context_start,
size_t begin_pad,
bool is_padding,
size_t total_pad) {
- CHECK(out_grad_mat);
- size_t input_dim = in_grad_mat ? in_grad_mat->getWidth()
- : w_grad_mat ? w_grad_mat->getWidth() : 0;
+ size_t input_dim = in_grad_mat ? in_grad_mat.getWidth()
+ : w_grad_mat ? w_grad_mat.getWidth() : 0;
const int* starts = seq_vec.getData();
size_t num_sequences = seq_vec.getSize() - 1;
for (size_t i = 0; i < num_sequences; ++i) {
@@ -153,8 +146,8 @@ void ContextProjectionBackward(CpuMatrix* out_grad_mat,
int64_t pad_size =
std::min(starts[i] - begin, starts[i + 1] - starts[i]);
if (is_padding && w_grad_mat) {
- MatrixPtr mat = out_grad_mat->subMatrix(starts[i], pad_size);
- MatrixPtr sub = w_grad_mat->subMatrix(j, pad_size);
+ MatrixPtr mat = out_grad_mat.subMatrix(starts[i], pad_size);
+ MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
sub->addAtOffset(*mat, j * input_dim);
}
dst_begin = starts[i] + pad_size;
@@ -165,8 +158,8 @@ void ContextProjectionBackward(CpuMatrix* out_grad_mat,
std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
if (is_padding && w_grad_mat) {
MatrixPtr mat =
- out_grad_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
- MatrixPtr sub = w_grad_mat->subMatrix(
+ out_grad_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+ MatrixPtr sub = w_grad_mat.subMatrix(
begin_pad + context_start + j - pad_size, pad_size);
sub->addAtOffset(*mat, j * input_dim);
}
@@ -175,8 +168,8 @@ void ContextProjectionBackward(CpuMatrix* out_grad_mat,
}
if (end <= begin) continue;
if (!in_grad_mat) continue;
- MatrixPtr src = in_grad_mat->subMatrix(begin, end - begin);
- MatrixPtr dst = out_grad_mat->subMatrix(dst_begin, dst_end - dst_begin);
+ MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
+ MatrixPtr dst = out_grad_mat.subMatrix(dst_begin, dst_end - dst_begin);
src->addAtOffset(*dst, j * input_dim);
}
}
@@ -199,44 +192,36 @@ public:
total_pad_ = config.get("total_pad");
}
- void calc(const Arguments& inputs,
- const Arguments& outputs,
- const Arguments& inouts) override {
- CHECK_EQ(3, static_cast(inputs.size()));
- CHECK_EQ(1, static_cast(outputs.size()));
- CHECK_EQ(0, static_cast(inouts.size()));
+ void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+ CHECK_EQ((size_t)3, inputs.size());
+ CHECK_EQ((size_t)1, outputs.size());
- CHECK(outputs[0].getData() && inputs[2].getData());
- CHECK_EQ(static_cast(outputs[0].dims_.size()), 2);
- CHECK_EQ(static_cast(inputs[0].dims_.size()), 2);
- CHECK_EQ(static_cast(inputs[1].dims_.size()), 2);
- CHECK_EQ(static_cast(inputs[2].dims_.size()), 1);
+ CHECK(outputs[0].data() && inputs[2].data());
+ CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
+ CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
+ CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
+ CHECK_EQ(inputs[2].shape().ndims(), (size_t)1);
/// dim of input == dim of weight
- CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
+ CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
/// input and output has the same batch_size
- CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+ CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
/// dim of output = dim of input * context_length
- CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+ CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
- auto out_grad_mat = std::make_shared::type>(
- outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
- auto in_grad_mat =
- !inputs[0].getData()
- ? nullptr
- : std::make_shared::type>(
- inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
- auto w_grad_mat =
- !inputs[1].getData()
- ? nullptr
- : std::make_shared::type>(
- inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
- typename SequenceT::type seq_vec(
- inputs[2].dims_[0], reinterpret_cast(inputs[2].getData()));
+ CHECK_EQ(outputs[0].getArgType(), ADD_TO);
- ContextProjectionBackward(out_grad_mat.get(),
- in_grad_mat ? in_grad_mat.get() : nullptr,
- w_grad_mat ? w_grad_mat.get() : nullptr,
+ auto out_grad_mat = outputs[0].matrix();
+ auto in_grad_mat =
+ !inputs[0].data() ? typename Tensor::Matrix(nullptr, 0, 0)
+ : inputs[0].matrix();
+ auto w_grad_mat = !inputs[1].data()
+ ? typename Tensor::Matrix(nullptr, 0, 0)
+ : inputs[1].matrix();
+ auto seq_vec = inputs[2].vector();
+ ContextProjectionBackward(out_grad_mat,
+ in_grad_mat,
+ w_grad_mat,
seq_vec,
context_length_,
context_start_,
@@ -253,6 +238,7 @@ private:
size_t total_pad_;
};
+#if 0
/**
* \param inputs[0] input grad.
* \param inputs[1] input sequence.
@@ -349,6 +335,7 @@ private:
size_t begin_pad_;
size_t total_pad_;
};
+#endif
REGISTER_TYPED_FUNC(ContextProjectionForward,
CPU,
@@ -363,6 +350,7 @@ REGISTER_TYPED_FUNC(ContextProjectionForward,
REGISTER_TYPED_FUNC(ContextProjectionBackward,
GPU,
ContextProjectionBackwardFunc);
+#if 0
REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
GPU,
ContextProjectionBackwardDataFunc);
@@ -370,4 +358,5 @@ REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
GPU,
ContextProjectionBackwardWeightFunc);
#endif
+#endif
} // namespace paddle
diff --git a/paddle/function/ContextProjectionOp.h b/paddle/function/ContextProjectionOp.h
index 93eb050fde35f474750f3c2efa72b7471f654b75..a558df5e072f2f4dcc5c45afa385b3cf88872d26 100644
--- a/paddle/function/ContextProjectionOp.h
+++ b/paddle/function/ContextProjectionOp.h
@@ -31,14 +31,15 @@ namespace paddle {
* \param[in] is_padding whether padding 0 or not.
*
*/
-template
-void ContextProjectionForward(typename MatrixT::type* output,
- const typename MatrixT::type* input,
- const typename MatrixT::type* weight,
- const typename SequenceT::type& sequence,
- size_t context_length,
- int context_start,
- size_t begin_pad);
+template
+void ContextProjectionForward(
+ typename Tensor::Matrix& output,
+ const typename Tensor::Matrix& input,
+ const typename Tensor::Matrix& weight,
+ const typename Tensor::Vector& sequence,
+ size_t context_length,
+ int context_start,
+ size_t begin_pad);
/**
* \brief Context Projection Backward.
@@ -53,30 +54,31 @@ void ContextProjectionForward(typename MatrixT::type* output,
* \param[in] is_padding whether padding 0 or not.
*
*/
-template
-void ContextProjectionBackward(typename MatrixT::type* out_grad,
- typename MatrixT::type* in_grad,
- typename MatrixT::type* w_grad,
- const typename SequenceT::type& seq_vec,
- size_t context_length,
- int context_start,
- size_t begin_pad,
- bool is_padding,
- size_t total_pad);
+template
+void ContextProjectionBackward(
+ typename Tensor::Matrix& out_grad,
+ typename Tensor::Matrix& in_grad,
+ typename Tensor::Matrix& w_grad,
+ const typename Tensor::Vector& seq_vec,
+ size_t context_length,
+ int context_start,
+ size_t begin_pad,
+ bool is_padding,
+ size_t total_pad);
-template
+template
void ContextProjectionBackwardData(
- typename MatrixT::type* out_grad,
- typename MatrixT::type* in_grad,
- const typename SequenceT::type& sequence,
+ typename Tensor::Matrix& out_grad,
+ typename Tensor::Matrix& in_grad,
+ const typename Tensor::Vector& sequence,
size_t context_length,
int context_start);
-template
+template
void ContextProjectionBackwardWeight(
- typename MatrixT::type* out_grad,
- typename MatrixT::type* w_grad,
- const typename SequenceT::type& seq_vec,
+ typename Tensor::Matrix& out_grad,
+ typename Tensor::Matrix& w_grad,
+ const typename Tensor::Vector& seq_vec,
size_t context_length,
int context_start,
size_t total_pad,
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu
index 1ec7058f96c8200728e5add051d5fa6a77a97e36..6a4a01a6510416fc1f945305203f55ece7a28f11 100644
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -120,20 +120,19 @@ void hl_context_projection_forward(const real* input,
}
template <>
-void ContextProjectionForward(GpuMatrix* output,
- const GpuMatrix* input,
- const GpuMatrix* weight,
+void ContextProjectionForward(GpuMatrix& output,
+ const GpuMatrix& input,
+ const GpuMatrix& weight,
const GpuIVector& sequence,
size_t context_length,
int context_start,
size_t begin_pad) {
- CHECK(input && output);
- hl_context_projection_forward(input->getData(),
+ hl_context_projection_forward(input.getData(),
sequence.getData(),
- weight ? weight->getData() : nullptr,
- output->getData(),
+ weight ? weight.getData() : nullptr,
+ output.getData(),
sequence.getSize() - 1,
- input->getWidth(),
+ input.getWidth(),
context_length,
context_start,
begin_pad);
@@ -217,17 +216,16 @@ void hl_context_projection_backward_data(real* out_grad,
}
template <>
-void ContextProjectionBackwardData(GpuMatrix* out_grad,
- GpuMatrix* in_grad,
+void ContextProjectionBackwardData(GpuMatrix& out_grad,
+ GpuMatrix& in_grad,
const GpuIVector& sequence,
size_t context_length,
int context_start) {
- CHECK(in_grad && out_grad);
- hl_context_projection_backward_data(out_grad->getData(),
+ hl_context_projection_backward_data(out_grad.getData(),
sequence.getData(),
- in_grad->getData(),
+ in_grad.getData(),
sequence.getSize() - 1,
- in_grad->getWidth(),
+ in_grad.getWidth(),
context_length,
context_start);
}
@@ -348,19 +346,18 @@ void hl_context_projection_backward_weight(real* out_grad,
template <>
void ContextProjectionBackwardWeight(
- GpuMatrix* out_grad,
- GpuMatrix* w_grad,
+ GpuMatrix& out_grad,
+ GpuMatrix& w_grad,
const GpuIVector& seq_vec,
size_t context_length,
int context_start,
size_t total_pad,
size_t begin_pad) {
- CHECK(out_grad && w_grad);
- hl_context_projection_backward_weight(out_grad->getData(),
+ hl_context_projection_backward_weight(out_grad.getData(),
seq_vec.getData(),
- w_grad->getData(),
+ w_grad.getData(),
seq_vec.getSize() - 1,
- w_grad->getWidth(),
+ w_grad.getWidth(),
total_pad,
context_length,
context_start,
@@ -368,16 +365,15 @@ void ContextProjectionBackwardWeight(
}
template <>
-void ContextProjectionBackward(GpuMatrix* out_grad,
- GpuMatrix* in_grad,
- GpuMatrix* w_grad,
+void ContextProjectionBackward(GpuMatrix& out_grad,
+ GpuMatrix& in_grad,
+ GpuMatrix& w_grad,
const GpuIVector& sequence,
size_t context_length,
int context_start,
size_t begin_pad,
bool is_padding,
size_t total_pad) {
- CHECK(out_grad);
if (in_grad) {
ContextProjectionBackwardData(
out_grad,
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index 96a7a30eebbf0f01fa89ea91110ddb826fd2f64b..92980c503fdaaaa9ac600070197dba6ba4bfb7a4 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -112,6 +112,8 @@ void CrossMapNormalGrad(real* inputsGrad,
}
/**
+ * \brief {o_0, o_1} = calc(i_0)
+ *
* \param inputs[0] input value.
* \param outputs[0] output value.
* \param outputs[1] denoms.
@@ -125,27 +127,24 @@ public:
pow_ = config.get("pow");
}
- void calc(const Arguments& inputs,
- const Arguments& outputs,
- const Arguments& inouts) override {
- CHECK_EQ(1, static_cast(inputs.size()));
- CHECK_EQ(2, static_cast(outputs.size()));
- CHECK_EQ(0, static_cast(inouts.size()));
-
- CHECK_EQ(static_cast(inputs[0].dims_.size()), 4);
- for (size_t i = 0; i < inputs[0].dims_.size(); i++) {
- CHECK_EQ(inputs[0].dims_[i], outputs[0].dims_[i]);
- CHECK_EQ(inputs[0].dims_[i], outputs[1].dims_[i]);
- }
+ void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+ CHECK_EQ((size_t)1, inputs.size());
+ CHECK_EQ((size_t)2, outputs.size());
+
+ CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
+ CHECK(inputs[0].shape() == outputs[0].shape());
+ CHECK(inputs[0].shape() == outputs[1].shape());
- size_t samples = inputs[0].dims_[0];
- size_t channels = inputs[0].dims_[1];
- size_t height = inputs[0].dims_[2];
- size_t width = inputs[0].dims_[3];
+ CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+ CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
+ size_t samples = inputs[0].shape()[0];
+ size_t channels = inputs[0].shape()[1];
+ size_t height = inputs[0].shape()[2];
+ size_t width = inputs[0].shape()[3];
- CrossMapNormal(outputs[0].getData(),
- outputs[1].getData(),
- inputs[0].getData(),
+ CrossMapNormal(outputs[0].data(),
+ outputs[1].data(),
+ inputs[0].data(),
samples,
channels,
height,
@@ -162,6 +161,8 @@ private:
};
/**
+ * \brief {o_0} = calc(i_0, i_1, i_2, i_3)
+ *
* \param inputs[0] input value.
* \param inputs[1] output value.
* \param inputs[2] output grad.
@@ -177,31 +178,29 @@ public:
pow_ = config.get("pow");
}
- void calc(const Arguments& inputs,
- const Arguments& outputs,
- const Arguments& inouts) override {
- CHECK_EQ(4, static_cast(inputs.size()));
- CHECK_EQ(1, static_cast(outputs.size()));
- CHECK_EQ(0, static_cast(inouts.size()));
-
- CHECK_EQ(static_cast(inputs[0].dims_.size()), 4);
- for (size_t i = 0; i < inputs[0].dims_.size(); i++) {
- CHECK_EQ(inputs[0].dims_[i], inputs[1].dims_[i]);
- CHECK_EQ(inputs[0].dims_[i], inputs[2].dims_[i]);
- CHECK_EQ(inputs[0].dims_[i], inputs[3].dims_[i]);
- CHECK_EQ(inputs[0].dims_[i], outputs[0].dims_[i]);
- }
-
- size_t samples = inputs[0].dims_[0];
- size_t channels = inputs[0].dims_[1];
- size_t height = inputs[0].dims_[2];
- size_t width = inputs[0].dims_[3];
-
- CrossMapNormalGrad(outputs[0].getData(),
- inputs[0].getData(),
- inputs[1].getData(),
- inputs[2].getData(),
- inputs[3].getData(),
+ void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+ CHECK_EQ((size_t)4, inputs.size());
+ CHECK_EQ((size_t)1, outputs.size());
+
+ CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
+ CHECK(inputs[0].shape() == inputs[1].shape());
+ CHECK(inputs[0].shape() == inputs[2].shape());
+ CHECK(inputs[0].shape() == inputs[3].shape());
+ CHECK(inputs[0].shape() == outputs[0].shape());
+
+ // TODO(hedaoyuan): need support ASSIGN_TO mode.
+ CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+ size_t samples = inputs[0].shape()[0];
+ size_t channels = inputs[0].shape()[1];
+ size_t height = inputs[0].shape()[2];
+ size_t width = inputs[0].shape()[3];
+
+ CrossMapNormalGrad(outputs[0].data(),
+ inputs[0].data(),
+ inputs[1].data(),
+ inputs[2].data(),
+ inputs[3].data(),
samples,
channels,
height,
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index 614e76b8ac0c9a9145a27f5b532ea63bef7f90f0..dbe3a4e9f608df6333a5637f2d962a555b04d7c3 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -76,6 +76,20 @@ FuncConfig& FuncConfig::set(const std::string& key, bool v) {
return *this;
}
+void BufferArgs::addArg(const Matrix& arg,
+ const TensorShape& shape,
+ ArgType argType) {
+ args_.push_back(std::make_shared(arg, shape, argType));
+}
+
+void BufferArgs::addArg(const CpuSparseMatrix& arg, ArgType argType) {
+ args_.push_back(std::make_shared(arg, argType));
+}
+
+void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
+ args_.push_back(std::make_shared(arg, argType));
+}
+
ClassRegistrar FunctionBase::funcRegistrar_;
} // namespace paddle
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 9e8cbb8e48c30e80c5057fc53c050b67d3957188..249f8f9cfad58bf596e8cdce9188409b5690f969 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -16,57 +16,17 @@ limitations under the License. */
#include