diff --git a/.travis.yml b/.travis.yml index d73fd39aa7a2ee87c0e31436ffc14df2213134c9..387367a2305e7bf582e29538ab9e51571b9ae75b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -48,7 +48,7 @@ before_install: - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python # protobuf version. - - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker + - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker - | function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } script: diff --git a/CMakeLists.txt b/CMakeLists.txt index fc85f83b94f22459002b17d66cb6ac98cbff9bd0..884afa962bbaff1defe610a9cd5b4a6e5d46c7c3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,6 +92,7 @@ include(external/swig) # download, build, install swig include(external/warpctc) # download, build, install warpctc include(external/any) # download libn::any +include(generic) # simplify cmake module include(package) # set paddle packages include(cpplint) # set paddle c++ style include(ccache) # set ccache for compilation diff --git a/RELEASE.cn.md b/RELEASE.cn.md old mode 100755 new mode 100644 diff --git a/cmake/generic.cmake b/cmake/generic.cmake new file mode 100644 index 0000000000000000000000000000000000000000..22a26d7c5b04ba1f45de5ec9f3387c539ade730b --- /dev/null +++ b/cmake/generic.cmake @@ -0,0 +1,129 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# To simplify the build process of PaddlePaddle, we defined couple of +# fundamental abstractions, e.g., how to build library, binary and +# test in C++, CUDA and Go. +# +# ------------------------------------------- +# C++ CUDA C++ Go +# ------------------------------------------- +# cc_library nv_library go_library +# cc_binary nv_binary go_binary +# cc_test nv_test go_test +# ------------------------------------------- +# +# cmake_parse_arguments can help us to achieve this goal. +# https://cmake.org/cmake/help/v3.0/module/CMakeParseArguments.html + +# cc_library parses tensor.cc and figures out that target also depend on tensor.h. +# cc_library(tensor +# SRCS +# tensor.cc +# DEPS +# variant) +function(cc_library TARGET_NAME) + set(options OPTIONAL) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if (${cc_library_OPTIONAL} STREQUAL "SHARED") + add_library(${TARGET_NAME} SHARED ${cc_library_SRCS}) + else() + add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) + endif() + add_dependencies(${TARGET_NAME} ${cc_library_DEPS} ${external_project_dependencies}) +endfunction(cc_library) + +# cc_binary parses tensor.cc and figures out that target also depend on tensor.h. +# cc_binary(tensor +# SRCS +# tensor.cc) +function(cc_binary TARGET_NAME) + set(options OPTIONAL) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_executable(${TARGET_NAME} ${cc_binary_SRCS}) + add_dependencies(${TARGET_NAME} ${cc_binary_DEPS} ${external_project_dependencies}) + target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS}) +endfunction(cc_binary) + +# The dependency to target tensor implies that if any of +# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built. +# cc_test(tensor_test +# SRCS +# tensor_test.cc +# DEPS +# tensor) +function(cc_test TARGET_NAME) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_executable(${TARGET_NAME} ${cc_test_SRCS}) + add_dependencies(${TARGET_NAME} ${cc_test_DEPS} ${external_project_dependencies}) + target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${GTEST_MAIN_LIBRARIES} ${GTEST_LIBRARIES}) + add_test(${TARGET_NAME} ${TARGET_NAME}) +endfunction(cc_test) + +# Suppose that ops.cu includes global functions that take Tensor as +# their parameters, so ops depend on tensor. This implies that if +# any of tensor.{h.cc}, ops.{h,cu} is changed, ops need to be re-built. +# nv_library(ops +# SRCS +# ops.cu +# DEPS +# tensor) +function(nv_library TARGET_NAME) + set(options OPTIONAL) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if (${nv_library_OPTIONAL} STREQUAL "SHARED") + cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS}) + else() + cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS}) + endif() + add_dependencies(${TARGET_NAME} ${nv_library_DEPS} ${external_project_dependencies}) +endfunction(nv_library) + +function(nv_binary TARGET_NAME) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS}) + add_dependencies(${TARGET_NAME} ${nv_binary_DEPS} ${external_project_dependencies}) + target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) +endfunction(nv_binary) + +# The dependency to target tensor implies that if any of +# ops{.h,.cu,_test.cu} is changed, ops_test need to be re-built. +# nv_test(ops_test +# SRCS +# ops_test.cu +# DEPS +# ops) +function(nv_test TARGET_NAME) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) + add_dependencies(${TARGET_NAME} ${nv_test_DEPS} ${external_project_dependencies}) + target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} ${GTEST_MAIN_LIBRARIES} ${GTEST_LIBRARIES}) + add_test(${TARGET_NAME} ${TARGET_NAME}) +endfunction(nv_test) diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py index 036cad4b0a32357bb42580ef577a1eba558be8fe..3af636aef5879b43641d55bd7c9b0b8a1242ff8b 100644 --- a/demo/semantic_role_labeling/api_train_v2.py +++ b/demo/semantic_role_labeling/api_train_v2.py @@ -1,26 +1,33 @@ -import sys import math import numpy as np -import paddle.v2 as paddle +import gzip +import logging import paddle.v2.dataset.conll05 as conll05 +import paddle.v2.evaluator as evaluator +import paddle.v2 as paddle +logger = logging.getLogger('paddle') -def db_lstm(): - word_dict, verb_dict, label_dict = conll05.get_dict() - word_dict_len = len(word_dict) - label_dict_len = len(label_dict) - pred_len = len(verb_dict) +word_dict, verb_dict, label_dict = conll05.get_dict() +word_dict_len = len(word_dict) +label_dict_len = len(label_dict) +pred_len = len(verb_dict) - mark_dict_len = 2 - word_dim = 32 - mark_dim = 5 - hidden_dim = 512 - depth = 8 +mark_dict_len = 2 +word_dim = 32 +mark_dim = 5 +hidden_dim = 512 +depth = 8 +default_std = 1 / math.sqrt(hidden_dim) / 3.0 +mix_hidden_lr = 1e-3 - #8 features - def d_type(size): - return paddle.data_type.integer_value_sequence(size) +def d_type(size): + return paddle.data_type.integer_value_sequence(size) + + +def db_lstm(): + #8 features word = paddle.layer.data(name='word_data', type=d_type(word_dict_len)) predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len)) @@ -31,11 +38,7 @@ def db_lstm(): ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len)) mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len)) - target = paddle.layer.data(name='target', type=d_type(label_dict_len)) - - default_std = 1 / math.sqrt(hidden_dim) / 3.0 - - emb_para = paddle.attr.Param(name='emb', initial_std=0., learning_rate=0.) + emb_para = paddle.attr.Param(name='emb', initial_std=0., is_static=True) std_0 = paddle.attr.Param(initial_std=0.) std_default = paddle.attr.Param(initial_std=default_std) @@ -63,7 +66,6 @@ def db_lstm(): input=emb, param_attr=std_default) for emb in emb_layers ]) - mix_hidden_lr = 1e-3 lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0) hidden_para_attr = paddle.attr.Param( initial_std=default_std, learning_rate=mix_hidden_lr) @@ -111,6 +113,21 @@ def db_lstm(): input=input_tmp[1], param_attr=lstm_para_attr) ], ) + return feature_out + + +def load_parameter(file_name, h, w): + with open(file_name, 'rb') as f: + f.read(16) # skip header. + return np.fromfile(f, dtype=np.float32).reshape(h, w) + + +def train(): + paddle.init(use_gpu=False, trainer_count=1) + + # define network topology + feature_out = db_lstm() + target = paddle.layer.data(name='target', type=d_type(label_dict_len)) crf_cost = paddle.layer.crf(size=label_dict_len, input=feature_out, label=target, @@ -120,29 +137,15 @@ def db_lstm(): learning_rate=mix_hidden_lr)) crf_dec = paddle.layer.crf_decoding( - name='crf_dec_l', size=label_dict_len, input=feature_out, label=target, param_attr=paddle.attr.Param(name='crfw')) - - return crf_cost, crf_dec - - -def load_parameter(file_name, h, w): - with open(file_name, 'rb') as f: - f.read(16) # skip header. - return np.fromfile(f, dtype=np.float32).reshape(h, w) - - -def main(): - paddle.init(use_gpu=False, trainer_count=1) - - # define network topology - crf_cost, crf_dec = db_lstm() + evaluator.sum(input=crf_dec) # create parameters - parameters = paddle.parameters.create([crf_cost, crf_dec]) + parameters = paddle.parameters.create(crf_cost) + parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32)) # create optimizer optimizer = paddle.optimizer.Momentum( @@ -152,18 +155,12 @@ def main(): model_average=paddle.optimizer.ModelAverage( average_window=0.5, max_average_window=10000), ) - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print "Pass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - trainer = paddle.trainer.SGD(cost=crf_cost, parameters=parameters, - update_equation=optimizer) - parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32)) + update_equation=optimizer, + extra_layers=crf_dec) - trn_reader = paddle.batch( + reader = paddle.batch( paddle.reader.shuffle( conll05.test(), buf_size=8192), batch_size=10) @@ -179,12 +176,102 @@ def main(): 'target': 8 } + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + logger.info("Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics)) + if event.batch_id and event.batch_id % 1000 == 0: + result = trainer.test(reader=reader, feeding=feeding) + logger.info("\nTest with Pass %d, Batch %d, %s" % + (event.pass_id, event.batch_id, result.metrics)) + + if isinstance(event, paddle.event.EndPass): + # save parameters + with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f: + parameters.to_tar(f) + + result = trainer.test(reader=reader, feeding=feeding) + logger.info("\nTest with Pass %d, %s" % + (event.pass_id, result.metrics)) + trainer.train( - reader=trn_reader, + reader=reader, event_handler=event_handler, - num_passes=10000, + num_passes=10, feeding=feeding) +def infer_a_batch(inferer, test_data, word_dict, pred_dict, label_dict): + probs = inferer.infer(input=test_data, field='id') + assert len(probs) == sum(len(x[0]) for x in test_data) + + for idx, test_sample in enumerate(test_data): + start_id = 0 + pred_str = "%s\t" % (pred_dict[test_sample[6][0]]) + + for w, tag in zip(test_sample[0], + probs[start_id:start_id + len(test_sample[0])]): + pred_str += "%s[%s] " % (word_dict[w], label_dict[tag]) + print(pred_str.strip()) + start_id += len(test_sample[0]) + + +def infer(): + label_dict_reverse = dict((value, key) + for key, value in label_dict.iteritems()) + word_dict_reverse = dict((value, key) + for key, value in word_dict.iteritems()) + pred_dict_reverse = dict((value, key) + for key, value in verb_dict.iteritems()) + + test_creator = paddle.dataset.conll05.test() + + paddle.init(use_gpu=False, trainer_count=1) + + # define network topology + feature_out = db_lstm() + predict = paddle.layer.crf_decoding( + size=label_dict_len, + input=feature_out, + param_attr=paddle.attr.Param(name='crfw')) + + test_pass = 0 + with gzip.open('params_pass_%d.tar.gz' % (test_pass)) as f: + parameters = paddle.parameters.Parameters.from_tar(f) + inferer = paddle.inference.Inference( + output_layer=predict, parameters=parameters) + + # prepare test data + test_data = [] + test_batch_size = 50 + + for idx, item in enumerate(test_creator()): + test_data.append(item[0:8]) + + if idx and (not idx % test_batch_size): + infer_a_batch( + inferer, + test_data, + word_dict_reverse, + pred_dict_reverse, + label_dict_reverse, ) + test_data = [] + infer_a_batch( + inferer, + test_data, + word_dict_reverse, + pred_dict_reverse, + label_dict_reverse, ) + test_data = [] + + +def main(is_inferring=False): + if is_inferring: + infer() + else: + train() + + if __name__ == '__main__': - main() + main(is_inferring=False) diff --git a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md b/doc/howto/cross_compiling/cross_compiling_for_android_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..90dc84718c9ce1374cda6022de177afeeb60279d --- /dev/null +++ b/doc/howto/cross_compiling/cross_compiling_for_android_cn.md @@ -0,0 +1,75 @@ +# 构建Android平台上的PaddlePaddle库 + +用户可通过交叉编译的方式,在用户熟悉的开发平台(Linux,Mac OS X和Windows)上编译Android平台上适用的PaddlePaddle库。 +本文档将以Linux x86-64平台为例,介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。 + +## 准备交叉编译环境 + +从源码交叉编译PaddlePaddle,用户需要提前准备好交叉编译环境。Android平台上使用的C/C++交叉编译工具链为[Android NDK](https://developer.android.com/ndk/downloads/index.html?hl=zh-cn),用户可自行前往下载预编译好的版本,也可通过以下命令获取: + +```bash +wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip +unzip -q android-ndk-r14b-linux-x86_64.zip +``` + +Android NDK中包含了所有Android API级别、所有架构(arm/arm64/x86/mips)需要用到的编译工具和系统库。用户可根据自己的编译目标架构、所需支持的最低Android API级别,构建[独立工具链](https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn)。 +比如: + +```bash +your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ + --arch=arm --platform=android-21 --install-dir=your/path/to/my_standalone_toolchain +``` + +此命令将在your/path/to/my_standalone_toolchain目录生成一套编译工具链,面向架构为32位ARM架构,支持的最小的Android API级别为21,使用的编译器为arm-linux-androideabi-gcc (GCC) 4.9。 + +注意:**PaddlePaddle要求使用的编译工具链所支持的Andoid API级别不小于21**。 + +## 配置交叉编译参数 + +CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置,PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake),以提供一些默认的编译器和编译参数相关配置。注意,从CMake 3.7版本开始,CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时,将会将用户传进来的配置参数传递CMake系统,交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。 + +交叉编译Android版本的PaddlePaddle库时,有一些必须配置的参数: +- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后,PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本,并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及Android所需`arm_soft_fp_abi`分支的目标机版OpenBLAS库。此外,还会强制设置一些PaddlePaddle参数的值(`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`)。 +- `WITH_C_API`,必须设置为`ON`。在Android平台上只支持使用C-API来预测。 +- `WITH_SWIG_PY`,必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。 + +Android平台可选配置参数: + +- `ANDROID_STANDALONE_TOOLCHAIN`,独立工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别;否则,用户需要在cmake时手动设置这些值。无默认值。 +- `ANDROID_ABI`,目标架构ABI。目前只支持`armeabi-v7a`,默认值为`armeabi-v7a`。 +- `ANDROID_NATIVE_API_LEVEL`,工具链的Android API级别。若没有显式设置,PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。 +- `ANROID_ARM_MODE`,是否使用ARM模式。可设置`ON/OFF`,默认值为`ON`。 +- `ANDROID_ARM_NEON`,是否使用NEON指令。目前必须设置成`ON`,默认值为`ON`。 + +其他配置参数: + +- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。 + +一种常用的cmake配置如下: + +```bash +cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/my_standalone_toolchain \ + -DANDROID_ABI=armeabi-v7a \ + -DANDROID_ARM_NEON=ON \ + -DANDROID_ARM_MODE=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。 + +## 编译和安装 + +CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。 + +```bash +make +make install +``` + +注意:如果你曾经在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。 + +执行完安装命令后,`your/path/to/install`目录中会包含`include`和`lib`目录,其中`include`中包含C-API的头文件,`lib`中包含一个Android版本的库。自此,PaddlePaddle的已经安装完成,用户可将`your/path/to/install`目录下的生成文件用于深度学习相关Android App中,调用方法见C-API文档。 diff --git a/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md b/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..085b5dda1615a9af918b59870db460fcc5acdcca --- /dev/null +++ b/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md @@ -0,0 +1,65 @@ +# 构建Raspberry Pi平台上的PaddlePaddle库 + +对于Rasspberry Pi系统,用户可通过ssh等方式登录到Raspberry Pi系统上,按照[源码编译PaddlePaddle](http://www.paddlepaddle.org/doc_cn/getstarted/build_and_install/cmake/build_from_source_cn.html)相关文档所述,直接编译Raspberry Pi平台上适用的PaddlePaddle库。 + +用户也可以在自己熟悉的开发平台上,通过交叉编译的方式来编译。这篇文档将以Linux x86-64平台为例,介绍交叉编译Raspberry Pi平台上适用的PaddlePaddle的方法和步骤。 + +## 准备交叉编译环境 + +从源码交叉编译PaddlePaddle,用户需要提前准备好交叉编译环境。用户可自行前往[github](https://github.com/raspberrypi/tools)下载Raspberry Pi平台使用的C/C++交叉编译工具链,也可通过以下命令获取: + +```bash +git clone https://github.com/raspberrypi/tools.git +``` + +该github仓库中包含若干个预编译好的、针对不同平台的编译工具。宿主机是Linux x86-64环境,则需选用`arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64`下的作为编译工具,所使用的编译器为arm-linux-gnueabihf-gcc 4.8.3。 + +注意,该编译工具链需要系统glibc支持2.14以上。 + +## 配置交叉编译参数 + +CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置,PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/raspberry_pi.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake),以提供一些默认的编译器和编译参数相关配置。 + +交叉编译Raspberry Pi版本PaddlePaddle库时,有一些必须配置的参数: + +- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须配置为`RPi`。在设置`CMAKE_SYSTEM_NAME=RPi`后,PaddlePaddle的CMake系统才认为在是在交叉编译Raspberry Pi系统的版本,并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及目标机版OpenBLAS库。 + +Raspberry Pi平台可选配置参数: + +- `RPI_TOOLCHAIN`,编译工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动设置需要使用的交叉编译器;否则,用户需要在cmake时手动设置这些值。无默认值。 +- `RPI_ARM_NEON`,是否使用NEON指令。目前必须设置成`ON`,默认值为`ON`。 + +其他配置参数: + +- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。 + +cmake参数如下; + +``` +cmake -DCMAKE_SYSTEM_NAME=RPi \ + -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \ + -DRPI_ARM_NEON=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_GPU=OFF \ + -DWITH_C_API=ON \ + -DWITH_PYTHON=OFF \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。 + +## 编译和安装 + +CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle。 + +```bash +make +make install +``` + +注意:如果你曾经在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。 + +执行完安装命令后,由于上一步cmake配置中`WITH_C_API`设置为`ON`,`your/path/to/install`目录中会包含`include`和`lib`目录,其中`include`中包含C-API的头文件,`lib`中包含一个Raspberry Pi版本的库。 + +更多的编译配置见[源码编译PaddlePaddle](http://www.paddlepaddle.org/doc_cn/getstarted/build_and_install/cmake/build_from_source_cn.html)相关文档。 diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt index 1cec77c0cae6ffbf7a1ca22092e8e41a6f9f0fc5..c9a285c90b0674e175c592c40fa26a2222ed0f51 100644 --- a/paddle/api/CMakeLists.txt +++ b/paddle/api/CMakeLists.txt @@ -26,7 +26,7 @@ FILE(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py) SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON) SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR}) -SET(CMAKE_CXX_FLAGS "-std=c++11 -fPIC -Wall") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -Wall") IF(WITH_COVERAGE) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage") ENDIF(WITH_COVERAGE) diff --git a/paddle/majel/CMakeLists.txt b/paddle/majel/CMakeLists.txt index 4b1438d570ae6dda95c72d2582df833d2d4c4d93..d4bce38906e9326992f6a44ac5cf25309063806a 100644 --- a/paddle/majel/CMakeLists.txt +++ b/paddle/majel/CMakeLists.txt @@ -1,36 +1,4 @@ -cmake_minimum_required(VERSION 3.0) - -if(${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR}) - # find #include - get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) - include_directories(${PARENT_DIR}) - - # find cmake directory modules - get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY) - set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake") - - # enable c++11 - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - - # enable gtest - set(THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/third_party) - set(WITH_TESTING ON) - include(external/gtest) -else() - message("-- Found gtest (include: ${GTEST_INCLUDE_DIR}, library: ${GTEST_LIBRARIES})") -endif() - -########################### Build Majel ############################# -set(MAJEL_CXX_FILES place.cc) -set(MAJEL_CUDA_FILES "") - -if(CUDA_FOUND) - cuda_add_library(majel ${MAJEL_CUDA_FILES} ${MAJEL_CXX_FILES}) -else() - add_library(majel ${MAJEL_CXX_FILES}) -endif() -add_dependencies(majel ${external_project_dependencies}) -##################################################################### +cc_library(majel SRCS place.cc) if(WITH_TESTING) add_subdirectory(test) diff --git a/paddle/majel/README.md b/paddle/majel/README.md index 5539853056797284ca1fa5ef5ab16fa0059907f0..2573738b66b2bf514d06358262ef941e833daf0f 100644 --- a/paddle/majel/README.md +++ b/paddle/majel/README.md @@ -93,6 +93,19 @@ typedef boost::variant< Because `variant` may be thought of as "multi-type, single value", we can utilize it to implement unified interfaces for PaddlePaddle. +`DDim` plays two kinds of roles in Majel. First, it is used to indicate the size of a tensor. For example, we can construct a new `DArray` by following way: + + ```c++ + DArray arr = make_darray(make_ddim({2,3}), 0.0f); + ``` + It means that `arr` will be a two-dimension tensor, or a matrix. The size of its first dimension is 2 and the second is 3. All the element value of `arr` will be initialized as 0.0 . + + The second meaning of `DDim` is tensor index. For example, if we want to access the value in the 1st row and 2nd column of `arr` and set it to 1.0, we can do like this: + + ```c++ + arr[make_ddim({0, 1})] = 1.0; + ``` + ## implement Tensor in Paddle Before writing code, please make sure you already look through Majel Source Code and grabbed the design philosophy of `DArray` in Majel. @@ -113,7 +126,7 @@ To assign subtasks to our colleagues, we have to discuss how to divide it to ind - [ ] 3. Re-implement `Dim`. - `Dim` is an excellent implementation in Majel. + `Dim` is an excellent implementation in Majel. > ??? diff --git a/paddle/majel/test/CMakeLists.txt b/paddle/majel/test/CMakeLists.txt index 46da6ff89b4a1d68fe4229b4f0f051000ab390c7..68f9059874aed8843da1fc598c7d2e57e9b8bbfe 100644 --- a/paddle/majel/test/CMakeLists.txt +++ b/paddle/majel/test/CMakeLists.txt @@ -1,11 +1,7 @@ -file(GLOB_RECURSE ALL_TEST_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") +cc_test(place_test + SRCS place_test.cc + DEPS majel) -add_executable(majel_tests ${ALL_TEST_FILES}) -add_dependencies(majel_tests majel) -target_link_libraries(majel_tests - ${Boost_LIBRARIES} - ${GTEST_LIBRARIES} - ${GTEST_MAIN_LIBRARIES} - majel - ) -add_test(majel_tests majel_tests) +if(WITH_GPU) + nv_test(cuda_test SRCS cuda_test.cu) +endif() diff --git a/paddle/majel/test/cuda_test.cu b/paddle/majel/test/cuda_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..4067dda2f19f7661722d8a14a27c7b32ed6afc92 --- /dev/null +++ b/paddle/majel/test/cuda_test.cu @@ -0,0 +1,59 @@ +#include +#include +#include "gtest/gtest.h" + +#define CHECK_ERR(x) \ + if (x != cudaSuccess) { \ + fprintf(stderr, \ + "%s in %s at line %d\n", \ + cudaGetErrorString(err), \ + __FILE__, \ + __LINE__); \ + exit(-1); \ + } + +__global__ void vecAdd(float *d_A, float *d_B, float *d_C, int n) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < n) { + d_C[i] = d_A[i] + d_B[i]; + } +} + +TEST(Cuda, Equality) { + int n = 10; + // Memory allocation for h_A, h_B and h_C (in the host) + float h_A[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0}; + float h_B[10] = {0.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0}; + float h_C[10]; + float *d_A, *d_B, *d_C; + cudaError_t err; + // Memory allocation for d_A, d_B and d_C (in the device) + err = cudaMalloc((void **)&d_A, sizeof(float) * n); + CHECK_ERR(err); + + err = cudaMalloc((void **)&d_B, sizeof(float) * n); + CHECK_ERR(err); + + err = cudaMalloc((void **)&d_C, sizeof(float) * n); + CHECK_ERR(err); + + // Copying memory to device + err = cudaMemcpy(d_A, h_A, sizeof(float) * n, cudaMemcpyHostToDevice); + CHECK_ERR(err); + + err = cudaMemcpy(d_B, h_B, sizeof(float) * n, cudaMemcpyHostToDevice); + CHECK_ERR(err); + + // Calling the kernel + vecAdd<<>>(d_A, d_B, d_C, n); + + // Copying results back to host + err = cudaMemcpy(h_C, d_C, sizeof(float) * n, cudaMemcpyDeviceToHost); + CHECK_ERR(err); + + EXPECT_EQ(h_C[0], 1.0); + for (int i = 1; i < n - 1; ++i) { + EXPECT_EQ(h_C[i], 11.0); + } + EXPECT_EQ(h_C[9], 1.0); +} diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh index 67b89adb4ddb7bb93cb776d64711078cb11a2784..c784293695bf134b5e990639778b6e84ba45d00d 100755 --- a/paddle/scripts/travis/docs.sh +++ b/paddle/scripts/travis/docs.sh @@ -60,7 +60,6 @@ function deploy_docs() { deploy_docs "master" "." deploy_docs "develop" "./develop/" -deploy_docs "release/0.10.0" "./release/0.10.0/" # Check is there anything changed. set +e diff --git a/paddle/utils/CpuId.cpp b/paddle/utils/CpuId.cpp index edd33c454122d95078e0fde2a2e9d68903951ee8..5abeeecae8d37dd0f9660ef009da2902f36d1804 100644 --- a/paddle/utils/CpuId.cpp +++ b/paddle/utils/CpuId.cpp @@ -19,19 +19,22 @@ limitations under the License. */ /// for MSVC #define CPUID(info, x) __cpuidex(info, x, 0) -#elif !defined(__ANDROID__) +#else +#if !defined(__arm__) #include - /// for GCC/Clang #define CPUID(info, x) __cpuid_count(x, 0, info[0], info[1], info[2], info[3]) +#endif #endif namespace paddle { SIMDFlags::SIMDFlags() { -#if !defined(__ANDROID__) +#if defined(__arm__) + simd_flags_ = SIMD_NEON; +#else unsigned int cpuInfo[4]; // CPUID: https://en.wikipedia.org/wiki/CPUID // clang-format off @@ -52,8 +55,6 @@ SIMDFlags::SIMDFlags() { CPUID(cpuInfo, 0x80000001); simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4 : SIMD_NONE; // clang-fotmat on -#else - simd_flags_ = SIMD_NEON; #endif } diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 57d30b088b873a94a11483aea536a9e4f6493129..9135f38719a44e3070f42e478d0fc6b0004227b5 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2320,6 +2320,9 @@ def Memory(name, memory_name = name + "+delay1" agent_name = memory_name if is_sequence: + config_assert( + boot_layer is not None, + "there must be boot_layer in network when is_sequence = True") agent_layer = SequenceAgentLayer(agent_name, size) else: agent_layer = AgentLayer(agent_name, size)