diff --git a/.travis.yml b/.travis.yml index d73fd39aa7a2ee87c0e31436ffc14df2213134c9..387367a2305e7bf582e29538ab9e51571b9ae75b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -48,7 +48,7 @@ before_install: - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python # protobuf version. - - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker + - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker - | function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } script: diff --git a/doc/design/file_manager/README.md b/doc/design/file_manager/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3df10d801e568834729f902aace483d033340e2d --- /dev/null +++ b/doc/design/file_manager/README.md @@ -0,0 +1,87 @@ +# FileManager设计文档 +## 目标 +在本文档中,我们设计说明了名为FileManager系统,方便用户上传自己的训练数据以进行分布式训练 + +主要功能包括: + +- 提供常用的命令行管理命令管理文件和目录 +- 支持大文件的断点上传、下载 + +## 名词解释 +- PFS:是`Paddlepaddle cloud File System`的缩写,是对用户文件存储空间的抽象,与之相对的是local filesystem。目前我们用CephFS来搭建。 +- [CephFS](http://docs.ceph.com/docs/master/cephfs/):一个POSIX兼容的文件系统。 +- Chunk:逻辑划上文件分块的单位。 + +## 模块 +### 架构图 + + +### PFSClient +- 功能: 详细设计[link](./pfs/pfsclient.md) + - 提供用户管理文件的命令 + - 需要可以跨平台执行 + +- 双向验证 + PFSClient需要和Ingress之间做双向验证[tls](#tls),所以用户需要首先在`cloud.paddlepaddle.org`上注册一下,申请用户空间,并且把系统生成的CA(certificate authority)、Key、CRT(CA signed certificate)下载到本地,然后才能使用PFSClient。 + +### [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) +- 功能: + 提供七层协议的反向代理、基于粘性会话的负载均衡功能。 + +- 透传用户身份的办法 + Ingress需要把PFSClient的身份信息传给PFSServer,配置的方法参考[link](http://www.integralist.co.uk/posts/clientcertauth.html#3) + +### PFSServer +PFSServer提供RESTful API接口,接收处理PFSClient端的文件管理请求,并且把结果返回PFSClient端。 + +RESTful API + +- /api/v1/files + - `GET /api/v1/files`: Get metadata of files or directories. + - `POST /api/v1/files`: Create files or directories. + - `PATCH /api/v1/files`: Update files or directories. + - `DELETE /api/v1/files`: Delete files or directories. + +- /api/v1/file/chunks + - `GET /api/v1/storage/file/chunks`: Get chunks's metadata of a file. + +- /api/v1/storage/files + - `GET /api/v1/storage/files`: Download files or directories. + - `POST /api/v1/storage/files`: Upload files or directories. + +- /api/v1/storage/file/chunks + - `GET /api/v1/storage/file/chunks`: Download chunks's data. + - `POST /api/v1/storage/file/chunks`: Upload chunks's data. + +## 文件传输优化 + +### 分块文件传输 +用户文件可能是比较大的,上传到Cloud或者下载到本地的时间可能比较长,而且在传输的过程中也可能出现网络不稳定的情况。为了应对以上的问题,我们提出了Chunk的概念,一个Chunk由所在的文件偏移、数据、数据长度及校验值组成。文件的上传和下载都是通过对Chunk的操作来实现的。由于Chunk比较小(默认256K),完成一个传输动作完成的时间也比较短,不容易出错。PFSClient需要在传输完毕最后一个Chunk的时候检查destination文件的MD5值是否和source文件一致。 + +一个典型的Chunk如下所示: + +``` +type Chunk struct { + fileOffset int64 + checksum uint32 + len uint32 + data []byte +} +``` + +### 生成sparse文件 +当destination文件不存在或者大小和source文件不一致时,可以用[Fallocate](https://Go.org/pkg/syscall/#Fallocate)生成sparse文件,然后就可以并发写入多个Chunk。 + +### 覆盖不一致的部分 +文件传输的的关键在于需要PFSClient端对比source和destination的文件Chunks的checksum是否保持一致,不一致的由PFSClient下载或者传输Chunk完成。这样已经传输成功的部分就不用重新传输了。 + +## 用户使用流程 +参考[link](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md) + +## 框架生成 +用[swagger](https://github.com/swagger-api/swagger-codegen)生成PFSClient和PFSServer的框架部分,以便我们可以把更多的精力放到逻辑本身上。 + +## 参考文档 +- [TLS complete guide](https://github.com/k8sp/tls/blob/master/tls.md) +- [aws.s3](http://docs.aws.amazon.com/cli/latest/reference/s3/) +- [linux man document](https://linux.die.net/man/) diff --git a/doc/design/file_manager/pfs/pfsclient.md b/doc/design/file_manager/pfs/pfsclient.md new file mode 100644 index 0000000000000000000000000000000000000000..56bc70c54bbc92b78d66e04fb495b1300cf8ebe0 --- /dev/null +++ b/doc/design/file_manager/pfs/pfsclient.md @@ -0,0 +1,129 @@ +# PFSClient + +## Description +The `pfs` command is a Command Line Interface to manage your files on PaddlePaddle Cloud + +## Synopsis +``` +paddle [options] pfs [parameters] +``` + +## Options +``` +--profile (string) + Use a specific profile from your credential file. + +--help (string) + Display more information about command + +--version + Output version information and exit + +--debug + Show detailed debugging log + +--only-show-errors (boolean) + Only errors and warnings are displayed. All other output is suppressed. +``` + +## Path Arguments +When using a command, we need to specify path arguments. There are two path argument type: `localpath` and `pfspath`. + +A `pfspath` begin with `/pfs`, eg: `/pfs/$DATACENTER/home/$USER/folder`. + +[Here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md#上传训练文件) is how to config datacenters. + +## order of Path Arguments +Commonly, if there are two path arguments, the first is the source, and the second is the destination. + +## Subcommonds +- rm - remove files or directories + +``` +Synopsis: + rm [-r] [-v] ... + +Options: + -r + Remove directories and their contents recursively + -v + Cause rm to be verbose, showing files after they are removed. + +Examples: + paddle pfs rm /pfs/$DATACENTER/home/$USER/file + paddle pfs rm -r /pfs/$DATACENTER/home/$USER/folder +``` +- mv - move (rename) files + +``` +Synopsis: + mv [-f | -n] [-v] + mv [-f | -n] [-v] ... + mv [-f | -n] [-v] + mv [-f | -n] [-v] ... + mv [-f | -n] [-v] + mv [-f | -n] [-v] ... + +Options: + -f + Do not prompt for confirmation before overwriting the destination path. (The -f option overrides previous -n options.) + -n + Do not overwrite an existing file. (The -n option overrides previous -f options.) + -v + Cause mv to be verbose, showing files after they are moved. + +Examples: + paddle pfs mv ./text1.txt /pfs/$DATACENTER/home/$USER/text1.txt +``` +- cp - copy files or directories + +``` +Synopsis: + cp [-r] [-f | -n] [-v] [--preserve--links] + cp [-r] [-f | -n] [-v] [--preserve--links] ... + cp [-r] [-f | -n] [-v] [--preserve--links] + cp [-r] [-f | -n] [-v] [--preserve--links] ... + cp [-r] [-f | -n] [-v] [--preserve--links] + cp [-r] [-f | -n] [-v] [--preserve--links] ... + +Options: + -r + Copy directories recursively + -f + Do not prompt for confirmation before overwriting the destination path. (The -f option overrides previous -n options.) + -n + Do not overwrite an existing file. (The -n option overrides previous -f options.) + -v + Cause cp to be verbose, showing files after they are copied. + --preserve--links + Reserve links when copy links + +Examples: + paddle pfs cp ./file /pfs/$DATACENTER/home/$USER/file + paddle pfs cp /pfs/$DATACENTER/home/$USER/file ./file +``` +- ls- list files + +``` +Synopsis: + ls [-r] ... + +Options: + -R + List directory(ies) recursively + +Examples: + paddle pfs ls /pfs/$DATACENTER/home/$USER/file + paddle pfs ls /pfs/$DATACENTER/home/$USER/folder +``` + +- mkdir - mkdir directory(ies) +Create intermediate directory(ies) as required. + +``` +Synopsis: + mkdir ... + +Examples: + paddle pfs mkdir /pfs/$DATACENTER/home/$USER/folder +``` diff --git a/doc/design/file_manager/src/filemanager.graffle b/doc/design/file_manager/src/filemanager.graffle new file mode 100644 index 0000000000000000000000000000000000000000..7861a33072bc1908f69d12b37c20491dd8663103 Binary files /dev/null and b/doc/design/file_manager/src/filemanager.graffle differ diff --git a/doc/design/file_manager/src/filemanager.png b/doc/design/file_manager/src/filemanager.png new file mode 100644 index 0000000000000000000000000000000000000000..8139a19f5722f56d3c211f3ab0d3982f751134b9 Binary files /dev/null and b/doc/design/file_manager/src/filemanager.png differ diff --git a/doc/howto/raspberry/build_for_raspberry.md b/doc/howto/raspberry/build_for_raspberry.md new file mode 100644 index 0000000000000000000000000000000000000000..4a98aba8f2a88a3de838b415131ed1d7c205d5e6 --- /dev/null +++ b/doc/howto/raspberry/build_for_raspberry.md @@ -0,0 +1,47 @@ +# 如何构建Raspberry pi下运行的PaddlePaddle + +这里考虑的是交叉编译方式,即在Linux-x86环境下构建Raspberry pi下可运行的PaddlePaddle。 + +## 下载交叉编译环境 +``` +git clone https://github.com/raspberrypi/tools +``` +如果host是x86-64环境,选用`arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64`下的作为编译工具。注意,需要系统glibc支持2.14以上。 + + +## 编译第三方库 +cmake编译PaddlePaddle时候会自动下载编译依赖的第三方库,不过openblas和protobuf最好还是在编译PaddlePaddle之前先编译好,这样可以保证编译PaddlePaddle的时候更加顺畅。 + +### 编译OpenBLAS +``` +git clone https://github.com/xianyi/OpenBLAS.git +make TARGET=ARMV7 HOSTCC=gcc CC=arm-linux-gnueabihf-gcc NOFORTRAN=1 USE_THREAD=0 +``` + +### 编译protobuf +``` +git clone https://github.com/google/protobuf.git +git checkout 9f75c5aa851cd877fb0d93ccc31b8567a6706546 +cmake ../protobuf/cmake \ +-Dprotobuf_BUILD_TESTS=OFF \ +-DCMAKE_CXX_COMPILER=arm-linux-gnueabihf-g++ \ +-DCMAKE_C_COMPILER=arm-linux-gnueabihf-gcc \ +-DCMAKE_POSITION_INDEPENDENT_CODE=ON \ +-DCMAKE_BUILD_TYPE=Release \ +-DCMAKE_INSTALL_LIBDIR=lib +``` +注意:这样编译出来的`libprotobuf.a`和`protoc`都是ARM版本的,而我们需要的是一个x86-64版本的`protoc`,所以需要用host gcc再编译一遍protobuf然后使用其中的`protoc`。 + + +## 编译PaddlePaddle +cmake参数如下;其中`WITH_C_API`设置为ON,编译输出的output目录会中包含`include`和`lib`目录,其中`include`中包含CAPI的头文件,`lib`中包含一个ARM版本的库。另外,`CMAKE_BUILD_TYPE`设置为`MinSizeRel`可以减小编译的库的大小。 +``` +cmake .. -DWITH_GPU=OFF -DWITH_C_API=ON -DWITH_PYTHON=OFF -DWITH_SWIG_PY=OFF \ +-DCMAKE_CXX_COMPILER:FILEPATH=arm-linux-gnueabihf-g++ \ +-DCMAKE_C_COMPILER:FILEPATH=arm-linux-gnueabihf-gcc \ +-DCMAKE_C_FLAGS="-mfpu=neon" \ +-DCMAKE_CXX_FLAGS="-mfpu=neon" \ +-DOPENBLAS_ROOT=openblas \ +-DCMAKE_PREFIX_PATH=protobuf \ +-DCMAKE_BUILD_TYPE=MinSizeRel +``` diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index c6fd9cc54ae3a671c5bdcf54cbaa873c59280694..769955490976401ea93ed61987064026829a9f41 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -9,7 +9,7 @@ add_subdirectory(pserver) add_subdirectory(trainer) add_subdirectory(scripts) -find_package(boost QUIET) +find_package(Boost QUIET) if(Boost_FOUND) include_directories(${Boost_INCLUDE_DIRS}) diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt index 1cec77c0cae6ffbf7a1ca22092e8e41a6f9f0fc5..c9a285c90b0674e175c592c40fa26a2222ed0f51 100644 --- a/paddle/api/CMakeLists.txt +++ b/paddle/api/CMakeLists.txt @@ -26,7 +26,7 @@ FILE(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py) SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON) SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR}) -SET(CMAKE_CXX_FLAGS "-std=c++11 -fPIC -Wall") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -Wall") IF(WITH_COVERAGE) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage") ENDIF(WITH_COVERAGE) diff --git a/paddle/majel/CMakeLists.txt b/paddle/majel/CMakeLists.txt index baa3bb9e914b3053a18dc638146325ffe3d28a12..d4977df1185b4c13b7c67e24a80fa479e23d46d4 100644 --- a/paddle/majel/CMakeLists.txt +++ b/paddle/majel/CMakeLists.txt @@ -1,8 +1,6 @@ cmake_minimum_required(VERSION 3.0) -if(GTEST_INCLUDE_DIR AND GTEST_LIBRARIES) - message("-- Found gtest (include: ${GTEST_INCLUDE_DIR}, library: ${GTEST_LIBRARIES})") -else() +if(${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR}) # find #include get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) include_directories(${PARENT_DIR}) @@ -11,6 +9,13 @@ else() get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake") + # enable boost + find_package(Boost REQUIRED) + if(NOT Boost_FOUND) + message(FATAL "Cannot find Boost library.") + endif() + include_directories(${Boost_INCLUDE_DIRS}) + # enable c++11 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") @@ -18,6 +23,8 @@ else() set(THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/third_party) set(WITH_TESTING ON) include(external/gtest) +else() + message("-- Found gtest (include: ${GTEST_INCLUDE_DIR}, library: ${GTEST_LIBRARIES})") endif() ########################### Build Majel ############################# @@ -29,6 +36,9 @@ if(CUDA_FOUND) else() add_library(majel ${MAJEL_CXX_FILES}) endif() +add_dependencies(majel ${external_project_dependencies}) ##################################################################### -add_subdirectory(test) +if(WITH_TESTING) + add_subdirectory(test) +endif() diff --git a/paddle/majel/README.md b/paddle/majel/README.md index 5539853056797284ca1fa5ef5ab16fa0059907f0..2573738b66b2bf514d06358262ef941e833daf0f 100644 --- a/paddle/majel/README.md +++ b/paddle/majel/README.md @@ -93,6 +93,19 @@ typedef boost::variant< Because `variant` may be thought of as "multi-type, single value", we can utilize it to implement unified interfaces for PaddlePaddle. +`DDim` plays two kinds of roles in Majel. First, it is used to indicate the size of a tensor. For example, we can construct a new `DArray` by following way: + + ```c++ + DArray arr = make_darray(make_ddim({2,3}), 0.0f); + ``` + It means that `arr` will be a two-dimension tensor, or a matrix. The size of its first dimension is 2 and the second is 3. All the element value of `arr` will be initialized as 0.0 . + + The second meaning of `DDim` is tensor index. For example, if we want to access the value in the 1st row and 2nd column of `arr` and set it to 1.0, we can do like this: + + ```c++ + arr[make_ddim({0, 1})] = 1.0; + ``` + ## implement Tensor in Paddle Before writing code, please make sure you already look through Majel Source Code and grabbed the design philosophy of `DArray` in Majel. @@ -113,7 +126,7 @@ To assign subtasks to our colleagues, we have to discuss how to divide it to ind - [ ] 3. Re-implement `Dim`. - `Dim` is an excellent implementation in Majel. + `Dim` is an excellent implementation in Majel. > ??? diff --git a/paddle/majel/test/CMakeLists.txt b/paddle/majel/test/CMakeLists.txt index 46da6ff89b4a1d68fe4229b4f0f051000ab390c7..76327fdd70c3f1763abda1f38c137dbaf27fba30 100644 --- a/paddle/majel/test/CMakeLists.txt +++ b/paddle/majel/test/CMakeLists.txt @@ -3,7 +3,6 @@ file(GLOB_RECURSE ALL_TEST_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") add_executable(majel_tests ${ALL_TEST_FILES}) add_dependencies(majel_tests majel) target_link_libraries(majel_tests - ${Boost_LIBRARIES} ${GTEST_LIBRARIES} ${GTEST_MAIN_LIBRARIES} majel diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 101b44e6c62ecf0b84d65ee7b6e90e64bd7b3272..2dfa712427d81d2be502f1dbbe880c81b6d9a3f4 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -131,8 +131,6 @@ cat > /paddle/build/Dockerfile < ENV HOME /root -ENV LANG en_US.UTF-8 -# Use Fix locales to en_US.UTF-8 EOF if [[ -n ${APT_MIRROR} ]]; then @@ -153,6 +151,7 @@ RUN apt-get update &&\ paddle version ${DOCKERFILE_CUDNN_DSO} ${DOCKERFILE_GPU_ENV} + # default command shows the paddle version and exit CMD ["paddle", "version"] EOF diff --git a/paddle/utils/CpuId.cpp b/paddle/utils/CpuId.cpp index edd33c454122d95078e0fde2a2e9d68903951ee8..5abeeecae8d37dd0f9660ef009da2902f36d1804 100644 --- a/paddle/utils/CpuId.cpp +++ b/paddle/utils/CpuId.cpp @@ -19,19 +19,22 @@ limitations under the License. */ /// for MSVC #define CPUID(info, x) __cpuidex(info, x, 0) -#elif !defined(__ANDROID__) +#else +#if !defined(__arm__) #include - /// for GCC/Clang #define CPUID(info, x) __cpuid_count(x, 0, info[0], info[1], info[2], info[3]) +#endif #endif namespace paddle { SIMDFlags::SIMDFlags() { -#if !defined(__ANDROID__) +#if defined(__arm__) + simd_flags_ = SIMD_NEON; +#else unsigned int cpuInfo[4]; // CPUID: https://en.wikipedia.org/wiki/CPUID // clang-format off @@ -52,8 +55,6 @@ SIMDFlags::SIMDFlags() { CPUID(cpuInfo, 0x80000001); simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4 : SIMD_NONE; // clang-fotmat on -#else - simd_flags_ = SIMD_NEON; #endif } diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 57d30b088b873a94a11483aea536a9e4f6493129..9135f38719a44e3070f42e478d0fc6b0004227b5 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2320,6 +2320,9 @@ def Memory(name, memory_name = name + "+delay1" agent_name = memory_name if is_sequence: + config_assert( + boot_layer is not None, + "there must be boot_layer in network when is_sequence = True") agent_layer = SequenceAgentLayer(agent_name, size) else: agent_layer = AgentLayer(agent_name, size)