提交 2920b6bc 编写于 作者: G gangliao 提交者: GitHub

Merge pull request #138 from gangliao/master

Add Mac OS X port
*.DS_Store *.DS_Store
build/ build/
*.user
.vscode
.idea
\ No newline at end of file
...@@ -65,12 +65,14 @@ set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas") ...@@ -65,12 +65,14 @@ set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
set(OPENBLAS_INCLUDE_SEARCH_PATHS set(OPENBLAS_INCLUDE_SEARCH_PATHS
${OPENBLAS_ROOT}/include ${OPENBLAS_ROOT}/include
/usr/include /usr/include
/usr/include/openblas) /usr/include/openblas
/usr/local/opt/openblas/include)
set(OPENBLAS_LIB_SEARCH_PATHS set(OPENBLAS_LIB_SEARCH_PATHS
${OPENBLAS_ROOT}/lib ${OPENBLAS_ROOT}/lib
/usr/lib /usr/lib
/usr/lib/blas/openblas /usr/lib/blas/openblas
/usr/lib/openblas) /usr/lib/openblas
/usr/local/opt/openblas/lib)
find_path(OPENBLAS_INC_DIR NAMES cblas.h find_path(OPENBLAS_INC_DIR NAMES cblas.h
PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS}) PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
......
...@@ -15,7 +15,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS ...@@ -15,7 +15,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
$ENV{CUDNN_ROOT}/lib64 $ENV{CUDNN_ROOT}/lib64
$ENV{CUDNN_ROOT}/lib $ENV{CUDNN_ROOT}/lib
/usr/lib) /usr/lib)
find_library(CUDNN_LIBRARY NAMES libcudnn.so # libcudnn_static.a find_library(CUDNN_LIBRARY NAMES libcudnn.so libcudnn.dylib # libcudnn_static.a
PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist} PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
NO_DEFAULT_PATH NO_DEFAULT_PATH
DOC "Path to cuDNN library.") DOC "Path to cuDNN library.")
......
# Some common routine for paddle compile. # Some common routine for paddle compile.
# target_circle_link_libraries # target_circle_link_libraries
# Link libraries to target which has circle dependencies. # Link libraries to target which has circle dependencies.
# #
# First Argument: target name want to be linked with libraries # First Argument: target name want to be linked with libraries
# Rest Arguments: libraries which link together. # Rest Arguments: libraries which link together.
function(target_circle_link_libraries TARGET_NAME) function(target_circle_link_libraries TARGET_NAME)
target_link_libraries(${TARGET_NAME} if(APPLE)
-Wl,--start-group set(LIBS)
${ARGN} set(inArchive OFF)
-Wl,--end-group) set(libsInArgn)
foreach(arg ${ARGN})
if(${arg} STREQUAL "ARCHIVE_START")
set(inArchive ON)
elseif(${arg} STREQUAL "ARCHIVE_END")
set(inArchive OFF)
else()
if(inArchive)
list(APPEND LIBS "-Wl,-force_load")
endif()
list(APPEND LIBS ${arg})
list(APPEND libsInArgn ${arg})
endif()
endforeach()
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
list(APPEND LIBS "-undefined dynamic_lookup")
endif()
list(REVERSE libsInArgn)
target_link_libraries(${TARGET_NAME}
${LIBS}
${libsInArgn})
else() # LINUX
set(LIBS)
foreach(arg ${ARGN})
if(${arg} STREQUAL "ARCHIVE_START")
list(APPEND LIBS "-Wl,--whole-archive")
elseif(${arg} STREQUAL "ARCHIVE_END")
list(APPEND LIBS "-Wl,--no-whole-archive")
else()
list(APPEND LIBS ${arg})
endif()
endforeach()
target_link_libraries(${TARGET_NAME}
"-Wl,--start-group"
${LIBS}
"-Wl,--end-group")
endif()
endfunction() endfunction()
# compile_cu_as_cpp # compile_cu_as_cpp
...@@ -41,20 +80,20 @@ function(link_paddle_exe TARGET_NAME) ...@@ -41,20 +80,20 @@ function(link_paddle_exe TARGET_NAME)
if(PADDLE_WITH_INTERNAL) if(PADDLE_WITH_INTERNAL)
set(INTERAL_LIBS paddle_internal_gserver paddle_internal_parameter) set(INTERAL_LIBS paddle_internal_gserver paddle_internal_parameter)
target_circle_link_libraries(${TARGET_NAME} target_circle_link_libraries(${TARGET_NAME}
-Wl,--whole-archive ARCHIVE_START
paddle_internal_gserver paddle_internal_gserver
paddle_internal_owlqn paddle_internal_owlqn
-Wl,--no-whole-archive ARCHIVE_END
paddle_internal_parameter) paddle_internal_parameter)
else() else()
set(INTERAL_LIBS "") set(INTERAL_LIBS "")
endif() endif()
target_circle_link_libraries(${TARGET_NAME} target_circle_link_libraries(${TARGET_NAME}
-Wl,--whole-archive ARCHIVE_START
paddle_gserver paddle_gserver
${METRIC_LIBS} ${METRIC_LIBS}
-Wl,--no-whole-archive ARCHIVE_END
paddle_pserver paddle_pserver
paddle_trainer_lib paddle_trainer_lib
paddle_network paddle_network
......
Build and Install Installing from Sources
================= =================
## Requirement * [1. Download and Setup](#download)
* [2. Requirements](#requirements)
* [3. Build on Ubuntu](#ubuntu)
* [4. Build on Mac OS X](#mac)
### Dependents ## <span id="download">Download and Setup</span>
You can download PaddlePaddle from the [github source](https://github.com/gangliao/Paddle).
- **CMake**: required for 2.8+ version ```bash
- **g++**: a recent c++ compiler supporting c++11, >= 4.6, < 5 git clone https://github.com/baidu/Paddle paddle
- **BLAS library**: such as openBLAS, MKL, ATLAS ```
- **protobuf**: required for 2.4+ version, 3.x is not supported
- **python**: currently only 2.7 version is supported ## <span id="requirements">Requirements</span>
To compile the source code, your computer must be equipped with GCC >=4.6 or Clang Compiler.
### Dependencies
- **CMake**: version >= 2.8
- **BLAS**: MKL, OpenBlas or ATLAS
- **protobuf**: version >= 2.4, **Note: 3.x is not supported**
- **python**: only python 2.7 is supported currently
### Options
PaddlePaddle supports some build options. To enable it, first you need to install the related libraries.
Optional | Description
------------ | :-----------
**WITH_GPU** | Compile with GPU mode.
**WITH_DOUBLE** | Compile with double precision floating-point, default: single precision. |
**WITH_GLOG** | Compile with glog. If not found, default: an internal log implementation.
**WITH_GFLAGS** | Compile with gflags. If not found, default: an internal flag implementation.
**WITH_TESTING** | Compile with gtest for PaddlePaddle's unit testing.
**WITH_DOC** | Compile to generate PaddlePaddle's docs, default: disabled (OFF).
**WITH_SWIG_PY** | Compile with python predict API, default: disabled (OFF).
**WITH_STYLE_CHECK**| Compile with code style check, default: enabled (ON).
|
### Optional **Note:**
- The GPU version works best with Cuda Toolkit 7.5 and cuDNN v5.
- Other versions like Cuda Toolkit 6.5, 7.0, 8.0 and cuDNN v2, v3, v4 are also supported.
- **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.**
PaddlePaddle also support some build options, you have to install related libraries. As a simple example, consider the following:
- **WITH_GPU**: Compile with gpu mode 1. **Python Dependencies(optional)**
- The GPU version works best with Cuda Toolkit 7.5 and cuDNN v5
- Other versions Cuda Toolkit 6.5, 7.0 and cuDNN v2, v3, v4 are also supported To compile PaddlePaddle with python predict API, make sure swig installed and set `-DWITH_SWIG_PY=ON` as follows:
- Note: to utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa
- **WITH_DOUBLE**: Compile with double precision, otherwise use single precision ```bash
- **WITH_GLOG**: Compile with glog, otherwise use a log implement internally # install swig on ubuntu
- **WITH_GFLAGS**: Compile with gflags, otherwise use a flag implement internally sudo apt-get install swig
- **WITH_TESTING**: Compile with gtest and run unittest for PaddlePaddle # install swig on Mac OS X
- **WITH_DOC**: Compile with documentation brew install swig
- **WITH_SWIG_PY**: Compile with python predict api
- **WITH_STYLE_CHECK**: Style check for source code # active swig in cmake
cmake .. -DWITH_SWIG_PY=ON
```
2. **Doc Dependencies(optional)**
To generate PaddlePaddle's documentation, install dependencies and set `-DWITH_DOC=ON` as follows:
```bash
pip install 'sphinx>=1.4.0'
pip install sphinx_rtd_theme breathe recommonmark
## Building on Ubuntu14.04 # install doxygen on Ubuntu
sudo apt-get install doxygen
# install doxygen on Mac OS X
brew install doxygen
# active docs in cmake
cmake .. -DWITH_DOC=ON`
```
## <span id="ubuntu">Build on Ubuntu 14.04</span>
### Install Dependencies ### Install Dependencies
- **CPU Dependencies** - **CPU Dependencies**
```bash ```bash
# necessary # necessary
sudo apt-get update sudo apt-get update
sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
# optional # optional
sudo apt-get install libgoogle-glog-dev sudo apt-get install libgoogle-glog-dev
sudo apt-get install libgflags-dev sudo apt-get install libgflags-dev
sudo apt-get install libgtest-dev sudo apt-get install libgtest-dev
sudo pip install wheel sudo pip install wheel
pushd /usr/src/gtest pushd /usr/src/gtest
cmake . cmake .
make make
sudo cp *.a /usr/lib sudo cp *.a /usr/lib
popd popd
``` ```
- **GPU Dependencies(optional)** - **GPU Dependencies (optional)**
If you need to build GPU version, the first thing you need is a machine that has GPU and CUDA installed. To build GPU version, you will need the following installed:
And you also need to install cuDNN.
You can download CUDA toolkit and cuDNN from nvidia website: 1. a CUDA-capable GPU
2. A supported version of Linux with a gcc compiler and toolchain
```bash 3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
https://developer.nvidia.com/cuda-downloads 4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
https://developer.nvidia.com/cudnn
``` The CUDA development environment relies on tight integration with the host development environment,
You can copy cuDNN files into the CUDA toolkit directory, such as: including the host compiler and C runtime libraries, and is therefore only supported on
distribution versions that have been qualified for this CUDA Toolkit release.
After downloading cuDNN library, issue the following commands:
```bash
sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
```
Then you need to set LD\_LIBRARY\_PATH, CUDA\_HOME and PATH environment variables in ~/.bashrc.
```bash
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export CUDA_HOME=/usr/local/cuda
export PATH=/usr/local/cuda/bin:$PATH
```
### Build and Install
As usual, the best option is to create build folder under paddle project directory.
```bash ```bash
sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local mkdir build && cd build
sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn* cmake ..
``` ```
Then you need to set LD\_LIBRARY\_PATH, CUDA\_HOME and PATH environment variables in ~/.bashrc.
CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
If still not found, you can manually set it based on CMake error information from your screen.
As a simple example, consider the following:
- **Only CPU**
```bash
cmake .. -DWITH_GPU=OFF -DWITH_DOC=OFF
```
- **GPU**
```bash
cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
```
- **GPU with doc and swig**
```bash
cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
```
Finally, you can download source code and build:
```bash ```bash
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH # you can add build option here, such as:
export CUDA_HOME=/usr/local/cuda cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<path to install>
export PATH=/usr/local/cuda/bin:$PATH # please use sudo make install, if you want
# to install PaddlePaddle into the system
make -j `nproc` && make install
# set PaddlePaddle installation path in ~/.bashrc
export PATH=<path to install>/bin:$PATH
``` ```
- **Python Dependencies(optional)**
If you want to compile PaddlePaddle with python predict api, you need to add -DWITH_SWIG_PY=ON in cmake command and install these first: **Note:**
If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
Otherwise, PaddlePaddle will automatically install python dependencies
at first time when user run paddle commands, such as `paddle version`, `paddle train`.
It may require sudo privileges:
```bash ```bash
sudo apt-get install swig # you can run
sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
# or just run
sudo paddle version
``` ```
- **Doc Dependencies(optional)** ## <span id="mac">Building on Mac OS X</span>
If you want to compile PaddlePaddle with doc, you need to add -DWITH_DOC=ON in cmake command and install these first: ### Prerequisites
This guide is based on Mac OS X 10.11 (El Capitan). Note that if you are running an up to date version of OS X,
you will already have Python 2.7.10 and Numpy 1.8 installed.
The best option is to use the package manager homebrew to handle installations and upgrades for you.
To install [homebrew](http://brew.sh/), first open a terminal window (you can find Terminal in the Utilities folder in Applications), and issue the command:
```bash ```bash
pip install 'sphinx>=1.4.0' # install brew
pip install sphinx_rtd_theme breathe recommonmark /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
sudo apt-get install doxygen # install pip
easy_install pip
``` ```
### Build and Install ### Install Dependencies
CMake will find dependent libraries in system default paths first. After installing some optional libraries, corresponding build option will automatically be on(such as glog, gtest and gflags). And if libraries are not found, you have to set following variables manually in cmake command(CUDNN_ROOT, ATLAS_ROOT, MKL_ROOT, OPENBLAS_ROOT). - **CPU Dependencies**
Here are some examples of cmake command with different options: ```bash
# Install fundamental dependents
brew install glog gflags cmake protobuf openblas
# Install google test on Mac OS X
# Download gtest 1.7.0
wget https://github.com/google/googletest/archive/release-1.7.0.tar.gz
tar -xvf googletest-release-1.7.0.tar.gz && cd googletest-release-1.7.0
# Build gtest
mkdir build && cmake ..
make
# Install gtest library
sudo cp -r ../include/gtest /usr/local/include/
sudo cp lib*.a /usr/local/lib
```
**only cpu** - **GPU Dependencies(optional)**
```bash To build GPU version, you will need the following installed:
cmake -DWITH_GPU=OFF -DWITH_DOC=OFF
``` 1. a CUDA-capable GPU
2. Mac OS X 10.11 or later
2. the Clang compiler and toolchain installed using Xcode
3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
The CUDA development environment relies on tight integration with the host development environment,
including the host compiler and C runtime libraries, and is therefore only supported on
distribution versions that have been qualified for this CUDA Toolkit release.
1. After downloading cuDNN library, issue the following commands:
```bash
sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local
sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
```
2. Then you need to set DYLD\_LIBRARY\_PATH, CUDA\_HOME and PATH environment variables in ~/.bashrc.
**gpu** ```bash
export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH
export PATH=/usr/local/cuda/bin:$PATH
```
### Build and Install
As usual, the best option is to create build folder under paddle project directory.
```bash ```bash
cmake -DWITH_GPU=ON -DWITH_DOC=OFF mkdir build && cd build
cmake ..
``` ```
**gpu with doc and swig** CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
If still not found, you can manually set it based on CMake error information from your screen.
```bash As a simple example, consider the following:
cmake -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
```
Finally, you can download source code and build: - **Only CPU**
```bash
cmake .. -DWITH_GPU=OFF -DWITH_DOC=OFF
```
- **GPU**
```bash
cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
```
- **GPU with doc and swig**
```bash
cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
```
Finally, you can build PaddlePaddle:
```bash ```bash
git clone https://github.com/baidu/Paddle paddle
cd paddle
mkdir build
cd build
# you can add build option here, such as: # you can add build option here, such as:
cmake -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<path to install> .. cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<installation path>
# please use sudo make install, if you want # please use sudo make install, if you want to install PaddlePaddle into the system
# to install PaddlePaddle into the system
make -j `nproc` && make install make -j `nproc` && make install
# PaddlePaddle installation path # set PaddlePaddle installation path in ~/.bashrc
export PATH=<path to install>/bin:$PATH export PATH=<installation path>/bin:$PATH
``` ```
**Note**
And if you set WITH_SWIG_PY=ON, you have to install related python predict api at the same time:
**Note:**
If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
Otherwise, PaddlePaddle will automatically install python dependencies
at first time when user run paddle commands, such as `paddle version`, `paddle train`.
It may require sudo privileges:
```bash ```bash
pip install <path to install>/opt/paddle/share/wheels/*.whl # you can run
``` sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
# or just run
sudo paddle version
```
\ No newline at end of file
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/utils/GlobalConstants.h" #include "paddle/utils/GlobalConstants.h"
#include "paddle/utils/TypeDefs.h"
/// Import PaddlePaddle's enumeration into global namespace. /// Import PaddlePaddle's enumeration into global namespace.
using namespace paddle::enumeration_wrapper; // NOLINT using namespace paddle::enumeration_wrapper; // NOLINT
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/utils/Util.h" #include "paddle/utils/Util.h"
#include "paddle/utils/PythonUtil.h" #include "paddle/utils/PythonUtil.h"
#include "paddle/utils/Flags.h" #include "paddle/utils/Flags.h"
#include "paddle/utils/Excepts.h"
#include "paddle/parameter/Parameter.h" #include "paddle/parameter/Parameter.h"
#include <fenv.h> #include <fenv.h>
......
...@@ -15,6 +15,19 @@ ...@@ -15,6 +15,19 @@
try: try:
from paddle_api_config import * from paddle_api_config import *
import os.path import os.path
import platform
system = platform.system().lower()
is_osx = (system == 'darwin')
is_win = (system == 'windows')
is_lin = (system == 'linux')
if is_lin:
whole_start = "-Wl,--whole-archive"
whole_end = "-Wl,--no-whole-archive"
elif is_osx:
whole_start = ""
whole_end = ""
LIB_DIRS = ["math", 'utils', 'parameter', "gserver", "api", "cuda", "pserver", "trainer"] LIB_DIRS = ["math", 'utils', 'parameter', "gserver", "api", "cuda", "pserver", "trainer"]
PARENT_LIB_DIRS = ['proto'] PARENT_LIB_DIRS = ['proto']
...@@ -56,9 +69,9 @@ try: ...@@ -56,9 +69,9 @@ try:
def libs_str(self): def libs_str(self):
libs = [ libs = [
"-Wl,--whole-archive", whole_start,
"-lpaddle_gserver", "-lpaddle_gserver",
"-Wl,--no-whole-archive", whole_end,
"-lpaddle_pserver", "-lpaddle_pserver",
"-lpaddle_trainer_lib", "-lpaddle_trainer_lib",
"-lpaddle_network", "-lpaddle_network",
......
...@@ -16,28 +16,37 @@ limitations under the License. */ ...@@ -16,28 +16,37 @@ limitations under the License. */
#ifndef HL_DEVICE_FUNCTIONS_CUH_ #ifndef HL_DEVICE_FUNCTIONS_CUH_
#define HL_DEVICE_FUNCTIONS_CUH_ #define HL_DEVICE_FUNCTIONS_CUH_
namespace hppl { namespace paddle {
static __inline__ __device__ double atomicAdd(double* address, double val) { template <class T>
// NOLINTNEXTLINE inline __device__ T paddleAtomicAdd(T* address, T val);
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed; // NOLINT
do {
assumed = old;
old = atomicCAS(address_as_ull,
assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
} // namespace hppl template <>
inline __device__ float paddleAtomicAdd(float* address, float val) {
return atomicAdd(address, val);
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 template <>
using hppl::atomicAdd; inline __device__ double paddleAtomicAdd(double* address, double val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
return atomicAdd(address, val);
#else
// NOLINTNEXTLINE
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed; // NOLINT
do {
assumed = old;
old = atomicCAS(address_as_ull,
assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
#endif #endif
}
} // namespace paddle
#endif /* HL_DEVICE_FUNCTIONS_CUH_ */ #endif /* HL_DEVICE_FUNCTIONS_CUH_ */
...@@ -192,10 +192,10 @@ __global__ void KeLstmBackward(Op op, ...@@ -192,10 +192,10 @@ __global__ void KeLstmBackward(Op op,
if (isBatch) { if (isBatch) {
if (value.prevStateValue) { if (value.prevStateValue) {
if (grad.checkIgGrad) atomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad); if (grad.checkIgGrad) paddle::paddleAtomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
if (grad.checkFgGrad) atomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad); if (grad.checkFgGrad) paddle::paddleAtomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
} }
if (grad.checkOgGrad) atomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad); if (grad.checkOgGrad) paddle::paddleAtomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
} else { } else {
if (value.prevStateValue) { if (value.prevStateValue) {
if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad; if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
......
...@@ -27,6 +27,8 @@ typedef float4 vecType; ...@@ -27,6 +27,8 @@ typedef float4 vecType;
typedef double2 vecType; typedef double2 vecType;
#endif #endif
#else #else
#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h> #include <emmintrin.h>
#ifndef HPPL_TYPE_DOUBLE #ifndef HPPL_TYPE_DOUBLE
typedef __m128 vecType; typedef __m128 vecType;
......
...@@ -25,6 +25,9 @@ limitations under the License. */ ...@@ -25,6 +25,9 @@ limitations under the License. */
#define VECTOR_LEN 4 #define VECTOR_LEN 4
#define VECTOR_SET _mm_set_ps1 #define VECTOR_SET _mm_set_ps1
#else #else
#if defined(__APPLE__) || defined(__OSX__)
#define _mm_set_pd1 _mm_set1_pd
#endif
/* number of double in vector */ /* number of double in vector */
#define VECTOR_LEN 2 #define VECTOR_LEN 2
#define VECTOR_SET _mm_set_pd1 #define VECTOR_SET _mm_set_pd1
......
...@@ -209,7 +209,18 @@ __thread cudaStream_t default_stream = 0; ...@@ -209,7 +209,18 @@ __thread cudaStream_t default_stream = 0;
__thread bool g_sync_flag = true; __thread bool g_sync_flag = true;
bool hl_start_flag = false; bool hl_start_flag = false;
#define gettid() syscall(SYS_gettid) inline pid_t gettid() {
#if defined(__APPLE__) || defined(__OSX__)
pid_t tid = syscall(SYS_thread_selfid);
#else
#ifndef __NR_gettid
#define __NR_gettid 224
#endif
pid_t tid = syscall(__NR_gettid);
#endif
CHECK_NE(tid, -1);
return tid;
}
void hl_init(int device) { void hl_init(int device) {
CHECK(hl_start_flag) CHECK(hl_start_flag)
......
...@@ -564,11 +564,11 @@ __global__ void KeLstmBackward(real *gateValue, ...@@ -564,11 +564,11 @@ __global__ void KeLstmBackward(real *gateValue,
/* TODO: Temporary save & merger in another kernel */ /* TODO: Temporary save & merger in another kernel */
if (frameIdy == 1) { if (frameIdy == 1) {
if (checkIgGrad) atomicAdd(checkIgGrad+frameIdx, rCheckGrad); if (checkIgGrad) paddle::paddleAtomicAdd(checkIgGrad+frameIdx, rCheckGrad);
} else if (frameIdy == 2) { } else if (frameIdy == 2) {
if (checkFgGrad) atomicAdd(checkFgGrad+frameIdx, rCheckGrad); if (checkFgGrad) paddle::paddleAtomicAdd(checkFgGrad+frameIdx, rCheckGrad);
} else if (frameIdy == 3) { } else if (frameIdy == 3) {
if (checkOgGrad) atomicAdd(checkOgGrad+frameIdx, rCheckGrad); if (checkOgGrad) paddle::paddleAtomicAdd(checkOgGrad+frameIdx, rCheckGrad);
} }
} }
......
...@@ -623,7 +623,7 @@ __global__ void KeCosSimDerivative(real* grad, ...@@ -623,7 +623,7 @@ __global__ void KeCosSimDerivative(real* grad,
prevGradY[index] += prevGradY[index] +=
scale * grad[ty] * prevOutX[index] * reciprocal; scale * grad[ty] * prevOutX[index] * reciprocal;
} else { } else {
atomicAdd(prevGradY + index, paddle::paddleAtomicAdd(prevGradY + index,
scale * grad[ty] * prevOutX[index] * reciprocal); scale * grad[ty] * prevOutX[index] * reciprocal);
} }
} }
...@@ -640,7 +640,7 @@ __global__ void KeCosSimDerivative(real* grad, ...@@ -640,7 +640,7 @@ __global__ void KeCosSimDerivative(real* grad,
(prevOutX[index] * reciprocalXY - (prevOutX[index] * reciprocalXY -
prevOutY[index] * reciprocalSquareSumY); prevOutY[index] * reciprocalSquareSumY);
} else { } else {
atomicAdd(prevGradY + index, output[ty] * grad[ty] * paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] *
(prevOutX[index] * reciprocalXY - (prevOutX[index] * reciprocalXY -
prevOutY[index] * reciprocalSquareSumY)); prevOutY[index] * reciprocalSquareSumY));
} }
......
...@@ -362,7 +362,7 @@ __global__ void KeMatrixAddRows(real* output, ...@@ -362,7 +362,7 @@ __global__ void KeMatrixAddRows(real* output,
if (AddRow == 0) { if (AddRow == 0) {
outputData[i] += tableData[i]; outputData[i] += tableData[i];
} else { } else {
atomicAdd(&tableData[i], outputData[i]); paddle::paddleAtomicAdd(&tableData[i], outputData[i]);
} }
} }
} }
......
...@@ -280,7 +280,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d, ...@@ -280,7 +280,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
if (index_n_t < dimN) { if (index_n_t < dimN) {
real tmp; real tmp;
tmp = alpha*a_r*b_r[n]; tmp = alpha*a_r*b_r[n];
atomicAdd(C_d_r, tmp); paddle::paddleAtomicAdd(C_d_r, tmp);
C_d_r += CU_CSC_MUL_DENSE_THREAD_X; C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
index_n_t += CU_CSC_MUL_DENSE_THREAD_X; index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
} }
...@@ -328,7 +328,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d, ...@@ -328,7 +328,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
if (index_n_t < dimN) { if (index_n_t < dimN) {
real tmp; real tmp;
tmp = alpha*a_r*b_r[n]; tmp = alpha*a_r*b_r[n];
atomicAdd(C_d_r, tmp); paddle::paddleAtomicAdd(C_d_r, tmp);
C_d_r += CU_CSC_MUL_DENSE_THREAD_X; C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
index_n_t += CU_CSC_MUL_DENSE_THREAD_X; index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
} }
...@@ -629,7 +629,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d, ...@@ -629,7 +629,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
for (int n=0; n < CU_DM_CSR_N; n++) { for (int n=0; n < CU_DM_CSR_N; n++) {
if (index_m_t++ < dimM) { if (index_m_t++ < dimM) {
tmp = alpha * b_r * a_r[n]; tmp = alpha * b_r * a_r[n];
atomicAdd(C_d_r, tmp); paddle::paddleAtomicAdd(C_d_r, tmp);
C_d_r += dimN; C_d_r += dimN;
} }
} }
...@@ -660,7 +660,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d, ...@@ -660,7 +660,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
for (int n=0; n < CU_DM_CSR_N; n++) { for (int n=0; n < CU_DM_CSR_N; n++) {
if (index_m_t++ < dimM) { if (index_m_t++ < dimM) {
tmp = alpha * b_r * a_r[n]; tmp = alpha * b_r * a_r[n];
atomicAdd(C_d_r, tmp); paddle::paddleAtomicAdd(C_d_r, tmp);
C_d_r += dimN; C_d_r += dimN;
} }
} }
...@@ -912,7 +912,7 @@ __global__ void KeSMatrixCsrColumnSum(real* a_val, real* csr_val, ...@@ -912,7 +912,7 @@ __global__ void KeSMatrixCsrColumnSum(real* a_val, real* csr_val,
for (int idx = gid; idx < dimNNZ; idx += gridDim.x * blockDim.x) { for (int idx = gid; idx < dimNNZ; idx += gridDim.x * blockDim.x) {
int colIdx = csr_col[idx]; int colIdx = csr_col[idx];
real val = csr_val[idx]; real val = csr_val[idx];
atomicAdd(a_val + colIdx, val); paddle::paddleAtomicAdd(a_val + colIdx, val);
} }
} }
......
...@@ -69,23 +69,40 @@ static inline void GetDsoHandleWithSearchPath( ...@@ -69,23 +69,40 @@ static inline void GetDsoHandleWithSearchPath(
CHECK(nullptr != *dso_handle) CHECK(nullptr != *dso_handle)
<< "For Gpu version of PaddlePaddle, it couldn't find CUDA library: " << "For Gpu version of PaddlePaddle, it couldn't find CUDA library: "
<< dlPath.c_str() << " Please make sure you already specify its path." << dlPath.c_str() << ". Please make sure you already specify its path. "
<< "Note: for training data on Cpu using Gpu version of PaddlePaddle," << "Note: for training data on Cpu using Gpu version of PaddlePaddle, "
<< "you must specify libcudart.so via LD_LIBRARY_PATH."; << "you must specify libcudart via export LD_LIBRARY_PATH for Linux or "
<< "export DYLD_LIBRARY_PATH for MAC OS.";
} }
void GetCublasDsoHandle(void** dso_handle) { void GetCublasDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
#else
GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle); GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
#endif
} }
void GetCudnnDsoHandle(void** dso_handle) { void GetCudnnDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
#else
GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle); GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
#endif
} }
void GetCudartDsoHandle(void** dso_handle) { void GetCudartDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleWithSearchPath("", "libcudart.dylib", dso_handle);
#else
GetDsoHandleWithSearchPath("", "libcudart.so", dso_handle); GetDsoHandleWithSearchPath("", "libcudart.so", dso_handle);
#endif
} }
void GetCurandDsoHandle(void** dso_handle) { void GetCurandDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
#else
GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle); GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
#endif
} }
...@@ -35,7 +35,7 @@ __global__ void KeMatrixAddRows(real* output, int ldo, ...@@ -35,7 +35,7 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
real *tab = table + tableId * ldt; real *tab = table + tableId * ldt;
for (int i = idx; i < dim; i += blockDimX) { for (int i = idx; i < dim; i += blockDimX) {
if (AddRow) { if (AddRow) {
atomicAdd(&tab[i], out[i]); paddle::paddleAtomicAdd(&tab[i], out[i]);
} else { } else {
out[i] += tab[i]; out[i] += tab[i];
} }
......
...@@ -65,7 +65,8 @@ void DataProviderGroup<T>::reset() { ...@@ -65,7 +65,8 @@ void DataProviderGroup<T>::reset() {
provider_ = nullptr; provider_ = nullptr;
// shuffle file list // shuffle file list
std::random_shuffle(fileList_.begin(), fileList_.end()); std::shuffle(fileList_.begin(), fileList_.end(),
ThreadLocalRandomEngine::get());
startLoader(); startLoader();
DataProvider::reset(); DataProvider::reset();
......
...@@ -374,7 +374,8 @@ void ProtoDataProvider::reset() { ...@@ -374,7 +374,8 @@ void ProtoDataProvider::reset() {
} }
void ProtoDataProvider::shuffle() { void ProtoDataProvider::shuffle() {
std::random_shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end()); std::shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end(),
ThreadLocalRandomEngine::get());
} }
/* /*
......
...@@ -17,6 +17,8 @@ limitations under the License. */ ...@@ -17,6 +17,8 @@ limitations under the License. */
#include "paddle/utils/PythonUtil.h" #include "paddle/utils/PythonUtil.h"
#include <fenv.h> #include <fenv.h>
#include "paddle/utils/Util.h" #include "paddle/utils/Util.h"
#include "paddle/utils/Excepts.h"
namespace paddle { namespace paddle {
...@@ -44,7 +46,6 @@ PyDataProvider::PyDataProvider(const DataConfig& config, bool useGpu, ...@@ -44,7 +46,6 @@ PyDataProvider::PyDataProvider(const DataConfig& config, bool useGpu,
} }
void PyDataProvider::loadData(const std::vector<std::string>& fileList) { void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
int feFlag = fegetexcept();
VLOG(1) << "module:" << pyModuleName_ << " class:" << pyClassName_; VLOG(1) << "module:" << pyModuleName_ << " class:" << pyClassName_;
classInstance_ = classInstance_ =
createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_); createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
...@@ -55,7 +56,7 @@ void PyDataProvider::loadData(const std::vector<std::string>& fileList) { ...@@ -55,7 +56,7 @@ void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
std::string headerInfo = std::string headerInfo =
std::string(PyString_AsString(obj.get()), PyString_Size(obj.get())); std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
parseHeaderData(headerInfo); parseHeaderData(headerInfo);
feenableexcept(feFlag); feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
} }
void PyDataProvider::parseHeaderData(const std::string& headerData) { void PyDataProvider::parseHeaderData(const std::string& headerData) {
......
...@@ -385,17 +385,17 @@ void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) { ...@@ -385,17 +385,17 @@ void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
} }
} }
extern NeuralNetwork* newCustomNeuralNetwork( extern NeuralNetwork* newCustomNerualNetwork(
const std::string& name, NeuralNetwork* network) __attribute__((weak)); const std::string& name, NeuralNetwork* network) __attribute__((weak));
NeuralNetwork* NeuralNetwork::newNeuralNetwork( NeuralNetwork* NeuralNetwork::newNeuralNetwork(
const std::string& name, const std::string& name,
NeuralNetwork* rootNetwork) { NeuralNetwork* rootNetwork) {
if (newCustomNeuralNetwork) { if (newCustomNerualNetwork) {
return newCustomNeuralNetwork(name, rootNetwork); return newCustomNerualNetwork(name, rootNetwork);
} else { } else {
return new NeuralNetwork(name, rootNetwork); return new NeuralNetwork(name, rootNetwork);
} }
} }
} // namespace paddle } // namespace paddle
...@@ -16,9 +16,9 @@ ...@@ -16,9 +16,9 @@
from paddle.trainer_config_helpers import * from paddle.trainer_config_helpers import *
settings(batch_size=1000) settings(batch_size=300)
data = data_layer(name ="input", size=100000) data = data_layer(name ="input", size=10000)
# emb1 is equal to emb2, note that bias_attr=false # emb1 is equal to emb2, note that bias_attr=false
# and act=LinearActivation() in default. # and act=LinearActivation() in default.
......
...@@ -16,9 +16,9 @@ ...@@ -16,9 +16,9 @@
from paddle.trainer_config_helpers import * from paddle.trainer_config_helpers import *
settings(batch_size=1000) settings(batch_size=300)
data = data_layer(name ="input", size=100000) data = data_layer(name ="input", size=10000)
proj1 = table_projection(input=data, size=128) proj1 = table_projection(input=data, size=128)
......
...@@ -50,7 +50,7 @@ TEST(Operator, dot_mul) { ...@@ -50,7 +50,7 @@ TEST(Operator, dot_mul) {
TEST(Projection, context) { TEST(Projection, context) {
for (auto contextStart : {-5, -3, -1, 0, 3}) { for (auto contextStart : {-5, -3, -1, 0, 3}) {
for (auto contextLength : {1, 2, 5, 7}) { for (auto contextLength : {1, 2, 5, 7}) {
for (auto batchSize : {1, 2, 5, 20, 100}) { for (auto batchSize : {1, 2, 5, 20, 50}) {
for (auto trainablePadding : {false, true}) { for (auto trainablePadding : {false, true}) {
LOG(INFO) << " contextStart=" << contextStart LOG(INFO) << " contextStart=" << contextStart
<< " contextLength=" << contextLength << " contextLength=" << contextLength
......
...@@ -321,7 +321,7 @@ TEST(PyDataProvider2, input_order) { ...@@ -321,7 +321,7 @@ TEST(PyDataProvider2, input_order) {
if (!realBatchSize) { if (!realBatchSize) {
break; break;
} }
ASSERT_EQ(batch.getStreams().size(), 2); ASSERT_EQ(batch.getStreams().size(), (size_t)2);
for (size_t i = 0; i < realBatchSize; ++i) { for (size_t i = 0; i < realBatchSize; ++i) {
ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0); ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0);
ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1); ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1);
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#pragma once #pragma once
#include <mutex> #include <mutex>
#include <malloc.h> #include <stdlib.h>
#include "hl_gpu.h" #include "hl_gpu.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
...@@ -48,9 +48,10 @@ public: ...@@ -48,9 +48,10 @@ public:
* @return Pointer to the allocated memory * @return Pointer to the allocated memory
*/ */
virtual void* alloc(size_t size) { virtual void* alloc(size_t size) {
void* ptr = memalign(32ul, size); void* ptr;
CHECK(ptr) << "Fail to allocate CPU memory: size=" << size; CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
return ptr; CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
return ptr;
} }
/** /**
......
...@@ -23,6 +23,8 @@ extern "C" { ...@@ -23,6 +23,8 @@ extern "C" {
} }
#endif #endif
#include <cmath>
namespace paddle { namespace paddle {
template<class T> template<class T>
......
...@@ -2514,7 +2514,8 @@ void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, ...@@ -2514,7 +2514,8 @@ void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
for (int k = 0; k < blockNum_; ++k) { for (int k = 0; k < blockNum_; ++k) {
blockSeq.push_back(k); blockSeq.push_back(k);
} }
std::random_shuffle(blockSeq.begin(), blockSeq.end()); std::shuffle(blockSeq.begin(), blockSeq.end(),
ThreadLocalRandomEngine::get());
} }
std::vector<int>& localBufRows = *localBufRows_; std::vector<int>& localBufRows = *localBufRows_;
int* cols = a->getCols(); int* cols = a->getCols();
......
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include <mutex> #include <mutex>
#include <vector> #include <vector>
#include <unordered_map> #include <unordered_map>
#include <map>
#include "Allocator.h" #include "Allocator.h"
namespace paddle { namespace paddle {
......
...@@ -25,8 +25,8 @@ namespace paddle { ...@@ -25,8 +25,8 @@ namespace paddle {
// Initialization StorageEngine singleton. // Initialization StorageEngine singleton.
// Other modules may rely on storage management, // Other modules may rely on storage management,
// so StorageEngine need to be initialized before other modules. // so StorageEngine need to be initialized before other modules.
static InitFunction __init_storage_engine( static InitFunction __init_storage_engine([](){StorageEngine::singleton();},
StorageEngine::singleton, std::numeric_limits<int>::max()); std::numeric_limits<int>::max());
StorageEngine::StorageEngine() : cpuAllocator_(nullptr) { StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {
} }
......
...@@ -24,7 +24,7 @@ limitations under the License. */ ...@@ -24,7 +24,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <malloc.h> #include <stdlib.h>
#include <time.h> #include <time.h>
static constexpr size_t VECTOR_LEN = 3072; static constexpr size_t VECTOR_LEN = 3072;
...@@ -37,7 +37,9 @@ static std::mt19937 RandomEngine(time(0)); ...@@ -37,7 +37,9 @@ static std::mt19937 RandomEngine(time(0));
inline static std::unique_ptr<float[]> NewVector(size_t len = VECTOR_LEN, inline static std::unique_ptr<float[]> NewVector(size_t len = VECTOR_LEN,
size_t align = ALIGN) { size_t align = ALIGN) {
return std::unique_ptr<float[]>((float*)memalign(align, len * sizeof(float))); float* ptr;
CHECK_EQ(posix_memalign((void**)&ptr, align, len * sizeof(float)), 0);
return std::unique_ptr<float[]>(ptr);
} }
inline static std::unique_ptr<float[]> NewRandomVector(size_t len = VECTOR_LEN, inline static std::unique_ptr<float[]> NewRandomVector(size_t len = VECTOR_LEN,
......
...@@ -124,8 +124,8 @@ void checkSMatrixErr(const CpuSparseMatrixPtr& a, ...@@ -124,8 +124,8 @@ void checkSMatrixErr(const CpuSparseMatrixPtr& a,
if (a->getValueType() == FLOAT_VALUE) { if (a->getValueType() == FLOAT_VALUE) {
real aVal = a->getValue()[r]; real aVal = a->getValue()[r];
real bVal = b->getValue()[r]; real bVal = b->getValue()[r];
if (fabs(aVal - bVal) > err) { if (std::abs(aVal - bVal) > err) {
if ((fabsf(aVal - bVal) / fabsf(aVal)) > (err / 10.0f)) { if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
LOG(INFO) << "a=" << aVal << "\t" << "b=" << bVal; LOG(INFO) << "a=" << aVal << "\t" << "b=" << bVal;
count++; count++;
} }
...@@ -141,8 +141,8 @@ void checkSMatrixErr(const CpuSparseMatrixPtr& a, ...@@ -141,8 +141,8 @@ void checkSMatrixErr(const CpuSparseMatrixPtr& a,
if (a->getValueType() == FLOAT_VALUE) { if (a->getValueType() == FLOAT_VALUE) {
real aVal = a->getValue()[r]; real aVal = a->getValue()[r];
real bVal = b->getValue()[r]; real bVal = b->getValue()[r];
if (fabs(aVal - bVal) > err) { if (std::abs(aVal - bVal) > err) {
if ((fabsf(aVal - bVal) / fabsf(aVal)) > (err / 10.0f)) { if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
count++; count++;
} }
} }
...@@ -173,8 +173,8 @@ void checkMatrixErr(const Matrix& matrix1, const Matrix& matrix2) { ...@@ -173,8 +173,8 @@ void checkMatrixErr(const Matrix& matrix1, const Matrix& matrix2) {
for (int j = 0; j < width; j++) { for (int j = 0; j < width; j++) {
real a = data1[i * width + j]; real a = data1[i * width + j];
real b = data2[i * width + j]; real b = data2[i * width + j];
if (fabs(a - b) > err) { if (std::abs(a - b) > err) {
if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) { if ((std::abs(a - b) / std::abs(a)) > (err / 10.0f)) {
count++; count++;
} }
} }
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <malloc.h> #include <stdlib.h>
#include <paddle/utils/Util.h> #include <paddle/utils/Util.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
...@@ -124,9 +124,13 @@ void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer, ...@@ -124,9 +124,13 @@ void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
TEST_F(CommonTest, sgdUpdate) { TEST_F(CommonTest, sgdUpdate) {
const size_t alignHeader[] = {0, 2, 3, 5, 7, 8}; const size_t alignHeader[] = {0, 2, 3, 5, 7, 8};
for (auto& size : sizeVec_) { for (auto& size : sizeVec_) {
real* gradientBuffer = (real*)memalign(32, sizeof(real) * size); real *gradientBuffer, *valueBuffer, *momentumBuffer;
real* valueBuffer = (real*)memalign(32, sizeof(real) * size); CHECK_EQ(posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size),
real* momentumBuffer = (real*)memalign(32, sizeof(real) * size); 0);
CHECK_EQ(posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size), 0);
CHECK_EQ(posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size),
0);
for (size_t i = 0; i < size; i++) { for (size_t i = 0; i < size; i++) {
gradientBuffer[i] = 1.0; gradientBuffer[i] = 1.0;
valueBuffer[i] = 2.0; valueBuffer[i] = 2.0;
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <sys/socket.h> #include <sys/socket.h>
#include <netdb.h> #include <netdb.h>
#include <netinet/in.h> #include <netinet/in.h>
#include <netinet/tcp.h>
#include <fcntl.h> #include <fcntl.h>
#include <arpa/inet.h> #include <arpa/inet.h>
...@@ -24,7 +25,6 @@ limitations under the License. */ ...@@ -24,7 +25,6 @@ limitations under the License. */
#include <net/if.h> #include <net/if.h>
#include <net/if_arp.h> #include <net/if_arp.h>
#include <sstream> #include <sstream>
#include <linux/tcp.h>
#include "LightNetwork.h" #include "LightNetwork.h"
#include "paddle/utils/Util.h" #include "paddle/utils/Util.h"
...@@ -79,6 +79,7 @@ std::string getIpAddr(std::string &device) { ...@@ -79,6 +79,7 @@ std::string getIpAddr(std::string &device) {
* @note adjust some default sock option for better performance * @note adjust some default sock option for better performance
*/ */
void setOption(int sockfd) { void setOption(int sockfd) {
#if !defined(__APPLE__) && !defined(__OSX__)
int sendSize = FLAGS_sock_send_buf_size; int sendSize = FLAGS_sock_send_buf_size;
int recvSize = FLAGS_sock_recv_buf_size; int recvSize = FLAGS_sock_recv_buf_size;
CHECK_GE( CHECK_GE(
...@@ -87,15 +88,19 @@ void setOption(int sockfd) { ...@@ -87,15 +88,19 @@ void setOption(int sockfd) {
CHECK_GE( CHECK_GE(
setsockopt(sockfd, SOL_SOCKET, SO_SNDBUF, &sendSize, sizeof(sendSize)), setsockopt(sockfd, SOL_SOCKET, SO_SNDBUF, &sendSize, sizeof(sendSize)),
0); 0);
#endif
if (FLAGS_small_messages) { if (FLAGS_small_messages) {
int optval = 1; int optval = 1;
CHECK_GE( CHECK_GE(
setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, &optval, sizeof(optval)), setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, &optval, sizeof(optval)),
0); 0);
#ifdef TCP_QUICKACK
optval = 1; optval = 1;
CHECK_GE( CHECK_GE(
setsockopt(sockfd, IPPROTO_TCP, TCP_QUICKACK, &optval, sizeof(optval)), setsockopt(sockfd, IPPROTO_TCP, TCP_QUICKACK, &optval, sizeof(optval)),
0); 0);
#endif
} }
int reuse = 1; int reuse = 1;
CHECK_GE(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)), CHECK_GE(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)),
...@@ -340,17 +345,27 @@ void SocketWorker::run() { ...@@ -340,17 +345,27 @@ void SocketWorker::run() {
*/ */
void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) { void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
struct sockaddr_in serv_addr; struct sockaddr_in serv_addr;
struct hostent hostinfo, *server; struct hostent *server;
char buf[1024]; // temp for gethostbyname_r
int errRet; // temp for gethostbyname_r int errRet; // temp for gethostbyname_r
/// Create a socket point /// Create a socket point
int sockfd = socket(AF_INET, SOCK_STREAM, 0); int sockfd = socket(AF_INET, SOCK_STREAM, 0);
PCHECK(sockfd >= 0) << "ERROR opening socket"; PCHECK(sockfd >= 0) << "ERROR opening socket";
CHECK_EQ(0, gethostbyname_r(serverAddr.c_str(), &hostinfo, buf, sizeof(buf),
&server, &errRet)) #if defined(__OSX__) || defined(__APPLE__)
<< "ERROR, no such host: " << serverAddr << " ret = " << errRet; server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
CHECK(server) << "gethostbyname_r err"; CHECK_NE(HOST_NOT_FOUND, errRet)
<< "ERROR, no such host: " << serverAddr << " ret = " << errRet;
CHECK(server) << "getipnodebyname error!";
#else
struct hostent hostinfo;
char buf[1024]; // temp for gethostbyname_r
CHECK_EQ(0, gethostbyname_r(serverAddr.c_str(), &hostinfo, buf, sizeof(buf),
&server, &errRet))
<< "ERROR, no such host: " << serverAddr << " ret = " << errRet;
CHECK(server) << "gethostbyname_r error!";
#endif
bzero((char *)&serv_addr, sizeof(serv_addr)); bzero((char *)&serv_addr, sizeof(serv_addr));
serv_addr.sin_family = AF_INET; serv_addr.sin_family = AF_INET;
......
...@@ -27,6 +27,15 @@ limitations under the License. */ ...@@ -27,6 +27,15 @@ limitations under the License. */
namespace paddle { namespace paddle {
/**
* UIO_MAXIOV is documented in writev(2), but <sys/uio.h> only
* declares it on osx/ios if defined(KERNEL)
*/
#ifndef UIO_MAXIOV
#define UIO_MAXIOV 512
#endif
SocketChannel::~SocketChannel() { SocketChannel::~SocketChannel() {
if (tcpRdma_ == F_TCP) if (tcpRdma_ == F_TCP)
close(tcpSocket_); close(tcpSocket_);
...@@ -148,8 +157,7 @@ void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) { ...@@ -148,8 +157,7 @@ void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
std::vector<iovec> iovs; std::vector<iovec> iovs;
iovs.reserve(userIovs.size() + 2); iovs.reserve(userIovs.size() + 2);
iovs.push_back({&header, sizeof(header)}); iovs.push_back({&header, sizeof(header)});
iovs.push_back({&iovLengths[0], iovs.push_back({&iovLengths[0], sizeof(iovLengths[0]) * header.numIovs});
sizeof(iovLengths[0]) * (size_t) header.numIovs});
iovs.insert(iovs.end(), userIovs.begin(), userIovs.end()); iovs.insert(iovs.end(), userIovs.begin(), userIovs.end());
header.totalLength = 0; header.totalLength = 0;
......
...@@ -17,6 +17,14 @@ ...@@ -17,6 +17,14 @@
from setuptools import setup, Extension from setuptools import setup, Extension
import numpy as np import numpy as np
import api.paddle_ld_flags import api.paddle_ld_flags
import platform
system = platform.system().lower()
is_osx = (system == 'darwin')
is_win = (system == 'windows')
is_lin = (system == 'linux')
# The extra links will passed from COMAKE # The extra links will passed from COMAKE
# because generate paddle LDFLAGS is too complicated to do in setup.py # because generate paddle LDFLAGS is too complicated to do in setup.py
...@@ -34,17 +42,24 @@ try: ...@@ -34,17 +42,24 @@ try:
except: except:
pass pass
if is_lin == True:
extra_links = ["-Xlinker", '-start-group'] + extra_links + ["-Xlinker", "-end-group"]
elif is_osx == True:
extra_links = ["-Wl,-all_load"] + extra_links
include_dirs = [np.get_include(), "../"] # include numpy and paddle.
setup(name="py_paddle", setup(name="py_paddle",
version="@PADDLE_VERSION@", version="@PADDLE_VERSION@",
ext_modules=[ ext_modules=[
Extension('py_paddle._swig_paddle', # Build SWIG Extension. Extension('py_paddle._swig_paddle', # Build SWIG Extension.
['Paddle_wrap.cxx'], ['Paddle_wrap.cxx'],
extra_link_args=["-Xlinker", '-start-group'] + include_dirs = include_dirs,
extra_links + ["-Xlinker", "-end-group"] extra_link_args = extra_links
) )
], ],
packages=['py_paddle'], packages=['py_paddle'],
include_dirs = [np.get_include(), "../"], # include numpy and paddle. include_dirs = include_dirs,
install_requires = [ install_requires = [
'numpy>=1.8.0', # The numpy is required. 'numpy>=1.8.0', # The numpy is required.
'protobuf>=2.4.1' # The paddle protobuf version 'protobuf>=2.4.1' # The paddle protobuf version
......
...@@ -28,6 +28,7 @@ limitations under the License. */ ...@@ -28,6 +28,7 @@ limitations under the License. */
#include "paddle/utils/PythonUtil.h" #include "paddle/utils/PythonUtil.h"
#include "paddle/utils/Stat.h" #include "paddle/utils/Stat.h"
#include "paddle/utils/Util.h" #include "paddle/utils/Util.h"
#include "paddle/utils/Excepts.h"
#include "paddle/utils/GlobalConstants.h" #include "paddle/utils/GlobalConstants.h"
#include "paddle/gserver/gradientmachines/NeuralNetwork.h" #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <fenv.h> #include <fenv.h>
#include "paddle/utils/PythonUtil.h" #include "paddle/utils/PythonUtil.h"
#include "paddle/utils/StringUtil.h" #include "paddle/utils/StringUtil.h"
#include "paddle/utils/Excepts.h"
#include "paddle/pserver/ParameterServer2.h" #include "paddle/pserver/ParameterServer2.h"
#include "ParamUtil.h" #include "ParamUtil.h"
......
...@@ -146,12 +146,12 @@ TEST(compareSparse, remote_cpu) { ...@@ -146,12 +146,12 @@ TEST(compareSparse, remote_cpu) {
TEST(compareSparse, cpu10_local_vs_remote) { TEST(compareSparse, cpu10_local_vs_remote) {
FLAGS_local = 1; // disable remote sparse update in parameter config FLAGS_local = 1; // disable remote sparse update in parameter config
std::vector<ParameterPtr> localParameters = std::vector<ParameterPtr> localParameters =
trainerOnePassTest(configFile1, true, 10); trainerOnePassTest(configFile1, true, 2);
FLAGS_local = 0; // will enable remote sparse update FLAGS_local = 0; // will enable remote sparse update
FLAGS_ports_num_for_sparse = 5; FLAGS_ports_num_for_sparse = 5;
std::vector<ParameterPtr> remoteParameters = std::vector<ParameterPtr> remoteParameters =
trainerOnePassTest(configFile1, true, 10); trainerOnePassTest(configFile1, true, 2);
compareValue(localParameters, remoteParameters); compareValue(localParameters, remoteParameters);
} }
...@@ -174,7 +174,7 @@ TEST(compareSparse, multiGradientMachine) { ...@@ -174,7 +174,7 @@ TEST(compareSparse, multiGradientMachine) {
FLAGS_parallel_nn = useGpu; FLAGS_parallel_nn = useGpu;
LOG(INFO) << " local=" << local LOG(INFO) << " local=" << local
<< " useGpu=" << useGpu; << " useGpu=" << useGpu;
int trainerCount = useGpu ? numGpu : 10; int trainerCount = useGpu ? numGpu : 2;
std::vector<ParameterPtr> parameters = std::vector<ParameterPtr> parameters =
trainerOnePassTest(configFile1, true, trainerCount, useGpu); trainerOnePassTest(configFile1, true, trainerCount, useGpu);
compareValue(getDenseParameters(), parameters, eps); compareValue(getDenseParameters(), parameters, eps);
......
...@@ -62,7 +62,11 @@ TEST(checkGradient, multiGpu) { ...@@ -62,7 +62,11 @@ TEST(checkGradient, multiGpu) {
} }
} }
TEST(checkGradient, parallel) { checkGradientTest(configFile4, true, true); } TEST(checkGradient, parallel) {
if (hl_get_device_count() >= 2) {
checkGradientTest(configFile4, true, true);
}
}
TEST(checkGradient, multiParallel) { TEST(checkGradient, multiParallel) {
FLAGS_allow_only_one_model_on_one_gpu = false; FLAGS_allow_only_one_model_on_one_gpu = false;
...@@ -90,7 +94,11 @@ TEST(checkGradient, multi) { ...@@ -90,7 +94,11 @@ TEST(checkGradient, multi) {
TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); } TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
TEST(checkGradient, chunk) { TEST(checkGradient, chunk) {
#if defined(__APPLE__) || defined (__OSX__)
EXPECT_EQ(0, system("python trainer/tests/gen_proto_data.py"));
#else
EXPECT_EQ(0, system("python2 trainer/tests/gen_proto_data.py")); EXPECT_EQ(0, system("python2 trainer/tests/gen_proto_data.py"));
#endif
checkGradientTest(configFile3, false, false); checkGradientTest(configFile3, false, false);
#ifndef PADDLE_ONLY_CPU #ifndef PADDLE_ONLY_CPU
checkGradientTest(configFile3, true, true); checkGradientTest(configFile3, true, true);
......
...@@ -82,7 +82,11 @@ TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); } ...@@ -82,7 +82,11 @@ TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); }
TEST(trainerOnePass, gpu4) { trainerOnePassTest(configFile1, true, false, 4); } TEST(trainerOnePass, gpu4) { trainerOnePassTest(configFile1, true, false, 4); }
TEST(trainerOnePass, parallel) { trainerOnePassTest(configFile2, true, true); } TEST(trainerOnePass, parallel) {
if (hl_get_device_count() >= 2) {
trainerOnePassTest(configFile2, true, true);
}
}
#endif #endif
// 2. test average_window. // 2. test average_window.
......
...@@ -2,12 +2,18 @@ ...@@ -2,12 +2,18 @@
file(GLOB UTIL_HEADERS . *.h) file(GLOB UTIL_HEADERS . *.h)
file(GLOB UTIL_SOURCES . *.cpp) file(GLOB UTIL_SOURCES . *.cpp)
if(APPLE)
file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
else()
file(GLOB UTIL_ARCH_SOURCES . arch/linux/*.cpp)
endif()
add_library(paddle_utils STATIC add_library(paddle_utils STATIC
${UTIL_SOURCES}) ${UTIL_SOURCES}
${UTIL_ARCH_SOURCES})
add_style_check_target(paddle_utils ${UTIL_HEADERS}) add_style_check_target(paddle_utils ${UTIL_HEADERS})
add_style_check_target(paddle_utils ${UTIL_SOURCES}) add_style_check_target(paddle_utils ${UTIL_SOURCES}
${UTIL_ARCH_SOURCES})
add_dependencies(paddle_utils gen_proto_cpp) add_dependencies(paddle_utils gen_proto_cpp)
if(WITH_TESTING) if(WITH_TESTING)
add_subdirectory(tests) add_subdirectory(tests)
endif() endif()
\ No newline at end of file
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "Excepts.h"
#if defined(__APPLE__) || defined(__OSX__)
#include <fenv.h>
int fegetexcept(void) {
static fenv_t fenv;
return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
}
int feenableexcept(unsigned int excepts) {
static fenv_t fenv;
unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
if ( fegetenv (&fenv) ) return -1;
old_excepts = fenv.__control & FE_ALL_EXCEPT;
// unmask
fenv.__control &= ~new_excepts;
fenv.__mxcsr &= ~(new_excepts << 7);
return ( fesetenv (&fenv) ? -1 : old_excepts );
}
int fedisableexcept(unsigned int excepts) {
static fenv_t fenv;
unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
if ( fegetenv (&fenv) ) return -1;
old_excepts = fenv.__control & FE_ALL_EXCEPT;
// mask
fenv.__control |= new_excepts;
fenv.__mxcsr |= new_excepts << 7;
return ( fesetenv (&fenv) ? -1 : old_excepts );
}
#endif
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef EXCEPTS_H_
#define EXCEPTS_H_
#if defined(__APPLE__) || defined(__OSX__)
int fegetexcept(void);
int feenableexcept(unsigned int excepts);
int fedisableexcept(unsigned int excepts);
#endif
#endif // EXCEPTS_H_
...@@ -16,13 +16,12 @@ limitations under the License. */ ...@@ -16,13 +16,12 @@ limitations under the License. */
#pragma once #pragma once
#include <pthread.h> #include <pthread.h>
#include <semaphore.h>
#include <sys/time.h> #include <sys/time.h>
#include <unistd.h>
#include <condition_variable> #include <condition_variable>
#include <mutex> #include <mutex>
#include "DisableCopy.h"
namespace paddle { namespace paddle {
/** /**
...@@ -98,35 +97,44 @@ protected: ...@@ -98,35 +97,44 @@ protected:
* which means it will keep trying to lock until lock on successfully. * which means it will keep trying to lock until lock on successfully.
* The SpinLock disable copy. * The SpinLock disable copy.
*/ */
class SpinLockPrivate;
class SpinLock { class SpinLock {
public: public:
SpinLock() { pthread_spin_init(&lock_, 0); } DISABLE_COPY(SpinLock);
~SpinLock() { pthread_spin_destroy(&lock_); } SpinLock();
SpinLock(const SpinLock&) = delete; ~SpinLock();
SpinLock& operator=(const SpinLock&) = delete;
// std::mutext interface // std::mutext interface
void lock() { pthread_spin_lock(&lock_); } void lock();
void unlock() { pthread_spin_unlock(&lock_); } void unlock();
protected: private:
pthread_spinlock_t lock_; SpinLockPrivate* m;
char padding_[64 - sizeof(pthread_spinlock_t)];
}; };
/** /**
* A simple wapper of semaphore which can only be shared in the same process. * A simple wapper of semaphore which can only be shared in the same process.
*/ */
class SemaphorePrivate;
class Semaphore { class Semaphore {
public:
//! Disable copy & assign
Semaphore(const Semaphore& other) = delete;
Semaphore& operator= (const Semaphore&& other) = delete;
//! Enable move.
Semaphore(Semaphore&& other): m(std::move(other.m)) {
}
public: public:
/** /**
* @brief Construct Function. * @brief Construct Function.
* @param[in] initValue the initial value of the * @param[in] initValue the initial value of the
* semaphore, default 0. * semaphore, default 0.
*/ */
explicit Semaphore(int initValue = 0) { sem_init(&sem_, 0, initValue); } explicit Semaphore(int initValue = 0);
~Semaphore() { sem_destroy(&sem_); } ~Semaphore();
/** /**
* @brief The same as wait(), except if the decrement can not * @brief The same as wait(), except if the decrement can not
...@@ -136,41 +144,38 @@ public: ...@@ -136,41 +144,38 @@ public:
* @return ture if the decrement proceeds before ts, * @return ture if the decrement proceeds before ts,
* else return false. * else return false.
*/ */
bool timeWait(struct timespec* ts) { return (0 == sem_timedwait(&sem_, ts)); } bool timeWait(struct timespec* ts);
/** /**
* @brief decrement the semaphore. If the semaphore's value is 0, then call blocks. * @brief decrement the semaphore. If the semaphore's value is 0, then call blocks.
*/ */
void wait() { sem_wait(&sem_); } void wait();
/** /**
* @brief increment the semaphore. If the semaphore's value * @brief increment the semaphore. If the semaphore's value
* greater than 0, wake up a thread blocked in wait(). * greater than 0, wake up a thread blocked in wait().
*/ */
void post() { sem_post(&sem_); } void post();
protected: private:
sem_t sem_; SemaphorePrivate* m;
}; };
static_assert(sizeof(SpinLock) == 64, "Wrong padding");
/** /**
* A simple wrapper of thread barrier. * A simple wrapper of thread barrier.
* The ThreadBarrier disable copy. * The ThreadBarrier disable copy.
*/ */
class ThreadBarrierPrivate;
class ThreadBarrier { class ThreadBarrier {
public: public:
DISABLE_COPY(ThreadBarrier);
/** /**
* @brief Construct Function. Initialize the barrier should * @brief Construct Function. Initialize the barrier should
* wait for count threads in wait(). * wait for count threads in wait().
*/ */
explicit ThreadBarrier(int count) { explicit ThreadBarrier(int count);
pthread_barrier_init(&barrier_, NULL, count); ~ThreadBarrier();
}
~ThreadBarrier() { pthread_barrier_destroy(&barrier_); }
ThreadBarrier(const ThreadBarrier&) = delete;
ThreadBarrier& operator=(const ThreadBarrier&) = delete;
/** /**
* @brief . * @brief .
...@@ -178,10 +183,10 @@ public: ...@@ -178,10 +183,10 @@ public:
* then wake up all the count - 1 threads and continue run together. * then wake up all the count - 1 threads and continue run together.
* Else block the thread until waked by other thread . * Else block the thread until waked by other thread .
*/ */
void wait() { pthread_barrier_wait(&barrier_); } void wait();
protected: private:
pthread_barrier_t barrier_; ThreadBarrierPrivate* m;
}; };
/** /**
......
...@@ -18,6 +18,12 @@ limitations under the License. */ ...@@ -18,6 +18,12 @@ limitations under the License. */
#ifndef PADDLE_NO_PYTHON #ifndef PADDLE_NO_PYTHON
// must include the following two blocks, otherwise, // must include the following two blocks, otherwise,
// gcc compiler may produce warning // gcc compiler may produce warning
#ifdef __APPLE__
#define _POSIX_SOURCE
#define _POSIX_C_SOURCE 200809L
#define _XOPEN_SOURCE 700
#endif
#ifdef _POSIX_C_SOURCE #ifdef _POSIX_C_SOURCE
#define __TEMP_POSIX_C_SOURCE _POSIX_C_SOURCE #define __TEMP_POSIX_C_SOURCE _POSIX_C_SOURCE
#undef _POSIX_C_SOURCE #undef _POSIX_C_SOURCE
...@@ -28,12 +34,7 @@ limitations under the License. */ ...@@ -28,12 +34,7 @@ limitations under the License. */
#endif #endif
#include <Python.h> #include <Python.h>
#include <frameobject.h> #include <frameobject.h>
#ifndef _POSIX_C_SOURCE
#warning "no _POSIX_C_SOURCE defined in Python.h"
#endif
#ifndef _XOPEN_SOURCE
#warning "no _XOPEN_SOURCE defined in Python.h"
#endif
#endif #endif
#include "paddle/utils/Util.h" #include "paddle/utils/Util.h"
......
...@@ -13,24 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,24 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "Stat.h" #include "Stat.h"
#include "Util.h"
#include <sys/syscall.h> // for syscall()
#include <sys/types.h>
#include <iomanip> #include <iomanip>
#include <algorithm> #include <algorithm>
namespace paddle { namespace paddle {
// return the thread id used by glog
pid_t getTID() {
#ifndef __NR_gettid
#define __NR_gettid 224
#endif
pid_t tid = syscall(__NR_gettid);
CHECK_NE(tid, -1);
return tid;
}
StatSet globalStat("GlobalStatInfo"); StatSet globalStat("GlobalStatInfo");
void Stat::addSample(uint64_t value) { void Stat::addSample(uint64_t value) {
......
...@@ -13,13 +13,10 @@ See the License for the specific language governing permissions and ...@@ -13,13 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "Util.h"
#include "Logging.h" #include "Logging.h"
#include <thread> #include <thread>
#include <sys/syscall.h>
#include <unistd.h>
inline pid_t gettid() { return syscall(SYS_gettid); }
#include "Queue.h" #include "Queue.h"
#include "ThreadLocal.h" #include "ThreadLocal.h"
...@@ -175,7 +172,7 @@ public: ...@@ -175,7 +172,7 @@ public:
jobFinishBarrier_(numWorkers + 1), jobFinishBarrier_(numWorkers + 1),
jobFunc_(nullptr), jobFunc_(nullptr),
checkOwner_(checkOwner) { checkOwner_(checkOwner) {
ownerThreadId_ = ::gettid(); ownerThreadId_ = getTID();
workers_.resize(numWorkers); workers_.resize(numWorkers);
start(); start();
} }
...@@ -199,7 +196,7 @@ public: ...@@ -199,7 +196,7 @@ public:
*/ */
void exec(JobFunc jobFunc, JobFunc ownerFunc = nullptr) { void exec(JobFunc jobFunc, JobFunc ownerFunc = nullptr) {
if (checkOwner_) { if (checkOwner_) {
CHECK_EQ(ownerThreadId_, ::gettid()) CHECK_EQ(ownerThreadId_, getTID())
<< "this sync thread pool should be used in one thread"; << "this sync thread pool should be used in one thread";
} }
......
...@@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "Util.h"
#include "ThreadLocal.h" #include "ThreadLocal.h"
#include "Thread.h"
#include "CommandLineParser.h" #include "CommandLineParser.h"
P_DEFINE_bool(thread_local_rand_use_global_seed, false, P_DEFINE_bool(thread_local_rand_use_global_seed, false,
...@@ -31,11 +29,11 @@ unsigned int* ThreadLocalRand::getSeed() { ...@@ -31,11 +29,11 @@ unsigned int* ThreadLocalRand::getSeed() {
if (!p) { // init seed if (!p) { // init seed
if (FLAGS_thread_local_rand_use_global_seed) { if (FLAGS_thread_local_rand_use_global_seed) {
p = new unsigned int(defaultSeed_); p = new unsigned int(defaultSeed_);
} else if (getpid() == gettid()) { // main thread } else if (getpid() == getTID()) { // main thread
// deterministic, but differs from global srand() // deterministic, but differs from global srand()
p = new unsigned int(defaultSeed_ - 1); p = new unsigned int(defaultSeed_ - 1);
} else { } else {
p = new unsigned int(defaultSeed_ + gettid()); p = new unsigned int(defaultSeed_ + getTID());
LOG(INFO) << "thread use undeterministic rand seed:" << *p; LOG(INFO) << "thread use undeterministic rand seed:" << *p;
} }
seed_.set(p); seed_.set(p);
...@@ -51,7 +49,7 @@ std::default_random_engine& ThreadLocalRandomEngine::get() { ...@@ -51,7 +49,7 @@ std::default_random_engine& ThreadLocalRandomEngine::get() {
int defaultSeed = ThreadLocalRand::getDefaultSeed(); int defaultSeed = ThreadLocalRand::getDefaultSeed();
engine->seed(FLAGS_thread_local_rand_use_global_seed engine->seed(FLAGS_thread_local_rand_use_global_seed
? defaultSeed ? defaultSeed
: defaultSeed + gettid()); : defaultSeed + getTID());
engine_.set(engine); engine_.set(engine);
} }
return *engine; return *engine;
......
...@@ -156,7 +156,15 @@ private: ...@@ -156,7 +156,15 @@ private:
static void dataDestructor(void* p) { delete (T*)p; } static void dataDestructor(void* p) { delete (T*)p; }
void updateMap(T* p) { void updateMap(T* p) {
pid_t tid = syscall(SYS_gettid); #if defined(__APPLE__) || defined(__OSX__)
pid_t tid = syscall(SYS_thread_selfid);
#else
#ifndef __NR_gettid
#define __NR_gettid 224
#endif
pid_t tid = syscall(__NR_gettid);
#endif
CHECK_NE(tid, -1);
std::lock_guard<std::mutex> guard(mutex_); std::lock_guard<std::mutex> guard(mutex_);
auto ret = threadMap_.insert(std::make_pair(tid, p)); auto ret = threadMap_.insert(std::make_pair(tid, p));
if (!ret.second) { if (!ret.second) {
......
...@@ -93,6 +93,19 @@ static void installProfilerSwitch() {} ...@@ -93,6 +93,19 @@ static void installProfilerSwitch() {}
namespace paddle { namespace paddle {
pid_t getTID() {
#if defined(__APPLE__) || defined(__OSX__)
pid_t tid = syscall(SYS_thread_selfid);
#else
#ifndef __NR_gettid
#define __NR_gettid 224
#endif
pid_t tid = syscall(__NR_gettid);
#endif
CHECK_NE(tid, -1);
return tid;
}
static bool g_initialized = false; static bool g_initialized = false;
typedef std::pair<int, std::function<void()>> PriorityFuncPair; typedef std::pair<int, std::function<void()>> PriorityFuncPair;
typedef std::vector<PriorityFuncPair> InitFuncList; typedef std::vector<PriorityFuncPair> InitFuncList;
......
...@@ -24,6 +24,8 @@ limitations under the License. */ ...@@ -24,6 +24,8 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <mutex> #include <mutex>
#include <functional> #include <functional>
#include <sys/syscall.h> // for syscall()
#include <sys/types.h>
#include "CommandLineParser.h" #include "CommandLineParser.h"
#include "Logging.h" #include "Logging.h"
...@@ -63,6 +65,9 @@ limitations under the License. */ ...@@ -63,6 +65,9 @@ limitations under the License. */
namespace paddle { namespace paddle {
// return the thread id used by glog
pid_t getTID();
/** /**
* return the 1-based index of the highest bit set * return the 1-based index of the highest bit set
* *
......
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/utils/Locks.h"
#include <semaphore.h>
#include <unistd.h>
namespace paddle {
class SemaphorePrivate {
public:
sem_t sem;
};
Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) {
sem_init(&m->sem, 0, initValue);
}
Semaphore::~Semaphore() {
sem_destroy(&m->sem);
}
bool Semaphore::timeWait(struct timespec* ts) {
return (0 == sem_timedwait(&m->sem, ts));
}
void Semaphore::wait() {
sem_wait(&m->sem);
}
void Semaphore::post() {
sem_post(&m->sem);
}
class SpinLockPrivate {
public:
inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
pthread_spinlock_t lock_;
char padding_[64 - sizeof(pthread_spinlock_t)];
};
SpinLock::SpinLock():m(new SpinLockPrivate()) {}
SpinLock::~SpinLock() { delete m; }
void SpinLock::lock() {
pthread_spin_lock(&m->lock_);
}
void SpinLock::unlock() {
pthread_spin_unlock(&m->lock_);
}
class ThreadBarrierPrivate {
public:
pthread_barrier_t barrier_;
};
ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate()) {
pthread_barrier_init(&m->barrier_, nullptr, count);
}
ThreadBarrier::~ThreadBarrier() {
pthread_barrier_destroy(&m->barrier_);
delete m;
}
void ThreadBarrier::wait() {
pthread_barrier_wait(&m->barrier_);
}
} // namespace paddle
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/utils/Locks.h"
#include "paddle/utils/Logging.h"
#include <dispatch/dispatch.h>
#include <libkern/OSAtomic.h>
namespace paddle {
class SemaphorePrivate {
public:
~SemaphorePrivate() {
dispatch_release(sem);
}
dispatch_semaphore_t sem;
};
Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) {
m->sem = dispatch_semaphore_create(initValue);
}
Semaphore::~Semaphore() {
delete m;
}
bool Semaphore::timeWait(timespec *ts) {
dispatch_time_t tm = dispatch_walltime(ts, 0);
return (0 == dispatch_semaphore_wait(m->sem, tm));
}
void Semaphore::wait() {
dispatch_semaphore_wait(m->sem, DISPATCH_TIME_FOREVER);
}
void Semaphore::post() {
dispatch_semaphore_signal(m->sem);
}
class SpinLockPrivate {
public:
SpinLockPrivate(): lock_(OS_SPINLOCK_INIT) {}
OSSpinLock lock_;
char padding_[64 - sizeof(OSSpinLock)]; // Padding to cache line size
};
SpinLock::SpinLock(): m(new SpinLockPrivate()) {}
SpinLock::~SpinLock() { delete m; }
void SpinLock::lock() {
OSSpinLockLock(&m->lock_);
}
void SpinLock::unlock() {
OSSpinLockUnlock(&m->lock_);
}
class ThreadBarrierPrivate {
public:
pthread_mutex_t mutex_;
pthread_cond_t cond_;
int count_;
int tripCount_;
inline explicit ThreadBarrierPrivate(int cnt):count_(0), tripCount_(cnt) {
CHECK_NE(cnt, 0);
CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
CHECK_GE(pthread_cond_init(&cond_, 0), 0);
}
inline ~ThreadBarrierPrivate() {
pthread_cond_destroy(&cond_);
pthread_mutex_destroy(&mutex_);
}
/**
* @brief wait
* @return true if the last wait
*/
inline bool wait() {
pthread_mutex_lock(&mutex_);
++count_;
if (count_ >= tripCount_) {
count_ = 0;
pthread_cond_broadcast(&cond_);
pthread_mutex_unlock(&mutex_);
return true;
} else {
pthread_cond_wait(&cond_, &mutex_);
pthread_mutex_unlock(&mutex_);
return false;
}
}
};
ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate(count)) {}
ThreadBarrier::~ThreadBarrier() { delete m; }
void ThreadBarrier::wait() { m->wait(); }
} // namespace paddle
...@@ -3,12 +3,15 @@ add_simple_unittest(test_Logging) ...@@ -3,12 +3,15 @@ add_simple_unittest(test_Logging)
add_simple_unittest(test_Thread) add_simple_unittest(test_Thread)
add_simple_unittest(test_StringUtils) add_simple_unittest(test_StringUtils)
add_simple_unittest(test_CustomStackTrace) add_simple_unittest(test_CustomStackTrace)
add_simple_unittest(test_ThreadBarrier)
add_executable( add_executable(
test_CustomStackTracePrint test_CustomStackTracePrint
test_CustomStackTracePrint.cpp test_CustomStackTracePrint.cpp
) )
link_paddle_exe(test_CustomStackTracePrint) link_paddle_exe(test_CustomStackTracePrint)
add_test(NAME test_CustomStackTracePrint if(NOT APPLE)
COMMAND ${PROJ_ROOT}/paddle/utils/tests/test_CustomStackTracePrint.sh add_test(NAME test_CustomStackTracePrint
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) COMMAND ${PROJ_ROOT}/paddle/utils/tests/test_CustomStackTracePrint.sh
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
...@@ -45,6 +45,7 @@ void testNormalImpl(const std::function<void( ...@@ -45,6 +45,7 @@ void testNormalImpl(const std::function<void(
size_t cntDown = countDown; size_t cntDown = countDown;
while (cntDown-- > 0) { while (cntDown-- > 0) {
startBarrier.wait(); startBarrier.wait();
sleep(1);
doneBarrier.wait(); doneBarrier.wait();
ASSERT_TRUE(tracer.empty()); ASSERT_TRUE(tracer.empty());
} }
......
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <set>
#include <vector>
#include "paddle/utils/Logging.h"
#include "paddle/utils/CommandLineParser.h"
#include "paddle/utils/Util.h"
#include "paddle/utils/Locks.h"
P_DEFINE_int32(test_thread_num, 100, "testing thread number");
void testNormalImpl(size_t thread_num,
const std::function<void(size_t,
std::mutex&, std::set<std::thread::id>&,
paddle::ThreadBarrier&)>& callback) {
std::mutex mutex;
std::set<std::thread::id> tids;
paddle::ThreadBarrier barrier(thread_num);
std::vector<std::thread> threads;
threads.reserve(thread_num);
for (size_t i = 0; i < thread_num; ++i) {
threads.emplace_back([&thread_num, &mutex,
&tids, &barrier, &callback]{
callback(thread_num, mutex, tids, barrier);
});
}
for (auto& thread : threads) {
thread.join();
}
}
TEST(ThreadBarrier, normalTest) {
for (auto &thread_num : {10, 30, 50 , 100 , 300, 1000}) {
testNormalImpl(thread_num,
[](size_t thread_num, std::mutex& mutex,
std::set<std::thread::id>& tids,
paddle::ThreadBarrier& barrier){
{
std::lock_guard<std::mutex> guard(mutex);
tids.insert(std::this_thread::get_id());
}
barrier.wait();
// Check whether all threads reach this point or not
CHECK_EQ(tids.size(), thread_num);
});
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
paddle::initMain(argc, argv);
return RUN_ALL_TESTS();
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册