提交 a131c73f 编写于 作者: T typhoonzero

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_grpc_server_ready_condition
...@@ -12,7 +12,7 @@ services: ...@@ -12,7 +12,7 @@ services:
os: os:
- linux - linux
env: env:
- JOB=build_doc - JOB=doc
- JOB=check_style - JOB=check_style
- JOB=build_android - JOB=build_android
addons: addons:
...@@ -36,21 +36,18 @@ addons: ...@@ -36,21 +36,18 @@ addons:
- ccache - ccache
ssh_known_hosts: 13.229.163.131 ssh_known_hosts: 13.229.163.131
before_install: before_install:
- if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
# Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
# protobuf version.
- sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
- sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit
- | - |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script: script:
- | - |
# 43min timeout # 43min timeout
if [[ "$JOB" == "build_android" ]]; then timeout 2580 docker run -it --rm -v "$TRAVIS_BUILD_DIR:/paddle" paddlepaddle/paddle:latest-dev-android; if [[ "$JOB" != "doc" ]]; then timeout 2580 paddle/scripts/paddle_docker_build.sh ${JOB}; else paddle/scripts/paddle_build.sh ${JOB}; fi;
else timeout 2580 paddle/scripts/travis/${JOB}.sh; fi; if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
- | - |
if [[ "$JOB" != "build_doc" ]]; then exit 0; fi; if [[ "$JOB" != "doc" ]]; then exit 0; fi;
# For document only
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi; if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi; if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
......
...@@ -2,12 +2,14 @@ ...@@ -2,12 +2,14 @@
|---|---| |---|---|
| abhinavarora | Abhinav Arora | | abhinavarora | Abhinav Arora |
| backyes | Yan-Fei Wang | | backyes | Yan-Fei Wang |
| baiyfbupt | Yi-Fan Bai |
| beckett1124 | Bin Qi | | beckett1124 | Bin Qi |
| JiayiFeng | Jia-Yi Feng |
| chengxiaohua1105 | Xiao-Hua Cheng | | chengxiaohua1105 | Xiao-Hua Cheng |
| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang | | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
| cxysteven | Xing-Yi Cheng | | cxysteven | Xing-Yi Cheng |
| dzhwinter | Zhi-Hong Dong | | dzhwinter | Zhi-Hong Dong |
| dragonwarrior | Long Wang |
| dyning | Yuning Du |
| emailweixu | Wei Xu | | emailweixu | Wei Xu |
| gangliao | Gang Liao | | gangliao | Gang Liao |
| gongweibao | Wei-Bao Gong | | gongweibao | Wei-Bao Gong |
...@@ -16,6 +18,9 @@ ...@@ -16,6 +18,9 @@
| hedaoyuan | Dao-Yuan He | | hedaoyuan | Dao-Yuan He |
| helinwang | He-Lin Wang | | helinwang | He-Lin Wang |
| jacquesqiao | Long-Fei Qiao | | jacquesqiao | Long-Fei Qiao |
| jczaja | Jacek Czaja |
| JiayiFeng | Jia-Yi Feng |
| kbinias | Krzysztof Binias |
| kuke | Yi-Bing Liu | | kuke | Yi-Bing Liu |
| lcy-seso | Ying Cao | | lcy-seso | Ying Cao |
| lipeng-unisound | Peng Li | | lipeng-unisound | Peng Li |
...@@ -24,16 +29,20 @@ ...@@ -24,16 +29,20 @@
| llxxxll | Yong-Feng Liu | | llxxxll | Yong-Feng Liu |
| luotao01 | Tao Luo | | luotao01 | Tao Luo |
| lzhao4ever | Liang Zhao | | lzhao4ever | Liang Zhao |
| mozga-intel | Mateusz Ozga |
| NHZlX | Zhao-Long Xing | | NHZlX | Zhao-Long Xing |
| Noplz | Yuan Gao |
| pakchoi | Chuan-Jiang Song | | pakchoi | Chuan-Jiang Song |
| panyx0718 | Xin Pan | | panyx0718 | Xin Pan |
| pengli09 | Peng Li | | pengli09 | Peng Li |
| pkuyym | Ya-Ming Yang | | pkuyym | Ya-Ming Yang |
| pzelazko-intel | Pawel Zelazko |
| QiJune | Jun Qi | | QiJune | Jun Qi |
| qingqing01 | Qing-Qing Dang | | qingqing01 | Qing-Qing Dang |
| reyoung | Yang Yu | | reyoung | Yang Yu |
| Superjom | Chun-Wei Yan | | Superjom | Chun-Wei Yan |
| tianbingsz | Tian-Bing Xu | | tianbingsz | Tian-Bing Xu |
| tpatejko | Tomasz Patejko |
| typhoonzero | Yi Wu | | typhoonzero | Yi Wu |
| wanghaoshuang | Hao-Shuang Wang | | wanghaoshuang | Hao-Shuang Wang |
| wangyang59 | Yang Wang | | wangyang59 | Yang Wang |
......
# A image for building paddle binaries # A image for building paddle binaries
# Use cuda devel base image for both cpu and gpu environment # Use cuda devel base image for both cpu and gpu environment
# When you modify it, please be aware of cudnn-runtime version
# When you modify it, please be aware of cudnn-runtime version
# and libcudnn.so.x in paddle/scripts/docker/build.sh # and libcudnn.so.x in paddle/scripts/docker/build.sh
FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04 FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com> MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
...@@ -24,7 +23,7 @@ ENV HOME /root ...@@ -24,7 +23,7 @@ ENV HOME /root
COPY ./paddle/scripts/docker/root/ /root/ COPY ./paddle/scripts/docker/root/ /root/
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y \ apt-get install -y --allow-downgrades \
git python-pip python-dev openssh-server bison \ git python-pip python-dev openssh-server bison \
libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \ libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
...@@ -33,7 +32,7 @@ RUN apt-get update && \ ...@@ -33,7 +32,7 @@ RUN apt-get update && \
automake locales clang-format swig doxygen cmake \ automake locales clang-format swig doxygen cmake \
liblapack-dev liblapacke-dev \ liblapack-dev liblapacke-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools libtool && \ net-tools libtool ccache && \
apt-get clean -y apt-get clean -y
# Install Go and glide # Install Go and glide
......
...@@ -172,6 +172,8 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF) ...@@ -172,6 +172,8 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
list(APPEND CUDA_NVCC_FLAGS "-std=c++11") list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math") list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC") list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
# in cuda9, suppress cuda warning on eigen
list(APPEND CUDA_NVCC_FLAGS "-w")
# Set :expt-relaxed-constexpr to suppress Eigen warnings # Set :expt-relaxed-constexpr to suppress Eigen warnings
list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
......
...@@ -22,7 +22,9 @@ else() ...@@ -22,7 +22,9 @@ else()
extern_eigen3 extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/RLovelett/eigen.git" GIT_REPOSITORY "https://github.com/RLovelett/eigen.git"
GIT_TAG 70661066beef694cadf6c304d0d07e0758825c10 # eigen on cuda9.1 missing header of math_funtions.hpp
# https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c
PREFIX ${EIGEN_SOURCE_DIR} PREFIX ${EIGEN_SOURCE_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
......
...@@ -38,8 +38,7 @@ ENDIF() ...@@ -38,8 +38,7 @@ ENDIF()
ExternalProject_Add( ExternalProject_Add(
extern_warpctc extern_warpctc
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/gangliao/warp-ctc.git" GIT_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git"
GIT_TAG b63a0644654a3e0ed624c85a1767bc8193aead09
PREFIX ${WARPCTC_SOURCES_DIR} PREFIX ${WARPCTC_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......
# Averaging Parameter in PaddlePaddle # Averaging Parameter in PaddlePaddle
## Why Averaging ## Why Averaging
In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable if we can obtain the optimal values of parameters by going through the data in as few passes as we can. In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable to obtain the optimal values of parameters by going through the data in as few passes as possible.
Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset. Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
...@@ -16,16 +16,16 @@ We propose averaging for any optimizer similar to how ASGD performs it, as menti ...@@ -16,16 +16,16 @@ We propose averaging for any optimizer similar to how ASGD performs it, as menti
### How to perform Parameter Averaging in PaddlePaddle ### How to perform Parameter Averaging in PaddlePaddle
Parameter Averaging in PaddlePaddle works in the following way during training : Parameter Averaging in PaddlePaddle works in the following way during training :
1. It will take in an instance of a normal optimizer as an input, e.g. RMSPropOptimizer 1. It will take in an instance of an optimizer as an input, e.g. RMSPropOptimizer
2. The optimizer itself is responsible for updating the parameters. 2. The optimizer itself is responsible for updating the parameters.
3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself: 3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
1. In concept, the values of this copy are the average of the values of the parameters in the most recent N batches. 1. In theory, the values of this copy are the average of the values of the parameters in the most recent N batches.
2. However, saving all the N instances of the parameters in memory is not feasible. 2. However, saving all N instances of the parameters in memory is not feasible.
3. Therefore, an approximation algorithm is used. 3. Therefore, an approximation algorithm is used.
Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved. Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.
During the testing/ saving the model phase, we perform the following steps: During the testing/saving the model phase, we perform the following steps:
1. Perform the delayed operations. 1. Perform the delayed operations.
2. Save current values of the parameters to a temporary variable. 2. Save current values of the parameters to a temporary variable.
3. Replace the values of the parameters with the averaged values. 3. Replace the values of the parameters with the averaged values.
......
...@@ -228,6 +228,21 @@ extern __thread cudaStream_t default_stream; ...@@ -228,6 +228,21 @@ extern __thread cudaStream_t default_stream;
<< "CUDA error: " << hl_get_device_error_string((size_t)err); \ << "CUDA error: " << hl_get_device_error_string((size_t)err); \
} }
// __shfl has been deprecated as of CUDA 9.0.
#if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T
__shfl_sync(unsigned, T val, int src_line, int width) {
return __shfl(val, src_line, width);
}
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
#endif /* __NVCC__ */ #endif /* __NVCC__ */
#endif /* HL_BASE_H_ */ #endif /* HL_BASE_H_ */
...@@ -341,12 +341,15 @@ void hl_lstm_parallel_forward(real *gateValue, ...@@ -341,12 +341,15 @@ void hl_lstm_parallel_forward(real *gateValue,
} }
__device__ __forceinline__ void transpose_32x32(real a[], const int idx) { __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
int addr = idx % 32; const int warp_size = 32;
int addr = idx % warp_size;
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, addr < warp_size);
#pragma unroll #pragma unroll
for (int k = 1; k < 32; k++) { for (int k = 1; k < 32; k++) {
// rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32); // rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32);
addr = __shfl(addr, (idx + 1) % 32, 32); addr = __shfl_sync(mask, addr, (idx + 1) % 32, 32);
a[k] = __shfl(a[k], addr, 32); a[k] = __shfl_sync(mask, a[k], addr, 32);
} }
#pragma unroll #pragma unroll
...@@ -360,10 +363,11 @@ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) { ...@@ -360,10 +363,11 @@ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
} }
addr = (32 - idx) % 32; addr = (32 - idx) % 32;
CREATE_SHFL_MASK(mask, idx % 32 < warp_size);
#pragma unroll #pragma unroll
for (int k = 0; k < 32; k++) { for (int k = 0; k < 32; k++) {
a[k] = __shfl(a[k], addr, 32); a[k] = __shfl_sync(mask, a[k], addr, 32);
addr = __shfl(addr, (idx + 31) % 32, 32); addr = __shfl_sync(mask, addr, (idx + 31) % 32, 32);
} }
} }
......
...@@ -244,13 +244,16 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK, ...@@ -244,13 +244,16 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK,
if (--beamSize == 0) break; if (--beamSize == 0) break;
__syncthreads(); __syncthreads();
unsigned mask = 0u;
// CREATE_SHFL_MASK(mask, tid < len);
if (tid == maxId[0]) { if (tid == maxId[0]) {
if (beam < maxLength) { if (beam < maxLength) {
shTopK[tid] = topK[beam]; shTopK[tid] = topK[beam];
} }
} }
if (maxId[0] / 32 == warp) { if (maxId[0] / 32 == warp) {
if (__shfl(beam, (maxId[0]) % 32, 32) == maxLength) break; if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break;
} }
} }
} }
......
...@@ -34,7 +34,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( ...@@ -34,7 +34,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::string &loss_var_name,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const std::vector<Scope *> &local_scopes, bool skip_scale_loss, const std::vector<Scope *> &local_scopes, bool use_default_grad_scale,
platform::NCCLContextMap *nccl_ctxs) platform::NCCLContextMap *nccl_ctxs)
: loss_var_name_(loss_var_name), : loss_var_name_(loss_var_name),
places_(places), places_(places),
...@@ -45,7 +45,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( ...@@ -45,7 +45,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::string &loss_var_name,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const std::vector<Scope *> &local_scopes, bool skip_scale_loss) const std::vector<Scope *> &local_scopes, bool use_default_grad_scale)
: loss_var_name_(loss_var_name), : loss_var_name_(loss_var_name),
places_(places), places_(places),
local_scopes_(local_scopes) { local_scopes_(local_scopes) {
...@@ -53,7 +53,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( ...@@ -53,7 +53,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
for (auto &p : params) { for (auto &p : params) {
grad_names_.insert(GradVarName(p)); grad_names_.insert(GradVarName(p));
} }
skip_scale_loss_ = skip_scale_loss; use_default_grad_scale_ = use_default_grad_scale;
} }
void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
...@@ -126,8 +126,8 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -126,8 +126,8 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
} else if (IsDistTrainOp(*op, send_op)) { } else if (IsDistTrainOp(*op, send_op)) {
CreateComputationalOps(&result, *op, 1); CreateComputationalOps(&result, *op, 1);
} else if (IsScaleLossOp(*op)) { } else if (IsScaleLossOp(*op)) {
// user can customize loss@grad if skip_scale_loss_ // user can customize loss@grad if not use_default_grad_scale_
if (!skip_scale_loss_) { if (use_default_grad_scale_) {
CreateScaleLossGradOp(&result); CreateScaleLossGradOp(&result);
} }
is_forwarding = false; is_forwarding = false;
......
...@@ -41,7 +41,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -41,7 +41,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
const std::string &loss_var_name, const std::string &loss_var_name,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
bool skip_scale_loss); bool use_default_grad_scale);
#endif #endif
std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override; std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
...@@ -59,7 +59,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -59,7 +59,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::NCCLContextMap *nccl_ctxs_; platform::NCCLContextMap *nccl_ctxs_;
#endif #endif
bool skip_scale_loss_; bool use_default_grad_scale_;
bool IsScaleLossOp(const OpDesc &op) const; bool IsScaleLossOp(const OpDesc &op) const;
......
...@@ -46,6 +46,7 @@ void ScaleLossGradOpHandle::RunImpl() { ...@@ -46,6 +46,7 @@ void ScaleLossGradOpHandle::RunImpl() {
->stream(); ->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp, memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
platform::CPUPlace(), &coeff_, sizeof(float), stream); platform::CPUPlace(), &coeff_, sizeof(float), stream);
VLOG(1) << place_ << "RUN Scale loss grad op";
}); });
#endif #endif
} }
......
...@@ -58,7 +58,7 @@ ParallelExecutor::ParallelExecutor( ...@@ -58,7 +58,7 @@ ParallelExecutor::ParallelExecutor(
const std::unordered_set<std::string> &bcast_vars, const std::unordered_set<std::string> &bcast_vars,
const ProgramDesc &main_program, const std::string &loss_var_name, const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay, Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay,
bool customize_scale_loss) bool use_default_grad_scale)
: member_(new ParallelExecutorPrivate(places)) { : member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope; member_->global_scope_ = scope;
...@@ -93,11 +93,11 @@ ParallelExecutor::ParallelExecutor( ...@@ -93,11 +93,11 @@ ParallelExecutor::ParallelExecutor(
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
details::MultiDevSSAGraphBuilder builder( details::MultiDevSSAGraphBuilder builder(
member_->places_, loss_var_name, params, member_->local_scopes_, member_->places_, loss_var_name, params, member_->local_scopes_,
customize_scale_loss, member_->nccl_ctxs_.get()); use_default_grad_scale, member_->nccl_ctxs_.get());
#else #else
details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
params, member_->local_scopes_, params, member_->local_scopes_,
customize_scale_loss); use_default_grad_scale);
#endif #endif
auto graph = builder.Build(main_program); auto graph = builder.Build(main_program);
......
...@@ -40,7 +40,7 @@ class ParallelExecutor { ...@@ -40,7 +40,7 @@ class ParallelExecutor {
const ProgramDesc& main_program, const ProgramDesc& main_program,
const std::string& loss_var_name, Scope* scope, const std::string& loss_var_name, Scope* scope,
const std::vector<Scope*>& local_scopes, const std::vector<Scope*>& local_scopes,
bool allow_op_delay, bool customize_scale_loss); bool allow_op_delay, bool use_default_grad_scale);
~ParallelExecutor(); ~ParallelExecutor();
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <thrust/execution_policy.h> #include <thrust/execution_policy.h>
#include <thrust/reduce.h> #include <thrust/reduce.h>
#include "paddle/fluid/operators/accuracy_op.h" #include "paddle/fluid/operators/accuracy_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
namespace paddle { namespace paddle {
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/adagrad_op.h" #include "paddle/fluid/operators/adagrad_op.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -195,10 +195,9 @@ std::string ItemToString(const BeamSearch::Item &item) { ...@@ -195,10 +195,9 @@ std::string ItemToString(const BeamSearch::Item &item) {
return stream.str(); return stream.str();
} }
class BeamSearchProtoAndCheckerMaker class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
: public framework::OpProtoAndCheckerMaker {
public: public:
BeamSearchProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker) BeamSearchOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
// inputs and outputs stored in proto // inputs and outputs stored in proto
AddInput("pre_ids", "ids in previous step"); AddInput("pre_ids", "ids in previous step");
...@@ -222,20 +221,32 @@ class BeamSearchProtoAndCheckerMaker ...@@ -222,20 +221,32 @@ class BeamSearchProtoAndCheckerMaker
} }
}; };
class BeamSearchInferShape : public framework::InferShapeBase { class BeamSearchOp : public framework::OperatorWithKernel {
public: public:
void operator()(framework::InferShapeContext *context) const override { using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
for (const std::string &arg : for (const std::string &arg :
std::vector<std::string>({"pre_ids", "ids", "scores"})) { std::vector<std::string>({"pre_ids", "ids", "scores"})) {
PADDLE_ENFORCE(context->HasInput(arg), PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'",
"BeamSearch need input argument '%s'", arg); arg);
} }
for (const std::string &arg : for (const std::string &arg :
std::vector<std::string>({"selected_ids", "selected_scores"})) { std::vector<std::string>({"selected_ids", "selected_scores"})) {
PADDLE_ENFORCE(context->HasOutput(arg), PADDLE_ENFORCE(ctx->HasOutput(arg),
"BeamSearch need output argument '%s'", arg); "BeamSearch need output argument '%s'", arg);
} }
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
framework::OpKernelType kt = framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>("pre_ids")->type()),
platform::CPUPlace());
return kt;
}
}; };
class BeamSearchInferVarType : public framework::VarTypeInference { class BeamSearchInferVarType : public framework::VarTypeInference {
...@@ -254,8 +265,13 @@ class BeamSearchInferVarType : public framework::VarTypeInference { ...@@ -254,8 +265,13 @@ class BeamSearchInferVarType : public framework::VarTypeInference {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp, namespace ops = paddle::operators;
paddle::operators::BeamSearchProtoAndCheckerMaker,
paddle::operators::BeamSearchInferShape, REGISTER_OPERATOR(beam_search, ops::BeamSearchOp, ops::BeamSearchOpMaker,
paddle::operators::BeamSearchInferVarType, ops::BeamSearchInferVarType);
paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL(
beam_search,
ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
...@@ -192,49 +192,29 @@ std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item); ...@@ -192,49 +192,29 @@ std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
std::string ItemToString(const BeamSearch::Item& item); std::string ItemToString(const BeamSearch::Item& item);
class BeamSearchOp : public framework::OperatorBase { template <typename DeviceContext, typename T>
class BeamSearchOpKernel : public framework::OpKernel<T> {
public: public:
BeamSearchOp(const std::string& type, void Compute(const framework::ExecutionContext& context) const override {
const framework::VariableNameMap& inputs, auto* ids_var = context.Input<framework::LoDTensor>("ids");
const framework::VariableNameMap& outputs, auto* scores_var = context.Input<framework::LoDTensor>("scores");
const framework::AttributeMap& attrs) auto* pre_ids_var = context.Input<framework::LoDTensor>("pre_ids");
: OperatorBase(type, inputs, outputs, attrs) {}
BeamSearchOp(const BeamSearchOp& o)
: framework::OperatorBase(
static_cast<const framework::OperatorBase&>(o)) {
PADDLE_THROW("Not Implemented");
}
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
auto ids_var = scope.FindVar(Input("ids"));
auto scores_var = scope.FindVar(Input("scores"));
auto pre_ids_var = scope.FindVar(Input("pre_ids"));
PADDLE_ENFORCE_NOT_NULL(ids_var); PADDLE_ENFORCE_NOT_NULL(ids_var);
PADDLE_ENFORCE_NOT_NULL(scores_var); PADDLE_ENFORCE_NOT_NULL(scores_var);
PADDLE_ENFORCE_NOT_NULL(pre_ids_var); PADDLE_ENFORCE_NOT_NULL(pre_ids_var);
auto& ids = ids_var->Get<framework::LoDTensor>(); size_t level = context.Attr<int>("level");
auto& scores = scores_var->Get<framework::LoDTensor>(); size_t beam_size = context.Attr<int>("beam_size");
auto& pre_ids = pre_ids_var->Get<framework::LoDTensor>(); int end_id = context.Attr<int>("end_id");
size_t level = Attr<int>("level"); BeamSearch alg(*ids_var, *scores_var, level, beam_size, end_id);
size_t beam_size = Attr<int>("beam_size"); auto selected_ids_var =
int end_id = Attr<int>("end_id"); context.Output<framework::LoDTensor>("selected_ids");
BeamSearch alg(ids, scores, level, beam_size, end_id); auto selected_scores_var =
context.Output<framework::LoDTensor>("selected_scores");
auto selected_ids_var = scope.FindVar(Output("selected_ids"));
auto selected_scores_var = scope.FindVar(Output("selected_scores"));
PADDLE_ENFORCE_NOT_NULL(selected_ids_var); PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
PADDLE_ENFORCE_NOT_NULL(selected_scores_var); PADDLE_ENFORCE_NOT_NULL(selected_scores_var);
auto& selected_ids_tensor = alg(*pre_ids_var, selected_ids_var, selected_scores_var);
*selected_ids_var->GetMutable<framework::LoDTensor>();
auto& selected_scores_tensor =
*selected_scores_var->GetMutable<framework::LoDTensor>();
alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor);
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/bilinear_interp_op.h" #include "paddle/fluid/operators/bilinear_interp_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -10,7 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,7 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/box_coder_op.h" #include "paddle/fluid/operators/box_coder_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -87,7 +87,7 @@ class ConcatGradKernel : public framework::OpKernel<T> { ...@@ -87,7 +87,7 @@ class ConcatGradKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
paddle::operators::math::ConcatGradFunctor<DeviceContext, T> paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
concat_grad_functor; concat_grad_functor;
concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), outputs); concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), &outputs);
} }
} }
}; };
......
...@@ -20,6 +20,11 @@ limitations under the License. */ ...@@ -20,6 +20,11 @@ limitations under the License. */
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
DEFINE_bool(cudnn_algo_use_autotune, true,
"Whether allow using an autotuning algorithm for convolution "
"operator. The autotuning algorithm may be non-deterministic. If "
"false, the algorithm is deterministic.");
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -267,17 +272,23 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -267,17 +272,23 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
if (input_grad) { if (input_grad) {
PADDLE_ENFORCE( if (FLAGS_cudnn_algo_use_autotune) {
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( PADDLE_ENFORCE(
handle, cudnn_filter_desc, platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
// dyDesc: Handle to the previously initialized input differential handle, cudnn_filter_desc,
// tensor descriptor. // dyDesc: Handle to the previously initialized input
cudnn_output_grad_desc, cudnn_conv_desc, // differential
// dxDesc: Handle to the previously initialized output tensor // tensor descriptor.
// descriptor. cudnn_output_grad_desc, cudnn_conv_desc,
cudnn_input_desc, // dxDesc: Handle to the previously initialized output tensor
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, // descriptor.
workspace_size_limit, &data_algo)); cudnn_input_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &data_algo));
} else {
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
PADDLE_ENFORCE( PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
handle, cudnn_filter_desc, cudnn_output_grad_desc, handle, cudnn_filter_desc, cudnn_output_grad_desc,
...@@ -286,12 +297,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -286,12 +297,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
} }
if (filter_grad) { if (filter_grad) {
PADDLE_ENFORCE( if (FLAGS_cudnn_algo_use_autotune) {
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( PADDLE_ENFORCE(
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
cudnn_filter_desc, handle, cudnn_input_desc, cudnn_output_grad_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, cudnn_conv_desc, cudnn_filter_desc,
workspace_size_limit, &filter_algo)); CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &filter_algo));
} else {
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
}
PADDLE_ENFORCE( PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_shift_op.h" #include "paddle/fluid/operators/conv_shift_op.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/edit_distance_op.h" #include "paddle/fluid/operators/edit_distance_op.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
namespace paddle { namespace paddle {
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#ifdef __NVCC__ #ifdef __NVCC__
#include <cuda.h> #include <cuda.h>
#include <thrust/iterator/iterator_adaptor.h> #include <thrust/iterator/iterator_adaptor.h>
#include "paddle/fluid/platform/cuda_primitives.h"
constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
#endif #endif
...@@ -333,24 +334,12 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out, ...@@ -333,24 +334,12 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out,
} }
} }
} }
#ifdef __NVCC__
// __shfl_down has been deprecated as of CUDA 9.0. #ifdef __NVCC__
#if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
return __shfl_down(val, delta);
}
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
template <typename T> template <typename T>
__device__ T reduceSum(T val, int tid, int len) { __device__ T reduceSum(T val, int tid, int len) {
// TODO(zcd): The warp size should be taken from the // NOTE(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply. // parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently, // To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size // I use Warp-Level Parallelism and assume the Warp size
...@@ -362,7 +351,7 @@ __device__ T reduceSum(T val, int tid, int len) { ...@@ -362,7 +351,7 @@ __device__ T reduceSum(T val, int tid, int len) {
CREATE_SHFL_MASK(mask, tid < len); CREATE_SHFL_MASK(mask, tid < len);
for (int offset = warpSize / 2; offset > 0; offset /= 2) for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset); val += platform::__shfl_down_sync(mask, val, offset);
if (tid < warpSize) shm[tid] = 0; if (tid < warpSize) shm[tid] = 0;
...@@ -378,7 +367,7 @@ __device__ T reduceSum(T val, int tid, int len) { ...@@ -378,7 +367,7 @@ __device__ T reduceSum(T val, int tid, int len) {
if (tid < warpSize) { if (tid < warpSize) {
val = shm[tid]; val = shm[tid];
for (int offset = warpSize / 2; offset > 0; offset /= 2) for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset); val += platform::__shfl_down_sync(mask, val, offset);
} }
return val; return val;
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/lookup_table_op.h" #include "paddle/fluid/operators/lookup_table_op.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/operators/math/concat.h"
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -70,20 +71,20 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> { ...@@ -70,20 +71,20 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, const int axis, const framework::Tensor& input, const int axis,
std::vector<framework::Tensor>& outputs) { std::vector<framework::Tensor>* outputs) {
// TODO(zcd): Add input data validity checking // TODO(zcd): Add input data validity checking
int num = outputs.size(); int num = outputs->size();
int input_rows = 1; int input_rows = 1;
auto dim_0 = outputs[0].dims(); auto dim_0 = outputs->at(0).dims();
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
input_rows *= dim_0[i]; input_rows *= dim_0[i];
} }
int input_cols = 0; int input_cols = 0;
std::vector<int64_t> output_cols(outputs.size()); std::vector<int64_t> output_cols(outputs->size());
for (int i = 0; i < num; ++i) { for (int i = 0; i < num; ++i) {
int t_cols = outputs[i].numel() / input_rows; int t_cols = outputs->at(i).numel() / input_rows;
input_cols += t_cols; input_cols += t_cols;
output_cols[i] = t_cols; output_cols[i] = t_cols;
} }
...@@ -95,7 +96,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> { ...@@ -95,7 +96,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
int col_idx = 0; int col_idx = 0;
for (int j = 0; j < num; ++j) { for (int j = 0; j < num; ++j) {
int col_len = output_cols[j]; int col_len = output_cols[j];
T* dst_ptr = outputs[j].data<T>() + k * col_len; T* dst_ptr = outputs->at(j).data<T>() + k * col_len;
memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx, memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
sizeof(T) * col_len); sizeof(T) * col_len);
col_idx += col_len; col_idx += col_len;
......
...@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -202,16 +204,16 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -202,16 +204,16 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const int axis, const framework::Tensor& input, const int axis,
std::vector<framework::Tensor>& outputs) { std::vector<framework::Tensor>* outputs) {
// TODO(zcd): Add input data validity checking // TODO(zcd): Add input data validity checking
int o_num = outputs.size(); int o_num = outputs->size();
int out_row = 1; int out_row = 1;
auto dim_0 = outputs[0].dims(); auto dim_0 = outputs->at(0).dims();
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
out_row *= dim_0[i]; out_row *= dim_0[i];
} }
int out_col = outputs[0].numel() / out_row; int out_col = outputs->at(0).numel() / out_row;
int in_col = 0, in_row = out_row; int in_col = 0, in_row = out_row;
bool sameShape = true; bool sameShape = true;
...@@ -221,13 +223,13 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -221,13 +223,13 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
outputs_cols[0] = 0; outputs_cols[0] = 0;
for (int i = 0; i < o_num; ++i) { for (int i = 0; i < o_num; ++i) {
int t_col = outputs[i].numel() / out_row; int t_col = outputs->at(i).numel() / out_row;
if (sameShape) { if (sameShape) {
if (t_col != out_col) sameShape = false; if (t_col != out_col) sameShape = false;
} }
in_col += t_col; in_col += t_col;
outputs_cols[i + 1] = in_col; outputs_cols[i + 1] = in_col;
outputs_ptr[i] = outputs[i].data<T>(); outputs_ptr[i] = outputs->at(i).data<T>();
} }
T** dev_out_gpu_data = T** dev_out_gpu_data =
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
...@@ -56,7 +57,7 @@ template <typename DeviceContext, typename T> ...@@ -56,7 +57,7 @@ template <typename DeviceContext, typename T>
class ConcatGradFunctor { class ConcatGradFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const int axis, std::vector<framework::Tensor>& outputs); const int axis, std::vector<framework::Tensor>* outputs);
}; };
} // namespace math } // namespace math
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/cos_sim_functor.h" #include "paddle/fluid/operators/math/cos_sim_functor.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -31,11 +32,11 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, ...@@ -31,11 +32,11 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
template <typename T> template <typename T>
__device__ __forceinline__ T sum_single_warp(T val) { __device__ __forceinline__ T sum_single_warp(T val) {
val += __shfl_down(val, 16); val += platform::__shfl_down_sync(0, val, 16);
val += __shfl_down(val, 8); val += platform::__shfl_down_sync(0, val, 8);
val += __shfl_down(val, 4); val += platform::__shfl_down_sync(0, val, 4);
val += __shfl_down(val, 2); val += platform::__shfl_down_sync(0, val, 2);
val += __shfl_down(val, 1); val += platform::__shfl_down_sync(0, val, 1);
return val; return val;
} }
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -89,14 +89,14 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, ...@@ -89,14 +89,14 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
__m256 r_value_reset_gate; __m256 r_value_reset_gate;
__m256 r_value_reset_output; __m256 r_value_reset_output;
__m256 r_prev_out = _mm256_set1_ps(0.0f); __m256 r_prev_out = _mm256_set1_ps(0.0f);
__m256 *update_gate = (__m256 *)gate_value; __m256 *update_gate = reinterpret_cast<__m256 *>(gate_value);
__m256 *reset_gate = (__m256 *)(gate_value + frame_size); __m256 *reset_gate = reinterpret_cast<__m256 *>(gate_value + frame_size);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_value_update_gate = update_gate[i]; r_value_update_gate = update_gate[i];
r_value_reset_gate = reset_gate[i]; r_value_reset_gate = reset_gate[i];
if (prev_output_value) { if (prev_output_value) {
r_prev_out = ((__m256 *)prev_output_value)[i]; r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i];
} }
op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out, op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
...@@ -104,7 +104,7 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, ...@@ -104,7 +104,7 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
update_gate[i] = r_value_update_gate; update_gate[i] = r_value_update_gate;
reset_gate[i] = r_value_reset_gate; reset_gate[i] = r_value_reset_gate;
((__m256 *)reset_output_value)[i] = r_value_reset_output; (reinterpret_cast<__m256 *>(reset_output_value))[i] = r_value_reset_output;
} }
#endif #endif
} }
...@@ -119,21 +119,21 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, ...@@ -119,21 +119,21 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
__m256 r_value_frame_state; __m256 r_value_frame_state;
__m256 r_prev_out = _mm256_set1_ps(0.0f); __m256 r_prev_out = _mm256_set1_ps(0.0f);
__m256 r_output; __m256 r_output;
__m256 *update_gate = (__m256 *)gate_value; __m256 *update_gate = reinterpret_cast<__m256 *>(gate_value);
__m256 *frame_state = (__m256 *)(gate_value + frame_size * 2); __m256 *frame_state = reinterpret_cast<__m256 *>(gate_value + frame_size * 2);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_value_update_gate = update_gate[i]; r_value_update_gate = update_gate[i];
r_value_frame_state = frame_state[i]; r_value_frame_state = frame_state[i];
if (prev_output_value) { if (prev_output_value) {
r_prev_out = ((__m256 *)prev_output_value)[i]; r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i];
} }
op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out, op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
r_output, active_node); r_output, active_node);
frame_state[i] = r_value_frame_state; frame_state[i] = r_value_frame_state;
((__m256 *)output_value)[i] = r_output; (reinterpret_cast<__m256 *>(output_value))[i] = r_output;
} }
#endif #endif
} }
...@@ -284,20 +284,22 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, ...@@ -284,20 +284,22 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
__m256 r_out_grad; __m256 r_out_grad;
__m256 r_prev_out_value = _mm256_set1_ps(0.0f); __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
__m256 r_prev_out_grad = _mm256_set1_ps(0.0f); __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
__m256 *update_gate_value = (__m256 *)gate_value; __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
__m256 *update_gate_grad = (__m256 *)gate_grad; __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
__m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2); __m256 *frame_state_value =
__m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2); reinterpret_cast<__m256 *>(gate_value + frame_size * 2);
__m256 *frame_state_grad =
reinterpret_cast<__m256 *>(gate_grad + frame_size * 2);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_update_gate_value = update_gate_value[i]; r_update_gate_value = update_gate_value[i];
r_frame_state_value = frame_state_value[i]; r_frame_state_value = frame_state_value[i];
r_out_grad = ((__m256 *)output_grad)[i]; r_out_grad = (reinterpret_cast<__m256 *>(output_grad))[i];
if (prev_out_value) { if (prev_out_value) {
r_prev_out_value = ((__m256 *)prev_out_value)[i]; r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i];
} }
if (prev_out_grad) { if (prev_out_grad) {
r_prev_out_grad = ((__m256 *)prev_out_grad)[i]; r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
} }
op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value, op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
...@@ -307,7 +309,7 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, ...@@ -307,7 +309,7 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
update_gate_grad[i] = r_update_gate_grad; update_gate_grad[i] = r_update_gate_grad;
frame_state_grad[i] = r_frame_state_grad; frame_state_grad[i] = r_frame_state_grad;
if (prev_out_grad) { if (prev_out_grad) {
((__m256 *)prev_out_grad)[i] = r_prev_out_grad; (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
} }
} }
#endif #endif
...@@ -327,10 +329,11 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, ...@@ -327,10 +329,11 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
__m256 r_reset_output_grad = _mm256_set1_ps(0.0f); __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
__m256 r_prev_out_value = _mm256_set1_ps(0.0f); __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
__m256 r_prev_out_grad = _mm256_set1_ps(0.0f); __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
__m256 *update_gate_value = (__m256 *)gate_value; __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
__m256 *update_gate_grad = (__m256 *)gate_grad; __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
__m256 *reset_gate_value = (__m256 *)(gate_value + frame_size); __m256 *reset_gate_value =
__m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size); reinterpret_cast<__m256 *>(gate_value + frame_size);
__m256 *reset_gate_grad = reinterpret_cast<__m256 *>(gate_grad + frame_size);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_update_gate_value = update_gate_value[i]; r_update_gate_value = update_gate_value[i];
...@@ -338,13 +341,13 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, ...@@ -338,13 +341,13 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
r_reset_gate_value = reset_gate_value[i]; r_reset_gate_value = reset_gate_value[i];
if (prev_out_value && prev_out_grad) { if (prev_out_value && prev_out_grad) {
r_reset_output_grad = ((__m256 *)reset_output_grad)[i]; r_reset_output_grad = (reinterpret_cast<__m256 *>(reset_output_grad))[i];
} }
if (prev_out_value) { if (prev_out_value) {
r_prev_out_value = ((__m256 *)prev_out_value)[i]; r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i];
} }
if (prev_out_grad) { if (prev_out_grad) {
r_prev_out_grad = ((__m256 *)prev_out_grad)[i]; r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
} }
op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value, op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
...@@ -354,7 +357,7 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, ...@@ -354,7 +357,7 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
update_gate_grad[i] = r_update_gate_grad; update_gate_grad[i] = r_update_gate_grad;
reset_gate_grad[i] = r_reset_gate_grad; reset_gate_grad[i] = r_reset_gate_grad;
if (prev_out_grad) { if (prev_out_grad) {
((__m256 *)prev_out_grad)[i] = r_prev_out_grad; (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
} }
} }
#endif #endif
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include <type_traits> #include <type_traits>
#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/gru_compute.h" #include "paddle/fluid/operators/math/gru_compute.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle { namespace paddle {
......
...@@ -164,10 +164,12 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -164,10 +164,12 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
__m256 r_state_atv; __m256 r_state_atv;
__m256 r_out; __m256 r_out;
__m256 *value_in = (__m256 *)value.gate_value; __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value);
__m256 *value_ig = (__m256 *)(value.gate_value + frame_size); __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size);
__m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2); __m256 *value_fg =
__m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3); reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2);
__m256 *value_og =
reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_value_in = value_in[i]; r_value_in = value_in[i];
...@@ -175,13 +177,13 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -175,13 +177,13 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
r_value_fg = value_fg[i]; r_value_fg = value_fg[i];
r_value_og = value_og[i]; r_value_og = value_og[i];
if (value.check_ig) { if (value.check_ig) {
r_checkI = ((__m256 *)value.check_ig)[i]; r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i];
r_checkF = ((__m256 *)value.check_fg)[i]; r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i];
r_checkO = ((__m256 *)value.check_og)[i]; r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i];
} }
if (value.prev_state_value) { if (value.prev_state_value) {
r_prev_state = ((__m256 *)value.prev_state_value)[i]; r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i];
} }
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
...@@ -192,9 +194,9 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -192,9 +194,9 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
value_ig[i] = r_value_ig; value_ig[i] = r_value_ig;
value_fg[i] = r_value_fg; value_fg[i] = r_value_fg;
value_og[i] = r_value_og; value_og[i] = r_value_og;
((__m256 *)value.state_value)[i] = r_state; (reinterpret_cast<__m256 *>(value.state_value))[i] = r_state;
((__m256 *)value.state_active_value)[i] = r_state_atv; (reinterpret_cast<__m256 *>(value.state_active_value))[i] = r_state_atv;
((__m256 *)value.output_value)[i] = r_out; (reinterpret_cast<__m256 *>(value.output_value))[i] = r_out;
} }
#endif #endif
} }
...@@ -227,14 +229,16 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -227,14 +229,16 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
__m256 r_checkFGrad; __m256 r_checkFGrad;
__m256 r_checkOGrad; __m256 r_checkOGrad;
__m256 *value_in = (__m256 *)value.gate_value; __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value);
__m256 *value_ig = (__m256 *)(value.gate_value + frame_size); __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size);
__m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2); __m256 *value_fg =
__m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3); reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2);
__m256 *grad_in = (__m256 *)grad.gate_grad; __m256 *value_og =
__m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size); reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3);
__m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2); __m256 *grad_in = reinterpret_cast<__m256 *>(grad.gate_grad);
__m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3); __m256 *grad_ig = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size);
__m256 *grad_fg = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 2);
__m256 *grad_og = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 3);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_value_in = value_in[i]; r_value_in = value_in[i];
...@@ -242,16 +246,16 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -242,16 +246,16 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
r_value_fg = value_fg[i]; r_value_fg = value_fg[i];
r_value_og = value_og[i]; r_value_og = value_og[i];
if (value.check_ig) { if (value.check_ig) {
r_checkI = ((__m256 *)value.check_ig)[i]; r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i];
r_checkF = ((__m256 *)value.check_fg)[i]; r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i];
r_checkO = ((__m256 *)value.check_og)[i]; r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i];
} }
r_state = ((__m256 *)value.state_value)[i]; r_state = (reinterpret_cast<__m256 *>(value.state_value))[i];
r_state_atv = ((__m256 *)value.state_active_value)[i]; r_state_atv = (reinterpret_cast<__m256 *>(value.state_active_value))[i];
r_output_grad = ((__m256 *)grad.output_grad)[i]; r_output_grad = (reinterpret_cast<__m256 *>(grad.output_grad))[i];
r_state_grad = ((__m256 *)grad.state_grad)[i]; r_state_grad = (reinterpret_cast<__m256 *>(grad.state_grad))[i];
if (value.prev_state_value) { if (value.prev_state_value) {
r_prev_state = ((__m256 *)value.prev_state_value)[i]; r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i];
} }
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
...@@ -264,15 +268,18 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -264,15 +268,18 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
grad_ig[i] = r_grad_ig; grad_ig[i] = r_grad_ig;
grad_fg[i] = r_grad_fg; grad_fg[i] = r_grad_fg;
grad_og[i] = r_grad_og; grad_og[i] = r_grad_og;
((__m256 *)grad.state_grad)[i] = r_state_grad; (reinterpret_cast<__m256 *>(grad.state_grad))[i] = r_state_grad;
if (grad.prev_state_grad) if (grad.prev_state_grad)
((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad; (reinterpret_cast<__m256 *>(grad.prev_state_grad))[i] = r_prev_state_grad;
if (value.prev_state_value) { if (value.prev_state_value) {
if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad; if (grad.check_ig_grad)
if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad; (reinterpret_cast<__m256 *>(grad.check_ig_grad))[i] += r_checkIGrad;
if (grad.check_fg_grad)
(reinterpret_cast<__m256 *>(grad.check_fg_grad))[i] += r_checkFGrad;
} }
if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad; if (grad.check_og_grad)
(reinterpret_cast<__m256 *>(grad.check_og_grad))[i] += r_checkOGrad;
} }
#endif #endif
} }
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h" #include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle { namespace paddle {
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -23,32 +23,29 @@ void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size, ...@@ -23,32 +23,29 @@ void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
} }
TEST(math_function, notrans_mul_trans_fp32) { TEST(math_function, notrans_mul_trans_fp32) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor out_gpu;
paddle::framework::Tensor out;
Tensor input1; paddle::platform::CPUPlace cpu_place;
Tensor input1_gpu; paddle::platform::CUDAPlace gpu_place(0);
Tensor input2_gpu; paddle::platform::CUDADeviceContext context(gpu_place);
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place); float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float)); memcpy(input1_ptr, arr, 6 * sizeof(float));
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input1, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float>({2, 2}, gpu_place); out_gpu.mutable_data<float>({2, 2}, gpu_place);
paddle::operators::math::matmul<CUDADeviceContext, float>( paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
TensorCopySync(out_gpu, cpu_place, &out); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
float* out_ptr = out.data<float>(); float* out_ptr = out.data<float>();
context.Wait(); context.Wait();
...@@ -59,39 +56,38 @@ TEST(math_function, notrans_mul_trans_fp32) { ...@@ -59,39 +56,38 @@ TEST(math_function, notrans_mul_trans_fp32) {
} }
TEST(math_function, notrans_mul_trans_fp16) { TEST(math_function, notrans_mul_trans_fp16) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
Tensor input1; paddle::framework::Tensor out_gpu;
Tensor input1_gpu; paddle::framework::Tensor out;
Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
return; return;
} }
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place); paddle::platform::float16* input1_ptr =
input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input1, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float16>({2, 2}, gpu_place); out_gpu.mutable_data<paddle::platform::float16>({2, 2}, gpu_place);
paddle::operators::math::matmul<CUDADeviceContext, float16>( paddle::operators::math::matmul<paddle::platform::CUDADeviceContext,
context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu, paddle::platform::float16>(
float16(0)); context, input1_gpu, false, input2_gpu, true,
paddle::platform::float16(1), &out_gpu, paddle::platform::float16(0));
TensorCopySync(out_gpu, cpu_place, &out); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
float16* out_ptr = out.data<float16>(); paddle::platform::float16* out_ptr = out.data<paddle::platform::float16>();
context.Wait(); context.Wait();
EXPECT_EQ(static_cast<float>(out_ptr[0]), 5); EXPECT_EQ(static_cast<float>(out_ptr[0]), 5);
EXPECT_EQ(static_cast<float>(out_ptr[1]), 14); EXPECT_EQ(static_cast<float>(out_ptr[1]), 14);
...@@ -100,32 +96,29 @@ TEST(math_function, notrans_mul_trans_fp16) { ...@@ -100,32 +96,29 @@ TEST(math_function, notrans_mul_trans_fp16) {
} }
TEST(math_function, trans_mul_notrans_fp32) { TEST(math_function, trans_mul_notrans_fp32) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor out_gpu;
paddle::framework::Tensor out;
Tensor input1; paddle::platform::CPUPlace cpu_place;
Tensor input1_gpu; paddle::platform::CUDAPlace gpu_place(0);
Tensor input2_gpu; paddle::platform::CUDADeviceContext context(gpu_place);
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place); float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float)); memcpy(input1_ptr, arr, 6 * sizeof(float));
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input1, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float>({3, 3}, gpu_place); out_gpu.mutable_data<float>({3, 3}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
TensorCopySync(out_gpu, cpu_place, &out); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
float* out_ptr = out.data<float>(); float* out_ptr = out.data<float>();
context.Wait(); context.Wait();
...@@ -141,39 +134,38 @@ TEST(math_function, trans_mul_notrans_fp32) { ...@@ -141,39 +134,38 @@ TEST(math_function, trans_mul_notrans_fp32) {
} }
TEST(math_function, trans_mul_notrans_fp16) { TEST(math_function, trans_mul_notrans_fp16) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
Tensor input1; paddle::framework::Tensor out_gpu;
Tensor input1_gpu; paddle::framework::Tensor out;
Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
return; return;
} }
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place); paddle::platform::float16* input1_ptr =
input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input1, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float16>({3, 3}, gpu_place); out_gpu.mutable_data<paddle::platform::float16>({3, 3}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float16>( paddle::operators::math::matmul<paddle::platform::CUDADeviceContext,
context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu, paddle::platform::float16>(
float16(0)); context, input1_gpu, true, input2_gpu, false,
paddle::platform::float16(1), &out_gpu, paddle::platform::float16(0));
TensorCopySync(out_gpu, cpu_place, &out); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
float16* out_ptr = out.data<float16>(); paddle::platform::float16* out_ptr = out.data<paddle::platform::float16>();
context.Wait(); context.Wait();
EXPECT_EQ(static_cast<float>(out_ptr[0]), 9); EXPECT_EQ(static_cast<float>(out_ptr[0]), 9);
EXPECT_EQ(static_cast<float>(out_ptr[1]), 12); EXPECT_EQ(static_cast<float>(out_ptr[1]), 12);
...@@ -187,19 +179,16 @@ TEST(math_function, trans_mul_notrans_fp16) { ...@@ -187,19 +179,16 @@ TEST(math_function, trans_mul_notrans_fp16) {
} }
TEST(math_function, gemm_notrans_cublas_fp32) { TEST(math_function, gemm_notrans_cublas_fp32) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input2;
paddle::framework::Tensor input3;
paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor input3_gpu;
Tensor input1; paddle::platform::CPUPlace cpu_place;
Tensor input2; paddle::platform::CUDAPlace gpu_place(0);
Tensor input3; paddle::platform::CUDADeviceContext context(gpu_place);
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
int m = 2; int m = 2;
int n = 3; int n = 3;
...@@ -214,9 +203,9 @@ TEST(math_function, gemm_notrans_cublas_fp32) { ...@@ -214,9 +203,9 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float)); memcpy(input3_ptr, arr3, 8 * sizeof(float));
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input2, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
TensorCopySync(input3, gpu_place, &input3_gpu); paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
float* a = input1_gpu.data<float>(); float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>(); float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(gpu_place); float* c = input3_gpu.mutable_data<float>(gpu_place);
...@@ -224,7 +213,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) { ...@@ -224,7 +213,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
TensorCopySync(input3_gpu, cpu_place, &input3); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
// numpy code: // numpy code:
// a = np.arange(6).reshape(2, 3) // a = np.arange(6).reshape(2, 3)
...@@ -244,19 +233,16 @@ TEST(math_function, gemm_notrans_cublas_fp32) { ...@@ -244,19 +233,16 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
} }
TEST(math_function, gemm_notrans_cublas_fp16) { TEST(math_function, gemm_notrans_cublas_fp16) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input2;
paddle::framework::Tensor input3;
Tensor input1; paddle::framework::Tensor input1_gpu;
Tensor input2; paddle::framework::Tensor input2_gpu;
Tensor input3; paddle::framework::Tensor input3_gpu;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
...@@ -266,26 +252,31 @@ TEST(math_function, gemm_notrans_cublas_fp16) { ...@@ -266,26 +252,31 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
int m = 2; int m = 2;
int n = 3; int n = 3;
int k = 3; int k = 3;
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place); paddle::platform::float16* input1_ptr =
input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
float16* input2_ptr = input2.mutable_data<float16>({3, 4}, cpu_place); paddle::platform::float16* input2_ptr =
input2.mutable_data<paddle::platform::float16>({3, 4}, cpu_place);
fill_fp16_data(input2_ptr, input2.numel(), fill_fp16_data(input2_ptr, input2.numel(),
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place); paddle::platform::float16* input3_ptr =
input3.mutable_data<paddle::platform::float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input2, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
TensorCopySync(input3, gpu_place, &input3_gpu); paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
float16* a = input1_gpu.data<float16>(); paddle::platform::float16* a = input1_gpu.data<paddle::platform::float16>();
float16* b = input2_gpu.data<float16>(); paddle::platform::float16* b = input2_gpu.data<paddle::platform::float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place); paddle::platform::float16* c =
input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext,
context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1), paddle::platform::float16>(
c + 1, 4); context, false, false, m, n, k, paddle::platform::float16(1), a, 3, b + 1,
4, paddle::platform::float16(1), c + 1, 4);
TensorCopySync(input3_gpu, cpu_place, &input3); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
// numpy code: // numpy code:
// a = np.arange(6).reshape(2, 3) // a = np.arange(6).reshape(2, 3)
...@@ -305,19 +296,16 @@ TEST(math_function, gemm_notrans_cublas_fp16) { ...@@ -305,19 +296,16 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
} }
TEST(math_function, gemm_trans_cublas_fp32) { TEST(math_function, gemm_trans_cublas_fp32) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input2;
paddle::framework::Tensor input3;
Tensor input1; paddle::framework::Tensor input1_gpu;
Tensor input2; paddle::framework::Tensor input2_gpu;
Tensor input3; paddle::framework::Tensor input3_gpu;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
int m = 2; int m = 2;
int n = 3; int n = 3;
...@@ -332,9 +320,9 @@ TEST(math_function, gemm_trans_cublas_fp32) { ...@@ -332,9 +320,9 @@ TEST(math_function, gemm_trans_cublas_fp32) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float)); memcpy(input3_ptr, arr3, 8 * sizeof(float));
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input2, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
TensorCopySync(input3, gpu_place, &input3_gpu); paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
float* a = input1_gpu.data<float>(); float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>(); float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(gpu_place); float* c = input3_gpu.mutable_data<float>(gpu_place);
...@@ -342,7 +330,7 @@ TEST(math_function, gemm_trans_cublas_fp32) { ...@@ -342,7 +330,7 @@ TEST(math_function, gemm_trans_cublas_fp32) {
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
TensorCopySync(input3_gpu, cpu_place, &input3); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
context.Wait(); context.Wait();
EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[0], 0);
...@@ -356,19 +344,16 @@ TEST(math_function, gemm_trans_cublas_fp32) { ...@@ -356,19 +344,16 @@ TEST(math_function, gemm_trans_cublas_fp32) {
} }
TEST(math_function, gemm_trans_cublas_fp16) { TEST(math_function, gemm_trans_cublas_fp16) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input2;
paddle::framework::Tensor input3;
paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor input3_gpu;
Tensor input1; paddle::platform::CPUPlace cpu_place;
Tensor input2; paddle::platform::CUDAPlace gpu_place(0);
Tensor input3; paddle::platform::CUDADeviceContext context(gpu_place);
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
...@@ -378,26 +363,31 @@ TEST(math_function, gemm_trans_cublas_fp16) { ...@@ -378,26 +363,31 @@ TEST(math_function, gemm_trans_cublas_fp16) {
int m = 2; int m = 2;
int n = 3; int n = 3;
int k = 3; int k = 3;
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place); paddle::platform::float16* input1_ptr =
input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
float16* input2_ptr = input2.mutable_data<float16>({4, 3}, cpu_place); paddle::platform::float16* input2_ptr =
input2.mutable_data<paddle::platform::float16>({4, 3}, cpu_place);
fill_fp16_data(input2_ptr, input2.numel(), fill_fp16_data(input2_ptr, input2.numel(),
{0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}); {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11});
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place); paddle::platform::float16* input3_ptr =
input3.mutable_data<paddle::platform::float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input2, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
TensorCopySync(input3, gpu_place, &input3_gpu); paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
float16* a = input1_gpu.data<float16>(); paddle::platform::float16* a = input1_gpu.data<paddle::platform::float16>();
float16* b = input2_gpu.data<float16>(); paddle::platform::float16* b = input2_gpu.data<paddle::platform::float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place); paddle::platform::float16* c =
input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext,
context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1), paddle::platform::float16>(
c + 1, 4); context, false, true, m, n, k, paddle::platform::float16(1), a, 3, b + 3,
3, paddle::platform::float16(1), c + 1, 4);
TensorCopySync(input3_gpu, cpu_place, &input3); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
context.Wait(); context.Wait();
EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0); EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
...@@ -412,24 +402,21 @@ TEST(math_function, gemm_trans_cublas_fp16) { ...@@ -412,24 +402,21 @@ TEST(math_function, gemm_trans_cublas_fp16) {
template <typename T> template <typename T>
void GemvTest(int m, int n, bool trans) { void GemvTest(int m, int n, bool trans) {
using namespace paddle::framework; paddle::framework::Tensor mat_a;
using namespace paddle::platform; paddle::framework::Tensor vec_b;
paddle::framework::Tensor vec_c;
Tensor mat_a;
Tensor vec_b;
Tensor vec_c;
CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place); T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place); T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
T* data_c = vec_c.mutable_data<T>({trans ? n : m}, cpu_place); T* data_c = vec_c.mutable_data<T>({trans ? n : m}, cpu_place);
Tensor g_mat_a; paddle::framework::Tensor g_mat_a;
Tensor g_vec_b; paddle::framework::Tensor g_vec_b;
Tensor g_vec_c; paddle::framework::Tensor g_vec_c;
T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), gpu_place); T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), gpu_place);
T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), gpu_place); T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), gpu_place);
T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), gpu_place); T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), gpu_place);
...@@ -441,14 +428,14 @@ void GemvTest(int m, int n, bool trans) { ...@@ -441,14 +428,14 @@ void GemvTest(int m, int n, bool trans) {
data_b[i] = static_cast<T>(i); data_b[i] = static_cast<T>(i);
} }
TensorCopySync(mat_a, gpu_place, &g_mat_a); paddle::framework::TensorCopySync(mat_a, gpu_place, &g_mat_a);
TensorCopySync(vec_b, gpu_place, &g_vec_b); paddle::framework::TensorCopySync(vec_b, gpu_place, &g_vec_b);
paddle::operators::math::gemv<CUDADeviceContext, T>( paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>(
context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a, context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
g_data_b, 0., g_data_c); g_data_b, 0., g_data_c);
TensorCopySync(g_vec_c, cpu_place, &vec_c); paddle::framework::TensorCopySync(g_vec_c, cpu_place, &vec_c);
if (!trans) { if (!trans) {
for (int i = 0; i < m; ++i) { for (int i = 0; i < m; ++i) {
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/maxouting.h" #include "paddle/fluid/operators/math/maxouting.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/operators/math/pooling.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -12,43 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,43 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
TEST(selected_rows_functor, gpu_add) { TEST(selected_rows_functor, gpu_add) {
using namespace paddle::framework; paddle::platform::CUDAPlace gpu_place(0);
using namespace paddle::platform; paddle::platform::CPUPlace cpu_place;
using namespace paddle::operators::math; paddle::platform::CUDADeviceContext ctx(gpu_place);
paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
CUDAPlace gpu_place(0); float>
CPUPlace cpu_place; functor;
CUDADeviceContext ctx(gpu_place);
SetConstant<CUDADeviceContext, float> functor;
int64_t height = 10; int64_t height = 10;
int64_t row_numel = 10; int64_t row_numel = 10;
std::vector<int64_t> rows1{0, 4, 7}; std::vector<int64_t> rows1{0, 4, 7};
std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)}; std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
new paddle::framework::SelectedRows(rows1, height)};
auto* in1_value = selected_rows1->mutable_value(); auto* in1_value = selected_rows1->mutable_value();
in1_value->mutable_data<float>( in1_value->mutable_data<float>(
make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place); paddle::framework::make_ddim(
{static_cast<int64_t>(rows1.size()), row_numel}),
gpu_place);
functor(ctx, in1_value, 1.0); functor(ctx, in1_value, 1.0);
std::vector<int64_t> rows2{0, 5, 7, 9}; std::vector<int64_t> rows2{0, 5, 7, 9};
std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)}; std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
new paddle::framework::SelectedRows(rows2, height)};
auto* in2_value = selected_rows2->mutable_value(); auto* in2_value = selected_rows2->mutable_value();
in2_value->mutable_data<float>( in2_value->mutable_data<float>(
make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), gpu_place); paddle::framework::make_ddim(
{static_cast<int64_t>(rows2.size()), row_numel}),
gpu_place);
functor(ctx, in2_value, 2.0); functor(ctx, in2_value, 2.0);
std::unique_ptr<SelectedRows> output{new SelectedRows()}; std::unique_ptr<paddle::framework::SelectedRows> output{
new paddle::framework::SelectedRows()};
auto* out_value = output->mutable_value(); auto* out_value = output->mutable_value();
// simplely concat two SelectedRows // simply concat two SelectedRows
out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place); out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
gpu_place);
SelectedRowsAdd<CUDADeviceContext, float> add_functor; paddle::operators::math::SelectedRowsAdd<paddle::platform::CUDADeviceContext,
float>
add_functor;
add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
auto out_height = output->height(); auto out_height = output->height();
...@@ -66,8 +75,8 @@ TEST(selected_rows_functor, gpu_add) { ...@@ -66,8 +75,8 @@ TEST(selected_rows_functor, gpu_add) {
EXPECT_EQ(out_rows[5], 7); EXPECT_EQ(out_rows[5], 7);
EXPECT_EQ(out_rows[6], 9); EXPECT_EQ(out_rows[6], 9);
Tensor out_cpu; paddle::framework::Tensor out_cpu;
TensorCopy(*out_value, cpu_place, ctx, &out_cpu); paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
ctx.Wait(); ctx.Wait();
auto* out_cpu_data = out_cpu.data<float>(); auto* out_cpu_data = out_cpu.data<float>();
...@@ -83,18 +92,24 @@ TEST(selected_rows_functor, gpu_add) { ...@@ -83,18 +92,24 @@ TEST(selected_rows_functor, gpu_add) {
EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0); EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0); EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
std::unique_ptr<Tensor> tensor1{new Tensor()}; std::unique_ptr<paddle::framework::Tensor> tensor1{
tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place); new paddle::framework::Tensor()};
tensor1->mutable_data<float>(
paddle::framework::make_ddim({height, row_numel}), gpu_place);
functor(ctx, tensor1.get(), 3.0); functor(ctx, tensor1.get(), 3.0);
std::unique_ptr<Tensor> tensor2{new Tensor()}; std::unique_ptr<paddle::framework::Tensor> tensor2{
tensor2->mutable_data<float>(make_ddim({height, row_numel}), gpu_place); new paddle::framework::Tensor()};
tensor2->mutable_data<float>(
paddle::framework::make_ddim({height, row_numel}), gpu_place);
SelectedRowsAddTensor<CUDADeviceContext, float> add_tensor_functor; paddle::operators::math::SelectedRowsAddTensor<
paddle::platform::CUDADeviceContext, float>
add_tensor_functor;
add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
Tensor tensor2_cpu; paddle::framework::Tensor tensor2_cpu;
TensorCopy(*tensor2, cpu_place, ctx, &tensor2_cpu); paddle::framework::TensorCopy(*tensor2, cpu_place, ctx, &tensor2_cpu);
ctx.Wait(); ctx.Wait();
auto* tensor2_cpu_data = tensor2_cpu.data<float>(); auto* tensor2_cpu_data = tensor2_cpu.data<float>();
...@@ -115,39 +130,47 @@ TEST(selected_rows_functor, gpu_add) { ...@@ -115,39 +130,47 @@ TEST(selected_rows_functor, gpu_add) {
} }
TEST(selected_rows_functor, gpu_add_to) { TEST(selected_rows_functor, gpu_add_to) {
using namespace paddle::framework; paddle::platform::CUDAPlace gpu_place(0);
using namespace paddle::platform; paddle::platform::CPUPlace cpu_place;
using namespace paddle::operators::math; paddle::platform::CUDADeviceContext ctx(gpu_place);
paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
CUDAPlace gpu_place(0); float>
CPUPlace cpu_place; functor;
CUDADeviceContext ctx(gpu_place);
SetConstant<CUDADeviceContext, float> functor;
int64_t height = 10; int64_t height = 10;
int64_t row_numel = 10; int64_t row_numel = 10;
std::vector<int64_t> rows1{0, 4, 7}; std::vector<int64_t> rows1{0, 4, 7};
std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)}; std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
new paddle::framework::SelectedRows(rows1, height)};
auto* in1_value = selected_rows1->mutable_value(); auto* in1_value = selected_rows1->mutable_value();
in1_value->mutable_data<float>( in1_value->mutable_data<float>(
make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place); paddle::framework::make_ddim(
{static_cast<int64_t>(rows1.size()), row_numel}),
gpu_place);
functor(ctx, in1_value, 1.0); functor(ctx, in1_value, 1.0);
std::vector<int64_t> rows2{0, 5, 7, 9}; std::vector<int64_t> rows2{0, 5, 7, 9};
std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)}; std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
new paddle::framework::SelectedRows(rows2, height)};
auto* in2_value = selected_rows2->mutable_value(); auto* in2_value = selected_rows2->mutable_value();
in2_value->mutable_data<float>( in2_value->mutable_data<float>(
make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), gpu_place); paddle::framework::make_ddim(
{static_cast<int64_t>(rows2.size()), row_numel}),
gpu_place);
functor(ctx, in2_value, 2.0); functor(ctx, in2_value, 2.0);
std::unique_ptr<SelectedRows> output{new SelectedRows()}; std::unique_ptr<paddle::framework::SelectedRows> output{
new paddle::framework::SelectedRows()};
output->set_height(height); output->set_height(height);
auto* out_value = output->mutable_value(); auto* out_value = output->mutable_value();
// simplely concat two SelectedRows // simply concat two SelectedRows
out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place); out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
gpu_place);
SelectedRowsAddTo<CUDADeviceContext, float> add_to_functor; paddle::operators::math::SelectedRowsAddTo<
paddle::platform::CUDADeviceContext, float>
add_to_functor;
add_to_functor(ctx, *selected_rows1, 0, output.get()); add_to_functor(ctx, *selected_rows1, 0, output.get());
add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
...@@ -166,8 +189,8 @@ TEST(selected_rows_functor, gpu_add_to) { ...@@ -166,8 +189,8 @@ TEST(selected_rows_functor, gpu_add_to) {
EXPECT_EQ(out_rows[5], 7); EXPECT_EQ(out_rows[5], 7);
EXPECT_EQ(out_rows[6], 9); EXPECT_EQ(out_rows[6], 9);
Tensor out_cpu; paddle::framework::Tensor out_cpu;
TensorCopy(*out_value, cpu_place, ctx, &out_cpu); paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
ctx.Wait(); ctx.Wait();
auto* out_cpu_data = out_cpu.data<float>(); auto* out_cpu_data = out_cpu.data<float>();
...@@ -183,15 +206,19 @@ TEST(selected_rows_functor, gpu_add_to) { ...@@ -183,15 +206,19 @@ TEST(selected_rows_functor, gpu_add_to) {
EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0); EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0); EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
std::unique_ptr<Tensor> tensor1{new Tensor()}; std::unique_ptr<paddle::framework::Tensor> tensor1{
tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place); new paddle::framework::Tensor()};
tensor1->mutable_data<float>(
paddle::framework::make_ddim({height, row_numel}), gpu_place);
functor(ctx, tensor1.get(), 3.0); functor(ctx, tensor1.get(), 3.0);
SelectedRowsAddToTensor<CUDADeviceContext, float> add_to_tensor_functor; paddle::operators::math::SelectedRowsAddToTensor<
paddle::platform::CUDADeviceContext, float>
add_to_tensor_functor;
add_to_tensor_functor(ctx, *output, tensor1.get()); add_to_tensor_functor(ctx, *output, tensor1.get());
Tensor tensor1_cpu; paddle::framework::Tensor tensor1_cpu;
TensorCopy(*tensor1, cpu_place, ctx, &tensor1_cpu); paddle::framework::TensorCopy(*tensor1, cpu_place, ctx, &tensor1_cpu);
ctx.Wait(); ctx.Wait();
auto* tensor1_cpu_data = tensor1_cpu.data<float>(); auto* tensor1_cpu_data = tensor1_cpu.data<float>();
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_pooling.h" #include "paddle/fluid/operators/math/sequence_pooling.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/sequence_scale.h" #include "paddle/fluid/operators/math/sequence_scale.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/unpooling.h" #include "paddle/fluid/operators/math/unpooling.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/one_hot_op.h" #include "paddle/fluid/operators/one_hot_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
namespace paddle { namespace paddle {
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/roi_pool_op.h" #include "paddle/fluid/operators/roi_pool_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/row_conv_op.h" #include "paddle/fluid/operators/row_conv_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -220,7 +220,7 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout, ...@@ -220,7 +220,7 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout,
for (int offset = 16; offset > 0; for (int offset = 16; offset > 0;
offset = offset / 2) { // blockDim.x is 32. offset = offset / 2) { // blockDim.x is 32.
val += __shfl_down(val, offset); val += platform::__shfl_down_sync(0, val, offset);
} }
__syncthreads(); __syncthreads();
...@@ -276,7 +276,7 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence, ...@@ -276,7 +276,7 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence,
for (int offset = 16; offset > 0; for (int offset = 16; offset > 0;
offset = offset / 2) { // blockDim.x is 32. offset = offset / 2) { // blockDim.x is 32.
val += __shfl_down(val, offset); val += platform::__shfl_down_sync(0, val, offset);
} }
__syncthreads(); __syncthreads();
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h"
USE_NO_KERNEL_OP(save); USE_NO_KERNEL_OP(save);
USE_NO_KERNEL_OP(load); USE_NO_KERNEL_OP(load);
...@@ -61,3 +62,35 @@ TEST(SaveLoadOp, CPU) { ...@@ -61,3 +62,35 @@ TEST(SaveLoadOp, CPU) {
} }
} }
} }
TEST(SaveLoadFP16Op, CPU) {
paddle::framework::Scope scope;
paddle::platform::CPUPlace place;
auto var = scope.Var("test_var");
auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
tensor->Resize({3, 10});
float* expect = tensor->mutable_data<float>(place);
for (int64_t i = 0; i < tensor->numel(); ++i) {
expect[i] = static_cast<float>(paddle::platform::float16(i));
}
paddle::framework::AttributeMap attrs;
attrs.insert({"file_path", std::string("tensor.save")});
attrs.insert({"save_as_fp16", true});
auto save_op = paddle::framework::OpRegistry::CreateOp(
"save", {{"X", {"test_var"}}}, {}, attrs);
save_op->Run(scope, place);
auto load_var = scope.Var("out_var");
auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
auto load_op = paddle::framework::OpRegistry::CreateOp(
"load", {}, {{"Out", {"out_var"}}}, attrs);
load_op->Run(scope, place);
paddle::platform::float16* actual = target->data<paddle::platform::float16>();
for (int64_t i = 0; i < tensor->numel(); ++i) {
EXPECT_EQ(expect[i], static_cast<float>(actual[i]));
}
}
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <numeric> #include <numeric>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -68,6 +69,7 @@ class SaveOp : public framework::OperatorBase { ...@@ -68,6 +69,7 @@ class SaveOp : public framework::OperatorBase {
const platform::Place &place) const override { const platform::Place &place) const override {
auto filename = Attr<std::string>("file_path"); auto filename = Attr<std::string>("file_path");
auto overwrite = Attr<bool>("overwrite"); auto overwrite = Attr<bool>("overwrite");
auto save_as_fp16 = Attr<bool>("save_as_fp16");
if (FileExists(filename) && !overwrite) { if (FileExists(filename) && !overwrite) {
PADDLE_THROW("%s is existed, cannot save to it when overwrite=false", PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
...@@ -96,7 +98,18 @@ class SaveOp : public framework::OperatorBase { ...@@ -96,7 +98,18 @@ class SaveOp : public framework::OperatorBase {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
framework::SerializeToStream(fout, tensor, dev_ctx); auto in_dtype = framework::ToDataType(tensor.type());
auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (in_dtype != out_dtype) {
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor out;
framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
framework::SerializeToStream(fout, out, dev_ctx);
} else {
framework::SerializeToStream(fout, tensor, dev_ctx);
}
} }
}; };
...@@ -114,6 +127,12 @@ This operator will serialize and write a tensor variable to file on disk. ...@@ -114,6 +127,12 @@ This operator will serialize and write a tensor variable to file on disk.
"(boolean, default true)" "(boolean, default true)"
"Overwrite the output file if exist") "Overwrite the output file if exist")
.SetDefault(true); .SetDefault(true);
AddAttr<bool>("save_as_fp16",
"(boolean, default false)"
"If true, the tensor will be converted to float16 data "
"type and then saved. Otherwise, the tensor will be "
"directly saved without data type conversion.")
.SetDefault(false);
AddAttr<std::string>("file_path", AddAttr<std::string>("file_path",
"(string)" "(string)"
"The \"file_path\" where the variable will be saved.") "The \"file_path\" where the variable will be saved.")
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <thrust/device_vector.h> #include <thrust/device_vector.h>
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
#include "paddle/fluid/operators/sequence_erase_op.h" #include "paddle/fluid/operators/sequence_erase_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "paddle/fluid/operators/sequence_expand_op.h" #include "paddle/fluid/operators/sequence_expand_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "paddle/fluid/operators/sgd_op.h" #include "paddle/fluid/operators/sgd_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -66,5 +66,18 @@ CUDA_ATOMIC_WRAPPER(Add, double) { ...@@ -66,5 +66,18 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
} }
#endif #endif
// __shfl_down has been deprecated as of CUDA 9.0.
#if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
return __shfl_down(val, delta);
}
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -502,11 +502,11 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -502,11 +502,11 @@ All parameter, weight, gradient are variables in Paddle.
const std::unordered_set<std::string> &bcast_vars, const std::unordered_set<std::string> &bcast_vars,
const ProgramDesc &main_program, const std::string &loss_var_name, const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, std::vector<Scope *> &local_scopes, Scope *scope, std::vector<Scope *> &local_scopes,
bool allow_op_delay, bool customize_loss_grad) { bool allow_op_delay, bool use_default_grad_scale) {
new (&self) ParallelExecutor(num_threads, use_event, places, new (&self) ParallelExecutor(
params, bcast_vars, main_program, num_threads, use_event, places, params, bcast_vars,
loss_var_name, scope, local_scopes, main_program, loss_var_name, scope, local_scopes,
allow_op_delay, customize_loss_grad); allow_op_delay, use_default_grad_scale);
}) })
.def("bcast_params", &ParallelExecutor::BCastParamsToGPUs) .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
// NOTE: even we return a vec<Scope*>* to Python use reference policy. // NOTE: even we return a vec<Scope*>* to Python use reference policy.
......
...@@ -155,7 +155,7 @@ EOF ...@@ -155,7 +155,7 @@ EOF
function gen_dockerfile() { function gen_dockerfile() {
# Set BASE_IMAGE according to env variables # Set BASE_IMAGE according to env variables
if [[ ${WITH_GPU} == "ON" ]]; then if [[ ${WITH_GPU} == "ON" ]]; then
BASE_IMAGE="nvidia/cuda:8.0-cudnn7-runtime-ubuntu16.04" BASE_IMAGE="nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04"
else else
BASE_IMAGE="ubuntu:16.04" BASE_IMAGE="ubuntu:16.04"
fi fi
......
...@@ -208,8 +208,8 @@ EOF ...@@ -208,8 +208,8 @@ EOF
--platform=android-$ANDROID_API \ --platform=android-$ANDROID_API \
--install-dir=$ANDROID_STANDALONE_TOOLCHAIN --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
BUILD_ROOT=${PADDLE_ROOT}/build BUILD_ROOT=${PADDLE_ROOT}/build_android
DEST_ROOT={PADDLE_ROOT}/install DEST_ROOT=${PADDLE_ROOT}/install_android
mkdir -p $BUILD_ROOT mkdir -p $BUILD_ROOT
cd $BUILD_ROOT cd $BUILD_ROOT
...@@ -349,13 +349,18 @@ function gen_docs() { ...@@ -349,13 +349,18 @@ function gen_docs() {
======================================== ========================================
EOF EOF
cmake .. \ cmake .. \
-DCMAKE_BUILD_TYPE=Release \
-DWITH_DOC=ON \ -DWITH_DOC=ON \
-DWITH_GPU=OFF \ -DWITH_GPU=OFF \
-DWITH_AVX=${WITH_AVX:-ON} \ -DWITH_MKL=OFF \
-DWITH_SWIG_PY=ON \
-DWITH_STYLE_CHECK=OFF -DWITH_STYLE_CHECK=OFF
make -j `nproc` paddle_docs paddle_apis make -j `nproc` paddle_docs paddle_apis
# check websites for broken links
linkchecker doc/v2/en/html/index.html
linkchecker doc/v2/cn/html/index.html
linkchecker doc/v2/api/en/html/index.html
} }
function gen_html() { function gen_html() {
......
...@@ -28,11 +28,16 @@ function start_build_docker() { ...@@ -28,11 +28,16 @@ function start_build_docker() {
docker rm -f "${CONTAINER_ID}" 1>/dev/null docker rm -f "${CONTAINER_ID}" 1>/dev/null
fi fi
apt_mirror='s#http://archive.ubuntu.com/ubuntu#mirror://mirrors.ubuntu.com/mirrors.txt#g'
DOCKER_ENV=$(cat <<EOL DOCKER_ENV=$(cat <<EOL
-e FLAGS_fraction_of_gpu_memory_to_use=0.15 \ -e FLAGS_fraction_of_gpu_memory_to_use=0.15 \
-e CTEST_OUTPUT_ON_FAILURE=1 \ -e CTEST_OUTPUT_ON_FAILURE=1 \
-e CTEST_PARALLEL_LEVEL=5 \ -e CTEST_PARALLEL_LEVEL=5 \
-e APT_MIRROR=${apt_mirror} \
-e WITH_GPU=ON \ -e WITH_GPU=ON \
-e CUDA_ARCH_NAME=Auto \
-e WITH_AVX=ON \
-e WITH_GOLANG=OFF \
-e WITH_TESTING=ON \ -e WITH_TESTING=ON \
-e WITH_C_API=OFF \ -e WITH_C_API=OFF \
-e WITH_COVERAGE=ON \ -e WITH_COVERAGE=ON \
...@@ -42,18 +47,23 @@ function start_build_docker() { ...@@ -42,18 +47,23 @@ function start_build_docker() {
-e PADDLE_FRACTION_GPU_MEMORY_TO_USE=0.15 \ -e PADDLE_FRACTION_GPU_MEMORY_TO_USE=0.15 \
-e CUDA_VISIBLE_DEVICES=0,1 \ -e CUDA_VISIBLE_DEVICES=0,1 \
-e WITH_DISTRIBUTE=ON \ -e WITH_DISTRIBUTE=ON \
-e WITH_FLUID_ONLY=ON \
-e RUN_TEST=ON -e RUN_TEST=ON
EOL EOL
) )
DOCKER_CMD="nvidia-docker"
if ! [ -x "$(command -v ${DOCKER_CMD})" ]; then
DOCKER_CMD="docker"
fi
set -x set -x
nvidia-docker run -it \ ${DOCKER_CMD} run -it \
-d \
--name $CONTAINER_ID \ --name $CONTAINER_ID \
${DOCKER_ENV} \ ${DOCKER_ENV} \
-v $PADDLE_ROOT:/paddle \ -v $PADDLE_ROOT:/paddle \
-w /paddle \ -w /paddle \
$IMG \ $IMG \
/bin/bash paddle/scripts/paddle_build.sh $@
set +x set +x
} }
...@@ -67,24 +77,7 @@ function main() { ...@@ -67,24 +77,7 @@ function main() {
VERSION="latest-dev-android" VERSION="latest-dev-android"
fi fi
IMG=${DOCKER_REPO}:${VERSION} IMG=${DOCKER_REPO}:${VERSION}
start_build_docker $@
case $1 in
start)
start_build_docker
;;
build_android)
start_build_docker
docker exec ${CONTAINER_ID} bash -c "./paddle/scripts/paddle_build.sh $@"
;;
*)
if container_running "${CONTAINER_ID}"; then
docker exec ${CONTAINER_ID} bash -c "./paddle/scripts/paddle_build.sh $@"
else
echo "Please start container first, with command:"
echo "$0 start"
fi
;;
esac
} }
main $@ main $@
...@@ -20,6 +20,16 @@ from framework import * ...@@ -20,6 +20,16 @@ from framework import *
import executor import executor
from executor import * from executor import *
import trainer
from trainer import Trainer
from trainer import Event
import inferencer
from inferencer import Inferencer
import params
from params import Params
import io import io
import evaluator import evaluator
import initializer import initializer
...@@ -47,7 +57,8 @@ from parallel_executor import ParallelExecutor ...@@ -47,7 +57,8 @@ from parallel_executor import ParallelExecutor
Tensor = LoDTensor Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\
trainer.__all__ + inferencer.__all__ + params.__all__ + [
'io', 'io',
'initializer', 'initializer',
'layers', 'layers',
...@@ -111,7 +122,9 @@ def __bootstrap__(): ...@@ -111,7 +122,9 @@ def __bootstrap__():
'eager_delete_scope' 'eager_delete_scope'
] ]
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += ['fraction_of_gpu_memory_to_use'] read_env_flags += [
'fraction_of_gpu_memory_to_use', 'cudnn_algo_use_autotune'
]
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)]) ["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0]) core.init_glog(sys.argv[0])
......
...@@ -658,10 +658,10 @@ class Operator(object): ...@@ -658,10 +658,10 @@ class Operator(object):
class Block(object): class Block(object):
def __init__(self, program, idx): def __init__(self, program, idx):
self.desc = program.desc.block(idx) self.desc = program.desc.block(idx)
self.vars = dict() # var_name --> var self.vars = collections.OrderedDict() # var_name --> var
self.ops = list() # operator list self.ops = list() # operator list
self.program = program self.program = program
self.removed_vars = dict() self.removed_vars = collections.OrderedDict()
def __str__(self): def __str__(self):
return self.to_string(True) return self.to_string(True)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = ['Inferencer', ]
class Inferencer(object):
def __init__(self, network_func, params, place=None):
# 1. we need to generate a framework.Program by calling
# network_func. Reference: fluid.program_guard in test_word2vec.py
# 2. move the default_main_program to self.program.
# 3. run the default_startup program.
self.params = params
self.place = place
def infer(self, inputs):
# run self.program
pass
...@@ -30,7 +30,7 @@ class ParallelExecutor(object): ...@@ -30,7 +30,7 @@ class ParallelExecutor(object):
num_threads=None, num_threads=None,
allow_op_delay=False, allow_op_delay=False,
share_vars_from=None, share_vars_from=None,
customize_loss_grad=False): use_default_grad_scale=True):
""" """
ParallelExecutor can run program in parallel. ParallelExecutor can run program in parallel.
...@@ -46,6 +46,11 @@ class ParallelExecutor(object): ...@@ -46,6 +46,11 @@ class ParallelExecutor(object):
improve performance in some cases, defalut False. improve performance in some cases, defalut False.
share_vars_from(ParallelExecutor, default None): If provied, share_vars_from(ParallelExecutor, default None): If provied,
it will share variables from the specified ParallelExecutor. it will share variables from the specified ParallelExecutor.
use_default_grad_scale(bool, default True): If set True, a default
scale value equal to `1./device_count` would be multiplied to
gradients of each device and scaled gradients would be
aggregated. Otherwise, a customized scale value should be fed
to the network.
Returns: Returns:
A ParallelExecutor object. A ParallelExecutor object.
...@@ -124,7 +129,7 @@ class ParallelExecutor(object): ...@@ -124,7 +129,7 @@ class ParallelExecutor(object):
scope, scope,
local_scopes, local_scopes,
allow_op_delay, allow_op_delay,
customize_loss_grad) use_default_grad_scale)
self.scope = scope self.scope = scope
def run(self, fetch_list, feed=None, feed_dict=None): def run(self, fetch_list, feed=None, feed_dict=None):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import core
__all__ = ['Params', ]
class Params(object):
def __init__(self, path=None):
self.scope = core.Scope()
if path:
self._load(path)
def _load(self, path):
# reference: load_persistables in io.py
pass
def save(self, path):
# reference: save_persistables in io.py
pass
def add_params(self, scope):
# take the keys from the scope,
# if not already exists in self.scope,
# add the key and value into self.scope.
pass
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import numpy
def resnet_cifar10(input, depth=32):
def conv_bn_layer(input,
ch_out,
filter_size,
stride,
padding,
act='relu',
bias_attr=False):
tmp = fluid.layers.conv2d(
input=input,
filter_size=filter_size,
num_filters=ch_out,
stride=stride,
padding=padding,
act=None,
bias_attr=bias_attr)
return fluid.layers.batch_norm(input=tmp, act=act)
def shortcut(input, ch_in, ch_out, stride):
if ch_in != ch_out:
return conv_bn_layer(input, ch_out, 1, stride, 0, None)
else:
return input
def basicblock(input, ch_in, ch_out, stride):
tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True)
short = shortcut(input, ch_in, ch_out, stride)
return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
def layer_warp(block_func, input, ch_in, ch_out, count, stride):
tmp = block_func(input, ch_in, ch_out, stride)
for i in range(1, count):
tmp = block_func(tmp, ch_out, ch_out, 1)
return tmp
assert (depth - 2) % 6 == 0
n = (depth - 2) / 6
conv1 = conv_bn_layer(
input=input, ch_out=16, filter_size=3, stride=1, padding=1)
res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
pool = fluid.layers.pool2d(
input=res3, pool_size=8, pool_type='avg', pool_stride=1)
return pool
def inference_network():
classdim = 10
data_shape = [3, 32, 32]
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
net = resnet_cifar10(images, 32)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
return predict
def train_network():
predict = inference_network()
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(cost)
accuracy = fluid.layers.accuracy(input=predict, label=label)
return avg_cost, accuracy
def train(use_cuda, save_path):
BATCH_SIZE = 128
EPOCH_NUM = 1
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10(), buf_size=128 * 10),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
def event_handler(event):
if isinstance(event, fluid.EndIteration):
if (event.batch_id % 10) == 0:
avg_cost, accuracy = trainer.test(reader=test_reader)
print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format(
event.batch_id + 1, avg_cost, accuracy))
if accuracy > 0.01: # Low threshold for speeding up CI
trainer.params.save(save_path)
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer(
train_network,
optimizer=fluid.optimizer.Adam(learning_rate=0.001),
place=place,
event_handler=event_handler)
trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler)
def infer(use_cuda, save_path):
params = fluid.Params(save_path)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(inference_network, params, place=place)
# The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range
# [0, 1.0].
tensor_img = numpy.random.rand(1, 3, 32, 32).astype("float32")
results = inferencer.infer({'pixel': tensor_img})
print("infer results: ", results)
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "image_classification_resnet.inference.model"
train(use_cuda, save_path)
infer(use_cuda, save_path)
if __name__ == '__main__':
for use_cuda in (False, True):
main(use_cuda=use_cuda)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import numpy
def vgg16_bn_drop(input):
def conv_block(input, num_filter, groups, dropouts):
return fluid.nets.img_conv_group(
input=input,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act='relu',
conv_with_batchnorm=True,
conv_batchnorm_drop_rate=dropouts,
pool_type='max')
conv1 = conv_block(input, 64, 2, [0.3, 0])
conv2 = conv_block(conv1, 128, 2, [0.4, 0])
conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
bn = fluid.layers.batch_norm(input=fc1, act='relu')
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
return fc2
def inference_network():
classdim = 10
data_shape = [3, 32, 32]
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
net = vgg16_bn_drop(images)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
return predict
def train_network():
predict = inference_network()
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(cost)
accuracy = fluid.layers.accuracy(input=predict, label=label)
return avg_cost, accuracy
def train(use_cuda, save_path):
BATCH_SIZE = 128
EPOCH_NUM = 1
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10(), buf_size=128 * 10),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
def event_handler(event):
if isinstance(event, fluid.EndIteration):
if (event.batch_id % 10) == 0:
avg_cost, accuracy = trainer.test(reader=test_reader)
print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format(
event.batch_id + 1, avg_cost, accuracy))
if accuracy > 0.01: # Low threshold for speeding up CI
trainer.params.save(save_path)
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer(
train_network,
optimizer=fluid.optimizer.Adam(learning_rate=0.001),
place=place,
event_handler=event_handler)
trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler)
def infer(use_cuda, save_path):
params = fluid.Params(save_path)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(inference_network, params, place=place)
# The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range
# [0, 1.0].
tensor_img = numpy.random.rand(1, 3, 32, 32).astype("float32")
results = inferencer.infer({'pixel': tensor_img})
print("infer results: ", results)
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "image_classification_vgg.inference.model"
train(use_cuda, save_path)
infer(use_cuda, save_path)
if __name__ == '__main__':
for use_cuda in (False, True):
main(use_cuda=use_cuda)
...@@ -275,10 +275,7 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference): ...@@ -275,10 +275,7 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
class TestBatchNormOpTraining(unittest.TestCase): class TestBatchNormOpTraining(unittest.TestCase):
def __assert_close(self, tensor, np_array, msg, atol=1e-4): def __assert_close(self, tensor, np_array, msg, atol=1e-4):
if not np.allclose(np.array(tensor), np_array, atol=atol): np.allclose(np.array(tensor), np_array, atol=atol)
import pdb
pdb.set_trace()
self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
def test_forward_backward(self): def test_forward_backward(self):
def test_with_place(place, data_layout, shape): def test_with_place(place, data_layout, shape):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = [
'Event',
'Trainer',
]
class Event(object):
BEGIN_EPOCH = 0
END_EPOCH = 1
BEGIN_STEP = 2
END_STEP = 3
def __init__(self):
self.step = 0
self.epoch = 0
self.type = Event.BEGIN_EPOCH
class Trainer(object):
def __init__(self, network_func, optimizer, params=None, place=None):
# 1. we need to generate a framework.Program by calling
# network_func. Reference: fluid.program_guard in
# test_word2vec.py
# 2. move the default_main_program to self.program and run the
# default_startup program on an empty core.Scope()
# 3. call self.params.add_vars with the initialized scope, it
# will add the new vars of the initialized scope into
# self.params.
self.network_func = network_func
self.optimizer = optimizer
self.params = params
self.place = place
# TODO(helin): support distributed training
def train(self, reader, num_epochs, event_handler):
pass
def test(self, reader):
pass
...@@ -8,3 +8,4 @@ scipy>=0.19.0 ...@@ -8,3 +8,4 @@ scipy>=0.19.0
Pillow Pillow
nltk>=3.2.2 nltk>=3.2.2
graphviz graphviz
LinkChecker
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册