提交 9a4c1a39 编写于 作者: Y Yu Yang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/add_stable_test_of_cross_entropy
...@@ -12,7 +12,7 @@ services: ...@@ -12,7 +12,7 @@ services:
os: os:
- linux - linux
env: env:
- JOB=build_doc - JOB=doc
- JOB=check_style - JOB=check_style
- JOB=build_android - JOB=build_android
addons: addons:
...@@ -36,21 +36,18 @@ addons: ...@@ -36,21 +36,18 @@ addons:
- ccache - ccache
ssh_known_hosts: 13.229.163.131 ssh_known_hosts: 13.229.163.131
before_install: before_install:
- if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
# Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
# protobuf version.
- sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
- sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit
- | - |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script: script:
- | - |
# 43min timeout # 43min timeout
if [[ "$JOB" == "build_android" ]]; then timeout 2580 docker run -it --rm -v "$TRAVIS_BUILD_DIR:/paddle" paddlepaddle/paddle:latest-dev-android; if [[ "$JOB" != "doc" ]]; then timeout 2580 paddle/scripts/paddle_docker_build.sh ${JOB}; else paddle/scripts/paddle_build.sh ${JOB}; fi;
else timeout 2580 paddle/scripts/travis/${JOB}.sh; fi; if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
- | - |
if [[ "$JOB" != "build_doc" ]]; then exit 0; fi; if [[ "$JOB" != "doc" ]]; then exit 0; fi;
# For document only
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi; if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi; if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
......
...@@ -2,12 +2,14 @@ ...@@ -2,12 +2,14 @@
|---|---| |---|---|
| abhinavarora | Abhinav Arora | | abhinavarora | Abhinav Arora |
| backyes | Yan-Fei Wang | | backyes | Yan-Fei Wang |
| baiyfbupt | Yi-Fan Bai |
| beckett1124 | Bin Qi | | beckett1124 | Bin Qi |
| JiayiFeng | Jia-Yi Feng |
| chengxiaohua1105 | Xiao-Hua Cheng | | chengxiaohua1105 | Xiao-Hua Cheng |
| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang | | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
| cxysteven | Xing-Yi Cheng | | cxysteven | Xing-Yi Cheng |
| dzhwinter | Zhi-Hong Dong | | dzhwinter | Zhi-Hong Dong |
| dragonwarrior | Long Wang |
| dyning | Yuning Du |
| emailweixu | Wei Xu | | emailweixu | Wei Xu |
| gangliao | Gang Liao | | gangliao | Gang Liao |
| gongweibao | Wei-Bao Gong | | gongweibao | Wei-Bao Gong |
...@@ -16,6 +18,9 @@ ...@@ -16,6 +18,9 @@
| hedaoyuan | Dao-Yuan He | | hedaoyuan | Dao-Yuan He |
| helinwang | He-Lin Wang | | helinwang | He-Lin Wang |
| jacquesqiao | Long-Fei Qiao | | jacquesqiao | Long-Fei Qiao |
| jczaja | Jacek Czaja |
| JiayiFeng | Jia-Yi Feng |
| kbinias | Krzysztof Binias |
| kuke | Yi-Bing Liu | | kuke | Yi-Bing Liu |
| lcy-seso | Ying Cao | | lcy-seso | Ying Cao |
| lipeng-unisound | Peng Li | | lipeng-unisound | Peng Li |
...@@ -24,15 +29,20 @@ ...@@ -24,15 +29,20 @@
| llxxxll | Yong-Feng Liu | | llxxxll | Yong-Feng Liu |
| luotao01 | Tao Luo | | luotao01 | Tao Luo |
| lzhao4ever | Liang Zhao | | lzhao4ever | Liang Zhao |
| mozga-intel | Mateusz Ozga |
| NHZlX | Zhao-Long Xing | | NHZlX | Zhao-Long Xing |
| Noplz | Yuan Gao |
| pakchoi | Chuan-Jiang Song | | pakchoi | Chuan-Jiang Song |
| panyx0718 | Xin Pan |
| pengli09 | Peng Li | | pengli09 | Peng Li |
| pkuyym | Ya-Ming Yang | | pkuyym | Ya-Ming Yang |
| pzelazko-intel | Pawel Zelazko |
| QiJune | Jun Qi | | QiJune | Jun Qi |
| qingqing01 | Qing-Qing Dang | | qingqing01 | Qing-Qing Dang |
| reyoung | Yang Yu | | reyoung | Yang Yu |
| Superjom | Chun-Wei Yan | | Superjom | Chun-Wei Yan |
| tianbingsz | Tian-Bing Xu | | tianbingsz | Tian-Bing Xu |
| tpatejko | Tomasz Patejko |
| typhoonzero | Yi Wu | | typhoonzero | Yi Wu |
| wanghaoshuang | Hao-Shuang Wang | | wanghaoshuang | Hao-Shuang Wang |
| wangyang59 | Yang Wang | | wangyang59 | Yang Wang |
......
# A image for building paddle binaries # A image for building paddle binaries
# Use cuda devel base image for both cpu and gpu environment # Use cuda devel base image for both cpu and gpu environment
# When you modify it, please be aware of cudnn-runtime version
# When you modify it, please be aware of cudnn-runtime version
# and libcudnn.so.x in paddle/scripts/docker/build.sh # and libcudnn.so.x in paddle/scripts/docker/build.sh
FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04 FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com> MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
...@@ -24,7 +23,7 @@ ENV HOME /root ...@@ -24,7 +23,7 @@ ENV HOME /root
COPY ./paddle/scripts/docker/root/ /root/ COPY ./paddle/scripts/docker/root/ /root/
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y \ apt-get install -y --allow-downgrades \
git python-pip python-dev openssh-server bison \ git python-pip python-dev openssh-server bison \
libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \ libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
...@@ -33,7 +32,7 @@ RUN apt-get update && \ ...@@ -33,7 +32,7 @@ RUN apt-get update && \
automake locales clang-format swig doxygen cmake \ automake locales clang-format swig doxygen cmake \
liblapack-dev liblapacke-dev \ liblapack-dev liblapacke-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools libtool && \ net-tools libtool ccache && \
apt-get clean -y apt-get clean -y
# Install Go and glide # Install Go and glide
......
...@@ -172,6 +172,8 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF) ...@@ -172,6 +172,8 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
list(APPEND CUDA_NVCC_FLAGS "-std=c++11") list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math") list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC") list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
# in cuda9, suppress cuda warning on eigen
list(APPEND CUDA_NVCC_FLAGS "-w")
# Set :expt-relaxed-constexpr to suppress Eigen warnings # Set :expt-relaxed-constexpr to suppress Eigen warnings
list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
......
...@@ -22,7 +22,9 @@ else() ...@@ -22,7 +22,9 @@ else()
extern_eigen3 extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/RLovelett/eigen.git" GIT_REPOSITORY "https://github.com/RLovelett/eigen.git"
GIT_TAG 70661066beef694cadf6c304d0d07e0758825c10 # eigen on cuda9.1 missing header of math_funtions.hpp
# https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c
PREFIX ${EIGEN_SOURCE_DIR} PREFIX ${EIGEN_SOURCE_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
......
...@@ -38,8 +38,7 @@ ENDIF() ...@@ -38,8 +38,7 @@ ENDIF()
ExternalProject_Add( ExternalProject_Add(
extern_warpctc extern_warpctc
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/gangliao/warp-ctc.git" GIT_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git"
GIT_TAG b63a0644654a3e0ed624c85a1767bc8193aead09
PREFIX ${WARPCTC_SOURCES_DIR} PREFIX ${WARPCTC_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......
# Averaging Parameter in PaddlePaddle # Averaging Parameter in PaddlePaddle
## Why Averaging ## Why Averaging
In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable if we can obtain the optimal values of parameters by going through the data in as few passes as we can. In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable to obtain the optimal values of parameters by going through the data in as few passes as possible.
Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset. Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
...@@ -16,16 +16,16 @@ We propose averaging for any optimizer similar to how ASGD performs it, as menti ...@@ -16,16 +16,16 @@ We propose averaging for any optimizer similar to how ASGD performs it, as menti
### How to perform Parameter Averaging in PaddlePaddle ### How to perform Parameter Averaging in PaddlePaddle
Parameter Averaging in PaddlePaddle works in the following way during training : Parameter Averaging in PaddlePaddle works in the following way during training :
1. It will take in an instance of a normal optimizer as an input, e.g. RMSPropOptimizer 1. It will take in an instance of an optimizer as an input, e.g. RMSPropOptimizer
2. The optimizer itself is responsible for updating the parameters. 2. The optimizer itself is responsible for updating the parameters.
3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself: 3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
1. In concept, the values of this copy are the average of the values of the parameters in the most recent N batches. 1. In theory, the values of this copy are the average of the values of the parameters in the most recent N batches.
2. However, saving all the N instances of the parameters in memory is not feasible. 2. However, saving all N instances of the parameters in memory is not feasible.
3. Therefore, an approximation algorithm is used. 3. Therefore, an approximation algorithm is used.
Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved. Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.
During the testing/ saving the model phase, we perform the following steps: During the testing/saving the model phase, we perform the following steps:
1. Perform the delayed operations. 1. Perform the delayed operations.
2. Save current values of the parameters to a temporary variable. 2. Save current values of the parameters to a temporary variable.
3. Replace the values of the parameters with the averaged values. 3. Replace the values of the parameters with the averaged values.
......
...@@ -228,6 +228,21 @@ extern __thread cudaStream_t default_stream; ...@@ -228,6 +228,21 @@ extern __thread cudaStream_t default_stream;
<< "CUDA error: " << hl_get_device_error_string((size_t)err); \ << "CUDA error: " << hl_get_device_error_string((size_t)err); \
} }
// __shfl has been deprecated as of CUDA 9.0.
#if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T
__shfl_sync(unsigned, T val, int src_line, int width) {
return __shfl(val, src_line, width);
}
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
#endif /* __NVCC__ */ #endif /* __NVCC__ */
#endif /* HL_BASE_H_ */ #endif /* HL_BASE_H_ */
...@@ -341,12 +341,15 @@ void hl_lstm_parallel_forward(real *gateValue, ...@@ -341,12 +341,15 @@ void hl_lstm_parallel_forward(real *gateValue,
} }
__device__ __forceinline__ void transpose_32x32(real a[], const int idx) { __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
int addr = idx % 32; const int warp_size = 32;
int addr = idx % warp_size;
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, addr < warp_size);
#pragma unroll #pragma unroll
for (int k = 1; k < 32; k++) { for (int k = 1; k < 32; k++) {
// rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32); // rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32);
addr = __shfl(addr, (idx + 1) % 32, 32); addr = __shfl_sync(mask, addr, (idx + 1) % 32, 32);
a[k] = __shfl(a[k], addr, 32); a[k] = __shfl_sync(mask, a[k], addr, 32);
} }
#pragma unroll #pragma unroll
...@@ -360,10 +363,11 @@ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) { ...@@ -360,10 +363,11 @@ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
} }
addr = (32 - idx) % 32; addr = (32 - idx) % 32;
CREATE_SHFL_MASK(mask, idx % 32 < warp_size);
#pragma unroll #pragma unroll
for (int k = 0; k < 32; k++) { for (int k = 0; k < 32; k++) {
a[k] = __shfl(a[k], addr, 32); a[k] = __shfl_sync(mask, a[k], addr, 32);
addr = __shfl(addr, (idx + 31) % 32, 32); addr = __shfl_sync(mask, addr, (idx + 31) % 32, 32);
} }
} }
......
...@@ -244,13 +244,16 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK, ...@@ -244,13 +244,16 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK,
if (--beamSize == 0) break; if (--beamSize == 0) break;
__syncthreads(); __syncthreads();
unsigned mask = 0u;
// CREATE_SHFL_MASK(mask, tid < len);
if (tid == maxId[0]) { if (tid == maxId[0]) {
if (beam < maxLength) { if (beam < maxLength) {
shTopK[tid] = topK[beam]; shTopK[tid] = topK[beam];
} }
} }
if (maxId[0] / 32 == warp) { if (maxId[0] / 32 == warp) {
if (__shfl(beam, (maxId[0]) % 32, 32) == maxLength) break; if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break;
} }
} }
} }
......
...@@ -34,7 +34,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( ...@@ -34,7 +34,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::string &loss_var_name,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const std::vector<Scope *> &local_scopes, bool skip_scale_loss, const std::vector<Scope *> &local_scopes, bool use_default_grad_scale,
platform::NCCLContextMap *nccl_ctxs) platform::NCCLContextMap *nccl_ctxs)
: loss_var_name_(loss_var_name), : loss_var_name_(loss_var_name),
places_(places), places_(places),
...@@ -45,7 +45,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( ...@@ -45,7 +45,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::string &loss_var_name,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const std::vector<Scope *> &local_scopes, bool skip_scale_loss) const std::vector<Scope *> &local_scopes, bool use_default_grad_scale)
: loss_var_name_(loss_var_name), : loss_var_name_(loss_var_name),
places_(places), places_(places),
local_scopes_(local_scopes) { local_scopes_(local_scopes) {
...@@ -53,28 +53,25 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( ...@@ -53,28 +53,25 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
for (auto &p : params) { for (auto &p : params) {
grad_names_.insert(GradVarName(p)); grad_names_.insert(GradVarName(p));
} }
skip_scale_loss_ = skip_scale_loss; use_default_grad_scale_ = use_default_grad_scale;
} }
void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
const OpDesc &op, const OpDesc &op,
const platform::Place &p, size_t place_id) const {
const size_t &i) const { auto p = places_[place_id];
auto *op_handle = result->ops_.back().get(); auto *op_handle = result->ops_.back().get();
op_handle->SetDeviceContext(p, op_handle->SetDeviceContext(p,
platform::DeviceContextPool::Instance().Get(p)); platform::DeviceContextPool::Instance().Get(p));
auto var_names = op.InputArgumentNames(); for (auto &each_var_name : op.InputArgumentNames()) {
VarHandle *var =
for (auto &each_var_name : var_names) { CreateOrGetLatestVarHandle(result, each_var_name, p, place_id);
VarHandle *var = CreateOrGetLatestVarHandle(result, each_var_name, p, i);
op_handle->AddInput(var); op_handle->AddInput(var);
} }
var_names = op.OutputArgumentNames(); for (auto &each_var_name : op.OutputArgumentNames()) {
CreateOpOutput(result, op_handle, each_var_name, p, place_id);
for (auto &each_var_name : var_names) {
CreateOpOutput(result, op_handle, each_var_name, p, i);
} }
} }
...@@ -84,17 +81,18 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op, ...@@ -84,17 +81,18 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op,
return false; return false;
} }
auto checker = [&](const std::vector<std::string> opvars, /**
const std::vector<std::string> sendvars) -> bool { * Check any of opvars contains `.block` and in sendvars
bool is_dist_train_op = false; */
auto checker = [](const std::vector<std::string> &opvars,
const std::vector<std::string> &sendvars) -> bool {
for (auto &var : opvars) { for (auto &var : opvars) {
if (var.find(".block") != std::string::npos && if (var.find(".block") != std::string::npos &&
std::find(sendvars.begin(), sendvars.end(), var) != sendvars.end()) { std::find(sendvars.begin(), sendvars.end(), var) != sendvars.end()) {
is_dist_train_op = true; return true;
break;
} }
} }
return is_dist_train_op; return false;
}; };
if (op.Type() == "split") { if (op.Type() == "split") {
...@@ -117,13 +115,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -117,13 +115,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
places_.size()); places_.size());
// Find "send" op first for split is in front of send. // Find "send" op first for split is in front of send.
OpDesc *send_op = nullptr; OpDesc *send_op = GetSendOpDesc(program);
for (auto *op : program.Block(0).AllOps()) {
if (op->Type() == "send") {
send_op = op;
break;
}
}
bool is_forwarding = true; bool is_forwarding = true;
for (auto *op : program.Block(0).AllOps()) { for (auto *op : program.Block(0).AllOps()) {
...@@ -134,7 +126,8 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -134,7 +126,8 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
} else if (IsDistTrainOp(*op, send_op)) { } else if (IsDistTrainOp(*op, send_op)) {
CreateComputationalOps(&result, *op, 1); CreateComputationalOps(&result, *op, 1);
} else if (IsScaleLossOp(*op)) { } else if (IsScaleLossOp(*op)) {
if (!skip_scale_loss_) { // user can customize loss@grad if not use_default_grad_scale_
if (use_default_grad_scale_) {
CreateScaleLossGradOp(&result); CreateScaleLossGradOp(&result);
} }
is_forwarding = false; is_forwarding = false;
...@@ -142,10 +135,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -142,10 +135,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
CreateComputationalOps(&result, *op, places_.size()); CreateComputationalOps(&result, *op, places_.size());
if (!is_forwarding) { if (!is_forwarding) {
// Currently, we assume that once gradient is generated, it can be // Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once. But there are no // broadcast, and each gradient is only broadcast once.
// other cases, for example, we need to adjust the gradient according to
// the input when we get the gradient, which is not considered at
// present.
for (auto &og : op->OutputArgumentNames()) { for (auto &og : op->OutputArgumentNames()) {
if (IsParameterGradientOnce(og, &og_has_been_broadcast)) { if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
InsertNCCLAllReduceOp(&result, og); InsertNCCLAllReduceOp(&result, og);
...@@ -175,6 +165,16 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -175,6 +165,16 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
return std::unique_ptr<SSAGraph>(graph); return std::unique_ptr<SSAGraph>(graph);
} }
OpDesc *MultiDevSSAGraphBuilder::GetSendOpDesc(
const ProgramDesc &program) const {
for (auto *op : program.Block(0).AllOps()) {
if (op->Type() == "send") {
return op;
}
}
return nullptr;
}
void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp( void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
SSAGraph *result, const std::string &og) const { SSAGraph *result, const std::string &og) const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -243,7 +243,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result, ...@@ -243,7 +243,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
auto p = places_[scope_idx]; auto p = places_[scope_idx];
auto s = local_scopes_[scope_idx]; auto s = local_scopes_[scope_idx];
result->ops_.emplace_back(new ComputationOpHandle(op, s, p)); result->ops_.emplace_back(new ComputationOpHandle(op, s, p));
CreateOpHandleIOs(result, op, p, scope_idx); CreateOpHandleIOs(result, op, scope_idx);
} }
} }
...@@ -255,7 +255,7 @@ void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result, ...@@ -255,7 +255,7 @@ void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
result->ops_.emplace_back(new SendOpHandle(op, s, p)); result->ops_.emplace_back(new SendOpHandle(op, s, p));
// Create inputs for output on original place and no ssa output // Create inputs for output on original place and no ssa output
// is created for send op. // is created for send op.
CreateOpHandleIOs(result, op, p, 0); CreateOpHandleIOs(result, op, 0);
} }
bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const { bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
......
...@@ -41,14 +41,14 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -41,14 +41,14 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
const std::string &loss_var_name, const std::string &loss_var_name,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
bool skip_scale_loss); bool use_default_grad_scale);
#endif #endif
std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override; std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
private: private:
void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op, void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
const platform::Place &p, const size_t &i) const; size_t place_id) const;
private: private:
std::string loss_var_name_; std::string loss_var_name_;
...@@ -59,12 +59,15 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -59,12 +59,15 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::NCCLContextMap *nccl_ctxs_; platform::NCCLContextMap *nccl_ctxs_;
#endif #endif
bool skip_scale_loss_; bool use_default_grad_scale_;
bool IsScaleLossOp(const OpDesc &op) const; bool IsScaleLossOp(const OpDesc &op) const;
void CreateSendOp(SSAGraph *result, const OpDesc &op) const; void CreateSendOp(SSAGraph *result, const OpDesc &op) const;
/**
* Is this operator as the end-point operator before/after send operator.
*/
bool IsDistTrainOp(const OpDesc &op, OpDesc *send_op) const; bool IsDistTrainOp(const OpDesc &op, OpDesc *send_op) const;
void CreateComputationalOps(SSAGraph *result, const OpDesc &op, void CreateComputationalOps(SSAGraph *result, const OpDesc &op,
...@@ -77,6 +80,12 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -77,6 +80,12 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
std::unordered_set<std::string> *og_has_been_broadcast) const; std::unordered_set<std::string> *og_has_been_broadcast) const;
void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const; void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
/**
* Get send op in the global block of program.
* nullptr if not found.
*/
OpDesc *GetSendOpDesc(const ProgramDesc &program) const;
}; };
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
...@@ -46,6 +46,7 @@ void ScaleLossGradOpHandle::RunImpl() { ...@@ -46,6 +46,7 @@ void ScaleLossGradOpHandle::RunImpl() {
->stream(); ->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp, memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
platform::CPUPlace(), &coeff_, sizeof(float), stream); platform::CPUPlace(), &coeff_, sizeof(float), stream);
VLOG(1) << place_ << "RUN Scale loss grad op";
}); });
#endif #endif
} }
......
...@@ -25,12 +25,22 @@ namespace paddle { ...@@ -25,12 +25,22 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
// A SSA graph used by parallel executor.
struct SSAGraph { struct SSAGraph {
// all variable in each devices.
// The outside vector is the device vector. Each element of this vector is a
// map from variable name to variables. The variables, who have the same name,
// will have a different version. The offset in the
// `std::vector<std::unique_ptr<VarHandle>>` is the version of varaibles.
std::vector< std::vector<
std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>> std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
vars_; vars_;
// aux variables to represent dependency. Useful to resolve data hazard. // aux variables to represent dependency. Useful to resolve data hazard.
std::unordered_set<std::unique_ptr<VarHandleBase>> dep_vars_; std::unordered_set<std::unique_ptr<VarHandleBase>> dep_vars_;
// all operators. NOTE that even we use a vector here, the operators is
// unordered.
std::vector<std::unique_ptr<OpHandleBase>> ops_; std::vector<std::unique_ptr<OpHandleBase>> ops_;
}; };
......
...@@ -48,6 +48,8 @@ class SSAGraphBuilder { ...@@ -48,6 +48,8 @@ class SSAGraphBuilder {
const platform::Place &place, const platform::Place &place,
size_t place_offset); size_t place_offset);
// Add an output variable (each_var_name, place, place_offset) to op_handle,
// which belongs to graph
static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
const std::string &each_var_name, const std::string &each_var_name,
const platform::Place &place, size_t place_offset); const platform::Place &place, size_t place_offset);
......
...@@ -255,11 +255,11 @@ TEST(LoDTensor, RecordIO) { ...@@ -255,11 +255,11 @@ TEST(LoDTensor, RecordIO) {
std::unique_ptr<std::istream> stream_ptr(stream); std::unique_ptr<std::istream> stream_ptr(stream);
recordio::Scanner scanner(std::move(stream_ptr)); recordio::Scanner scanner(std::move(stream_ptr));
auto tensors = ReadFromRecordIO(&scanner, ctx); auto tensors = ReadFromRecordIO(&scanner, ctx);
ASSERT_EQ(tensors.size(), 2); ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
assert_tensor_ok(tensors[0]); assert_tensor_ok(tensors[0]);
assert_tensor_ok(tensors[1]); assert_tensor_ok(tensors[1]);
tensors = ReadFromRecordIO(&scanner, ctx); tensors = ReadFromRecordIO(&scanner, ctx);
ASSERT_EQ(tensors.size(), 2); ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
assert_tensor_ok(tensors[0]); assert_tensor_ok(tensors[0]);
assert_tensor_ok(tensors[1]); assert_tensor_ok(tensors[1]);
} }
......
...@@ -58,7 +58,7 @@ ParallelExecutor::ParallelExecutor( ...@@ -58,7 +58,7 @@ ParallelExecutor::ParallelExecutor(
const std::unordered_set<std::string> &bcast_vars, const std::unordered_set<std::string> &bcast_vars,
const ProgramDesc &main_program, const std::string &loss_var_name, const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay, Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay,
bool customize_scale_loss) bool use_default_grad_scale)
: member_(new ParallelExecutorPrivate(places)) { : member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope; member_->global_scope_ = scope;
...@@ -93,11 +93,11 @@ ParallelExecutor::ParallelExecutor( ...@@ -93,11 +93,11 @@ ParallelExecutor::ParallelExecutor(
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
details::MultiDevSSAGraphBuilder builder( details::MultiDevSSAGraphBuilder builder(
member_->places_, loss_var_name, params, member_->local_scopes_, member_->places_, loss_var_name, params, member_->local_scopes_,
customize_scale_loss, member_->nccl_ctxs_.get()); use_default_grad_scale, member_->nccl_ctxs_.get());
#else #else
details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
params, member_->local_scopes_, params, member_->local_scopes_,
customize_scale_loss); use_default_grad_scale);
#endif #endif
auto graph = builder.Build(main_program); auto graph = builder.Build(main_program);
......
...@@ -40,7 +40,7 @@ class ParallelExecutor { ...@@ -40,7 +40,7 @@ class ParallelExecutor {
const ProgramDesc& main_program, const ProgramDesc& main_program,
const std::string& loss_var_name, Scope* scope, const std::string& loss_var_name, Scope* scope,
const std::vector<Scope*>& local_scopes, const std::vector<Scope*>& local_scopes,
bool allow_op_delay, bool customize_scale_loss); bool allow_op_delay, bool use_default_grad_scale);
~ParallelExecutor(); ~ParallelExecutor();
......
...@@ -120,11 +120,11 @@ bool SelectedRows::HasKey(int64_t key) const { ...@@ -120,11 +120,11 @@ bool SelectedRows::HasKey(int64_t key) const {
: true; : true;
} }
std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys, std::vector<std::pair<int64_t, int64_t>> SelectedRows::Get(
framework::Tensor* value) const { std::vector<int64_t> keys, framework::Tensor* value) const {
PADDLE_ENFORCE(value->IsInitialized(), PADDLE_ENFORCE(value->IsInitialized(),
"The value tensor should be initialized."); "The value tensor should be initialized.");
std::vector<int64_t> non_keys; std::vector<std::pair<int64_t, int64_t>> non_keys_pair;
int64_t value_width = value_->numel() / value_->dims()[0]; int64_t value_width = value_->numel() / value_->dims()[0];
PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0], PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
"output tensor should have the same shape with table " "output tensor should have the same shape with table "
...@@ -133,7 +133,7 @@ std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys, ...@@ -133,7 +133,7 @@ std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys,
for (size_t i = 0; i < keys.size(); ++i) { for (size_t i = 0; i < keys.size(); ++i) {
int64_t index = Index(keys[i]); int64_t index = Index(keys[i]);
if (index == -1) { if (index == -1) {
non_keys.push_back(keys[i]); non_keys_pair.push_back(std::make_pair(keys[i], static_cast<int64_t>(i)));
} else { } else {
framework::VisitDataType( framework::VisitDataType(
framework::ToDataType(value_->type()), framework::ToDataType(value_->type()),
...@@ -141,7 +141,7 @@ std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys, ...@@ -141,7 +141,7 @@ std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys,
index * value_width, value_width)); index * value_width, value_width));
} }
} }
return non_keys; return non_keys_pair;
} }
bool SelectedRows::Set(int64_t key, const framework::Tensor& value) { bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
...@@ -78,10 +79,11 @@ class SelectedRows { ...@@ -78,10 +79,11 @@ class SelectedRows {
/* /*
* @brief Get value by the key list, if the * @brief Get value by the key list, if the
* *
* @return a list of keys which does not exists in table * @return a list of pair which contains the non-exists key and the index in
* the value
*/ */
std::vector<int64_t> Get(std::vector<int64_t> keys, std::vector<std::pair<int64_t, int64_t>> Get(std::vector<int64_t> keys,
framework::Tensor* tensor) const; framework::Tensor* value) const;
/* /*
* @brief Set a key-value pair into the table. * @brief Set a key-value pair into the table.
......
...@@ -59,7 +59,7 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) { ...@@ -59,7 +59,7 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims()); ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
} }
TEST_F(SelectedRowsTester, Table) { TEST_F(SelectedRowsTester, SparseTable) {
platform::CPUPlace cpu; platform::CPUPlace cpu;
SelectedRows table; SelectedRows table;
// initialize a sparse table // initialize a sparse table
...@@ -87,11 +87,11 @@ TEST_F(SelectedRowsTester, Table) { ...@@ -87,11 +87,11 @@ TEST_F(SelectedRowsTester, Table) {
framework::Tensor get_value; framework::Tensor get_value;
get_value.mutable_data<float>(framework::make_ddim({2, 100}), cpu); get_value.mutable_data<float>(framework::make_ddim({2, 100}), cpu);
std::vector<int64_t> keys({non_key, key}); std::vector<int64_t> keys({non_key, key});
auto non_keys = table.Get(keys, &get_value); auto non_key_pairs = table.Get(keys, &get_value);
ASSERT_EQ(get_value.data<float>()[100], static_cast<float>(10)); ASSERT_EQ(get_value.data<float>()[100], static_cast<float>(10));
ASSERT_EQ(non_keys.size(), static_cast<size_t>(1)); ASSERT_EQ(non_key_pairs.size(), static_cast<size_t>(1));
ASSERT_EQ(non_keys[0], non_key); ASSERT_EQ(non_key_pairs[0].first, non_key);
} }
} // namespace framework } // namespace framework
......
...@@ -65,7 +65,7 @@ class TensorRTEngine : public EngineBase { ...@@ -65,7 +65,7 @@ class TensorRTEngine : public EngineBase {
// Initialize the inference network, so that TensorRT layers can add to this // Initialize the inference network, so that TensorRT layers can add to this
// network. // network.
void InitNetwork() { void InitNetwork() {
infer_builder_.reset(createInferBuilder(logger_)); infer_builder_.reset(createInferBuilder(&logger_));
infer_network_.reset(infer_builder_->createNetwork()); infer_network_.reset(infer_builder_->createNetwork());
} }
// After finishing adding ops, freeze this network and creates the executation // After finishing adding ops, freeze this network and creates the executation
......
...@@ -46,13 +46,13 @@ const int kDataTypeSize[] = { ...@@ -46,13 +46,13 @@ const int kDataTypeSize[] = {
// The following two API are implemented in TensorRT's header file, cannot load // The following two API are implemented in TensorRT's header file, cannot load
// from the dynamic library. So create our own implementation and directly // from the dynamic library. So create our own implementation and directly
// trigger the method from the dynamic library. // trigger the method from the dynamic library.
static nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger& logger) { static nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger* logger) {
return static_cast<nvinfer1::IBuilder*>( return static_cast<nvinfer1::IBuilder*>(
dy::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION)); dy::createInferBuilder_INTERNAL(logger, NV_TENSORRT_VERSION));
} }
static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger& logger) { static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
return static_cast<nvinfer1::IRuntime*>( return static_cast<nvinfer1::IRuntime*>(
dy::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION)); dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
} }
// A logger for create TensorRT infer builder. // A logger for create TensorRT infer builder.
...@@ -80,7 +80,7 @@ class NaiveLogger : public nvinfer1::ILogger { ...@@ -80,7 +80,7 @@ class NaiveLogger : public nvinfer1::ILogger {
return *x; return *x;
} }
virtual ~NaiveLogger() override {} ~NaiveLogger() override {}
}; };
} // namespace tensorrt } // namespace tensorrt
......
...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "NvInfer.h" #include "NvInfer.h"
#include "cuda.h"
#include "cuda_runtime_api.h"
#include "paddle/fluid/platform/dynload/tensorrt.h" #include "paddle/fluid/platform/dynload/tensorrt.h"
namespace dy = paddle::platform::dynload; namespace dy = paddle::platform::dynload;
...@@ -43,7 +43,7 @@ class Logger : public nvinfer1::ILogger { ...@@ -43,7 +43,7 @@ class Logger : public nvinfer1::ILogger {
class ScopedWeights { class ScopedWeights {
public: public:
ScopedWeights(float value) : value_(value) { explicit ScopedWeights(float value) : value_(value) {
w.type = nvinfer1::DataType::kFLOAT; w.type = nvinfer1::DataType::kFLOAT;
w.values = &value_; w.values = &value_;
w.count = 1; w.count = 1;
...@@ -58,13 +58,13 @@ class ScopedWeights { ...@@ -58,13 +58,13 @@ class ScopedWeights {
// The following two API are implemented in TensorRT's header file, cannot load // The following two API are implemented in TensorRT's header file, cannot load
// from the dynamic library. So create our own implementation and directly // from the dynamic library. So create our own implementation and directly
// trigger the method from the dynamic library. // trigger the method from the dynamic library.
nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger& logger) { nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger* logger) {
return static_cast<nvinfer1::IBuilder*>( return static_cast<nvinfer1::IBuilder*>(
dy::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION)); dy::createInferBuilder_INTERNAL(logger, NV_TENSORRT_VERSION));
} }
nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger& logger) { nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
return static_cast<nvinfer1::IRuntime*>( return static_cast<nvinfer1::IRuntime*>(
dy::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION)); dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
} }
const char* kInputTensor = "input"; const char* kInputTensor = "input";
...@@ -74,7 +74,7 @@ const char* kOutputTensor = "output"; ...@@ -74,7 +74,7 @@ const char* kOutputTensor = "output";
nvinfer1::IHostMemory* CreateNetwork() { nvinfer1::IHostMemory* CreateNetwork() {
Logger logger; Logger logger;
// Create the engine. // Create the engine.
nvinfer1::IBuilder* builder = createInferBuilder(logger); nvinfer1::IBuilder* builder = createInferBuilder(&logger);
ScopedWeights weights(2.); ScopedWeights weights(2.);
ScopedWeights bias(3.); ScopedWeights bias(3.);
...@@ -103,9 +103,9 @@ nvinfer1::IHostMemory* CreateNetwork() { ...@@ -103,9 +103,9 @@ nvinfer1::IHostMemory* CreateNetwork() {
return model; return model;
} }
void Execute(nvinfer1::IExecutionContext& context, const float* input, void Execute(nvinfer1::IExecutionContext* context, const float* input,
float* output) { float* output) {
const nvinfer1::ICudaEngine& engine = context.getEngine(); const nvinfer1::ICudaEngine& engine = context->getEngine();
// Two binds, input and output // Two binds, input and output
ASSERT_EQ(engine.getNbBindings(), 2); ASSERT_EQ(engine.getNbBindings(), 2);
const int input_index = engine.getBindingIndex(kInputTensor); const int input_index = engine.getBindingIndex(kInputTensor);
...@@ -119,7 +119,7 @@ void Execute(nvinfer1::IExecutionContext& context, const float* input, ...@@ -119,7 +119,7 @@ void Execute(nvinfer1::IExecutionContext& context, const float* input,
// Copy the input to the GPU, execute the network, and copy the output back. // Copy the input to the GPU, execute the network, and copy the output back.
ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float), ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float),
cudaMemcpyHostToDevice, stream)); cudaMemcpyHostToDevice, stream));
context.enqueue(1, buffers, stream, nullptr); context->enqueue(1, buffers, stream, nullptr);
ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float), ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float),
cudaMemcpyDeviceToHost, stream)); cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
...@@ -136,7 +136,7 @@ TEST(TensorrtTest, BasicFunction) { ...@@ -136,7 +136,7 @@ TEST(TensorrtTest, BasicFunction) {
// Use the model to create an engine and an execution context. // Use the model to create an engine and an execution context.
Logger logger; Logger logger;
nvinfer1::IRuntime* runtime = createInferRuntime(logger); nvinfer1::IRuntime* runtime = createInferRuntime(&logger);
nvinfer1::ICudaEngine* engine = nvinfer1::ICudaEngine* engine =
runtime->deserializeCudaEngine(model->data(), model->size(), nullptr); runtime->deserializeCudaEngine(model->data(), model->size(), nullptr);
model->destroy(); model->destroy();
...@@ -145,7 +145,7 @@ TEST(TensorrtTest, BasicFunction) { ...@@ -145,7 +145,7 @@ TEST(TensorrtTest, BasicFunction) {
// Execute the network. // Execute the network.
float input = 1234; float input = 1234;
float output; float output;
Execute(*context, &input, &output); Execute(context, &input, &output);
EXPECT_EQ(output, input * 2 + 3); EXPECT_EQ(output, input * 2 + 3);
// Destroy the engine. // Destroy the engine.
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <thrust/execution_policy.h> #include <thrust/execution_policy.h>
#include <thrust/reduce.h> #include <thrust/reduce.h>
#include "paddle/fluid/operators/accuracy_op.h" #include "paddle/fluid/operators/accuracy_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
namespace paddle { namespace paddle {
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/adagrad_op.h" #include "paddle/fluid/operators/adagrad_op.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -87,9 +87,13 @@ class BatchNormOp : public framework::OperatorWithKernel { ...@@ -87,9 +87,13 @@ class BatchNormOp : public framework::OperatorWithKernel {
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
auto input_data_type = auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("X")->type()); framework::ToDataType(ctx.Input<Tensor>("X")->type());
// For float or float16 input tensor, the type of the scale, bias, mean, // By default, the type of the scale, bias, mean,
// and var tensors should both be float. // and var tensors should both be float. (For float or float16 input tensor)
// or double (For double input tensor).
auto bn_param_type = framework::proto::VarType::FP32; auto bn_param_type = framework::proto::VarType::FP32;
if (input_data_type == framework::proto::VarType::FP64) {
bn_param_type = framework::proto::VarType::FP64;
}
PADDLE_ENFORCE_EQ(bn_param_type, PADDLE_ENFORCE_EQ(bn_param_type,
framework::ToDataType(ctx.Input<Tensor>("Scale")->type()), framework::ToDataType(ctx.Input<Tensor>("Scale")->type()),
"Scale input should be of float type"); "Scale input should be of float type");
...@@ -492,8 +496,9 @@ REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, ...@@ -492,8 +496,9 @@ REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp); REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
batch_norm, batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>); ops::BatchNormKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
batch_norm_grad, batch_norm_grad,
ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>); ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -287,6 +287,8 @@ namespace ops = paddle::operators; ...@@ -287,6 +287,8 @@ namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>, batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
ops::BatchNormKernel<plat::CUDADeviceContext, double>,
ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>); ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>); batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
ops::BatchNormGradKernel<plat::CUDADeviceContext, double>);
...@@ -195,10 +195,9 @@ std::string ItemToString(const BeamSearch::Item &item) { ...@@ -195,10 +195,9 @@ std::string ItemToString(const BeamSearch::Item &item) {
return stream.str(); return stream.str();
} }
class BeamSearchProtoAndCheckerMaker class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
: public framework::OpProtoAndCheckerMaker {
public: public:
BeamSearchProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker) BeamSearchOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
// inputs and outputs stored in proto // inputs and outputs stored in proto
AddInput("pre_ids", "ids in previous step"); AddInput("pre_ids", "ids in previous step");
...@@ -222,20 +221,32 @@ class BeamSearchProtoAndCheckerMaker ...@@ -222,20 +221,32 @@ class BeamSearchProtoAndCheckerMaker
} }
}; };
class BeamSearchInferShape : public framework::InferShapeBase { class BeamSearchOp : public framework::OperatorWithKernel {
public: public:
void operator()(framework::InferShapeContext *context) const override { using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
for (const std::string &arg : for (const std::string &arg :
std::vector<std::string>({"pre_ids", "ids", "scores"})) { std::vector<std::string>({"pre_ids", "ids", "scores"})) {
PADDLE_ENFORCE(context->HasInput(arg), PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'",
"BeamSearch need input argument '%s'", arg); arg);
} }
for (const std::string &arg : for (const std::string &arg :
std::vector<std::string>({"selected_ids", "selected_scores"})) { std::vector<std::string>({"selected_ids", "selected_scores"})) {
PADDLE_ENFORCE(context->HasOutput(arg), PADDLE_ENFORCE(ctx->HasOutput(arg),
"BeamSearch need output argument '%s'", arg); "BeamSearch need output argument '%s'", arg);
} }
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
framework::OpKernelType kt = framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>("pre_ids")->type()),
platform::CPUPlace());
return kt;
}
}; };
class BeamSearchInferVarType : public framework::VarTypeInference { class BeamSearchInferVarType : public framework::VarTypeInference {
...@@ -254,8 +265,13 @@ class BeamSearchInferVarType : public framework::VarTypeInference { ...@@ -254,8 +265,13 @@ class BeamSearchInferVarType : public framework::VarTypeInference {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp, namespace ops = paddle::operators;
paddle::operators::BeamSearchProtoAndCheckerMaker,
paddle::operators::BeamSearchInferShape, REGISTER_OPERATOR(beam_search, ops::BeamSearchOp, ops::BeamSearchOpMaker,
paddle::operators::BeamSearchInferVarType, ops::BeamSearchInferVarType);
paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL(
beam_search,
ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
...@@ -192,49 +192,29 @@ std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item); ...@@ -192,49 +192,29 @@ std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
std::string ItemToString(const BeamSearch::Item& item); std::string ItemToString(const BeamSearch::Item& item);
class BeamSearchOp : public framework::OperatorBase { template <typename DeviceContext, typename T>
class BeamSearchOpKernel : public framework::OpKernel<T> {
public: public:
BeamSearchOp(const std::string& type, void Compute(const framework::ExecutionContext& context) const override {
const framework::VariableNameMap& inputs, auto* ids_var = context.Input<framework::LoDTensor>("ids");
const framework::VariableNameMap& outputs, auto* scores_var = context.Input<framework::LoDTensor>("scores");
const framework::AttributeMap& attrs) auto* pre_ids_var = context.Input<framework::LoDTensor>("pre_ids");
: OperatorBase(type, inputs, outputs, attrs) {}
BeamSearchOp(const BeamSearchOp& o)
: framework::OperatorBase(
static_cast<const framework::OperatorBase&>(o)) {
PADDLE_THROW("Not Implemented");
}
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
auto ids_var = scope.FindVar(Input("ids"));
auto scores_var = scope.FindVar(Input("scores"));
auto pre_ids_var = scope.FindVar(Input("pre_ids"));
PADDLE_ENFORCE_NOT_NULL(ids_var); PADDLE_ENFORCE_NOT_NULL(ids_var);
PADDLE_ENFORCE_NOT_NULL(scores_var); PADDLE_ENFORCE_NOT_NULL(scores_var);
PADDLE_ENFORCE_NOT_NULL(pre_ids_var); PADDLE_ENFORCE_NOT_NULL(pre_ids_var);
auto& ids = ids_var->Get<framework::LoDTensor>(); size_t level = context.Attr<int>("level");
auto& scores = scores_var->Get<framework::LoDTensor>(); size_t beam_size = context.Attr<int>("beam_size");
auto& pre_ids = pre_ids_var->Get<framework::LoDTensor>(); int end_id = context.Attr<int>("end_id");
size_t level = Attr<int>("level"); BeamSearch alg(*ids_var, *scores_var, level, beam_size, end_id);
size_t beam_size = Attr<int>("beam_size"); auto selected_ids_var =
int end_id = Attr<int>("end_id"); context.Output<framework::LoDTensor>("selected_ids");
BeamSearch alg(ids, scores, level, beam_size, end_id); auto selected_scores_var =
context.Output<framework::LoDTensor>("selected_scores");
auto selected_ids_var = scope.FindVar(Output("selected_ids"));
auto selected_scores_var = scope.FindVar(Output("selected_scores"));
PADDLE_ENFORCE_NOT_NULL(selected_ids_var); PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
PADDLE_ENFORCE_NOT_NULL(selected_scores_var); PADDLE_ENFORCE_NOT_NULL(selected_scores_var);
auto& selected_ids_tensor = alg(*pre_ids_var, selected_ids_var, selected_scores_var);
*selected_ids_var->GetMutable<framework::LoDTensor>();
auto& selected_scores_tensor =
*selected_scores_var->GetMutable<framework::LoDTensor>();
alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor);
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/bilinear_interp_op.h" #include "paddle/fluid/operators/bilinear_interp_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -10,7 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,7 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/box_coder_op.h" #include "paddle/fluid/operators/box_coder_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -87,7 +87,7 @@ class ConcatGradKernel : public framework::OpKernel<T> { ...@@ -87,7 +87,7 @@ class ConcatGradKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
paddle::operators::math::ConcatGradFunctor<DeviceContext, T> paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
concat_grad_functor; concat_grad_functor;
concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), outputs); concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), &outputs);
} }
} }
}; };
......
...@@ -20,6 +20,11 @@ limitations under the License. */ ...@@ -20,6 +20,11 @@ limitations under the License. */
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
DEFINE_bool(cudnn_algo_use_autotune, true,
"Whether allow using an autotuning algorithm for convolution "
"operator. The autotuning algorithm may be non-deterministic. If "
"false, the algorithm is deterministic.");
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -267,17 +272,23 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -267,17 +272,23 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
if (input_grad) { if (input_grad) {
PADDLE_ENFORCE( if (FLAGS_cudnn_algo_use_autotune) {
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( PADDLE_ENFORCE(
handle, cudnn_filter_desc, platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
// dyDesc: Handle to the previously initialized input differential handle, cudnn_filter_desc,
// tensor descriptor. // dyDesc: Handle to the previously initialized input
cudnn_output_grad_desc, cudnn_conv_desc, // differential
// dxDesc: Handle to the previously initialized output tensor // tensor descriptor.
// descriptor. cudnn_output_grad_desc, cudnn_conv_desc,
cudnn_input_desc, // dxDesc: Handle to the previously initialized output tensor
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, // descriptor.
workspace_size_limit, &data_algo)); cudnn_input_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &data_algo));
} else {
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
PADDLE_ENFORCE( PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
handle, cudnn_filter_desc, cudnn_output_grad_desc, handle, cudnn_filter_desc, cudnn_output_grad_desc,
...@@ -286,12 +297,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -286,12 +297,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
} }
if (filter_grad) { if (filter_grad) {
PADDLE_ENFORCE( if (FLAGS_cudnn_algo_use_autotune) {
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( PADDLE_ENFORCE(
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
cudnn_filter_desc, handle, cudnn_input_desc, cudnn_output_grad_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, cudnn_conv_desc, cudnn_filter_desc,
workspace_size_limit, &filter_algo)); CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &filter_algo));
} else {
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
}
PADDLE_ENFORCE( PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_shift_op.h" #include "paddle/fluid/operators/conv_shift_op.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -82,7 +82,9 @@ class RequestSend final : public RequestBase { ...@@ -82,7 +82,9 @@ class RequestSend final : public RequestBase {
virtual std::string GetReqName() { return request_->Varname(); } virtual std::string GetReqName() { return request_->Varname(); }
virtual void Process() { virtual void Process() {
queue_->Push(std::make_pair(request_->Varname(), request_)); std::string var_name = GetReqName();
VLOG(3) << "RequestSend " << var_name;
queue_->Push(std::make_pair(var_name, request_));
sendrecv::VoidMessage reply; sendrecv::VoidMessage reply;
responder_.Finish(reply, ::grpc::Status::OK, this); responder_.Finish(reply, ::grpc::Status::OK, this);
...@@ -106,7 +108,7 @@ class RequestGet final : public RequestBase { ...@@ -106,7 +108,7 @@ class RequestGet final : public RequestBase {
responder_(&ctx_), responder_(&ctx_),
scope_(scope), scope_(scope),
queue_(queue) { queue_(queue) {
int method_id = static_cast<int>(detail::GrpcMethod::kGetVariable); auto method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_, service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
cq_, this); cq_, this);
} }
...@@ -118,6 +120,7 @@ class RequestGet final : public RequestBase { ...@@ -118,6 +120,7 @@ class RequestGet final : public RequestBase {
virtual void Process() { virtual void Process() {
// proc request. // proc request.
std::string var_name = request_.varname(); std::string var_name = request_.varname();
VLOG(3) << "RequestGet " << var_name;
auto* var = scope_->FindVar(var_name); auto* var = scope_->FindVar(var_name);
::grpc::ByteBuffer reply; ::grpc::ByteBuffer reply;
...@@ -176,7 +179,7 @@ class RequestPrefetch final : public RequestBase { ...@@ -176,7 +179,7 @@ class RequestPrefetch final : public RequestBase {
::grpc::ByteBuffer reply; ::grpc::ByteBuffer reply;
std::string var_name = request_->OutVarname(); std::string var_name = request_->OutVarname();
VLOG(3) << "prefetch var " << var_name; VLOG(3) << "RequestPrefetch " << var_name;
auto var_desc = program_->Block(0).FindVar(var_name); auto var_desc = program_->Block(0).FindVar(var_name);
framework::Scope* local_scope = &scope_->NewScope(); framework::Scope* local_scope = &scope_->NewScope();
auto* var = local_scope->FindVar(var_name); auto* var = local_scope->FindVar(var_name);
...@@ -307,18 +310,20 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, ...@@ -307,18 +310,20 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
bool ok = false; bool ok = false;
while (true) { while (true) {
VLOG(3) << "HandleRequest for " << cq_name << " while in"; VLOG(3) << "HandleRequest for " << cq_name << " wait Next";
if (!cq->Next(&tag, &ok)) { if (!cq->Next(&tag, &ok)) {
LOG(INFO) << cq_name << " CompletionQueue shutdown!"; LOG(INFO) << cq_name << " CompletionQueue shutdown!";
break; break;
} }
VLOG(3) << "HandleRequest for " << cq_name << " while after Next"; VLOG(3) << "HandleRequest for " << cq_name << " get Next";
PADDLE_ENFORCE(tag); PADDLE_ENFORCE(tag);
if (sync_mode_) { if (sync_mode_) {
// FIXME(typhoonzero): de-couple the barriers with recv_op // FIXME(typhoonzero): de-couple the barriers with recv_op
if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1); if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0); if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
VLOG(3) << "HandleRequest for " << cq_name << " after WaitCond";
} }
RequestBase* base = reinterpret_cast<RequestBase*>(tag); RequestBase* base = reinterpret_cast<RequestBase*>(tag);
...@@ -336,9 +341,9 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, ...@@ -336,9 +341,9 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
switch (base->Status()) { switch (base->Status()) {
case PROCESS: { case PROCESS: {
VLOG(4) << cq_name << " PROCESS status:" << base->Status();
TryToRegisterNewOne(); TryToRegisterNewOne();
base->Process(); base->Process();
VLOG(4) << cq_name << " PROCESS status:" << base->Status();
break; break;
} }
case FINISH: { case FINISH: {
......
...@@ -108,7 +108,7 @@ void RunSerdeTestSelectedRows(platform::Place place) { ...@@ -108,7 +108,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
} }
for (size_t i = 0; i < rows2->size(); ++i) { for (size_t i = 0; i < rows2->size(); ++i) {
EXPECT_EQ(rows_data2[i], i); EXPECT_EQ(rows_data2[i], static_cast<int64_t>(i));
} }
EXPECT_EQ(slr2->height(), 1000); EXPECT_EQ(slr2->height(), 1000);
} }
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/edit_distance_op.h" #include "paddle/fluid/operators/edit_distance_op.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
namespace paddle { namespace paddle {
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#ifdef __NVCC__ #ifdef __NVCC__
#include <cuda.h> #include <cuda.h>
#include <thrust/iterator/iterator_adaptor.h> #include <thrust/iterator/iterator_adaptor.h>
#include "paddle/fluid/platform/cuda_primitives.h"
constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
#endif #endif
...@@ -333,24 +334,12 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out, ...@@ -333,24 +334,12 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out,
} }
} }
} }
#ifdef __NVCC__
// __shfl_down has been deprecated as of CUDA 9.0. #ifdef __NVCC__
#if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
return __shfl_down(val, delta);
}
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
template <typename T> template <typename T>
__device__ T reduceSum(T val, int tid, int len) { __device__ T reduceSum(T val, int tid, int len) {
// TODO(zcd): The warp size should be taken from the // NOTE(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply. // parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently, // To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size // I use Warp-Level Parallelism and assume the Warp size
...@@ -362,7 +351,7 @@ __device__ T reduceSum(T val, int tid, int len) { ...@@ -362,7 +351,7 @@ __device__ T reduceSum(T val, int tid, int len) {
CREATE_SHFL_MASK(mask, tid < len); CREATE_SHFL_MASK(mask, tid < len);
for (int offset = warpSize / 2; offset > 0; offset /= 2) for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset); val += platform::__shfl_down_sync(mask, val, offset);
if (tid < warpSize) shm[tid] = 0; if (tid < warpSize) shm[tid] = 0;
...@@ -378,7 +367,7 @@ __device__ T reduceSum(T val, int tid, int len) { ...@@ -378,7 +367,7 @@ __device__ T reduceSum(T val, int tid, int len) {
if (tid < warpSize) { if (tid < warpSize) {
val = shm[tid]; val = shm[tid];
for (int offset = warpSize / 2; offset > 0; offset /= 2) for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset); val += platform::__shfl_down_sync(mask, val, offset);
} }
return val; return val;
......
...@@ -45,20 +45,6 @@ static void split(const std::string &str, char sep, ...@@ -45,20 +45,6 @@ static void split(const std::string &str, char sep,
} }
} }
static void AsyncExecuteBlock(framework::Executor *executor,
framework::ExecutorPrepareContext *prepared,
framework::Scope *scope) {
std::future<void> future = framework::Async([&executor, &prepared, &scope]() {
try {
executor->RunPreparedContext(prepared, scope, false, false);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
});
// TODO(qiao) maybe we can remove this
future.wait();
}
static void ParallelExecuteBlocks( static void ParallelExecuteBlocks(
const std::vector<size_t> &parallel_blkids, framework::Executor *executor, const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>> const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
...@@ -201,14 +187,40 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, ...@@ -201,14 +187,40 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
} // while(true) } // while(true)
} }
static void AsyncUpdateThread(
const std::string &var_name, const bool &exit_flag,
const std::shared_ptr<detail::ReceivedQueue> &queue,
framework::Executor *executor,
framework::ExecutorPrepareContext *prepared) {
VLOG(3) << "update thread for " << var_name << " started";
while (!exit_flag) {
const detail::ReceivedMessage v = queue->Pop();
auto recv_var_name = v.first;
auto var = v.second->GetVar();
if (var == nullptr) {
LOG(ERROR) << "Can not find server side var: " << recv_var_name;
PADDLE_THROW("Can not find server side var");
}
auto fs = framework::Async([var_name, &executor, &v, prepared] {
try {
executor->RunPreparedContext(prepared, v.second->GetMutableLocalScope(),
false, false);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
});
fs.wait();
}
}
void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
framework::ProgramDesc *program, framework::ProgramDesc *program) const {
framework::Scope *recv_scope,
framework::BlockDesc *prefetch_block) const {
VLOG(3) << "RunAsyncLoop in"; VLOG(3) << "RunAsyncLoop in";
// grad name to block id // grad name to block id
std::unordered_map<std::string, int32_t> grad_to_block_id; std::unordered_map<std::string, int32_t> grad_to_block_id;
std::unordered_map<int32_t, std::string> id_to_grad; std::unordered_map<int32_t, std::string> id_to_grad;
std::unordered_map<std::string, std::shared_ptr<detail::ReceivedQueue>>
grad_to_queue;
auto grad_to_block_id_str = auto grad_to_block_id_str =
Attr<std::vector<std::string>>("grad_to_block_id"); Attr<std::vector<std::string>>("grad_to_block_id");
...@@ -220,6 +232,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -220,6 +232,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
PADDLE_ENFORCE_EQ(grad_to_block_id.count(pieces[0]), 0); PADDLE_ENFORCE_EQ(grad_to_block_id.count(pieces[0]), 0);
int block_id = std::stoi(pieces[1]); int block_id = std::stoi(pieces[1]);
grad_to_block_id[pieces[0]] = block_id; grad_to_block_id[pieces[0]] = block_id;
grad_to_queue[pieces[0]] = std::make_shared<detail::ReceivedQueue>();
id_to_grad[block_id] = pieces[0]; id_to_grad[block_id] = pieces[0];
} }
size_t num_blocks = program->Size(); size_t num_blocks = program->Size();
...@@ -238,8 +251,21 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -238,8 +251,21 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
grad_to_prepared_ctx[id_to_grad[block_list[i]]] = optimize_prepared[i]; grad_to_prepared_ctx[id_to_grad[block_list[i]]] = optimize_prepared[i];
} }
VLOG(3) << "RunAsyncLoop into while";
bool exit_flag = false; bool exit_flag = false;
VLOG(3) << "start async optimize threads";
std::vector<std::future<void>> fs;
for (auto iter = grad_to_queue.begin(); iter != grad_to_queue.end(); iter++) {
std::string grad_name = iter->first;
VLOG(3) << "create async update thread for " << grad_name;
fs.push_back(framework::AsyncIO([grad_name, &exit_flag, &executor,
&grad_to_queue, &grad_to_prepared_ctx]() {
AsyncUpdateThread(grad_name, exit_flag, grad_to_queue[grad_name],
executor, grad_to_prepared_ctx[grad_name].get());
}));
}
VLOG(3) << "RunAsyncLoop into while";
while (!exit_flag) { while (!exit_flag) {
const detail::ReceivedMessage v = rpc_service_->Get(); const detail::ReceivedMessage v = rpc_service_->Get();
auto recv_var_name = v.first; auto recv_var_name = v.first;
...@@ -249,13 +275,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -249,13 +275,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
break; break;
} else { } else {
VLOG(3) << "received grad: " << recv_var_name; VLOG(3) << "received grad: " << recv_var_name;
auto var = v.second->GetVar(); grad_to_queue[recv_var_name]->Push(v);
if (var == nullptr) {
LOG(ERROR) << "Can not find server side var: " << recv_var_name;
PADDLE_THROW("Can not find server side var");
}
AsyncExecuteBlock(executor, grad_to_prepared_ctx[recv_var_name].get(),
v.second->GetMutableLocalScope());
} }
if (exit_flag) { if (exit_flag) {
...@@ -304,7 +324,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -304,7 +324,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
if (sync_mode) { if (sync_mode) {
RunSyncLoop(&executor, program, &recv_scope, prefetch_block); RunSyncLoop(&executor, program, &recv_scope, prefetch_block);
} else { } else {
RunAsyncLoop(&executor, program, &recv_scope, prefetch_block); RunAsyncLoop(&executor, program);
} }
} }
......
...@@ -47,9 +47,7 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -47,9 +47,7 @@ class ListenAndServOp : public framework::OperatorBase {
framework::BlockDesc* prefetch_block) const; framework::BlockDesc* prefetch_block) const;
void RunAsyncLoop(framework::Executor* executor, void RunAsyncLoop(framework::Executor* executor,
framework::ProgramDesc* program, framework::ProgramDesc* program) const;
framework::Scope* recv_scope,
framework::BlockDesc* prefetch_block) const;
void Stop() override; void Stop() override;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace operators {
constexpr int64_t kNoPadding = -1;
class LookupSparseTableInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of LookupSparseTableOp should not be null.");
auto shape_w = ctx->GetInputDim("W");
auto shape_ids = ctx->GetInputDim("Ids");
shape_w[0] = shape_ids.size();
ctx->SetOutputDim("Out", shape_w);
}
};
class LookupSparseTableOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
auto out_var = scope.FindVar(Output("Out"));
auto w_var = scope.FindVar(Input("W"));
auto ids_var = scope.FindVar(Input("Ids"));
unsigned int seed = static_cast<unsigned int>(Attr<int>("seed"));
float min = Attr<float>("min");
float max = Attr<float>("max");
bool auto_grown_table = Attr<bool>("auto_grown_table");
PADDLE_ENFORCE(out_var->IsType<framework::LoDTensor>(),
"The type of Out var should be LodTensor.");
PADDLE_ENFORCE(w_var->IsType<framework::SelectedRows>(),
"The type of W var should be SelectedRows.");
PADDLE_ENFORCE(ids_var->IsType<framework::LoDTensor>(),
"The type of Ids var should be LoDTensor.");
auto &ids_t = ids_var->Get<framework::LoDTensor>();
auto out_t = out_var->GetMutable<framework::LoDTensor>();
auto w_t = w_var->GetMutable<framework::SelectedRows>();
std::vector<int64_t> keys;
keys.resize(ids_t.numel());
for (size_t i = 0; i < ids_t.numel(); ++i) {
keys[i] = ids_t.data<int64_t>()[i];
}
// TODO(Yancey1989): support CUDA Place for the sparse table
platform::CPUPlace cpu;
auto out_shape = w_t->value().dims();
out_shape[0] = keys.size();
out_t->Resize(out_shape);
out_t->mutable_data(cpu, w_t->value().type());
PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()),
framework::proto::VarType::FP32,
"The sparse table only support FP32");
auto non_keys_pair = w_t->Get(keys, out_t);
if (!auto_grown_table) {
PADDLE_ENFORCE_EQ(non_keys_pair.size(), static_cast<size_t>(0),
"there is some keys does exists in the sparse table.");
}
auto value_shape = w_t->value().dims();
value_shape[0] = 1;
for (const auto &it : non_keys_pair) {
const auto key = it.first;
const auto index = it.second;
framework::Tensor value;
value.Resize(value_shape);
auto data = value.mutable_data<float>(cpu);
std::minstd_rand engine;
engine.seed(seed);
std::uniform_real_distribution<float> dist(min, max);
int64_t size = value.numel();
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(engine);
}
w_t->Set(key, value);
memory::Copy(cpu, out_t->mutable_data<float>(cpu) + index * value.numel(),
cpu, value.data<float>(), value.numel() * sizeof(float));
}
}
};
class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker {
public:
LookupSparseTableOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("W",
"(SelectedRows) The input represents embedding table, "
"which is a learnable parameter.");
AddInput("Ids",
"(LoDTensor) Ids's type should be LoDTensor"
"THe ids to be looked up in W.");
AddOutput("Out",
"(LoDTensor) The lookup results, which have the "
"same type as W.");
AddAttr<int64_t>("padding_idx",
"(int64, default -1) "
"If the value is -1, it makes no effect to lookup. "
"Otherwise the given value indicates padding the output "
"with zeros whenever lookup encounters it in Ids.")
.SetDefault(kNoPadding);
AddAttr<float>("min",
"(float, default -1.0) "
"Minimum value of uniform random")
.SetDefault(-1.0f);
AddAttr<float>("max",
"(float, default 1.0) "
"Maximun value of uniform random")
.SetDefault(1.0f);
AddAttr<int>("seed",
"(int, default 0) "
"Random seed used for generating samples. "
"0 means use a seed generated by the system."
"Note that if seed is not 0, this operator will always "
"generate the same random numbers every time.")
.SetDefault(0);
AddAttr<bool>("auto_grown_table",
"(bool default false)"
"Whether create new value if for nonexistent key.")
.SetDefault(true);
AddComment(R"DOC(
Lookup Sprase Tablel Operator.
This operator is used to perform lookup on parameter W,
then concatenated into a sparse tensor.
The type of Ids(Input) is SelectedRows, the rows of Ids contains
the ids to be looked up in W;
if the Id is not in the sparse table, this operator will return a
random value and set the value into the table for the next looking up.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(lookup_sparse_table, ops::LookupSparseTableOp,
ops::LookupSparseTableInferShape,
ops::LookupSparseTableOpMaker,
paddle::framework::EmptyGradOpMaker);
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/lookup_table_op.h" #include "paddle/fluid/operators/lookup_table_op.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/operators/math/concat.h"
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -70,20 +71,20 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> { ...@@ -70,20 +71,20 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, const int axis, const framework::Tensor& input, const int axis,
std::vector<framework::Tensor>& outputs) { std::vector<framework::Tensor>* outputs) {
// TODO(zcd): Add input data validity checking // TODO(zcd): Add input data validity checking
int num = outputs.size(); int num = outputs->size();
int input_rows = 1; int input_rows = 1;
auto dim_0 = outputs[0].dims(); auto dim_0 = outputs->at(0).dims();
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
input_rows *= dim_0[i]; input_rows *= dim_0[i];
} }
int input_cols = 0; int input_cols = 0;
std::vector<int64_t> output_cols(outputs.size()); std::vector<int64_t> output_cols(outputs->size());
for (int i = 0; i < num; ++i) { for (int i = 0; i < num; ++i) {
int t_cols = outputs[i].numel() / input_rows; int t_cols = outputs->at(i).numel() / input_rows;
input_cols += t_cols; input_cols += t_cols;
output_cols[i] = t_cols; output_cols[i] = t_cols;
} }
...@@ -95,7 +96,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> { ...@@ -95,7 +96,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
int col_idx = 0; int col_idx = 0;
for (int j = 0; j < num; ++j) { for (int j = 0; j < num; ++j) {
int col_len = output_cols[j]; int col_len = output_cols[j];
T* dst_ptr = outputs[j].data<T>() + k * col_len; T* dst_ptr = outputs->at(j).data<T>() + k * col_len;
memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx, memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
sizeof(T) * col_len); sizeof(T) * col_len);
col_idx += col_len; col_idx += col_len;
......
...@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -202,16 +204,16 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -202,16 +204,16 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const int axis, const framework::Tensor& input, const int axis,
std::vector<framework::Tensor>& outputs) { std::vector<framework::Tensor>* outputs) {
// TODO(zcd): Add input data validity checking // TODO(zcd): Add input data validity checking
int o_num = outputs.size(); int o_num = outputs->size();
int out_row = 1; int out_row = 1;
auto dim_0 = outputs[0].dims(); auto dim_0 = outputs->at(0).dims();
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
out_row *= dim_0[i]; out_row *= dim_0[i];
} }
int out_col = outputs[0].numel() / out_row; int out_col = outputs->at(0).numel() / out_row;
int in_col = 0, in_row = out_row; int in_col = 0, in_row = out_row;
bool sameShape = true; bool sameShape = true;
...@@ -221,13 +223,13 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -221,13 +223,13 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
outputs_cols[0] = 0; outputs_cols[0] = 0;
for (int i = 0; i < o_num; ++i) { for (int i = 0; i < o_num; ++i) {
int t_col = outputs[i].numel() / out_row; int t_col = outputs->at(i).numel() / out_row;
if (sameShape) { if (sameShape) {
if (t_col != out_col) sameShape = false; if (t_col != out_col) sameShape = false;
} }
in_col += t_col; in_col += t_col;
outputs_cols[i + 1] = in_col; outputs_cols[i + 1] = in_col;
outputs_ptr[i] = outputs[i].data<T>(); outputs_ptr[i] = outputs->at(i).data<T>();
} }
T** dev_out_gpu_data = T** dev_out_gpu_data =
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
...@@ -56,7 +57,7 @@ template <typename DeviceContext, typename T> ...@@ -56,7 +57,7 @@ template <typename DeviceContext, typename T>
class ConcatGradFunctor { class ConcatGradFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const int axis, std::vector<framework::Tensor>& outputs); const int axis, std::vector<framework::Tensor>* outputs);
}; };
} // namespace math } // namespace math
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/cos_sim_functor.h" #include "paddle/fluid/operators/math/cos_sim_functor.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -31,11 +32,11 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, ...@@ -31,11 +32,11 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
template <typename T> template <typename T>
__device__ __forceinline__ T sum_single_warp(T val) { __device__ __forceinline__ T sum_single_warp(T val) {
val += __shfl_down(val, 16); val += platform::__shfl_down_sync(0, val, 16);
val += __shfl_down(val, 8); val += platform::__shfl_down_sync(0, val, 8);
val += __shfl_down(val, 4); val += platform::__shfl_down_sync(0, val, 4);
val += __shfl_down(val, 2); val += platform::__shfl_down_sync(0, val, 2);
val += __shfl_down(val, 1); val += platform::__shfl_down_sync(0, val, 1);
return val; return val;
} }
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -89,14 +89,14 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, ...@@ -89,14 +89,14 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
__m256 r_value_reset_gate; __m256 r_value_reset_gate;
__m256 r_value_reset_output; __m256 r_value_reset_output;
__m256 r_prev_out = _mm256_set1_ps(0.0f); __m256 r_prev_out = _mm256_set1_ps(0.0f);
__m256 *update_gate = (__m256 *)gate_value; __m256 *update_gate = reinterpret_cast<__m256 *>(gate_value);
__m256 *reset_gate = (__m256 *)(gate_value + frame_size); __m256 *reset_gate = reinterpret_cast<__m256 *>(gate_value + frame_size);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_value_update_gate = update_gate[i]; r_value_update_gate = update_gate[i];
r_value_reset_gate = reset_gate[i]; r_value_reset_gate = reset_gate[i];
if (prev_output_value) { if (prev_output_value) {
r_prev_out = ((__m256 *)prev_output_value)[i]; r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i];
} }
op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out, op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
...@@ -104,7 +104,7 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, ...@@ -104,7 +104,7 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
update_gate[i] = r_value_update_gate; update_gate[i] = r_value_update_gate;
reset_gate[i] = r_value_reset_gate; reset_gate[i] = r_value_reset_gate;
((__m256 *)reset_output_value)[i] = r_value_reset_output; (reinterpret_cast<__m256 *>(reset_output_value))[i] = r_value_reset_output;
} }
#endif #endif
} }
...@@ -119,21 +119,21 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, ...@@ -119,21 +119,21 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
__m256 r_value_frame_state; __m256 r_value_frame_state;
__m256 r_prev_out = _mm256_set1_ps(0.0f); __m256 r_prev_out = _mm256_set1_ps(0.0f);
__m256 r_output; __m256 r_output;
__m256 *update_gate = (__m256 *)gate_value; __m256 *update_gate = reinterpret_cast<__m256 *>(gate_value);
__m256 *frame_state = (__m256 *)(gate_value + frame_size * 2); __m256 *frame_state = reinterpret_cast<__m256 *>(gate_value + frame_size * 2);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_value_update_gate = update_gate[i]; r_value_update_gate = update_gate[i];
r_value_frame_state = frame_state[i]; r_value_frame_state = frame_state[i];
if (prev_output_value) { if (prev_output_value) {
r_prev_out = ((__m256 *)prev_output_value)[i]; r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i];
} }
op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out, op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
r_output, active_node); r_output, active_node);
frame_state[i] = r_value_frame_state; frame_state[i] = r_value_frame_state;
((__m256 *)output_value)[i] = r_output; (reinterpret_cast<__m256 *>(output_value))[i] = r_output;
} }
#endif #endif
} }
...@@ -284,20 +284,22 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, ...@@ -284,20 +284,22 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
__m256 r_out_grad; __m256 r_out_grad;
__m256 r_prev_out_value = _mm256_set1_ps(0.0f); __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
__m256 r_prev_out_grad = _mm256_set1_ps(0.0f); __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
__m256 *update_gate_value = (__m256 *)gate_value; __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
__m256 *update_gate_grad = (__m256 *)gate_grad; __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
__m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2); __m256 *frame_state_value =
__m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2); reinterpret_cast<__m256 *>(gate_value + frame_size * 2);
__m256 *frame_state_grad =
reinterpret_cast<__m256 *>(gate_grad + frame_size * 2);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_update_gate_value = update_gate_value[i]; r_update_gate_value = update_gate_value[i];
r_frame_state_value = frame_state_value[i]; r_frame_state_value = frame_state_value[i];
r_out_grad = ((__m256 *)output_grad)[i]; r_out_grad = (reinterpret_cast<__m256 *>(output_grad))[i];
if (prev_out_value) { if (prev_out_value) {
r_prev_out_value = ((__m256 *)prev_out_value)[i]; r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i];
} }
if (prev_out_grad) { if (prev_out_grad) {
r_prev_out_grad = ((__m256 *)prev_out_grad)[i]; r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
} }
op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value, op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
...@@ -307,7 +309,7 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, ...@@ -307,7 +309,7 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
update_gate_grad[i] = r_update_gate_grad; update_gate_grad[i] = r_update_gate_grad;
frame_state_grad[i] = r_frame_state_grad; frame_state_grad[i] = r_frame_state_grad;
if (prev_out_grad) { if (prev_out_grad) {
((__m256 *)prev_out_grad)[i] = r_prev_out_grad; (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
} }
} }
#endif #endif
...@@ -327,10 +329,11 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, ...@@ -327,10 +329,11 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
__m256 r_reset_output_grad = _mm256_set1_ps(0.0f); __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
__m256 r_prev_out_value = _mm256_set1_ps(0.0f); __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
__m256 r_prev_out_grad = _mm256_set1_ps(0.0f); __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
__m256 *update_gate_value = (__m256 *)gate_value; __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
__m256 *update_gate_grad = (__m256 *)gate_grad; __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
__m256 *reset_gate_value = (__m256 *)(gate_value + frame_size); __m256 *reset_gate_value =
__m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size); reinterpret_cast<__m256 *>(gate_value + frame_size);
__m256 *reset_gate_grad = reinterpret_cast<__m256 *>(gate_grad + frame_size);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_update_gate_value = update_gate_value[i]; r_update_gate_value = update_gate_value[i];
...@@ -338,13 +341,13 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, ...@@ -338,13 +341,13 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
r_reset_gate_value = reset_gate_value[i]; r_reset_gate_value = reset_gate_value[i];
if (prev_out_value && prev_out_grad) { if (prev_out_value && prev_out_grad) {
r_reset_output_grad = ((__m256 *)reset_output_grad)[i]; r_reset_output_grad = (reinterpret_cast<__m256 *>(reset_output_grad))[i];
} }
if (prev_out_value) { if (prev_out_value) {
r_prev_out_value = ((__m256 *)prev_out_value)[i]; r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i];
} }
if (prev_out_grad) { if (prev_out_grad) {
r_prev_out_grad = ((__m256 *)prev_out_grad)[i]; r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
} }
op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value, op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
...@@ -354,7 +357,7 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, ...@@ -354,7 +357,7 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
update_gate_grad[i] = r_update_gate_grad; update_gate_grad[i] = r_update_gate_grad;
reset_gate_grad[i] = r_reset_gate_grad; reset_gate_grad[i] = r_reset_gate_grad;
if (prev_out_grad) { if (prev_out_grad) {
((__m256 *)prev_out_grad)[i] = r_prev_out_grad; (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
} }
} }
#endif #endif
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include <type_traits> #include <type_traits>
#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/gru_compute.h" #include "paddle/fluid/operators/math/gru_compute.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle { namespace paddle {
......
...@@ -164,10 +164,12 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -164,10 +164,12 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
__m256 r_state_atv; __m256 r_state_atv;
__m256 r_out; __m256 r_out;
__m256 *value_in = (__m256 *)value.gate_value; __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value);
__m256 *value_ig = (__m256 *)(value.gate_value + frame_size); __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size);
__m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2); __m256 *value_fg =
__m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3); reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2);
__m256 *value_og =
reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_value_in = value_in[i]; r_value_in = value_in[i];
...@@ -175,13 +177,13 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -175,13 +177,13 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
r_value_fg = value_fg[i]; r_value_fg = value_fg[i];
r_value_og = value_og[i]; r_value_og = value_og[i];
if (value.check_ig) { if (value.check_ig) {
r_checkI = ((__m256 *)value.check_ig)[i]; r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i];
r_checkF = ((__m256 *)value.check_fg)[i]; r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i];
r_checkO = ((__m256 *)value.check_og)[i]; r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i];
} }
if (value.prev_state_value) { if (value.prev_state_value) {
r_prev_state = ((__m256 *)value.prev_state_value)[i]; r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i];
} }
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
...@@ -192,9 +194,9 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -192,9 +194,9 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
value_ig[i] = r_value_ig; value_ig[i] = r_value_ig;
value_fg[i] = r_value_fg; value_fg[i] = r_value_fg;
value_og[i] = r_value_og; value_og[i] = r_value_og;
((__m256 *)value.state_value)[i] = r_state; (reinterpret_cast<__m256 *>(value.state_value))[i] = r_state;
((__m256 *)value.state_active_value)[i] = r_state_atv; (reinterpret_cast<__m256 *>(value.state_active_value))[i] = r_state_atv;
((__m256 *)value.output_value)[i] = r_out; (reinterpret_cast<__m256 *>(value.output_value))[i] = r_out;
} }
#endif #endif
} }
...@@ -227,14 +229,16 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -227,14 +229,16 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
__m256 r_checkFGrad; __m256 r_checkFGrad;
__m256 r_checkOGrad; __m256 r_checkOGrad;
__m256 *value_in = (__m256 *)value.gate_value; __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value);
__m256 *value_ig = (__m256 *)(value.gate_value + frame_size); __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size);
__m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2); __m256 *value_fg =
__m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3); reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2);
__m256 *grad_in = (__m256 *)grad.gate_grad; __m256 *value_og =
__m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size); reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3);
__m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2); __m256 *grad_in = reinterpret_cast<__m256 *>(grad.gate_grad);
__m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3); __m256 *grad_ig = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size);
__m256 *grad_fg = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 2);
__m256 *grad_og = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 3);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_value_in = value_in[i]; r_value_in = value_in[i];
...@@ -242,16 +246,16 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -242,16 +246,16 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
r_value_fg = value_fg[i]; r_value_fg = value_fg[i];
r_value_og = value_og[i]; r_value_og = value_og[i];
if (value.check_ig) { if (value.check_ig) {
r_checkI = ((__m256 *)value.check_ig)[i]; r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i];
r_checkF = ((__m256 *)value.check_fg)[i]; r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i];
r_checkO = ((__m256 *)value.check_og)[i]; r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i];
} }
r_state = ((__m256 *)value.state_value)[i]; r_state = (reinterpret_cast<__m256 *>(value.state_value))[i];
r_state_atv = ((__m256 *)value.state_active_value)[i]; r_state_atv = (reinterpret_cast<__m256 *>(value.state_active_value))[i];
r_output_grad = ((__m256 *)grad.output_grad)[i]; r_output_grad = (reinterpret_cast<__m256 *>(grad.output_grad))[i];
r_state_grad = ((__m256 *)grad.state_grad)[i]; r_state_grad = (reinterpret_cast<__m256 *>(grad.state_grad))[i];
if (value.prev_state_value) { if (value.prev_state_value) {
r_prev_state = ((__m256 *)value.prev_state_value)[i]; r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i];
} }
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
...@@ -264,15 +268,18 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -264,15 +268,18 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
grad_ig[i] = r_grad_ig; grad_ig[i] = r_grad_ig;
grad_fg[i] = r_grad_fg; grad_fg[i] = r_grad_fg;
grad_og[i] = r_grad_og; grad_og[i] = r_grad_og;
((__m256 *)grad.state_grad)[i] = r_state_grad; (reinterpret_cast<__m256 *>(grad.state_grad))[i] = r_state_grad;
if (grad.prev_state_grad) if (grad.prev_state_grad)
((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad; (reinterpret_cast<__m256 *>(grad.prev_state_grad))[i] = r_prev_state_grad;
if (value.prev_state_value) { if (value.prev_state_value) {
if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad; if (grad.check_ig_grad)
if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad; (reinterpret_cast<__m256 *>(grad.check_ig_grad))[i] += r_checkIGrad;
if (grad.check_fg_grad)
(reinterpret_cast<__m256 *>(grad.check_fg_grad))[i] += r_checkFGrad;
} }
if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad; if (grad.check_og_grad)
(reinterpret_cast<__m256 *>(grad.check_og_grad))[i] += r_checkOGrad;
} }
#endif #endif
} }
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h" #include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle { namespace paddle {
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -23,32 +23,29 @@ void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size, ...@@ -23,32 +23,29 @@ void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
} }
TEST(math_function, notrans_mul_trans_fp32) { TEST(math_function, notrans_mul_trans_fp32) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor out_gpu;
paddle::framework::Tensor out;
Tensor input1; paddle::platform::CPUPlace cpu_place;
Tensor input1_gpu; paddle::platform::CUDAPlace gpu_place(0);
Tensor input2_gpu; paddle::platform::CUDADeviceContext context(gpu_place);
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place); float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float)); memcpy(input1_ptr, arr, 6 * sizeof(float));
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input1, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float>({2, 2}, gpu_place); out_gpu.mutable_data<float>({2, 2}, gpu_place);
paddle::operators::math::matmul<CUDADeviceContext, float>( paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
TensorCopySync(out_gpu, cpu_place, &out); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
float* out_ptr = out.data<float>(); float* out_ptr = out.data<float>();
context.Wait(); context.Wait();
...@@ -59,39 +56,38 @@ TEST(math_function, notrans_mul_trans_fp32) { ...@@ -59,39 +56,38 @@ TEST(math_function, notrans_mul_trans_fp32) {
} }
TEST(math_function, notrans_mul_trans_fp16) { TEST(math_function, notrans_mul_trans_fp16) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
Tensor input1; paddle::framework::Tensor out_gpu;
Tensor input1_gpu; paddle::framework::Tensor out;
Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
return; return;
} }
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place); paddle::platform::float16* input1_ptr =
input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input1, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float16>({2, 2}, gpu_place); out_gpu.mutable_data<paddle::platform::float16>({2, 2}, gpu_place);
paddle::operators::math::matmul<CUDADeviceContext, float16>( paddle::operators::math::matmul<paddle::platform::CUDADeviceContext,
context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu, paddle::platform::float16>(
float16(0)); context, input1_gpu, false, input2_gpu, true,
paddle::platform::float16(1), &out_gpu, paddle::platform::float16(0));
TensorCopySync(out_gpu, cpu_place, &out); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
float16* out_ptr = out.data<float16>(); paddle::platform::float16* out_ptr = out.data<paddle::platform::float16>();
context.Wait(); context.Wait();
EXPECT_EQ(static_cast<float>(out_ptr[0]), 5); EXPECT_EQ(static_cast<float>(out_ptr[0]), 5);
EXPECT_EQ(static_cast<float>(out_ptr[1]), 14); EXPECT_EQ(static_cast<float>(out_ptr[1]), 14);
...@@ -100,32 +96,29 @@ TEST(math_function, notrans_mul_trans_fp16) { ...@@ -100,32 +96,29 @@ TEST(math_function, notrans_mul_trans_fp16) {
} }
TEST(math_function, trans_mul_notrans_fp32) { TEST(math_function, trans_mul_notrans_fp32) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor out_gpu;
paddle::framework::Tensor out;
Tensor input1; paddle::platform::CPUPlace cpu_place;
Tensor input1_gpu; paddle::platform::CUDAPlace gpu_place(0);
Tensor input2_gpu; paddle::platform::CUDADeviceContext context(gpu_place);
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place); float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float)); memcpy(input1_ptr, arr, 6 * sizeof(float));
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input1, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float>({3, 3}, gpu_place); out_gpu.mutable_data<float>({3, 3}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
TensorCopySync(out_gpu, cpu_place, &out); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
float* out_ptr = out.data<float>(); float* out_ptr = out.data<float>();
context.Wait(); context.Wait();
...@@ -141,39 +134,38 @@ TEST(math_function, trans_mul_notrans_fp32) { ...@@ -141,39 +134,38 @@ TEST(math_function, trans_mul_notrans_fp32) {
} }
TEST(math_function, trans_mul_notrans_fp16) { TEST(math_function, trans_mul_notrans_fp16) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
Tensor input1; paddle::framework::Tensor out_gpu;
Tensor input1_gpu; paddle::framework::Tensor out;
Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
return; return;
} }
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place); paddle::platform::float16* input1_ptr =
input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input1, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float16>({3, 3}, gpu_place); out_gpu.mutable_data<paddle::platform::float16>({3, 3}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float16>( paddle::operators::math::matmul<paddle::platform::CUDADeviceContext,
context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu, paddle::platform::float16>(
float16(0)); context, input1_gpu, true, input2_gpu, false,
paddle::platform::float16(1), &out_gpu, paddle::platform::float16(0));
TensorCopySync(out_gpu, cpu_place, &out); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
float16* out_ptr = out.data<float16>(); paddle::platform::float16* out_ptr = out.data<paddle::platform::float16>();
context.Wait(); context.Wait();
EXPECT_EQ(static_cast<float>(out_ptr[0]), 9); EXPECT_EQ(static_cast<float>(out_ptr[0]), 9);
EXPECT_EQ(static_cast<float>(out_ptr[1]), 12); EXPECT_EQ(static_cast<float>(out_ptr[1]), 12);
...@@ -187,19 +179,16 @@ TEST(math_function, trans_mul_notrans_fp16) { ...@@ -187,19 +179,16 @@ TEST(math_function, trans_mul_notrans_fp16) {
} }
TEST(math_function, gemm_notrans_cublas_fp32) { TEST(math_function, gemm_notrans_cublas_fp32) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input2;
paddle::framework::Tensor input3;
paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor input3_gpu;
Tensor input1; paddle::platform::CPUPlace cpu_place;
Tensor input2; paddle::platform::CUDAPlace gpu_place(0);
Tensor input3; paddle::platform::CUDADeviceContext context(gpu_place);
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
int m = 2; int m = 2;
int n = 3; int n = 3;
...@@ -214,9 +203,9 @@ TEST(math_function, gemm_notrans_cublas_fp32) { ...@@ -214,9 +203,9 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float)); memcpy(input3_ptr, arr3, 8 * sizeof(float));
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input2, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
TensorCopySync(input3, gpu_place, &input3_gpu); paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
float* a = input1_gpu.data<float>(); float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>(); float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(gpu_place); float* c = input3_gpu.mutable_data<float>(gpu_place);
...@@ -224,7 +213,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) { ...@@ -224,7 +213,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
TensorCopySync(input3_gpu, cpu_place, &input3); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
// numpy code: // numpy code:
// a = np.arange(6).reshape(2, 3) // a = np.arange(6).reshape(2, 3)
...@@ -244,19 +233,16 @@ TEST(math_function, gemm_notrans_cublas_fp32) { ...@@ -244,19 +233,16 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
} }
TEST(math_function, gemm_notrans_cublas_fp16) { TEST(math_function, gemm_notrans_cublas_fp16) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input2;
paddle::framework::Tensor input3;
Tensor input1; paddle::framework::Tensor input1_gpu;
Tensor input2; paddle::framework::Tensor input2_gpu;
Tensor input3; paddle::framework::Tensor input3_gpu;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
...@@ -266,26 +252,31 @@ TEST(math_function, gemm_notrans_cublas_fp16) { ...@@ -266,26 +252,31 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
int m = 2; int m = 2;
int n = 3; int n = 3;
int k = 3; int k = 3;
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place); paddle::platform::float16* input1_ptr =
input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
float16* input2_ptr = input2.mutable_data<float16>({3, 4}, cpu_place); paddle::platform::float16* input2_ptr =
input2.mutable_data<paddle::platform::float16>({3, 4}, cpu_place);
fill_fp16_data(input2_ptr, input2.numel(), fill_fp16_data(input2_ptr, input2.numel(),
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place); paddle::platform::float16* input3_ptr =
input3.mutable_data<paddle::platform::float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input2, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
TensorCopySync(input3, gpu_place, &input3_gpu); paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
float16* a = input1_gpu.data<float16>(); paddle::platform::float16* a = input1_gpu.data<paddle::platform::float16>();
float16* b = input2_gpu.data<float16>(); paddle::platform::float16* b = input2_gpu.data<paddle::platform::float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place); paddle::platform::float16* c =
input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext,
context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1), paddle::platform::float16>(
c + 1, 4); context, false, false, m, n, k, paddle::platform::float16(1), a, 3, b + 1,
4, paddle::platform::float16(1), c + 1, 4);
TensorCopySync(input3_gpu, cpu_place, &input3); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
// numpy code: // numpy code:
// a = np.arange(6).reshape(2, 3) // a = np.arange(6).reshape(2, 3)
...@@ -305,19 +296,16 @@ TEST(math_function, gemm_notrans_cublas_fp16) { ...@@ -305,19 +296,16 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
} }
TEST(math_function, gemm_trans_cublas_fp32) { TEST(math_function, gemm_trans_cublas_fp32) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input2;
paddle::framework::Tensor input3;
Tensor input1; paddle::framework::Tensor input1_gpu;
Tensor input2; paddle::framework::Tensor input2_gpu;
Tensor input3; paddle::framework::Tensor input3_gpu;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
int m = 2; int m = 2;
int n = 3; int n = 3;
...@@ -332,9 +320,9 @@ TEST(math_function, gemm_trans_cublas_fp32) { ...@@ -332,9 +320,9 @@ TEST(math_function, gemm_trans_cublas_fp32) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float)); memcpy(input3_ptr, arr3, 8 * sizeof(float));
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input2, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
TensorCopySync(input3, gpu_place, &input3_gpu); paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
float* a = input1_gpu.data<float>(); float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>(); float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(gpu_place); float* c = input3_gpu.mutable_data<float>(gpu_place);
...@@ -342,7 +330,7 @@ TEST(math_function, gemm_trans_cublas_fp32) { ...@@ -342,7 +330,7 @@ TEST(math_function, gemm_trans_cublas_fp32) {
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
TensorCopySync(input3_gpu, cpu_place, &input3); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
context.Wait(); context.Wait();
EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[0], 0);
...@@ -356,19 +344,16 @@ TEST(math_function, gemm_trans_cublas_fp32) { ...@@ -356,19 +344,16 @@ TEST(math_function, gemm_trans_cublas_fp32) {
} }
TEST(math_function, gemm_trans_cublas_fp16) { TEST(math_function, gemm_trans_cublas_fp16) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input2;
paddle::framework::Tensor input3;
paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor input3_gpu;
Tensor input1; paddle::platform::CPUPlace cpu_place;
Tensor input2; paddle::platform::CUDAPlace gpu_place(0);
Tensor input3; paddle::platform::CUDADeviceContext context(gpu_place);
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
...@@ -378,26 +363,31 @@ TEST(math_function, gemm_trans_cublas_fp16) { ...@@ -378,26 +363,31 @@ TEST(math_function, gemm_trans_cublas_fp16) {
int m = 2; int m = 2;
int n = 3; int n = 3;
int k = 3; int k = 3;
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place); paddle::platform::float16* input1_ptr =
input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
float16* input2_ptr = input2.mutable_data<float16>({4, 3}, cpu_place); paddle::platform::float16* input2_ptr =
input2.mutable_data<paddle::platform::float16>({4, 3}, cpu_place);
fill_fp16_data(input2_ptr, input2.numel(), fill_fp16_data(input2_ptr, input2.numel(),
{0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}); {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11});
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place); paddle::platform::float16* input3_ptr =
input3.mutable_data<paddle::platform::float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input2, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
TensorCopySync(input3, gpu_place, &input3_gpu); paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
float16* a = input1_gpu.data<float16>(); paddle::platform::float16* a = input1_gpu.data<paddle::platform::float16>();
float16* b = input2_gpu.data<float16>(); paddle::platform::float16* b = input2_gpu.data<paddle::platform::float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place); paddle::platform::float16* c =
input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext,
context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1), paddle::platform::float16>(
c + 1, 4); context, false, true, m, n, k, paddle::platform::float16(1), a, 3, b + 3,
3, paddle::platform::float16(1), c + 1, 4);
TensorCopySync(input3_gpu, cpu_place, &input3); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
context.Wait(); context.Wait();
EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0); EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
...@@ -412,24 +402,21 @@ TEST(math_function, gemm_trans_cublas_fp16) { ...@@ -412,24 +402,21 @@ TEST(math_function, gemm_trans_cublas_fp16) {
template <typename T> template <typename T>
void GemvTest(int m, int n, bool trans) { void GemvTest(int m, int n, bool trans) {
using namespace paddle::framework; paddle::framework::Tensor mat_a;
using namespace paddle::platform; paddle::framework::Tensor vec_b;
paddle::framework::Tensor vec_c;
Tensor mat_a;
Tensor vec_b;
Tensor vec_c;
CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place); T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place); T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
T* data_c = vec_c.mutable_data<T>({trans ? n : m}, cpu_place); T* data_c = vec_c.mutable_data<T>({trans ? n : m}, cpu_place);
Tensor g_mat_a; paddle::framework::Tensor g_mat_a;
Tensor g_vec_b; paddle::framework::Tensor g_vec_b;
Tensor g_vec_c; paddle::framework::Tensor g_vec_c;
T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), gpu_place); T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), gpu_place);
T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), gpu_place); T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), gpu_place);
T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), gpu_place); T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), gpu_place);
...@@ -441,14 +428,14 @@ void GemvTest(int m, int n, bool trans) { ...@@ -441,14 +428,14 @@ void GemvTest(int m, int n, bool trans) {
data_b[i] = static_cast<T>(i); data_b[i] = static_cast<T>(i);
} }
TensorCopySync(mat_a, gpu_place, &g_mat_a); paddle::framework::TensorCopySync(mat_a, gpu_place, &g_mat_a);
TensorCopySync(vec_b, gpu_place, &g_vec_b); paddle::framework::TensorCopySync(vec_b, gpu_place, &g_vec_b);
paddle::operators::math::gemv<CUDADeviceContext, T>( paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>(
context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a, context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
g_data_b, 0., g_data_c); g_data_b, 0., g_data_c);
TensorCopySync(g_vec_c, cpu_place, &vec_c); paddle::framework::TensorCopySync(g_vec_c, cpu_place, &vec_c);
if (!trans) { if (!trans) {
for (int i = 0; i < m; ++i) { for (int i = 0; i < m; ++i) {
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/maxouting.h" #include "paddle/fluid/operators/math/maxouting.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -11,8 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,8 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/operators/math/pooling.h"
#include <algorithm>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -27,9 +28,10 @@ template <typename PoolProcess, typename T> ...@@ -27,9 +28,10 @@ template <typename PoolProcess, typename T>
class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> { class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, std::vector<int>& ksize, const framework::Tensor& input, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
PoolProcess pool_process, framework::Tensor* output) { const std::vector<int>& paddings, PoolProcess pool_process,
framework::Tensor* output) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
const int input_width = input.dims()[3]; const int input_width = input.dims()[3];
...@@ -63,11 +65,11 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> { ...@@ -63,11 +65,11 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
T ele = pool_process.initial(); T ele = pool_process.initial();
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
pool_process.compute(ele, input_data[h * input_width + w]); pool_process.compute(input_data[h * input_width + w], &ele);
} }
} }
int pool_size = (hend - hstart) * (wend - wstart); int pool_size = (hend - hstart) * (wend - wstart);
pool_process.finalize(ele, (static_cast<T>(pool_size))); pool_process.finalize(static_cast<T>(pool_size), &ele);
output_data[ph * output_width + pw] = ele; output_data[ph * output_width + pw] = ele;
} }
} }
...@@ -86,13 +88,12 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> { ...@@ -86,13 +88,12 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
template <typename PoolProcess, class T> template <typename PoolProcess, class T>
class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> { class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(
const framework::Tensor& input, const platform::CPUDeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output, const framework::Tensor& output_grad,
const framework::Tensor& output_grad, std::vector<int>& ksize, const std::vector<int>& ksize, const std::vector<int>& strides,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& paddings, PoolProcess pool_grad_process,
PoolProcess pool_grad_process, framework::Tensor* input_grad) {
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
const int input_width = input.dims()[3]; const int input_width = input.dims()[3];
...@@ -131,8 +132,8 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> { ...@@ -131,8 +132,8 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
input_data[h * input_width + w], input_data[h * input_width + w],
output_data[ph * output_width + pw], output_data[ph * output_width + pw],
output_grad_data[ph * output_width + pw], output_grad_data[ph * output_width + pw],
input_grad_data[h * input_width + w], static_cast<T>(scale),
static_cast<T>(scale)); input_grad_data + h * input_width + w);
} }
} }
} }
...@@ -154,12 +155,11 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> { ...@@ -154,12 +155,11 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
template <class T> template <class T>
class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> { class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(
const framework::Tensor& input, const platform::CPUDeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output, const framework::Tensor& output_grad,
const framework::Tensor& output_grad, std::vector<int>& ksize, const std::vector<int>& ksize, const std::vector<int>& strides,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& paddings, framework::Tensor* input_grad) {
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
const int input_width = input.dims()[3]; const int input_width = input.dims()[3];
...@@ -246,9 +246,10 @@ template <typename PoolProcess, class T> ...@@ -246,9 +246,10 @@ template <typename PoolProcess, class T>
class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> { class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, std::vector<int>& ksize, const framework::Tensor& input, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
PoolProcess pool_process, framework::Tensor* output) { const std::vector<int>& paddings, PoolProcess pool_process,
framework::Tensor* output) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
const int input_height = input.dims()[3]; const int input_height = input.dims()[3];
...@@ -293,14 +294,14 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> { ...@@ -293,14 +294,14 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
pool_process.compute( pool_process.compute(
ele, input_data[(d * input_height + h) * input_width + w],
input_data[(d * input_height + h) * input_width + w]); &ele);
} }
} }
} }
int pool_size = int pool_size =
(dend - dstart) * (hend - hstart) * (wend - wstart); (dend - dstart) * (hend - hstart) * (wend - wstart);
pool_process.finalize(ele, static_cast<T>(pool_size)); pool_process.finalize(static_cast<T>(pool_size), &ele);
output_data[output_idx] = ele; output_data[output_idx] = ele;
} }
} }
...@@ -320,13 +321,12 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> { ...@@ -320,13 +321,12 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
template <typename PoolProcess, class T> template <typename PoolProcess, class T>
class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> { class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(
const framework::Tensor& input, const platform::CPUDeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output, const framework::Tensor& output_grad,
const framework::Tensor& output_grad, std::vector<int>& ksize, const std::vector<int>& ksize, const std::vector<int>& strides,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& paddings, PoolProcess pool_grad_process,
PoolProcess pool_grad_process, framework::Tensor* input_grad) {
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
const int input_height = input.dims()[3]; const int input_height = input.dims()[3];
...@@ -379,8 +379,8 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> { ...@@ -379,8 +379,8 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
(pd * output_height + ph) * output_width + pw; (pd * output_height + ph) * output_width + pw;
pool_grad_process.compute( pool_grad_process.compute(
input_data[input_idx], output_data[output_idx], input_data[input_idx], output_data[output_idx],
output_grad_data[output_idx], output_grad_data[output_idx], static_cast<T>(scale),
input_grad_data[input_idx], static_cast<T>(scale)); input_grad_data + input_idx);
} }
} }
} }
...@@ -404,12 +404,11 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> { ...@@ -404,12 +404,11 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
template <class T> template <class T>
class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> { class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(
const framework::Tensor& input, const platform::CPUDeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output, const framework::Tensor& output_grad,
const framework::Tensor& output_grad, std::vector<int>& ksize, const std::vector<int>& ksize, const std::vector<int>& strides,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& paddings, framework::Tensor* input_grad) {
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
const int input_height = input.dims()[3]; const int input_height = input.dims()[3];
...@@ -510,9 +509,10 @@ template <typename T1, typename T2> ...@@ -510,9 +509,10 @@ template <typename T1, typename T2>
class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> { class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, std::vector<int>& ksize, const framework::Tensor& input, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
framework::Tensor* output, framework::Tensor* mask) { const std::vector<int>& paddings, framework::Tensor* output,
framework::Tensor* mask) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
const int input_width = input.dims()[3]; const int input_width = input.dims()[3];
...@@ -576,8 +576,9 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> { ...@@ -576,8 +576,9 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const framework::Tensor& mask, std::vector<int>& ksize, const framework::Tensor& mask, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad) { framework::Tensor* input_grad) {
const int batch_size = input_grad->dims()[0]; const int batch_size = input_grad->dims()[0];
const int input_height = input_grad->dims()[2]; const int input_height = input_grad->dims()[2];
...@@ -628,9 +629,10 @@ template <typename T1, typename T2> ...@@ -628,9 +629,10 @@ template <typename T1, typename T2>
class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> { class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, std::vector<int>& ksize, const framework::Tensor& input, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
framework::Tensor* output, framework::Tensor* mask) { const std::vector<int>& paddings, framework::Tensor* output,
framework::Tensor* mask) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
const int input_height = input.dims()[3]; const int input_height = input.dims()[3];
...@@ -708,8 +710,9 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> { ...@@ -708,8 +710,9 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const framework::Tensor& mask, std::vector<int>& ksize, const framework::Tensor& mask, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad) { framework::Tensor* input_grad) {
const int batch_size = input_grad->dims()[0]; const int batch_size = input_grad->dims()[0];
const int input_depth = input_grad->dims()[2]; const int input_depth = input_grad->dims()[2];
......
...@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/operators/math/pooling.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -47,11 +49,11 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data, ...@@ -47,11 +49,11 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
T ele = pool_process.initial(); T ele = pool_process.initial();
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
pool_process.compute(ele, input_data[h * input_width + w]); pool_process.compute(input_data[h * input_width + w], &ele);
} }
} }
int pool_size = (hend - hstart) * (wend - wstart); int pool_size = (hend - hstart) * (wend - wstart);
pool_process.finalize(ele, (static_cast<T>(pool_size))); pool_process.finalize(static_cast<T>(pool_size), &ele);
output_data[index] = ele; output_data[index] = ele;
} }
} }
...@@ -96,8 +98,8 @@ __global__ void KernelPool2DGrad( ...@@ -96,8 +98,8 @@ __global__ void KernelPool2DGrad(
int pool_size = (hend - hstart) * (wend - wstart); int pool_size = (hend - hstart) * (wend - wstart);
int output_sub_idx = ph * output_width + pw; int output_sub_idx = ph * output_width + pw;
pool_process.compute(input, output_data[output_sub_idx], pool_process.compute(input, output_data[output_sub_idx],
output_grad[output_sub_idx], gradient, output_grad[output_sub_idx],
static_cast<T>(1.0 / pool_size)); static_cast<T>(1.0 / pool_size), &gradient);
} }
} }
input_grad[index] = gradient; input_grad[index] = gradient;
...@@ -158,9 +160,10 @@ template <typename PoolProcess, typename T> ...@@ -158,9 +160,10 @@ template <typename PoolProcess, typename T>
class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> { class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, std::vector<int>& ksize, const framework::Tensor& input, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
PoolProcess pool_process, framework::Tensor* output) { const std::vector<int>& paddings, PoolProcess pool_process,
framework::Tensor* output) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
...@@ -201,9 +204,11 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> { ...@@ -201,9 +204,11 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& ksize,
PoolProcess pool_process, framework::Tensor* input_grad) { const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_process,
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
...@@ -246,8 +251,10 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> { ...@@ -246,8 +251,10 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad) { framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
...@@ -340,12 +347,12 @@ __global__ void KernelPool3D(const int nthreads, const T* input_data, ...@@ -340,12 +347,12 @@ __global__ void KernelPool3D(const int nthreads, const T* input_data,
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
pool_process.compute( pool_process.compute(
ele, input_data[(d * input_height + h) * input_width + w]); input_data[(d * input_height + h) * input_width + w], &ele);
} }
} }
} }
int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
pool_process.finalize(ele, static_cast<T>(pool_size)); pool_process.finalize(static_cast<T>(pool_size), &ele);
output_data[index] = ele; output_data[index] = ele;
} }
} }
...@@ -405,8 +412,8 @@ __global__ void KernelPool3DGrad( ...@@ -405,8 +412,8 @@ __global__ void KernelPool3DGrad(
int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
int output_sub_idx = (pd * output_height + ph) * output_width + pw; int output_sub_idx = (pd * output_height + ph) * output_width + pw;
pool_process.compute(input, output_data[output_sub_idx], pool_process.compute(input, output_data[output_sub_idx],
output_grad[output_sub_idx], gradient, output_grad[output_sub_idx],
static_cast<T>(1.0 / pool_size)); static_cast<T>(1.0 / pool_size), &gradient);
} }
} }
} }
...@@ -474,9 +481,10 @@ template <typename PoolProcess, class T> ...@@ -474,9 +481,10 @@ template <typename PoolProcess, class T>
class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> { class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, std::vector<int>& ksize, const framework::Tensor& input, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
PoolProcess pool_process, framework::Tensor* output) { const std::vector<int>& paddings, PoolProcess pool_process,
framework::Tensor* output) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
...@@ -525,9 +533,11 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> { ...@@ -525,9 +533,11 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& ksize,
PoolProcess pool_process, framework::Tensor* input_grad) { const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_process,
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
...@@ -578,8 +588,10 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> { ...@@ -578,8 +588,10 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad) { framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
...@@ -736,9 +748,10 @@ template <typename T1, typename T2> ...@@ -736,9 +748,10 @@ template <typename T1, typename T2>
class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> { class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, std::vector<int>& ksize, const framework::Tensor& input, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
framework::Tensor* output, framework::Tensor* mask) { const std::vector<int>& paddings, framework::Tensor* output,
framework::Tensor* mask) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
...@@ -779,8 +792,9 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> { ...@@ -779,8 +792,9 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const framework::Tensor& mask, std::vector<int>& ksize, const framework::Tensor& mask, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad) { framework::Tensor* input_grad) {
const int batch_size = input_grad->dims()[0]; const int batch_size = input_grad->dims()[0];
const int input_channels = input_grad->dims()[1]; const int input_channels = input_grad->dims()[1];
...@@ -937,9 +951,10 @@ template <typename T1, typename T2> ...@@ -937,9 +951,10 @@ template <typename T1, typename T2>
class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> { class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, std::vector<int>& ksize, const framework::Tensor& input, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
framework::Tensor* output, framework::Tensor* mask) { const std::vector<int>& paddings, framework::Tensor* output,
framework::Tensor* mask) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
...@@ -987,8 +1002,9 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> { ...@@ -987,8 +1002,9 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const framework::Tensor& mask, std::vector<int>& ksize, const framework::Tensor& mask, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad) { framework::Tensor* input_grad) {
const int batch_size = input_grad->dims()[0]; const int batch_size = input_grad->dims()[0];
const int input_channels = input_grad->dims()[1]; const int input_channels = input_grad->dims()[1];
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -23,8 +24,8 @@ namespace operators { ...@@ -23,8 +24,8 @@ namespace operators {
namespace math { namespace math {
#define FLT_MAX \ #define FLT_MAX \
__FLT_MAX__ // It might need to be placed in another file, but I'm still __FLT_MAX__ // TODO(zcd) :It might need to be placed in another file, but I'm
// wondering where to put it. // still wondering where to put it.
/* /*
* \brief Extracting simple operations from pooling. * \brief Extracting simple operations from pooling.
...@@ -40,33 +41,33 @@ template <class T> ...@@ -40,33 +41,33 @@ template <class T>
class MaxPool { class MaxPool {
public: public:
DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); } DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; } DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
DEVICE inline void finalize(T& y, const T& pool_field) {} DEVICE inline void finalize(const T& pool_field, T* y) {}
}; };
template <class T> template <class T>
class AvgPool { class AvgPool {
public: public:
DEVICE inline T initial() { return static_cast<T>(0); } DEVICE inline T initial() { return static_cast<T>(0); }
DEVICE inline void compute(T& y, const T& x) { y += x; } DEVICE inline void compute(const T& x, T* y) { *y += x; }
DEVICE inline void finalize(T& y, const T& pool_field) { y /= pool_field; } DEVICE inline void finalize(const T& pool_field, T* y) { *y /= pool_field; }
}; };
template <class T> template <class T>
class MaxPoolGrad { class MaxPoolGrad {
public: public:
DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx, DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
T scale) { T* dx) {
dx += dy * (x == y); *dx += dy * (x == y);
} }
}; };
template <class T> template <class T>
class AvgPoolGrad { class AvgPoolGrad {
public: public:
DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx, DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
T scale) { T* dx) {
dx += (scale * dy); *dx += (scale * dy);
} }
}; };
...@@ -88,8 +89,9 @@ template <typename DeviceContext, typename PoolProcess, typename T> ...@@ -88,8 +89,9 @@ template <typename DeviceContext, typename PoolProcess, typename T>
class Pool2dFunctor { class Pool2dFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
std::vector<int>& ksize, std::vector<int>& strides, const std::vector<int>& ksize,
std::vector<int>& paddings, PoolProcess pool_compute, const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_compute,
framework::Tensor* output); framework::Tensor* output);
}; };
...@@ -98,9 +100,11 @@ class Pool2dGradFunctor { ...@@ -98,9 +100,11 @@ class Pool2dGradFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& ksize,
PoolProcess pool_compute, framework::Tensor* input_grad); const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_compute,
framework::Tensor* input_grad);
}; };
template <typename DeviceContext, class T> template <typename DeviceContext, class T>
...@@ -108,8 +112,10 @@ class MaxPool2dGradFunctor { ...@@ -108,8 +112,10 @@ class MaxPool2dGradFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad); framework::Tensor* input_grad);
}; };
...@@ -117,8 +123,9 @@ template <typename DeviceContext, typename PoolProcess, typename T> ...@@ -117,8 +123,9 @@ template <typename DeviceContext, typename PoolProcess, typename T>
class Pool3dFunctor { class Pool3dFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
std::vector<int>& ksize, std::vector<int>& strides, const std::vector<int>& ksize,
std::vector<int>& paddings, PoolProcess pool_compute, const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_compute,
framework::Tensor* output); framework::Tensor* output);
}; };
...@@ -127,9 +134,11 @@ class Pool3dGradFunctor { ...@@ -127,9 +134,11 @@ class Pool3dGradFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& ksize,
PoolProcess pool_compute, framework::Tensor* input_grad); const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_compute,
framework::Tensor* input_grad);
}; };
template <typename DeviceContext, class T> template <typename DeviceContext, class T>
...@@ -137,8 +146,10 @@ class MaxPool3dGradFunctor { ...@@ -137,8 +146,10 @@ class MaxPool3dGradFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad); framework::Tensor* input_grad);
}; };
...@@ -153,8 +164,9 @@ template <typename DeviceContext, typename T1, typename T2> ...@@ -153,8 +164,9 @@ template <typename DeviceContext, typename T1, typename T2>
class MaxPool2dWithIndexFunctor { class MaxPool2dWithIndexFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
std::vector<int>& ksize, std::vector<int>& strides, const std::vector<int>& ksize,
std::vector<int>& paddings, framework::Tensor* output, const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* output,
framework::Tensor* mask); framework::Tensor* mask);
}; };
...@@ -163,8 +175,9 @@ class MaxPool2dWithIndexGradFunctor { ...@@ -163,8 +175,9 @@ class MaxPool2dWithIndexGradFunctor {
public: public:
void operator()(const DeviceContext& context, void operator()(const DeviceContext& context,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const framework::Tensor& mask, std::vector<int>& ksize, const framework::Tensor& mask, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad); framework::Tensor* input_grad);
}; };
...@@ -172,8 +185,9 @@ template <typename DeviceContext, typename T1, typename T2> ...@@ -172,8 +185,9 @@ template <typename DeviceContext, typename T1, typename T2>
class MaxPool3dWithIndexFunctor { class MaxPool3dWithIndexFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
std::vector<int>& ksize, std::vector<int>& strides, const std::vector<int>& ksize,
std::vector<int>& paddings, framework::Tensor* output, const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* output,
framework::Tensor* mask); framework::Tensor* mask);
}; };
...@@ -182,8 +196,9 @@ class MaxPool3dWithIndexGradFunctor { ...@@ -182,8 +196,9 @@ class MaxPool3dWithIndexGradFunctor {
public: public:
void operator()(const DeviceContext& context, void operator()(const DeviceContext& context,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const framework::Tensor& mask, std::vector<int>& ksize, const framework::Tensor& mask, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad); framework::Tensor* input_grad);
}; };
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -12,43 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,43 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
TEST(selected_rows_functor, gpu_add) { TEST(selected_rows_functor, gpu_add) {
using namespace paddle::framework; paddle::platform::CUDAPlace gpu_place(0);
using namespace paddle::platform; paddle::platform::CPUPlace cpu_place;
using namespace paddle::operators::math; paddle::platform::CUDADeviceContext ctx(gpu_place);
paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
CUDAPlace gpu_place(0); float>
CPUPlace cpu_place; functor;
CUDADeviceContext ctx(gpu_place);
SetConstant<CUDADeviceContext, float> functor;
int64_t height = 10; int64_t height = 10;
int64_t row_numel = 10; int64_t row_numel = 10;
std::vector<int64_t> rows1{0, 4, 7}; std::vector<int64_t> rows1{0, 4, 7};
std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)}; std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
new paddle::framework::SelectedRows(rows1, height)};
auto* in1_value = selected_rows1->mutable_value(); auto* in1_value = selected_rows1->mutable_value();
in1_value->mutable_data<float>( in1_value->mutable_data<float>(
make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place); paddle::framework::make_ddim(
{static_cast<int64_t>(rows1.size()), row_numel}),
gpu_place);
functor(ctx, in1_value, 1.0); functor(ctx, in1_value, 1.0);
std::vector<int64_t> rows2{0, 5, 7, 9}; std::vector<int64_t> rows2{0, 5, 7, 9};
std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)}; std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
new paddle::framework::SelectedRows(rows2, height)};
auto* in2_value = selected_rows2->mutable_value(); auto* in2_value = selected_rows2->mutable_value();
in2_value->mutable_data<float>( in2_value->mutable_data<float>(
make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), gpu_place); paddle::framework::make_ddim(
{static_cast<int64_t>(rows2.size()), row_numel}),
gpu_place);
functor(ctx, in2_value, 2.0); functor(ctx, in2_value, 2.0);
std::unique_ptr<SelectedRows> output{new SelectedRows()}; std::unique_ptr<paddle::framework::SelectedRows> output{
new paddle::framework::SelectedRows()};
auto* out_value = output->mutable_value(); auto* out_value = output->mutable_value();
// simplely concat two SelectedRows // simply concat two SelectedRows
out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place); out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
gpu_place);
SelectedRowsAdd<CUDADeviceContext, float> add_functor; paddle::operators::math::SelectedRowsAdd<paddle::platform::CUDADeviceContext,
float>
add_functor;
add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
auto out_height = output->height(); auto out_height = output->height();
...@@ -66,8 +75,8 @@ TEST(selected_rows_functor, gpu_add) { ...@@ -66,8 +75,8 @@ TEST(selected_rows_functor, gpu_add) {
EXPECT_EQ(out_rows[5], 7); EXPECT_EQ(out_rows[5], 7);
EXPECT_EQ(out_rows[6], 9); EXPECT_EQ(out_rows[6], 9);
Tensor out_cpu; paddle::framework::Tensor out_cpu;
TensorCopy(*out_value, cpu_place, ctx, &out_cpu); paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
ctx.Wait(); ctx.Wait();
auto* out_cpu_data = out_cpu.data<float>(); auto* out_cpu_data = out_cpu.data<float>();
...@@ -83,18 +92,24 @@ TEST(selected_rows_functor, gpu_add) { ...@@ -83,18 +92,24 @@ TEST(selected_rows_functor, gpu_add) {
EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0); EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0); EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
std::unique_ptr<Tensor> tensor1{new Tensor()}; std::unique_ptr<paddle::framework::Tensor> tensor1{
tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place); new paddle::framework::Tensor()};
tensor1->mutable_data<float>(
paddle::framework::make_ddim({height, row_numel}), gpu_place);
functor(ctx, tensor1.get(), 3.0); functor(ctx, tensor1.get(), 3.0);
std::unique_ptr<Tensor> tensor2{new Tensor()}; std::unique_ptr<paddle::framework::Tensor> tensor2{
tensor2->mutable_data<float>(make_ddim({height, row_numel}), gpu_place); new paddle::framework::Tensor()};
tensor2->mutable_data<float>(
paddle::framework::make_ddim({height, row_numel}), gpu_place);
SelectedRowsAddTensor<CUDADeviceContext, float> add_tensor_functor; paddle::operators::math::SelectedRowsAddTensor<
paddle::platform::CUDADeviceContext, float>
add_tensor_functor;
add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
Tensor tensor2_cpu; paddle::framework::Tensor tensor2_cpu;
TensorCopy(*tensor2, cpu_place, ctx, &tensor2_cpu); paddle::framework::TensorCopy(*tensor2, cpu_place, ctx, &tensor2_cpu);
ctx.Wait(); ctx.Wait();
auto* tensor2_cpu_data = tensor2_cpu.data<float>(); auto* tensor2_cpu_data = tensor2_cpu.data<float>();
...@@ -115,39 +130,47 @@ TEST(selected_rows_functor, gpu_add) { ...@@ -115,39 +130,47 @@ TEST(selected_rows_functor, gpu_add) {
} }
TEST(selected_rows_functor, gpu_add_to) { TEST(selected_rows_functor, gpu_add_to) {
using namespace paddle::framework; paddle::platform::CUDAPlace gpu_place(0);
using namespace paddle::platform; paddle::platform::CPUPlace cpu_place;
using namespace paddle::operators::math; paddle::platform::CUDADeviceContext ctx(gpu_place);
paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
CUDAPlace gpu_place(0); float>
CPUPlace cpu_place; functor;
CUDADeviceContext ctx(gpu_place);
SetConstant<CUDADeviceContext, float> functor;
int64_t height = 10; int64_t height = 10;
int64_t row_numel = 10; int64_t row_numel = 10;
std::vector<int64_t> rows1{0, 4, 7}; std::vector<int64_t> rows1{0, 4, 7};
std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)}; std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
new paddle::framework::SelectedRows(rows1, height)};
auto* in1_value = selected_rows1->mutable_value(); auto* in1_value = selected_rows1->mutable_value();
in1_value->mutable_data<float>( in1_value->mutable_data<float>(
make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place); paddle::framework::make_ddim(
{static_cast<int64_t>(rows1.size()), row_numel}),
gpu_place);
functor(ctx, in1_value, 1.0); functor(ctx, in1_value, 1.0);
std::vector<int64_t> rows2{0, 5, 7, 9}; std::vector<int64_t> rows2{0, 5, 7, 9};
std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)}; std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
new paddle::framework::SelectedRows(rows2, height)};
auto* in2_value = selected_rows2->mutable_value(); auto* in2_value = selected_rows2->mutable_value();
in2_value->mutable_data<float>( in2_value->mutable_data<float>(
make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), gpu_place); paddle::framework::make_ddim(
{static_cast<int64_t>(rows2.size()), row_numel}),
gpu_place);
functor(ctx, in2_value, 2.0); functor(ctx, in2_value, 2.0);
std::unique_ptr<SelectedRows> output{new SelectedRows()}; std::unique_ptr<paddle::framework::SelectedRows> output{
new paddle::framework::SelectedRows()};
output->set_height(height); output->set_height(height);
auto* out_value = output->mutable_value(); auto* out_value = output->mutable_value();
// simplely concat two SelectedRows // simply concat two SelectedRows
out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place); out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
gpu_place);
SelectedRowsAddTo<CUDADeviceContext, float> add_to_functor; paddle::operators::math::SelectedRowsAddTo<
paddle::platform::CUDADeviceContext, float>
add_to_functor;
add_to_functor(ctx, *selected_rows1, 0, output.get()); add_to_functor(ctx, *selected_rows1, 0, output.get());
add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
...@@ -166,8 +189,8 @@ TEST(selected_rows_functor, gpu_add_to) { ...@@ -166,8 +189,8 @@ TEST(selected_rows_functor, gpu_add_to) {
EXPECT_EQ(out_rows[5], 7); EXPECT_EQ(out_rows[5], 7);
EXPECT_EQ(out_rows[6], 9); EXPECT_EQ(out_rows[6], 9);
Tensor out_cpu; paddle::framework::Tensor out_cpu;
TensorCopy(*out_value, cpu_place, ctx, &out_cpu); paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
ctx.Wait(); ctx.Wait();
auto* out_cpu_data = out_cpu.data<float>(); auto* out_cpu_data = out_cpu.data<float>();
...@@ -183,15 +206,19 @@ TEST(selected_rows_functor, gpu_add_to) { ...@@ -183,15 +206,19 @@ TEST(selected_rows_functor, gpu_add_to) {
EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0); EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0); EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
std::unique_ptr<Tensor> tensor1{new Tensor()}; std::unique_ptr<paddle::framework::Tensor> tensor1{
tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place); new paddle::framework::Tensor()};
tensor1->mutable_data<float>(
paddle::framework::make_ddim({height, row_numel}), gpu_place);
functor(ctx, tensor1.get(), 3.0); functor(ctx, tensor1.get(), 3.0);
SelectedRowsAddToTensor<CUDADeviceContext, float> add_to_tensor_functor; paddle::operators::math::SelectedRowsAddToTensor<
paddle::platform::CUDADeviceContext, float>
add_to_tensor_functor;
add_to_tensor_functor(ctx, *output, tensor1.get()); add_to_tensor_functor(ctx, *output, tensor1.get());
Tensor tensor1_cpu; paddle::framework::Tensor tensor1_cpu;
TensorCopy(*tensor1, cpu_place, ctx, &tensor1_cpu); paddle::framework::TensorCopy(*tensor1, cpu_place, ctx, &tensor1_cpu);
ctx.Wait(); ctx.Wait();
auto* tensor1_cpu_data = tensor1_cpu.data<float>(); auto* tensor1_cpu_data = tensor1_cpu.data<float>();
......
...@@ -22,7 +22,7 @@ template <typename T> ...@@ -22,7 +22,7 @@ template <typename T>
class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::LoDTensor& seq, framework::Tensor& padding, const framework::LoDTensor& seq, framework::Tensor* padding,
bool norm_by_times) { bool norm_by_times) {
auto lod = seq.lod(); auto lod = seq.lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL, PADDLE_ENFORCE_GT(lod.size(), 0UL,
...@@ -37,7 +37,7 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { ...@@ -37,7 +37,7 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
"The first dimension of LoDTensor seq should be " "The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length."); "equal to the sum of all sequences's length.");
auto padding_dims = padding.dims(); auto padding_dims = padding->dims();
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape " "The input padding should be a 3-D Tensor of shape "
"[max_sequence_length, num_sequences, sequence_width]."); "[max_sequence_length, num_sequences, sequence_width].");
...@@ -58,7 +58,7 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { ...@@ -58,7 +58,7 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
"width of sequence in LoDTensor seq."); "width of sequence in LoDTensor seq.");
const T* seq_data = seq.data<T>(); const T* seq_data = seq.data<T>();
T* padding_data = padding.data<T>(); T* padding_data = padding->data<T>();
for (int64_t i = 0; i < max_sequence_length; ++i) { for (int64_t i = 0; i < max_sequence_length; ++i) {
for (int64_t j = 0; j < num_sequences; ++j) { for (int64_t j = 0; j < num_sequences; ++j) {
int64_t start_pos = abs_offset_lod[level][j]; int64_t start_pos = abs_offset_lod[level][j];
...@@ -84,16 +84,16 @@ template <typename T> ...@@ -84,16 +84,16 @@ template <typename T>
class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
framework::LoDTensor& seq, const framework::Tensor& padding, framework::LoDTensor* seq, const framework::Tensor& padding,
bool norm_by_times) { bool norm_by_times) {
auto lod = seq.lod(); auto lod = seq->lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL, PADDLE_ENFORCE_GT(lod.size(), 0UL,
"The LoD of LoDTensor seq should not be null."); "The LoD of LoDTensor seq should not be null.");
const size_t level = 0; const size_t level = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto seq_dims = seq.dims(); auto seq_dims = seq->dims();
PADDLE_ENFORCE_EQ(seq_dims[0], PADDLE_ENFORCE_EQ(seq_dims[0],
static_cast<int64_t>(abs_offset_lod[level].back()), static_cast<int64_t>(abs_offset_lod[level].back()),
"The first dimension of LoDTensor seq should be " "The first dimension of LoDTensor seq should be "
...@@ -114,13 +114,13 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { ...@@ -114,13 +114,13 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
"The second dimension of Tensor padding should be " "The second dimension of Tensor padding should be "
"the number of sequences in LoDTensor seq."); "the number of sequences in LoDTensor seq.");
const int64_t sequence_width = seq.numel() / seq_dims[0]; const int64_t sequence_width = seq->numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the " "The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq."); "width of sequence in LoDTensor seq.");
const T* padding_data = padding.data<T>(); const T* padding_data = padding.data<T>();
T* seq_data = seq.data<T>(); T* seq_data = seq->data<T>();
for (int64_t i = 0; i < num_sequences; ++i) { for (int64_t i = 0; i < num_sequences; ++i) {
int64_t start_pos = abs_offset_lod[level][i]; int64_t start_pos = abs_offset_lod[level][i];
int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos; int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos;
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm>
#include "paddle/fluid/operators/math/sequence_padding.h" #include "paddle/fluid/operators/math/sequence_padding.h"
namespace paddle { namespace paddle {
...@@ -61,7 +62,7 @@ template <typename T> ...@@ -61,7 +62,7 @@ template <typename T>
class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::LoDTensor& seq, framework::Tensor& padding, const framework::LoDTensor& seq, framework::Tensor* padding,
bool norm_by_times) { bool norm_by_times) {
auto lod = seq.lod(); auto lod = seq.lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL, PADDLE_ENFORCE_GT(lod.size(), 0UL,
...@@ -76,7 +77,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -76,7 +77,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
"The first dimension of LoDTensor seq should be " "The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length."); "equal to the sum of all sequences's length.");
auto padding_dims = padding.dims(); auto padding_dims = padding->dims();
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape " "The input padding should be a 3-D Tensor of shape "
"[max_sequence_length, num_sequences, sequence_width]."); "[max_sequence_length, num_sequences, sequence_width].");
...@@ -97,8 +98,8 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -97,8 +98,8 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
"width of sequence in LoDTensor seq."); "width of sequence in LoDTensor seq.");
if (!norm_by_times && num_sequences == 1UL) { if (!norm_by_times && num_sequences == 1UL) {
TensorCopy(seq, context.GetPlace(), context, &padding); TensorCopy(seq, context.GetPlace(), context, padding);
padding.Resize(padding_dims); padding->Resize(padding_dims);
return; return;
} }
...@@ -117,7 +118,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -117,7 +118,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
dim3 grid(grid_dim_x, grid_dim_y); dim3 grid(grid_dim_x, grid_dim_y);
const T* seq_data = seq.data<T>(); const T* seq_data = seq.data<T>();
T* padding_data = padding.data<T>(); T* padding_data = padding->data<T>();
if (norm_by_times) { if (norm_by_times) {
SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data), padding_data, const_cast<T*>(seq_data),
...@@ -136,16 +137,16 @@ template <typename T> ...@@ -136,16 +137,16 @@ template <typename T>
class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
framework::LoDTensor& seq, const framework::Tensor& padding, framework::LoDTensor* seq, const framework::Tensor& padding,
bool norm_by_times) { bool norm_by_times) {
auto lod = seq.lod(); auto lod = seq->lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL, PADDLE_ENFORCE_GT(lod.size(), 0UL,
"The lod of LoDTensor seq should not be null."); "The lod of LoDTensor seq should not be null.");
const size_t level = 0; const size_t level = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto seq_dims = seq.dims(); auto seq_dims = seq->dims();
PADDLE_ENFORCE_EQ(seq_dims[0], PADDLE_ENFORCE_EQ(seq_dims[0],
static_cast<int64_t>(abs_offset_lod[level].back()), static_cast<int64_t>(abs_offset_lod[level].back()),
"The first dimension of LoDTensor seq should be " "The first dimension of LoDTensor seq should be "
...@@ -166,14 +167,14 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -166,14 +167,14 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
"The second dimension of Tensor padding should be " "The second dimension of Tensor padding should be "
"the number of sequences in LoDTensor seq."); "the number of sequences in LoDTensor seq.");
const int64_t sequence_width = seq.numel() / seq_dims[0]; const int64_t sequence_width = seq->numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the " "The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq."); "width of sequence in LoDTensor seq.");
if (!norm_by_times && num_sequences == 1UL) { if (!norm_by_times && num_sequences == 1UL) {
TensorCopy(padding, context.GetPlace(), context, &seq); TensorCopy(padding, context.GetPlace(), context, seq);
seq.Resize(seq_dims); seq->Resize(seq_dims);
return; return;
} }
...@@ -192,7 +193,7 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -192,7 +193,7 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
dim3 grid(grid_dim_x, grid_dim_y); dim3 grid(grid_dim_x, grid_dim_y);
const T* padding_data = padding.data<T>(); const T* padding_data = padding.data<T>();
T* seq_data = seq.data<T>(); T* seq_data = seq->data<T>();
if (norm_by_times) { if (norm_by_times) {
SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data, const_cast<T*>(padding_data), seq_data,
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -64,13 +65,13 @@ template <typename DeviceContext, typename T> ...@@ -64,13 +65,13 @@ template <typename DeviceContext, typename T>
class PaddingLoDTensorFunctor { class PaddingLoDTensorFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::LoDTensor& seq, void operator()(const DeviceContext& context, const framework::LoDTensor& seq,
framework::Tensor& padding, bool norm_by_times); framework::Tensor* padding, bool norm_by_times);
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class UnpaddingLoDTensorFunctor { class UnpaddingLoDTensorFunctor {
public: public:
void operator()(const DeviceContext& context, framework::LoDTensor& seq, void operator()(const DeviceContext& context, framework::LoDTensor* seq,
const framework::Tensor& padding, bool norm_by_times); const framework::Tensor& padding, bool norm_by_times);
}; };
......
...@@ -54,12 +54,12 @@ void TestSequencePadding(const paddle::framework::LoD& lod, ...@@ -54,12 +54,12 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
static_cast<int64_t>(sequence_width)}); static_cast<int64_t>(sequence_width)});
padding.mutable_data<T>(padding_dims, *place); padding.mutable_data<T>(padding_dims, *place);
paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()( paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
*context, seq, padding, false); *context, seq, &padding, false);
seq_back.set_lod(lod); seq_back.set_lod(lod);
seq_back.mutable_data<T>(seq_dims, *place); seq_back.mutable_data<T>(seq_dims, *place);
paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()( paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
*context, seq_back, padding, false); *context, &seq_back, padding, false);
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
cpu_seq_back = seq_back; cpu_seq_back = seq_back;
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_pooling.h" #include "paddle/fluid/operators/math/sequence_pooling.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/sequence_scale.h" #include "paddle/fluid/operators/math/sequence_scale.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/unpooling.h" #include "paddle/fluid/operators/math/unpooling.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -204,6 +204,8 @@ REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ...@@ -204,6 +204,8 @@ REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(mul_grad, ops::MulGradOp); REGISTER_OPERATOR(mul_grad, ops::MulGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>); mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>,
ops::MulKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>); mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::MulGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -18,6 +18,8 @@ limitations under the License. */ ...@@ -18,6 +18,8 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>, REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
ops::MulKernel<plat::CUDADeviceContext, double>,
ops::MulKernel<plat::CUDADeviceContext, plat::float16>); ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(mul_grad, REGISTER_OP_CUDA_KERNEL(mul_grad,
ops::MulGradKernel<plat::CUDADeviceContext, float>); ops::MulGradKernel<plat::CUDADeviceContext, float>,
ops::MulGradKernel<plat::CUDADeviceContext, double>);
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/one_hot_op.h" #include "paddle/fluid/operators/one_hot_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
namespace paddle { namespace paddle {
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/roi_pool_op.h" #include "paddle/fluid/operators/roi_pool_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/row_conv_op.h" #include "paddle/fluid/operators/row_conv_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -220,7 +220,7 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout, ...@@ -220,7 +220,7 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout,
for (int offset = 16; offset > 0; for (int offset = 16; offset > 0;
offset = offset / 2) { // blockDim.x is 32. offset = offset / 2) { // blockDim.x is 32.
val += __shfl_down(val, offset); val += platform::__shfl_down_sync(0, val, offset);
} }
__syncthreads(); __syncthreads();
...@@ -276,7 +276,7 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence, ...@@ -276,7 +276,7 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence,
for (int offset = 16; offset > 0; for (int offset = 16; offset > 0;
offset = offset / 2) { // blockDim.x is 32. offset = offset / 2) { // blockDim.x is 32.
val += __shfl_down(val, offset); val += platform::__shfl_down_sync(0, val, offset);
} }
__syncthreads(); __syncthreads();
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h"
USE_NO_KERNEL_OP(save); USE_NO_KERNEL_OP(save);
USE_NO_KERNEL_OP(load); USE_NO_KERNEL_OP(load);
...@@ -61,3 +62,35 @@ TEST(SaveLoadOp, CPU) { ...@@ -61,3 +62,35 @@ TEST(SaveLoadOp, CPU) {
} }
} }
} }
TEST(SaveLoadFP16Op, CPU) {
paddle::framework::Scope scope;
paddle::platform::CPUPlace place;
auto var = scope.Var("test_var");
auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
tensor->Resize({3, 10});
float* expect = tensor->mutable_data<float>(place);
for (int64_t i = 0; i < tensor->numel(); ++i) {
expect[i] = static_cast<float>(paddle::platform::float16(i));
}
paddle::framework::AttributeMap attrs;
attrs.insert({"file_path", std::string("tensor.save")});
attrs.insert({"save_as_fp16", true});
auto save_op = paddle::framework::OpRegistry::CreateOp(
"save", {{"X", {"test_var"}}}, {}, attrs);
save_op->Run(scope, place);
auto load_var = scope.Var("out_var");
auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
auto load_op = paddle::framework::OpRegistry::CreateOp(
"load", {}, {{"Out", {"out_var"}}}, attrs);
load_op->Run(scope, place);
paddle::platform::float16* actual = target->data<paddle::platform::float16>();
for (int64_t i = 0; i < tensor->numel(); ++i) {
EXPECT_EQ(expect[i], static_cast<float>(actual[i]));
}
}
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <numeric> #include <numeric>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -68,6 +69,7 @@ class SaveOp : public framework::OperatorBase { ...@@ -68,6 +69,7 @@ class SaveOp : public framework::OperatorBase {
const platform::Place &place) const override { const platform::Place &place) const override {
auto filename = Attr<std::string>("file_path"); auto filename = Attr<std::string>("file_path");
auto overwrite = Attr<bool>("overwrite"); auto overwrite = Attr<bool>("overwrite");
auto save_as_fp16 = Attr<bool>("save_as_fp16");
if (FileExists(filename) && !overwrite) { if (FileExists(filename) && !overwrite) {
PADDLE_THROW("%s is existed, cannot save to it when overwrite=false", PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
...@@ -96,7 +98,18 @@ class SaveOp : public framework::OperatorBase { ...@@ -96,7 +98,18 @@ class SaveOp : public framework::OperatorBase {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
framework::SerializeToStream(fout, tensor, dev_ctx); auto in_dtype = framework::ToDataType(tensor.type());
auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (in_dtype != out_dtype) {
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor out;
framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
framework::SerializeToStream(fout, out, dev_ctx);
} else {
framework::SerializeToStream(fout, tensor, dev_ctx);
}
} }
}; };
...@@ -114,6 +127,12 @@ This operator will serialize and write a tensor variable to file on disk. ...@@ -114,6 +127,12 @@ This operator will serialize and write a tensor variable to file on disk.
"(boolean, default true)" "(boolean, default true)"
"Overwrite the output file if exist") "Overwrite the output file if exist")
.SetDefault(true); .SetDefault(true);
AddAttr<bool>("save_as_fp16",
"(boolean, default false)"
"If true, the tensor will be converted to float16 data "
"type and then saved. Otherwise, the tensor will be "
"directly saved without data type conversion.")
.SetDefault(false);
AddAttr<std::string>("file_path", AddAttr<std::string>("file_path",
"(string)" "(string)"
"The \"file_path\" where the variable will be saved.") "The \"file_path\" where the variable will be saved.")
......
...@@ -113,7 +113,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs, ...@@ -113,7 +113,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
op->SetAttrMap(attrs); op->SetAttrMap(attrs);
} }
void StartServerNet(bool is_sparse) { void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
f::Scope scope; f::Scope scope;
p::CPUPlace place; p::CPUPlace place;
if (is_sparse) { if (is_sparse) {
...@@ -121,7 +121,6 @@ void StartServerNet(bool is_sparse) { ...@@ -121,7 +121,6 @@ void StartServerNet(bool is_sparse) {
} else { } else {
InitTensorsInScope(place, &scope); InitTensorsInScope(place, &scope);
} }
// sub program run in listen_and_serv_op, for simple test we use sum // sub program run in listen_and_serv_op, for simple test we use sum
f::ProgramDesc program; f::ProgramDesc program;
const auto &root_block = program.Block(0); const auto &root_block = program.Block(0);
...@@ -129,7 +128,6 @@ void StartServerNet(bool is_sparse) { ...@@ -129,7 +128,6 @@ void StartServerNet(bool is_sparse) {
auto *prefetch_block = program.AppendBlock(root_block); auto *prefetch_block = program.AppendBlock(root_block);
// X for server side tensors, RX for received tensors, must be of same shape. // X for server side tensors, RX for received tensors, must be of same shape.
AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block); AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
f::AttributeMap attrs; f::AttributeMap attrs;
attrs.insert({"endpoint", std::string("127.0.0.1:0")}); attrs.insert({"endpoint", std::string("127.0.0.1:0")});
attrs.insert({"Fanin", 1}); attrs.insert({"Fanin", 1});
...@@ -141,12 +139,16 @@ void StartServerNet(bool is_sparse) { ...@@ -141,12 +139,16 @@ void StartServerNet(bool is_sparse) {
attrs.insert({"sync_mode", true}); attrs.insert({"sync_mode", true});
listen_and_serv_op = listen_and_serv_op =
f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs); f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
*initialized = true;
listen_and_serv_op->Run(scope, place); listen_and_serv_op->Run(scope, place);
LOG(INFO) << "server exit"; LOG(INFO) << "server exit";
} }
TEST(SendRecvOp, CPUDense) { TEST(SendRecvOp, CPUDense) {
std::thread server_thread(StartServerNet, false); std::atomic<bool> initialized{false};
std::thread server_thread(StartServerNet, false, &initialized);
while (!initialized) {
}
sleep(5); // wait server to start sleep(5); // wait server to start
// local net // local net
f::Scope scope; f::Scope scope;
...@@ -156,9 +158,11 @@ TEST(SendRecvOp, CPUDense) { ...@@ -156,9 +158,11 @@ TEST(SendRecvOp, CPUDense) {
scope.Var("RPC_CLIENT_VAR"); scope.Var("RPC_CLIENT_VAR");
f::AttributeMap attrs; f::AttributeMap attrs;
selected_port = static_cast<paddle::operators::ListenAndServOp *>( auto *listen_and_serv_op_ptr =
listen_and_serv_op.get()) static_cast<paddle::operators::ListenAndServOp *>(
->GetSelectedPort(); listen_and_serv_op.get());
ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
selected_port = listen_and_serv_op_ptr->GetSelectedPort();
std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port); std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
attrs.insert({"endpoints", std::vector<std::string>({endpoint})}); attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
attrs.insert({"epmap", std::vector<std::string>({endpoint})}); attrs.insert({"epmap", std::vector<std::string>({endpoint})});
...@@ -184,8 +188,12 @@ TEST(SendRecvOp, CPUDense) { ...@@ -184,8 +188,12 @@ TEST(SendRecvOp, CPUDense) {
} }
TEST(SendRecvOp, CPUSparse) { TEST(SendRecvOp, CPUSparse) {
std::thread server_thread(StartServerNet, true); std::atomic<bool> initialized;
sleep(3); // wait server to start initialized = false;
std::thread server_thread(StartServerNet, true, &initialized);
while (!initialized) {
}
sleep(5); // wait server to start
// local net // local net
f::Scope scope; f::Scope scope;
p::CPUPlace place; p::CPUPlace place;
...@@ -193,9 +201,11 @@ TEST(SendRecvOp, CPUSparse) { ...@@ -193,9 +201,11 @@ TEST(SendRecvOp, CPUSparse) {
InitSelectedRowsInScope(place, &scope); InitSelectedRowsInScope(place, &scope);
scope.Var("RPC_CLIENT_VAR"); scope.Var("RPC_CLIENT_VAR");
f::AttributeMap attrs; f::AttributeMap attrs;
selected_port = static_cast<paddle::operators::ListenAndServOp *>( auto *listen_and_serv_op_ptr =
listen_and_serv_op.get()) static_cast<paddle::operators::ListenAndServOp *>(
->GetSelectedPort(); listen_and_serv_op.get());
ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
selected_port = listen_and_serv_op_ptr->GetSelectedPort();
std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port); std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
attrs.insert({"endpoints", std::vector<std::string>({endpoint})}); attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
attrs.insert({"epmap", std::vector<std::string>({endpoint})}); attrs.insert({"epmap", std::vector<std::string>({endpoint})});
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <thrust/device_vector.h> #include <thrust/device_vector.h>
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
#include "paddle/fluid/operators/sequence_erase_op.h" #include "paddle/fluid/operators/sequence_erase_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "paddle/fluid/operators/sequence_expand_op.h" #include "paddle/fluid/operators/sequence_expand_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -48,6 +48,24 @@ class SGDOp : public framework::OperatorWithKernel { ...@@ -48,6 +48,24 @@ class SGDOp : public framework::OperatorWithKernel {
} }
}; };
class SGDOpInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto input_var = op_desc.Input("Param")[0];
for (auto& out_var : op_desc.Output("ParamOut")) {
if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
framework::proto::VarType::SELECTED_ROWS) {
block->FindRecursiveOrCreateVar(out_var).SetType(
framework::proto::VarType::SELECTED_ROWS);
} else {
block->FindRecursiveOrCreateVar(out_var).SetType(
framework::proto::VarType::LOD_TENSOR);
}
}
}
};
class SGDOpMaker : public framework::OpProtoAndCheckerMaker { class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker) SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker)
...@@ -74,5 +92,6 @@ $$param\_out = param - learning\_rate * grad$$ ...@@ -74,5 +92,6 @@ $$param\_out = param - learning\_rate * grad$$
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker); REGISTER_OPERATOR(sgd, ops::SGDOp, ops::SGDOpMaker,
paddle::framework::EmptyGradOpMaker, ops::SGDOpInferVarType);
REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<float>, ops::SGDOpKernel<double>); REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<float>, ops::SGDOpKernel<double>);
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "paddle/fluid/operators/sgd_op.h" #include "paddle/fluid/operators/sgd_op.h"
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -116,11 +116,31 @@ uniform distribution. ...@@ -116,11 +116,31 @@ uniform distribution.
.SetDefault(framework::proto::VarType::FP32); .SetDefault(framework::proto::VarType::FP32);
} }
}; };
class UniformRandomOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto out_var_name = op_desc.Output("Out").front();
if (block->FindRecursiveOrCreateVar(out_var_name).GetType() ==
framework::proto::VarType::SELECTED_ROWS) {
block->FindRecursiveOrCreateVar(out_var_name)
.SetType(framework::proto::VarType::SELECTED_ROWS);
} else {
block->FindRecursiveOrCreateVar(out_var_name)
.SetType(framework::proto::VarType::LOD_TENSOR);
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_WITHOUT_GRADIENT(uniform_random, paddle::operators::UniformRandomOp, REGISTER_OPERATOR(uniform_random, paddle::operators::UniformRandomOp,
paddle::operators::UniformRandomOpMaker); paddle::operators::UniformRandomOpMaker,
paddle::framework::EmptyGradOpMaker,
paddle::operators::UniformRandomOpVarTypeInference);
REGISTER_OP_CPU_KERNEL(uniform_random, REGISTER_OP_CPU_KERNEL(uniform_random,
paddle::operators::CPUUniformRandomKernel<float>, paddle::operators::CPUUniformRandomKernel<float>,
paddle::operators::CPUUniformRandomKernel<double>); paddle::operators::CPUUniformRandomKernel<double>);
......
...@@ -162,7 +162,7 @@ class WarpCTCKernel : public framework::OpKernel<T> { ...@@ -162,7 +162,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
static_cast<int64_t>(sequence_width)}); static_cast<int64_t>(sequence_width)});
warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace()); warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
math::PaddingLoDTensorFunctor<DeviceContext, T>()( math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *logits, warpctc_logits, ctx.template device_context<DeviceContext>(), *logits, &warpctc_logits,
false); false);
const T* warpctc_logits_data = warpctc_logits.data<T>(); const T* warpctc_logits_data = warpctc_logits.data<T>();
...@@ -217,7 +217,7 @@ class WarpCTCGradKernel : public framework::OpKernel<T> { ...@@ -217,7 +217,7 @@ class WarpCTCGradKernel : public framework::OpKernel<T> {
logits_grad->mutable_data<T>(ctx.GetPlace()); logits_grad->mutable_data<T>(ctx.GetPlace());
bool norm_by_times = ctx.Attr<bool>("norm_by_times"); bool norm_by_times = ctx.Attr<bool>("norm_by_times");
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()( math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *logits_grad, ctx.template device_context<DeviceContext>(), logits_grad,
*warpctc_grad, norm_by_times); *warpctc_grad, norm_by_times);
const T* loss_grad_data = loss_grad->data<T>(); const T* loss_grad_data = loss_grad->data<T>();
......
...@@ -66,5 +66,18 @@ CUDA_ATOMIC_WRAPPER(Add, double) { ...@@ -66,5 +66,18 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
} }
#endif #endif
// __shfl_down has been deprecated as of CUDA 9.0.
#if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
return __shfl_down(val, delta);
}
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -502,11 +502,11 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -502,11 +502,11 @@ All parameter, weight, gradient are variables in Paddle.
const std::unordered_set<std::string> &bcast_vars, const std::unordered_set<std::string> &bcast_vars,
const ProgramDesc &main_program, const std::string &loss_var_name, const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, std::vector<Scope *> &local_scopes, Scope *scope, std::vector<Scope *> &local_scopes,
bool allow_op_delay, bool customize_loss_grad) { bool allow_op_delay, bool use_default_grad_scale) {
new (&self) ParallelExecutor(num_threads, use_event, places, new (&self) ParallelExecutor(
params, bcast_vars, main_program, num_threads, use_event, places, params, bcast_vars,
loss_var_name, scope, local_scopes, main_program, loss_var_name, scope, local_scopes,
allow_op_delay, customize_loss_grad); allow_op_delay, use_default_grad_scale);
}) })
.def("bcast_params", &ParallelExecutor::BCastParamsToGPUs) .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
// NOTE: even we return a vec<Scope*>* to Python use reference policy. // NOTE: even we return a vec<Scope*>* to Python use reference policy.
......
...@@ -107,7 +107,7 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) { ...@@ -107,7 +107,7 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
return self.data<T>()[offset]; return self.data<T>()[offset];
} else { } else {
std::shared_ptr<framework::Tensor> dst(new framework::Tensor); std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
framework::TensorCopy(self, platform::CPUPlace(), dst.get()); framework::TensorCopySync(self, platform::CPUPlace(), dst.get());
return dst->data<T>()[offset]; return dst->data<T>()[offset];
} }
} }
...@@ -117,9 +117,9 @@ template <typename T> ...@@ -117,9 +117,9 @@ template <typename T>
void TensorSetElement(framework::Tensor *self, size_t offset, T elem) { void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
if (platform::is_gpu_place(self->place())) { if (platform::is_gpu_place(self->place())) {
std::shared_ptr<framework::Tensor> dst(new framework::Tensor); std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
framework::TensorCopy(*self, platform::CPUPlace(), dst.get()); framework::TensorCopySync(*self, platform::CPUPlace(), dst.get());
dst->data<T>()[offset] = elem; dst->data<T>()[offset] = elem;
framework::TensorCopy(*dst.get(), self->place(), self); framework::TensorCopySync(*dst.get(), self->place(), self);
} else if (platform::is_cpu_place(self->place())) { } else if (platform::is_cpu_place(self->place())) {
self->data<T>()[offset] = elem; self->data<T>()[offset] = elem;
......
...@@ -155,7 +155,7 @@ EOF ...@@ -155,7 +155,7 @@ EOF
function gen_dockerfile() { function gen_dockerfile() {
# Set BASE_IMAGE according to env variables # Set BASE_IMAGE according to env variables
if [[ ${WITH_GPU} == "ON" ]]; then if [[ ${WITH_GPU} == "ON" ]]; then
BASE_IMAGE="nvidia/cuda:8.0-cudnn7-runtime-ubuntu16.04" BASE_IMAGE="nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04"
else else
BASE_IMAGE="ubuntu:16.04" BASE_IMAGE="ubuntu:16.04"
fi fi
......
...@@ -208,8 +208,8 @@ EOF ...@@ -208,8 +208,8 @@ EOF
--platform=android-$ANDROID_API \ --platform=android-$ANDROID_API \
--install-dir=$ANDROID_STANDALONE_TOOLCHAIN --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
BUILD_ROOT=${PADDLE_ROOT}/build BUILD_ROOT=${PADDLE_ROOT}/build_android
DEST_ROOT={PADDLE_ROOT}/install DEST_ROOT=${PADDLE_ROOT}/install_android
mkdir -p $BUILD_ROOT mkdir -p $BUILD_ROOT
cd $BUILD_ROOT cd $BUILD_ROOT
...@@ -349,13 +349,18 @@ function gen_docs() { ...@@ -349,13 +349,18 @@ function gen_docs() {
======================================== ========================================
EOF EOF
cmake .. \ cmake .. \
-DCMAKE_BUILD_TYPE=Release \
-DWITH_DOC=ON \ -DWITH_DOC=ON \
-DWITH_GPU=OFF \ -DWITH_GPU=OFF \
-DWITH_AVX=${WITH_AVX:-ON} \ -DWITH_MKL=OFF \
-DWITH_SWIG_PY=ON \
-DWITH_STYLE_CHECK=OFF -DWITH_STYLE_CHECK=OFF
make -j `nproc` paddle_docs paddle_apis make -j `nproc` paddle_docs paddle_apis
# check websites for broken links
linkchecker doc/v2/en/html/index.html
linkchecker doc/v2/cn/html/index.html
linkchecker doc/v2/api/en/html/index.html
} }
function gen_html() { function gen_html() {
......
...@@ -28,11 +28,16 @@ function start_build_docker() { ...@@ -28,11 +28,16 @@ function start_build_docker() {
docker rm -f "${CONTAINER_ID}" 1>/dev/null docker rm -f "${CONTAINER_ID}" 1>/dev/null
fi fi
apt_mirror='s#http://archive.ubuntu.com/ubuntu#mirror://mirrors.ubuntu.com/mirrors.txt#g'
DOCKER_ENV=$(cat <<EOL DOCKER_ENV=$(cat <<EOL
-e FLAGS_fraction_of_gpu_memory_to_use=0.15 \ -e FLAGS_fraction_of_gpu_memory_to_use=0.15 \
-e CTEST_OUTPUT_ON_FAILURE=1 \ -e CTEST_OUTPUT_ON_FAILURE=1 \
-e CTEST_PARALLEL_LEVEL=5 \ -e CTEST_PARALLEL_LEVEL=5 \
-e APT_MIRROR=${apt_mirror} \
-e WITH_GPU=ON \ -e WITH_GPU=ON \
-e CUDA_ARCH_NAME=Auto \
-e WITH_AVX=ON \
-e WITH_GOLANG=OFF \
-e WITH_TESTING=ON \ -e WITH_TESTING=ON \
-e WITH_C_API=OFF \ -e WITH_C_API=OFF \
-e WITH_COVERAGE=ON \ -e WITH_COVERAGE=ON \
...@@ -42,18 +47,23 @@ function start_build_docker() { ...@@ -42,18 +47,23 @@ function start_build_docker() {
-e PADDLE_FRACTION_GPU_MEMORY_TO_USE=0.15 \ -e PADDLE_FRACTION_GPU_MEMORY_TO_USE=0.15 \
-e CUDA_VISIBLE_DEVICES=0,1 \ -e CUDA_VISIBLE_DEVICES=0,1 \
-e WITH_DISTRIBUTE=ON \ -e WITH_DISTRIBUTE=ON \
-e WITH_FLUID_ONLY=ON \
-e RUN_TEST=ON -e RUN_TEST=ON
EOL EOL
) )
DOCKER_CMD="nvidia-docker"
if ! [ -x "$(command -v ${DOCKER_CMD})" ]; then
DOCKER_CMD="docker"
fi
set -x set -x
nvidia-docker run -it \ ${DOCKER_CMD} run -it \
-d \
--name $CONTAINER_ID \ --name $CONTAINER_ID \
${DOCKER_ENV} \ ${DOCKER_ENV} \
-v $PADDLE_ROOT:/paddle \ -v $PADDLE_ROOT:/paddle \
-w /paddle \ -w /paddle \
$IMG \ $IMG \
/bin/bash paddle/scripts/paddle_build.sh $@
set +x set +x
} }
...@@ -67,24 +77,7 @@ function main() { ...@@ -67,24 +77,7 @@ function main() {
VERSION="latest-dev-android" VERSION="latest-dev-android"
fi fi
IMG=${DOCKER_REPO}:${VERSION} IMG=${DOCKER_REPO}:${VERSION}
start_build_docker $@
case $1 in
start)
start_build_docker
;;
build_android)
start_build_docker
docker exec ${CONTAINER_ID} bash -c "./paddle/scripts/paddle_build.sh $@"
;;
*)
if container_running "${CONTAINER_ID}"; then
docker exec ${CONTAINER_ID} bash -c "./paddle/scripts/paddle_build.sh $@"
else
echo "Please start container first, with command:"
echo "$0 start"
fi
;;
esac
} }
main $@ main $@
...@@ -20,6 +20,16 @@ from framework import * ...@@ -20,6 +20,16 @@ from framework import *
import executor import executor
from executor import * from executor import *
import trainer
from trainer import Trainer
from trainer import Event
import inferencer
from inferencer import Inferencer
import params
from params import Params
import io import io
import evaluator import evaluator
import initializer import initializer
...@@ -47,7 +57,8 @@ from parallel_executor import ParallelExecutor ...@@ -47,7 +57,8 @@ from parallel_executor import ParallelExecutor
Tensor = LoDTensor Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\
trainer.__all__ + inferencer.__all__ + params.__all__ + [
'io', 'io',
'initializer', 'initializer',
'layers', 'layers',
...@@ -111,7 +122,9 @@ def __bootstrap__(): ...@@ -111,7 +122,9 @@ def __bootstrap__():
'eager_delete_scope' 'eager_delete_scope'
] ]
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += ['fraction_of_gpu_memory_to_use'] read_env_flags += [
'fraction_of_gpu_memory_to_use', 'cudnn_algo_use_autotune'
]
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)]) ["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0]) core.init_glog(sys.argv[0])
......
...@@ -661,7 +661,7 @@ class DistributeTranspiler: ...@@ -661,7 +661,7 @@ class DistributeTranspiler:
shape=trainer_out.shape, shape=trainer_out.shape,
dtype=trainer_out.dtype) dtype=trainer_out.dtype)
prefetch_block.append_op( prefetch_block.append_op(
type=LOOKUP_TABLE_TYPE, type="lookup_sparse_table",
inputs={'Ids': pserver_ids, inputs={'Ids': pserver_ids,
"W": table_var}, "W": table_var},
outputs={"Out": pserver_out}, outputs={"Out": pserver_out},
...@@ -685,9 +685,14 @@ class DistributeTranspiler: ...@@ -685,9 +685,14 @@ class DistributeTranspiler:
# STEP: create table optimize block # STEP: create table optimize block
# create table param and grad var in pserver program # create table param and grad var in pserver program
param_var = _clone_var( origin_param_var = self.origin_program.global_block().vars[
pserver_program.global_block(), self.table_name]
self.origin_program.global_block().vars[self.table_name]) param_var = pserver_program.global_block().create_var(
name=origin_param_var.name,
shape=origin_param_var.shape,
dtype=origin_param_var.dtype,
type=core.VarDesc.VarType.SELECTED_ROWS,
persistable=True)
grad_var = _clone_var( grad_var = _clone_var(
pserver_program.global_block(), pserver_program.global_block(),
self.origin_program.global_block().vars[framework.grad_var_name( self.origin_program.global_block().vars[framework.grad_var_name(
......
...@@ -658,10 +658,10 @@ class Operator(object): ...@@ -658,10 +658,10 @@ class Operator(object):
class Block(object): class Block(object):
def __init__(self, program, idx): def __init__(self, program, idx):
self.desc = program.desc.block(idx) self.desc = program.desc.block(idx)
self.vars = dict() # var_name --> var self.vars = collections.OrderedDict() # var_name --> var
self.ops = list() # operator list self.ops = list() # operator list
self.program = program self.program = program
self.removed_vars = dict() self.removed_vars = collections.OrderedDict()
def __str__(self): def __str__(self):
return self.to_string(True) return self.to_string(True)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = ['Inferencer', ]
class Inferencer(object):
def __init__(self, network_func, params, place=None):
# 1. we need to generate a framework.Program by calling
# network_func. Reference: fluid.program_guard in test_word2vec.py
# 2. move the default_main_program to self.program.
# 3. run the default_startup program.
self.params = params
self.place = place
def infer(self, inputs):
# run self.program
pass
...@@ -168,7 +168,9 @@ class ListenAndServ(object): ...@@ -168,7 +168,9 @@ class ListenAndServ(object):
'endpoint': self.endpoint, 'endpoint': self.endpoint,
'Fanin': self.fan_in, 'Fanin': self.fan_in,
'OptimizeBlock': current_block, 'OptimizeBlock': current_block,
'PrefetchBlock': empty_block 'PrefetchBlock': empty_block,
'sync_mode': True, # did not support async now in layers
'grad_to_block_id': [""]
}) })
......
...@@ -193,10 +193,7 @@ def assign(input, output): ...@@ -193,10 +193,7 @@ def assign(input, output):
helper = LayerHelper('assign', **locals()) helper = LayerHelper('assign', **locals())
if isinstance(input, Variable): if isinstance(input, Variable):
helper.append_op( helper.append_op(
type='scale', type='assign', inputs={'X': [input]}, outputs={'Out': [output]})
inputs={'X': [input]},
outputs={'Out': [output]},
attrs={'scale': 1.0})
elif isinstance(input, numpy.ndarray): elif isinstance(input, numpy.ndarray):
dtype = convert_np_dtype_to_dtype_(input.dtype) dtype = convert_np_dtype_to_dtype_(input.dtype)
if dtype == VarDesc.VarType.FP32: if dtype == VarDesc.VarType.FP32:
......
...@@ -30,7 +30,7 @@ class ParallelExecutor(object): ...@@ -30,7 +30,7 @@ class ParallelExecutor(object):
num_threads=None, num_threads=None,
allow_op_delay=False, allow_op_delay=False,
share_vars_from=None, share_vars_from=None,
customize_loss_grad=False): use_default_grad_scale=True):
""" """
ParallelExecutor can run program in parallel. ParallelExecutor can run program in parallel.
...@@ -46,6 +46,11 @@ class ParallelExecutor(object): ...@@ -46,6 +46,11 @@ class ParallelExecutor(object):
improve performance in some cases, defalut False. improve performance in some cases, defalut False.
share_vars_from(ParallelExecutor, default None): If provied, share_vars_from(ParallelExecutor, default None): If provied,
it will share variables from the specified ParallelExecutor. it will share variables from the specified ParallelExecutor.
use_default_grad_scale(bool, default True): If set True, a default
scale value equal to `1./device_count` would be multiplied to
gradients of each device and scaled gradients would be
aggregated. Otherwise, a customized scale value should be fed
to the network.
Returns: Returns:
A ParallelExecutor object. A ParallelExecutor object.
...@@ -124,7 +129,7 @@ class ParallelExecutor(object): ...@@ -124,7 +129,7 @@ class ParallelExecutor(object):
scope, scope,
local_scopes, local_scopes,
allow_op_delay, allow_op_delay,
customize_loss_grad) use_default_grad_scale)
self.scope = scope self.scope = scope
def run(self, fetch_list, feed=None, feed_dict=None): def run(self, fetch_list, feed=None, feed_dict=None):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import core
__all__ = ['Params', ]
class Params(object):
def __init__(self, path=None):
self.scope = core.Scope()
if path:
self._load(path)
def _load(self, path):
# reference: load_persistables in io.py
pass
def save(self, path):
# reference: save_persistables in io.py
pass
def add_params(self, scope):
# take the keys from the scope,
# if not already exists in self.scope,
# add the key and value into self.scope.
pass
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import numpy
def resnet_cifar10(input, depth=32):
def conv_bn_layer(input,
ch_out,
filter_size,
stride,
padding,
act='relu',
bias_attr=False):
tmp = fluid.layers.conv2d(
input=input,
filter_size=filter_size,
num_filters=ch_out,
stride=stride,
padding=padding,
act=None,
bias_attr=bias_attr)
return fluid.layers.batch_norm(input=tmp, act=act)
def shortcut(input, ch_in, ch_out, stride):
if ch_in != ch_out:
return conv_bn_layer(input, ch_out, 1, stride, 0, None)
else:
return input
def basicblock(input, ch_in, ch_out, stride):
tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True)
short = shortcut(input, ch_in, ch_out, stride)
return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
def layer_warp(block_func, input, ch_in, ch_out, count, stride):
tmp = block_func(input, ch_in, ch_out, stride)
for i in range(1, count):
tmp = block_func(tmp, ch_out, ch_out, 1)
return tmp
assert (depth - 2) % 6 == 0
n = (depth - 2) / 6
conv1 = conv_bn_layer(
input=input, ch_out=16, filter_size=3, stride=1, padding=1)
res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
pool = fluid.layers.pool2d(
input=res3, pool_size=8, pool_type='avg', pool_stride=1)
return pool
def inference_network():
classdim = 10
data_shape = [3, 32, 32]
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
net = resnet_cifar10(images, 32)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
return predict
def train_network():
predict = inference_network()
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(cost)
accuracy = fluid.layers.accuracy(input=predict, label=label)
return avg_cost, accuracy
def train(use_cuda, save_path):
BATCH_SIZE = 128
EPOCH_NUM = 1
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10(), buf_size=128 * 10),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
def event_handler(event):
if isinstance(event, fluid.EndIteration):
if (event.batch_id % 10) == 0:
avg_cost, accuracy = trainer.test(reader=test_reader)
print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format(
event.batch_id + 1, avg_cost, accuracy))
if accuracy > 0.01: # Low threshold for speeding up CI
trainer.params.save(save_path)
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer(
train_network,
optimizer=fluid.optimizer.Adam(learning_rate=0.001),
place=place,
event_handler=event_handler)
trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler)
def infer(use_cuda, save_path):
params = fluid.Params(save_path)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(inference_network, params, place=place)
# The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range
# [0, 1.0].
tensor_img = numpy.random.rand(1, 3, 32, 32).astype("float32")
results = inferencer.infer({'pixel': tensor_img})
print("infer results: ", results)
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "image_classification_resnet.inference.model"
train(use_cuda, save_path)
infer(use_cuda, save_path)
if __name__ == '__main__':
for use_cuda in (False, True):
main(use_cuda=use_cuda)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import numpy
def vgg16_bn_drop(input):
def conv_block(input, num_filter, groups, dropouts):
return fluid.nets.img_conv_group(
input=input,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act='relu',
conv_with_batchnorm=True,
conv_batchnorm_drop_rate=dropouts,
pool_type='max')
conv1 = conv_block(input, 64, 2, [0.3, 0])
conv2 = conv_block(conv1, 128, 2, [0.4, 0])
conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
bn = fluid.layers.batch_norm(input=fc1, act='relu')
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
return fc2
def inference_network():
classdim = 10
data_shape = [3, 32, 32]
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
net = vgg16_bn_drop(images)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
return predict
def train_network():
predict = inference_network()
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(cost)
accuracy = fluid.layers.accuracy(input=predict, label=label)
return avg_cost, accuracy
def train(use_cuda, save_path):
BATCH_SIZE = 128
EPOCH_NUM = 1
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10(), buf_size=128 * 10),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
def event_handler(event):
if isinstance(event, fluid.EndIteration):
if (event.batch_id % 10) == 0:
avg_cost, accuracy = trainer.test(reader=test_reader)
print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format(
event.batch_id + 1, avg_cost, accuracy))
if accuracy > 0.01: # Low threshold for speeding up CI
trainer.params.save(save_path)
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer(
train_network,
optimizer=fluid.optimizer.Adam(learning_rate=0.001),
place=place,
event_handler=event_handler)
trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler)
def infer(use_cuda, save_path):
params = fluid.Params(save_path)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(inference_network, params, place=place)
# The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range
# [0, 1.0].
tensor_img = numpy.random.rand(1, 3, 32, 32).astype("float32")
results = inferencer.infer({'pixel': tensor_img})
print("infer results: ", results)
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "image_classification_vgg.inference.model"
train(use_cuda, save_path)
infer(use_cuda, save_path)
if __name__ == '__main__':
for use_cuda in (False, True):
main(use_cuda=use_cuda)
...@@ -275,10 +275,7 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference): ...@@ -275,10 +275,7 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
class TestBatchNormOpTraining(unittest.TestCase): class TestBatchNormOpTraining(unittest.TestCase):
def __assert_close(self, tensor, np_array, msg, atol=1e-4): def __assert_close(self, tensor, np_array, msg, atol=1e-4):
if not np.allclose(np.array(tensor), np_array, atol=atol): np.allclose(np.array(tensor), np_array, atol=atol)
import pdb
pdb.set_trace()
self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
def test_forward_backward(self): def test_forward_backward(self):
def test_with_place(place, data_layout, shape): def test_with_place(place, data_layout, shape):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
import paddle.fluid.core as core
from paddle.fluid.op import Operator
def output_hist(out):
hist, _ = np.histogram(out, range=(-5, 10))
hist = hist.astype("float32")
hist /= float(out.size)
prob = 0.1 * np.ones((10))
return hist, prob
class TestLookupSpraseTable(OpTest):
def check_with_place(self, place):
scope = core.Scope()
# create and initialize Id Variable
ids = scope.var("Ids").get_tensor()
ids_array = np.array([0, 2, 3, 5, 100]).astype("int64")
ids.set(ids_array, place)
# create and initialize W Variable
rows = [0, 1, 2, 3, 4, 5, 6]
row_numel = 10000
w_selected_rows = scope.var('W').get_selected_rows()
w_selected_rows.set_height(len(rows))
w_selected_rows.set_rows(rows)
w_array = np.ones((len(rows), row_numel)).astype("float32")
for i in range(len(rows)):
w_array[i] *= i
w_tensor = w_selected_rows.get_tensor()
w_tensor.set(w_array, place)
# create Out Variable
out_tensor = scope.var('Out').get_tensor()
# create and run lookup_table operator
lookup_table = Operator(
"lookup_sparse_table",
W='W',
Ids='Ids',
Out='Out',
min=-5.0,
max=10.0,
seed=10)
lookup_table.run(scope, place)
# get result from Out
result_array = np.array(out_tensor)
# all(): return True if all elements of the iterable are true (or if the iterable is empty)
for idx, row in enumerate(ids_array[:-2]):
assert (row == result_array[idx]).all()
# check the random value
hist, prob = output_hist(result_array[-1])
self.assertTrue(
np.allclose(
hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
def test_w_is_selected_rows(self):
places = [core.CPUPlace()]
# currently only support CPU
for place in places:
self.check_with_place(place)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = [
'Event',
'Trainer',
]
class Event(object):
BEGIN_EPOCH = 0
END_EPOCH = 1
BEGIN_STEP = 2
END_STEP = 3
def __init__(self):
self.step = 0
self.epoch = 0
self.type = Event.BEGIN_EPOCH
class Trainer(object):
def __init__(self, network_func, optimizer, params=None, place=None):
# 1. we need to generate a framework.Program by calling
# network_func. Reference: fluid.program_guard in
# test_word2vec.py
# 2. move the default_main_program to self.program and run the
# default_startup program on an empty core.Scope()
# 3. call self.params.add_vars with the initialized scope, it
# will add the new vars of the initialized scope into
# self.params.
self.network_func = network_func
self.optimizer = optimizer
self.params = params
self.place = place
# TODO(helin): support distributed training
def train(self, reader, num_epochs, event_handler):
pass
def test(self, reader):
pass
...@@ -8,3 +8,4 @@ scipy>=0.19.0 ...@@ -8,3 +8,4 @@ scipy>=0.19.0
Pillow Pillow
nltk>=3.2.2 nltk>=3.2.2
graphviz graphviz
LinkChecker
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册