diff --git a/.travis.yml b/.travis.yml index 929c847bd36d64e79a199b2634ebf68c3225429b..fe4eb2d1570fb3dbb38128400e6f438b99236c0d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ services: os: - linux env: - - JOB=build_doc + - JOB=doc - JOB=check_style - JOB=build_android addons: @@ -36,21 +36,18 @@ addons: - ccache ssh_known_hosts: 13.229.163.131 before_install: - - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi - # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python - # protobuf version. - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt - - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker + - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit - | function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } script: - | # 43min timeout - if [[ "$JOB" == "build_android" ]]; then timeout 2580 docker run -it --rm -v "$TRAVIS_BUILD_DIR:/paddle" paddlepaddle/paddle:latest-dev-android; - else timeout 2580 paddle/scripts/travis/${JOB}.sh; fi; - RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi; + if [[ "$JOB" != "doc" ]]; then timeout 2580 paddle/scripts/paddle_docker_build.sh ${JOB}; else paddle/scripts/paddle_build.sh ${JOB}; fi; + if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi; - | - if [[ "$JOB" != "build_doc" ]]; then exit 0; fi; + if [[ "$JOB" != "doc" ]]; then exit 0; fi; + # For document only if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi; if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi; export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh diff --git a/AUTHORS.md b/AUTHORS.md index 9c6821d9f8681c5907c2fc9938fdb62ba64b9a92..4ee05420982d13f686cf13e8957ce41dfcdd2cb8 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -2,12 +2,14 @@ |---|---| | abhinavarora | Abhinav Arora | | backyes | Yan-Fei Wang | +| baiyfbupt | Yi-Fan Bai | | beckett1124 | Bin Qi | -| JiayiFeng | Jia-Yi Feng | | chengxiaohua1105 | Xiao-Hua Cheng | | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang | | cxysteven | Xing-Yi Cheng | | dzhwinter | Zhi-Hong Dong | +| dragonwarrior | Long Wang | +| dyning | Yuning Du | | emailweixu | Wei Xu | | gangliao | Gang Liao | | gongweibao | Wei-Bao Gong | @@ -16,6 +18,9 @@ | hedaoyuan | Dao-Yuan He | | helinwang | He-Lin Wang | | jacquesqiao | Long-Fei Qiao | +| jczaja | Jacek Czaja | +| JiayiFeng | Jia-Yi Feng | +| kbinias | Krzysztof Binias | | kuke | Yi-Bing Liu | | lcy-seso | Ying Cao | | lipeng-unisound | Peng Li | @@ -24,15 +29,20 @@ | llxxxll | Yong-Feng Liu | | luotao01 | Tao Luo | | lzhao4ever | Liang Zhao | +| mozga-intel | Mateusz Ozga | | NHZlX | Zhao-Long Xing | +| Noplz | Yuan Gao | | pakchoi | Chuan-Jiang Song | +| panyx0718 | Xin Pan | | pengli09 | Peng Li | | pkuyym | Ya-Ming Yang | +| pzelazko-intel | Pawel Zelazko | | QiJune | Jun Qi | | qingqing01 | Qing-Qing Dang | | reyoung | Yang Yu | | Superjom | Chun-Wei Yan | | tianbingsz | Tian-Bing Xu | +| tpatejko | Tomasz Patejko | | typhoonzero | Yi Wu | | wanghaoshuang | Hao-Shuang Wang | | wangyang59 | Yang Wang | diff --git a/Dockerfile b/Dockerfile index c257dbfc2987323f8ed2a24dfffa8b3c15e09399..164fe84904947bfc3cf71132b5fba04744460b26 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,6 @@ # A image for building paddle binaries # Use cuda devel base image for both cpu and gpu environment - -# When you modify it, please be aware of cudnn-runtime version +# When you modify it, please be aware of cudnn-runtime version # and libcudnn.so.x in paddle/scripts/docker/build.sh FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04 MAINTAINER PaddlePaddle Authors @@ -24,7 +23,7 @@ ENV HOME /root COPY ./paddle/scripts/docker/root/ /root/ RUN apt-get update && \ - apt-get install -y \ + apt-get install -y --allow-downgrades \ git python-pip python-dev openssh-server bison \ libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ @@ -33,7 +32,7 @@ RUN apt-get update && \ automake locales clang-format swig doxygen cmake \ liblapack-dev liblapacke-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \ - net-tools libtool && \ + net-tools libtool ccache && \ apt-get clean -y # Install Go and glide diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 7edc8637727e300539a46bc3941ace87c87903b8..b520c03a836a9e3f263ba050f151877ffe0d071d 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -172,6 +172,8 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF) list(APPEND CUDA_NVCC_FLAGS "-std=c++11") list(APPEND CUDA_NVCC_FLAGS "--use_fast_math") list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC") +# in cuda9, suppress cuda warning on eigen +list(APPEND CUDA_NVCC_FLAGS "-w") # Set :expt-relaxed-constexpr to suppress Eigen warnings list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 73d70c34dce8bedd9e62519c207e5be3dcf7dba3..edc93c2773f46ec9e0bf898557c55c93274e6a01 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -22,7 +22,9 @@ else() extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/RLovelett/eigen.git" - GIT_TAG 70661066beef694cadf6c304d0d07e0758825c10 + # eigen on cuda9.1 missing header of math_funtions.hpp + # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen + GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c PREFIX ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index a631ad14b18310598f7eea3a51839d61a9e456ff..07e1137e16afc1e4e9ab9640e1ccaea8008a0cd2 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -38,8 +38,7 @@ ENDIF() ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/gangliao/warp-ctc.git" - GIT_TAG b63a0644654a3e0ed624c85a1767bc8193aead09 + GIT_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git" PREFIX ${WARPCTC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/doc/fluid/design/algorithm/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md index 340bc302d57429a9bf10a9d23ed9b0cdc7a2a568..28ad6495d97515442eb8af2050158829814acd33 100644 --- a/doc/fluid/design/algorithm/parameter_average.md +++ b/doc/fluid/design/algorithm/parameter_average.md @@ -1,7 +1,7 @@ # Averaging Parameter in PaddlePaddle ## Why Averaging -In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable if we can obtain the optimal values of parameters by going through the data in as few passes as we can. +In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable to obtain the optimal values of parameters by going through the data in as few passes as possible. Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset. @@ -16,16 +16,16 @@ We propose averaging for any optimizer similar to how ASGD performs it, as menti ### How to perform Parameter Averaging in PaddlePaddle Parameter Averaging in PaddlePaddle works in the following way during training : -1. It will take in an instance of a normal optimizer as an input, e.g. RMSPropOptimizer +1. It will take in an instance of an optimizer as an input, e.g. RMSPropOptimizer 2. The optimizer itself is responsible for updating the parameters. 3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself: - 1. In concept, the values of this copy are the average of the values of the parameters in the most recent N batches. - 2. However, saving all the N instances of the parameters in memory is not feasible. + 1. In theory, the values of this copy are the average of the values of the parameters in the most recent N batches. + 2. However, saving all N instances of the parameters in memory is not feasible. 3. Therefore, an approximation algorithm is used. Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved. -During the testing/ saving the model phase, we perform the following steps: +During the testing/saving the model phase, we perform the following steps: 1. Perform the delayed operations. 2. Save current values of the parameters to a temporary variable. 3. Replace the values of the parameters with the averaged values. diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h index 6c4f09dacb47c431db2e2610a3e61390a82dcba0..b979aa7723ea77fcae98a5be41d9b9dc488a68c3 100644 --- a/paddle/cuda/include/hl_base.h +++ b/paddle/cuda/include/hl_base.h @@ -228,6 +228,21 @@ extern __thread cudaStream_t default_stream; << "CUDA error: " << hl_get_device_error_string((size_t)err); \ } +// __shfl has been deprecated as of CUDA 9.0. +#if CUDA_VERSION < 9000 +template +__forceinline__ __device__ T +__shfl_sync(unsigned, T val, int src_line, int width) { + return __shfl(val, src_line, width); +} + +#define CREATE_SHFL_MASK(mask, predicate) mask = 0u; +#else +#define FULL_WARP_MASK 0xFFFFFFFF +#define CREATE_SHFL_MASK(mask, predicate) \ + mask = __ballot_sync(FULL_WARP_MASK, (predicate)) +#endif + #endif /* __NVCC__ */ #endif /* HL_BASE_H_ */ diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu index 21c0c26b6ef0420b1a719736a66eeb8114ed9680..e30fcddffdf99417a4b9b811a0b0cb0a12e79b99 100644 --- a/paddle/cuda/src/hl_cuda_lstm.cu +++ b/paddle/cuda/src/hl_cuda_lstm.cu @@ -341,12 +341,15 @@ void hl_lstm_parallel_forward(real *gateValue, } __device__ __forceinline__ void transpose_32x32(real a[], const int idx) { - int addr = idx % 32; + const int warp_size = 32; + int addr = idx % warp_size; + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, addr < warp_size); #pragma unroll for (int k = 1; k < 32; k++) { - // rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32); - addr = __shfl(addr, (idx + 1) % 32, 32); - a[k] = __shfl(a[k], addr, 32); + // rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32); + addr = __shfl_sync(mask, addr, (idx + 1) % 32, 32); + a[k] = __shfl_sync(mask, a[k], addr, 32); } #pragma unroll @@ -360,10 +363,11 @@ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) { } addr = (32 - idx) % 32; + CREATE_SHFL_MASK(mask, idx % 32 < warp_size); #pragma unroll for (int k = 0; k < 32; k++) { - a[k] = __shfl(a[k], addr, 32); - addr = __shfl(addr, (idx + 31) % 32, 32); + a[k] = __shfl_sync(mask, a[k], addr, 32); + addr = __shfl_sync(mask, addr, (idx + 31) % 32, 32); } } diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu index fea8712a773b1524022f4bba626cf5044edebef6..59ba552f560dab904d4983e0778ff57be9477c3e 100644 --- a/paddle/cuda/src/hl_top_k.cu +++ b/paddle/cuda/src/hl_top_k.cu @@ -244,13 +244,16 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK, if (--beamSize == 0) break; __syncthreads(); + unsigned mask = 0u; + // CREATE_SHFL_MASK(mask, tid < len); + if (tid == maxId[0]) { if (beam < maxLength) { shTopK[tid] = topK[beam]; } } if (maxId[0] / 32 == warp) { - if (__shfl(beam, (maxId[0]) % 32, 32) == maxLength) break; + if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break; } } } diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 3413467b149539bcff42d78a9a6fe315d6558bb4..daba9bf2dd861d6366723ed71eada66bc67c78f2 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -34,7 +34,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( const std::vector &places, const std::string &loss_var_name, const std::unordered_set ¶ms, - const std::vector &local_scopes, bool skip_scale_loss, + const std::vector &local_scopes, bool use_default_grad_scale, platform::NCCLContextMap *nccl_ctxs) : loss_var_name_(loss_var_name), places_(places), @@ -45,7 +45,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( const std::vector &places, const std::string &loss_var_name, const std::unordered_set ¶ms, - const std::vector &local_scopes, bool skip_scale_loss) + const std::vector &local_scopes, bool use_default_grad_scale) : loss_var_name_(loss_var_name), places_(places), local_scopes_(local_scopes) { @@ -53,28 +53,25 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( for (auto &p : params) { grad_names_.insert(GradVarName(p)); } - skip_scale_loss_ = skip_scale_loss; + use_default_grad_scale_ = use_default_grad_scale; } void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, const OpDesc &op, - const platform::Place &p, - const size_t &i) const { + size_t place_id) const { + auto p = places_[place_id]; auto *op_handle = result->ops_.back().get(); op_handle->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p)); - auto var_names = op.InputArgumentNames(); - - for (auto &each_var_name : var_names) { - VarHandle *var = CreateOrGetLatestVarHandle(result, each_var_name, p, i); + for (auto &each_var_name : op.InputArgumentNames()) { + VarHandle *var = + CreateOrGetLatestVarHandle(result, each_var_name, p, place_id); op_handle->AddInput(var); } - var_names = op.OutputArgumentNames(); - - for (auto &each_var_name : var_names) { - CreateOpOutput(result, op_handle, each_var_name, p, i); + for (auto &each_var_name : op.OutputArgumentNames()) { + CreateOpOutput(result, op_handle, each_var_name, p, place_id); } } @@ -84,17 +81,18 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op, return false; } - auto checker = [&](const std::vector opvars, - const std::vector sendvars) -> bool { - bool is_dist_train_op = false; + /** + * Check any of opvars contains `.block` and in sendvars + */ + auto checker = [](const std::vector &opvars, + const std::vector &sendvars) -> bool { for (auto &var : opvars) { if (var.find(".block") != std::string::npos && std::find(sendvars.begin(), sendvars.end(), var) != sendvars.end()) { - is_dist_train_op = true; - break; + return true; } } - return is_dist_train_op; + return false; }; if (op.Type() == "split") { @@ -117,13 +115,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( places_.size()); // Find "send" op first for split is in front of send. - OpDesc *send_op = nullptr; - for (auto *op : program.Block(0).AllOps()) { - if (op->Type() == "send") { - send_op = op; - break; - } - } + OpDesc *send_op = GetSendOpDesc(program); bool is_forwarding = true; for (auto *op : program.Block(0).AllOps()) { @@ -134,7 +126,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( } else if (IsDistTrainOp(*op, send_op)) { CreateComputationalOps(&result, *op, 1); } else if (IsScaleLossOp(*op)) { - if (!skip_scale_loss_) { + // user can customize loss@grad if not use_default_grad_scale_ + if (use_default_grad_scale_) { CreateScaleLossGradOp(&result); } is_forwarding = false; @@ -142,10 +135,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( CreateComputationalOps(&result, *op, places_.size()); if (!is_forwarding) { // Currently, we assume that once gradient is generated, it can be - // broadcast, and each gradient is only broadcast once. But there are no - // other cases, for example, we need to adjust the gradient according to - // the input when we get the gradient, which is not considered at - // present. + // broadcast, and each gradient is only broadcast once. for (auto &og : op->OutputArgumentNames()) { if (IsParameterGradientOnce(og, &og_has_been_broadcast)) { InsertNCCLAllReduceOp(&result, og); @@ -175,6 +165,16 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( return std::unique_ptr(graph); } +OpDesc *MultiDevSSAGraphBuilder::GetSendOpDesc( + const ProgramDesc &program) const { + for (auto *op : program.Block(0).AllOps()) { + if (op->Type() == "send") { + return op; + } + } + return nullptr; +} + void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp( SSAGraph *result, const std::string &og) const { #ifdef PADDLE_WITH_CUDA @@ -243,7 +243,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result, auto p = places_[scope_idx]; auto s = local_scopes_[scope_idx]; result->ops_.emplace_back(new ComputationOpHandle(op, s, p)); - CreateOpHandleIOs(result, op, p, scope_idx); + CreateOpHandleIOs(result, op, scope_idx); } } @@ -255,7 +255,7 @@ void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result, result->ops_.emplace_back(new SendOpHandle(op, s, p)); // Create inputs for output on original place and no ssa output // is created for send op. - CreateOpHandleIOs(result, op, p, 0); + CreateOpHandleIOs(result, op, 0); } bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const { diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index dc3da70eda2abaa1a312c25aedf94fa7e427c78a..bad47458ef4cd1cd42e902341e8be66da5c210ed 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -41,14 +41,14 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::string &loss_var_name, const std::unordered_set ¶ms, const std::vector &local_scopes, - bool skip_scale_loss); + bool use_default_grad_scale); #endif std::unique_ptr Build(const ProgramDesc &program) const override; private: void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op, - const platform::Place &p, const size_t &i) const; + size_t place_id) const; private: std::string loss_var_name_; @@ -59,12 +59,15 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { #ifdef PADDLE_WITH_CUDA platform::NCCLContextMap *nccl_ctxs_; #endif - bool skip_scale_loss_; + bool use_default_grad_scale_; bool IsScaleLossOp(const OpDesc &op) const; void CreateSendOp(SSAGraph *result, const OpDesc &op) const; + /** + * Is this operator as the end-point operator before/after send operator. + */ bool IsDistTrainOp(const OpDesc &op, OpDesc *send_op) const; void CreateComputationalOps(SSAGraph *result, const OpDesc &op, @@ -77,6 +80,12 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { std::unordered_set *og_has_been_broadcast) const; void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const; + + /** + * Get send op in the global block of program. + * nullptr if not found. + */ + OpDesc *GetSendOpDesc(const ProgramDesc &program) const; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index 7a65ee62c9bfc0dad2ebee3be21de825fa405d73..1cd3113030086104e7fc5c4ba3364a5ff027632b 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -46,6 +46,7 @@ void ScaleLossGradOpHandle::RunImpl() { ->stream(); memory::Copy(boost::get(place_), tmp, platform::CPUPlace(), &coeff_, sizeof(float), stream); + VLOG(1) << place_ << "RUN Scale loss grad op"; }); #endif } diff --git a/paddle/fluid/framework/details/ssa_graph.h b/paddle/fluid/framework/details/ssa_graph.h index 72684e7f97f1324d0efba960903cf9f2acb618a4..e996a00c162186e47e77d007503ac67caa9f8024 100644 --- a/paddle/fluid/framework/details/ssa_graph.h +++ b/paddle/fluid/framework/details/ssa_graph.h @@ -25,12 +25,22 @@ namespace paddle { namespace framework { namespace details { +// A SSA graph used by parallel executor. struct SSAGraph { + // all variable in each devices. + // The outside vector is the device vector. Each element of this vector is a + // map from variable name to variables. The variables, who have the same name, + // will have a different version. The offset in the + // `std::vector>` is the version of varaibles. std::vector< std::unordered_map>>> vars_; + // aux variables to represent dependency. Useful to resolve data hazard. std::unordered_set> dep_vars_; + + // all operators. NOTE that even we use a vector here, the operators is + // unordered. std::vector> ops_; }; diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index be1f0460e45402806b18835f054a7195df1374cc..64e5d93081eb76c56898bbeb530e37364619fdbb 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -48,6 +48,8 @@ class SSAGraphBuilder { const platform::Place &place, size_t place_offset); + // Add an output variable (each_var_name, place, place_offset) to op_handle, + // which belongs to graph static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, const std::string &each_var_name, const platform::Place &place, size_t place_offset); diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc index 97ab98f09b1a902a942d9667bc7716a28b98d54c..77e5ec4c7dd14b7ebb6d606b8c401ee714259d40 100644 --- a/paddle/fluid/framework/lod_tensor_test.cc +++ b/paddle/fluid/framework/lod_tensor_test.cc @@ -255,11 +255,11 @@ TEST(LoDTensor, RecordIO) { std::unique_ptr stream_ptr(stream); recordio::Scanner scanner(std::move(stream_ptr)); auto tensors = ReadFromRecordIO(&scanner, ctx); - ASSERT_EQ(tensors.size(), 2); + ASSERT_EQ(tensors.size(), static_cast(2)); assert_tensor_ok(tensors[0]); assert_tensor_ok(tensors[1]); tensors = ReadFromRecordIO(&scanner, ctx); - ASSERT_EQ(tensors.size(), 2); + ASSERT_EQ(tensors.size(), static_cast(2)); assert_tensor_ok(tensors[0]); assert_tensor_ok(tensors[1]); } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index de644e851999920251c762a75c050e8182e950c6..4712efeff68cf98a50b610acf78bb52d7519c520 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -58,7 +58,7 @@ ParallelExecutor::ParallelExecutor( const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope, const std::vector &local_scopes, bool allow_op_delay, - bool customize_scale_loss) + bool use_default_grad_scale) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; @@ -93,11 +93,11 @@ ParallelExecutor::ParallelExecutor( #ifdef PADDLE_WITH_CUDA details::MultiDevSSAGraphBuilder builder( member_->places_, loss_var_name, params, member_->local_scopes_, - customize_scale_loss, member_->nccl_ctxs_.get()); + use_default_grad_scale, member_->nccl_ctxs_.get()); #else details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, params, member_->local_scopes_, - customize_scale_loss); + use_default_grad_scale); #endif auto graph = builder.Build(main_program); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 49da123d98181c3d3abcdd64d14c5583142eba58..ecd107d81f8f5bf5d8b899d0c07797114a7ab767 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -40,7 +40,7 @@ class ParallelExecutor { const ProgramDesc& main_program, const std::string& loss_var_name, Scope* scope, const std::vector& local_scopes, - bool allow_op_delay, bool customize_scale_loss); + bool allow_op_delay, bool use_default_grad_scale); ~ParallelExecutor(); diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc index 794e7f743413b068119afd5df232bfc2bb91a8c7..56cf6693caf4529d6e157e6e9a0d5c27d05ee0c3 100644 --- a/paddle/fluid/framework/selected_rows.cc +++ b/paddle/fluid/framework/selected_rows.cc @@ -120,11 +120,11 @@ bool SelectedRows::HasKey(int64_t key) const { : true; } -std::vector SelectedRows::Get(std::vector keys, - framework::Tensor* value) const { +std::vector> SelectedRows::Get( + std::vector keys, framework::Tensor* value) const { PADDLE_ENFORCE(value->IsInitialized(), "The value tensor should be initialized."); - std::vector non_keys; + std::vector> non_keys_pair; int64_t value_width = value_->numel() / value_->dims()[0]; PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0], "output tensor should have the same shape with table " @@ -133,7 +133,7 @@ std::vector SelectedRows::Get(std::vector keys, for (size_t i = 0; i < keys.size(); ++i) { int64_t index = Index(keys[i]); if (index == -1) { - non_keys.push_back(keys[i]); + non_keys_pair.push_back(std::make_pair(keys[i], static_cast(i))); } else { framework::VisitDataType( framework::ToDataType(value_->type()), @@ -141,7 +141,7 @@ std::vector SelectedRows::Get(std::vector keys, index * value_width, value_width)); } } - return non_keys; + return non_keys_pair; } bool SelectedRows::Set(int64_t key, const framework::Tensor& value) { diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h index d6c9507b1681855e759a4b1b9d3dddf6fcb2fc13..c27c927ee751c4392840bfb71f4814991b23a8c9 100644 --- a/paddle/fluid/framework/selected_rows.h +++ b/paddle/fluid/framework/selected_rows.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include #include "paddle/fluid/framework/lod_tensor.h" @@ -78,10 +79,11 @@ class SelectedRows { /* * @brief Get value by the key list, if the * - * @return a list of keys which does not exists in table + * @return a list of pair which contains the non-exists key and the index in + * the value */ - std::vector Get(std::vector keys, - framework::Tensor* tensor) const; + std::vector> Get(std::vector keys, + framework::Tensor* value) const; /* * @brief Set a key-value pair into the table. diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc index 39fe6d92940606084c28eec1a4d6486cb58844ce..eefcaa5672c5a3debf162f5c8eda653408dcf221 100644 --- a/paddle/fluid/framework/selected_rows_test.cc +++ b/paddle/fluid/framework/selected_rows_test.cc @@ -59,7 +59,7 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) { ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims()); } -TEST_F(SelectedRowsTester, Table) { +TEST_F(SelectedRowsTester, SparseTable) { platform::CPUPlace cpu; SelectedRows table; // initialize a sparse table @@ -87,11 +87,11 @@ TEST_F(SelectedRowsTester, Table) { framework::Tensor get_value; get_value.mutable_data(framework::make_ddim({2, 100}), cpu); std::vector keys({non_key, key}); - auto non_keys = table.Get(keys, &get_value); + auto non_key_pairs = table.Get(keys, &get_value); ASSERT_EQ(get_value.data()[100], static_cast(10)); - ASSERT_EQ(non_keys.size(), static_cast(1)); - ASSERT_EQ(non_keys[0], non_key); + ASSERT_EQ(non_key_pairs.size(), static_cast(1)); + ASSERT_EQ(non_key_pairs[0].first, non_key); } } // namespace framework diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 82d8c3df4ece7e56a72b650f8ea58f3953af3b64..d6d4c2f8a2ced08ed8481e92e131f6e2bed9ec05 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -65,7 +65,7 @@ class TensorRTEngine : public EngineBase { // Initialize the inference network, so that TensorRT layers can add to this // network. void InitNetwork() { - infer_builder_.reset(createInferBuilder(logger_)); + infer_builder_.reset(createInferBuilder(&logger_)); infer_network_.reset(infer_builder_->createNetwork()); } // After finishing adding ops, freeze this network and creates the executation diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index 796283d325ceb84c733eff5c119b808300bca069..2b402cce60762d774cd7b371e448b2b88794b6a8 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -46,13 +46,13 @@ const int kDataTypeSize[] = { // The following two API are implemented in TensorRT's header file, cannot load // from the dynamic library. So create our own implementation and directly // trigger the method from the dynamic library. -static nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger& logger) { +static nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger* logger) { return static_cast( - dy::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION)); + dy::createInferBuilder_INTERNAL(logger, NV_TENSORRT_VERSION)); } -static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger& logger) { +static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) { return static_cast( - dy::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION)); + dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION)); } // A logger for create TensorRT infer builder. @@ -80,7 +80,7 @@ class NaiveLogger : public nvinfer1::ILogger { return *x; } - virtual ~NaiveLogger() override {} + ~NaiveLogger() override {} }; } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/test_tensorrt.cc b/paddle/fluid/inference/tensorrt/test_tensorrt.cc index aed5b5e1a22cbed1256d4f28d0a8a4c29c6cc744..a07537985738ab0ad4092b794f3b62ba53dfa866 100644 --- a/paddle/fluid/inference/tensorrt/test_tensorrt.cc +++ b/paddle/fluid/inference/tensorrt/test_tensorrt.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include #include #include #include "NvInfer.h" -#include "cuda.h" -#include "cuda_runtime_api.h" #include "paddle/fluid/platform/dynload/tensorrt.h" namespace dy = paddle::platform::dynload; @@ -43,7 +43,7 @@ class Logger : public nvinfer1::ILogger { class ScopedWeights { public: - ScopedWeights(float value) : value_(value) { + explicit ScopedWeights(float value) : value_(value) { w.type = nvinfer1::DataType::kFLOAT; w.values = &value_; w.count = 1; @@ -58,13 +58,13 @@ class ScopedWeights { // The following two API are implemented in TensorRT's header file, cannot load // from the dynamic library. So create our own implementation and directly // trigger the method from the dynamic library. -nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger& logger) { +nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger* logger) { return static_cast( - dy::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION)); + dy::createInferBuilder_INTERNAL(logger, NV_TENSORRT_VERSION)); } -nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger& logger) { +nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) { return static_cast( - dy::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION)); + dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION)); } const char* kInputTensor = "input"; @@ -74,7 +74,7 @@ const char* kOutputTensor = "output"; nvinfer1::IHostMemory* CreateNetwork() { Logger logger; // Create the engine. - nvinfer1::IBuilder* builder = createInferBuilder(logger); + nvinfer1::IBuilder* builder = createInferBuilder(&logger); ScopedWeights weights(2.); ScopedWeights bias(3.); @@ -103,9 +103,9 @@ nvinfer1::IHostMemory* CreateNetwork() { return model; } -void Execute(nvinfer1::IExecutionContext& context, const float* input, +void Execute(nvinfer1::IExecutionContext* context, const float* input, float* output) { - const nvinfer1::ICudaEngine& engine = context.getEngine(); + const nvinfer1::ICudaEngine& engine = context->getEngine(); // Two binds, input and output ASSERT_EQ(engine.getNbBindings(), 2); const int input_index = engine.getBindingIndex(kInputTensor); @@ -119,7 +119,7 @@ void Execute(nvinfer1::IExecutionContext& context, const float* input, // Copy the input to the GPU, execute the network, and copy the output back. ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float), cudaMemcpyHostToDevice, stream)); - context.enqueue(1, buffers, stream, nullptr); + context->enqueue(1, buffers, stream, nullptr); ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); @@ -136,7 +136,7 @@ TEST(TensorrtTest, BasicFunction) { // Use the model to create an engine and an execution context. Logger logger; - nvinfer1::IRuntime* runtime = createInferRuntime(logger); + nvinfer1::IRuntime* runtime = createInferRuntime(&logger); nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(model->data(), model->size(), nullptr); model->destroy(); @@ -145,7 +145,7 @@ TEST(TensorrtTest, BasicFunction) { // Execute the network. float input = 1234; float output; - Execute(*context, &input, &output); + Execute(context, &input, &output); EXPECT_EQ(output, input * 2 + 3); // Destroy the engine. diff --git a/paddle/fluid/operators/accuracy_op.cu b/paddle/fluid/operators/accuracy_op.cu index 630a4a2df2ca8f6afe81be3c455d255a0693fcc3..23b48c6fdf427348879de07c671c65327d6436d7 100644 --- a/paddle/fluid/operators/accuracy_op.cu +++ b/paddle/fluid/operators/accuracy_op.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/accuracy_op.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/gpu_info.h" namespace paddle { diff --git a/paddle/fluid/operators/adagrad_op.cu b/paddle/fluid/operators/adagrad_op.cu index e798101ca6a3a44de749a2d2219295bd8911dfac..b25268786d622bc7a94117849763833e528bef48 100644 --- a/paddle/fluid/operators/adagrad_op.cu +++ b/paddle/fluid/operators/adagrad_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/adagrad_op.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index c9939e8602ed341d37784ca292a55326899e8e65..f8b2505ccfb143f9f74cf0b16d92e8e1ca059709 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -87,9 +87,13 @@ class BatchNormOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { auto input_data_type = framework::ToDataType(ctx.Input("X")->type()); - // For float or float16 input tensor, the type of the scale, bias, mean, - // and var tensors should both be float. + // By default, the type of the scale, bias, mean, + // and var tensors should both be float. (For float or float16 input tensor) + // or double (For double input tensor). auto bn_param_type = framework::proto::VarType::FP32; + if (input_data_type == framework::proto::VarType::FP64) { + bn_param_type = framework::proto::VarType::FP64; + } PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType(ctx.Input("Scale")->type()), "Scale input should be of float type"); @@ -492,8 +496,9 @@ REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp); REGISTER_OP_CPU_KERNEL( - batch_norm, - ops::BatchNormKernel); + batch_norm, ops::BatchNormKernel, + ops::BatchNormKernel); REGISTER_OP_CPU_KERNEL( batch_norm_grad, - ops::BatchNormGradKernel); + ops::BatchNormGradKernel, + ops::BatchNormGradKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc index cb1927bc0f2eb735f0a3184df5f0f8fada2f9dca..550dd32d36767f90e880415bfffaf01aeb623609 100644 --- a/paddle/fluid/operators/batch_norm_op.cu.cc +++ b/paddle/fluid/operators/batch_norm_op.cu.cc @@ -287,6 +287,8 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( batch_norm, ops::BatchNormKernel, + ops::BatchNormKernel, ops::BatchNormKernel); REGISTER_OP_CUDA_KERNEL( - batch_norm_grad, ops::BatchNormGradKernel); + batch_norm_grad, ops::BatchNormGradKernel, + ops::BatchNormGradKernel); diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc index fdab4e92f47c7c8f241d93268a73dcb8c2eb2dc6..cff097cca13f3b92c7efe4b69259fdf7c75b3760 100644 --- a/paddle/fluid/operators/beam_search_op.cc +++ b/paddle/fluid/operators/beam_search_op.cc @@ -195,10 +195,9 @@ std::string ItemToString(const BeamSearch::Item &item) { return stream.str(); } -class BeamSearchProtoAndCheckerMaker - : public framework::OpProtoAndCheckerMaker { +class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { public: - BeamSearchProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker) + BeamSearchOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { // inputs and outputs stored in proto AddInput("pre_ids", "ids in previous step"); @@ -222,20 +221,32 @@ class BeamSearchProtoAndCheckerMaker } }; -class BeamSearchInferShape : public framework::InferShapeBase { +class BeamSearchOp : public framework::OperatorWithKernel { public: - void operator()(framework::InferShapeContext *context) const override { + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { for (const std::string &arg : std::vector({"pre_ids", "ids", "scores"})) { - PADDLE_ENFORCE(context->HasInput(arg), - "BeamSearch need input argument '%s'", arg); + PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'", + arg); } for (const std::string &arg : std::vector({"selected_ids", "selected_scores"})) { - PADDLE_ENFORCE(context->HasOutput(arg), + PADDLE_ENFORCE(ctx->HasOutput(arg), "BeamSearch need output argument '%s'", arg); } } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = framework::OpKernelType( + framework::ToDataType( + ctx.Input("pre_ids")->type()), + platform::CPUPlace()); + return kt; + } }; class BeamSearchInferVarType : public framework::VarTypeInference { @@ -254,8 +265,13 @@ class BeamSearchInferVarType : public framework::VarTypeInference { } // namespace operators } // namespace paddle -REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp, - paddle::operators::BeamSearchProtoAndCheckerMaker, - paddle::operators::BeamSearchInferShape, - paddle::operators::BeamSearchInferVarType, - paddle::framework::EmptyGradOpMaker); +namespace ops = paddle::operators; + +REGISTER_OPERATOR(beam_search, ops::BeamSearchOp, ops::BeamSearchOpMaker, + ops::BeamSearchInferVarType); +REGISTER_OP_CPU_KERNEL( + beam_search, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel); diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h index 0a481a85ce6fbb582b8c0e12710455aaaac72aa1..9b51db8a45186c2a90cf8b2eb7966d0aaea04028 100644 --- a/paddle/fluid/operators/beam_search_op.h +++ b/paddle/fluid/operators/beam_search_op.h @@ -192,49 +192,29 @@ std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item); std::string ItemToString(const BeamSearch::Item& item); -class BeamSearchOp : public framework::OperatorBase { +template +class BeamSearchOpKernel : public framework::OpKernel { public: - BeamSearchOp(const std::string& type, - const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - BeamSearchOp(const BeamSearchOp& o) - : framework::OperatorBase( - static_cast(o)) { - PADDLE_THROW("Not Implemented"); - } - - private: - void RunImpl(const framework::Scope& scope, - const platform::Place& dev_place) const override { - auto ids_var = scope.FindVar(Input("ids")); - auto scores_var = scope.FindVar(Input("scores")); - auto pre_ids_var = scope.FindVar(Input("pre_ids")); + void Compute(const framework::ExecutionContext& context) const override { + auto* ids_var = context.Input("ids"); + auto* scores_var = context.Input("scores"); + auto* pre_ids_var = context.Input("pre_ids"); PADDLE_ENFORCE_NOT_NULL(ids_var); PADDLE_ENFORCE_NOT_NULL(scores_var); PADDLE_ENFORCE_NOT_NULL(pre_ids_var); - auto& ids = ids_var->Get(); - auto& scores = scores_var->Get(); - auto& pre_ids = pre_ids_var->Get(); - size_t level = Attr("level"); - size_t beam_size = Attr("beam_size"); - int end_id = Attr("end_id"); - BeamSearch alg(ids, scores, level, beam_size, end_id); - - auto selected_ids_var = scope.FindVar(Output("selected_ids")); - auto selected_scores_var = scope.FindVar(Output("selected_scores")); + size_t level = context.Attr("level"); + size_t beam_size = context.Attr("beam_size"); + int end_id = context.Attr("end_id"); + BeamSearch alg(*ids_var, *scores_var, level, beam_size, end_id); + auto selected_ids_var = + context.Output("selected_ids"); + auto selected_scores_var = + context.Output("selected_scores"); PADDLE_ENFORCE_NOT_NULL(selected_ids_var); PADDLE_ENFORCE_NOT_NULL(selected_scores_var); - auto& selected_ids_tensor = - *selected_ids_var->GetMutable(); - auto& selected_scores_tensor = - *selected_scores_var->GetMutable(); - alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor); + alg(*pre_ids_var, selected_ids_var, selected_scores_var); } }; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/bilinear_interp_op.cu b/paddle/fluid/operators/bilinear_interp_op.cu index 82eb9e83bd84e6ec6881facbb2fac0aebce93d55..510190f1aaf02960284216a1bedd409011088499 100644 --- a/paddle/fluid/operators/bilinear_interp_op.cu +++ b/paddle/fluid/operators/bilinear_interp_op.cu @@ -10,7 +10,7 @@ limitations under the License. */ #include "paddle/fluid/operators/bilinear_interp_op.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/box_coder_op.cu b/paddle/fluid/operators/box_coder_op.cu index 0944e9c95d4a66cc4a51751a8c70cd7a3fefaf1a..708c7a5fa96c2f9ce6a2d913ca5f30126bb6192f 100644 --- a/paddle/fluid/operators/box_coder_op.cu +++ b/paddle/fluid/operators/box_coder_op.cu @@ -10,7 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/box_coder_op.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h index 92c8ab6d9ff11ec6acd46a39877eb67d624748a9..1b1b8bf5ed959dd9c2ce8c9f5c905a75b81865fd 100644 --- a/paddle/fluid/operators/concat_op.h +++ b/paddle/fluid/operators/concat_op.h @@ -87,7 +87,7 @@ class ConcatGradKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); paddle::operators::math::ConcatGradFunctor concat_grad_functor; - concat_grad_functor(dev_ctx, *in, static_cast(axis), outputs); + concat_grad_functor(dev_ctx, *in, static_cast(axis), &outputs); } } }; diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index c70e3cc3c9198008d9eca5f462000aa67ff7e5ba..cf410c3ca16955620610634b99ee9111106ef99f 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -20,6 +20,11 @@ limitations under the License. */ #include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/float16.h" +DEFINE_bool(cudnn_algo_use_autotune, true, + "Whether allow using an autotuning algorithm for convolution " + "operator. The autotuning algorithm may be non-deterministic. If " + "false, the algorithm is deterministic."); + namespace paddle { namespace operators { @@ -267,17 +272,23 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); if (input_grad) { - PADDLE_ENFORCE( - platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( - handle, cudnn_filter_desc, - // dyDesc: Handle to the previously initialized input differential - // tensor descriptor. - cudnn_output_grad_desc, cudnn_conv_desc, - // dxDesc: Handle to the previously initialized output tensor - // descriptor. - cudnn_input_desc, - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &data_algo)); + if (FLAGS_cudnn_algo_use_autotune) { + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + handle, cudnn_filter_desc, + // dyDesc: Handle to the previously initialized input + // differential + // tensor descriptor. + cudnn_output_grad_desc, cudnn_conv_desc, + // dxDesc: Handle to the previously initialized output tensor + // descriptor. + cudnn_input_desc, + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &data_algo)); + } else { + data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + } + PADDLE_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( handle, cudnn_filter_desc, cudnn_output_grad_desc, @@ -286,12 +297,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { } if (filter_grad) { - PADDLE_ENFORCE( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( - handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, - cudnn_filter_desc, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &filter_algo)); + if (FLAGS_cudnn_algo_use_autotune) { + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + handle, cudnn_input_desc, cudnn_output_grad_desc, + cudnn_conv_desc, cudnn_filter_desc, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &filter_algo)); + } else { + filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + } PADDLE_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu index 344bbade7055aa8e0aede61dd31dab246bddd169..314d33310588ed960eecaf1a0319ebf56d925c55 100644 --- a/paddle/fluid/operators/conv_shift_op.cu +++ b/paddle/fluid/operators/conv_shift_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_shift_op.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index 95f4738b4ff50852d9591719133ca650533bf848..7ca694886e9209a49e214352f5babc473a1f275a 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -82,7 +82,9 @@ class RequestSend final : public RequestBase { virtual std::string GetReqName() { return request_->Varname(); } virtual void Process() { - queue_->Push(std::make_pair(request_->Varname(), request_)); + std::string var_name = GetReqName(); + VLOG(3) << "RequestSend " << var_name; + queue_->Push(std::make_pair(var_name, request_)); sendrecv::VoidMessage reply; responder_.Finish(reply, ::grpc::Status::OK, this); @@ -106,7 +108,7 @@ class RequestGet final : public RequestBase { responder_(&ctx_), scope_(scope), queue_(queue) { - int method_id = static_cast(detail::GrpcMethod::kGetVariable); + auto method_id = static_cast(detail::GrpcMethod::kGetVariable); service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_, cq_, this); } @@ -118,6 +120,7 @@ class RequestGet final : public RequestBase { virtual void Process() { // proc request. std::string var_name = request_.varname(); + VLOG(3) << "RequestGet " << var_name; auto* var = scope_->FindVar(var_name); ::grpc::ByteBuffer reply; @@ -176,7 +179,7 @@ class RequestPrefetch final : public RequestBase { ::grpc::ByteBuffer reply; std::string var_name = request_->OutVarname(); - VLOG(3) << "prefetch var " << var_name; + VLOG(3) << "RequestPrefetch " << var_name; auto var_desc = program_->Block(0).FindVar(var_name); framework::Scope* local_scope = &scope_->NewScope(); auto* var = local_scope->FindVar(var_name); @@ -307,18 +310,20 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, bool ok = false; while (true) { - VLOG(3) << "HandleRequest for " << cq_name << " while in"; + VLOG(3) << "HandleRequest for " << cq_name << " wait Next"; if (!cq->Next(&tag, &ok)) { LOG(INFO) << cq_name << " CompletionQueue shutdown!"; break; } - VLOG(3) << "HandleRequest for " << cq_name << " while after Next"; + VLOG(3) << "HandleRequest for " << cq_name << " get Next"; PADDLE_ENFORCE(tag); + if (sync_mode_) { // FIXME(typhoonzero): de-couple the barriers with recv_op if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1); if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0); + VLOG(3) << "HandleRequest for " << cq_name << " after WaitCond"; } RequestBase* base = reinterpret_cast(tag); @@ -336,9 +341,9 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, switch (base->Status()) { case PROCESS: { - VLOG(4) << cq_name << " PROCESS status:" << base->Status(); TryToRegisterNewOne(); base->Process(); + VLOG(4) << cq_name << " PROCESS status:" << base->Status(); break; } case FINISH: { diff --git a/paddle/fluid/operators/detail/serde_test.cc b/paddle/fluid/operators/detail/serde_test.cc index 221d2f4c5b30aef022a5d6b54cd657d1dec1f5a2..e9eaaf1cbcd07ed1c8d6fb0b025bc1f1500718fd 100644 --- a/paddle/fluid/operators/detail/serde_test.cc +++ b/paddle/fluid/operators/detail/serde_test.cc @@ -108,7 +108,7 @@ void RunSerdeTestSelectedRows(platform::Place place) { EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); } for (size_t i = 0; i < rows2->size(); ++i) { - EXPECT_EQ(rows_data2[i], i); + EXPECT_EQ(rows_data2[i], static_cast(i)); } EXPECT_EQ(slr2->height(), 1000); } diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu index 913a9145420dae7c4f6a4df10c0330636b5796b0..c25b7d2f9ec32bcef44db239de43feefd855bfe5 100644 --- a/paddle/fluid/operators/edit_distance_op.cu +++ b/paddle/fluid/operators/edit_distance_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/edit_distance_op.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/gpu_info.h" namespace paddle { diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index f0362ec606c994d69f31c7a2e1e9ad0d0108b621..953aedc85064ee803ab02afd427a5a6f22096f94 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -22,6 +22,7 @@ limitations under the License. */ #ifdef __NVCC__ #include #include +#include "paddle/fluid/platform/cuda_primitives.h" constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; #endif @@ -333,24 +334,12 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out, } } } -#ifdef __NVCC__ -// __shfl_down has been deprecated as of CUDA 9.0. -#if CUDA_VERSION < 9000 -template -__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) { - return __shfl_down(val, delta); -} -#define CREATE_SHFL_MASK(mask, predicate) mask = 0u; -#else -#define FULL_WARP_MASK 0xFFFFFFFF -#define CREATE_SHFL_MASK(mask, predicate) \ - mask = __ballot_sync(FULL_WARP_MASK, (predicate)) -#endif +#ifdef __NVCC__ template __device__ T reduceSum(T val, int tid, int len) { - // TODO(zcd): The warp size should be taken from the + // NOTE(zcd): The warp size should be taken from the // parameters of the GPU but not specified as 32 simply. // To make the reduceSum more efficiently, // I use Warp-Level Parallelism and assume the Warp size @@ -362,7 +351,7 @@ __device__ T reduceSum(T val, int tid, int len) { CREATE_SHFL_MASK(mask, tid < len); for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += __shfl_down_sync(mask, val, offset); + val += platform::__shfl_down_sync(mask, val, offset); if (tid < warpSize) shm[tid] = 0; @@ -378,7 +367,7 @@ __device__ T reduceSum(T val, int tid, int len) { if (tid < warpSize) { val = shm[tid]; for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += __shfl_down_sync(mask, val, offset); + val += platform::__shfl_down_sync(mask, val, offset); } return val; diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 57cff680ab89f2df7e71af4056ee06cdf330bbab..f22f8b261030c0c536e2118351ec2aa1a9be6cd0 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -45,20 +45,6 @@ static void split(const std::string &str, char sep, } } -static void AsyncExecuteBlock(framework::Executor *executor, - framework::ExecutorPrepareContext *prepared, - framework::Scope *scope) { - std::future future = framework::Async([&executor, &prepared, &scope]() { - try { - executor->RunPreparedContext(prepared, scope, false, false); - } catch (std::exception &e) { - LOG(ERROR) << "run sub program error " << e.what(); - } - }); - // TODO(qiao) maybe we can remove this - future.wait(); -} - static void ParallelExecuteBlocks( const std::vector ¶llel_blkids, framework::Executor *executor, const std::vector> @@ -201,14 +187,40 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, } // while(true) } +static void AsyncUpdateThread( + const std::string &var_name, const bool &exit_flag, + const std::shared_ptr &queue, + framework::Executor *executor, + framework::ExecutorPrepareContext *prepared) { + VLOG(3) << "update thread for " << var_name << " started"; + while (!exit_flag) { + const detail::ReceivedMessage v = queue->Pop(); + auto recv_var_name = v.first; + auto var = v.second->GetVar(); + if (var == nullptr) { + LOG(ERROR) << "Can not find server side var: " << recv_var_name; + PADDLE_THROW("Can not find server side var"); + } + auto fs = framework::Async([var_name, &executor, &v, prepared] { + try { + executor->RunPreparedContext(prepared, v.second->GetMutableLocalScope(), + false, false); + } catch (std::exception &e) { + LOG(ERROR) << "run sub program error " << e.what(); + } + }); + fs.wait(); + } +} + void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, - framework::ProgramDesc *program, - framework::Scope *recv_scope, - framework::BlockDesc *prefetch_block) const { + framework::ProgramDesc *program) const { VLOG(3) << "RunAsyncLoop in"; // grad name to block id std::unordered_map grad_to_block_id; std::unordered_map id_to_grad; + std::unordered_map> + grad_to_queue; auto grad_to_block_id_str = Attr>("grad_to_block_id"); @@ -220,6 +232,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, PADDLE_ENFORCE_EQ(grad_to_block_id.count(pieces[0]), 0); int block_id = std::stoi(pieces[1]); grad_to_block_id[pieces[0]] = block_id; + grad_to_queue[pieces[0]] = std::make_shared(); id_to_grad[block_id] = pieces[0]; } size_t num_blocks = program->Size(); @@ -238,8 +251,21 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, grad_to_prepared_ctx[id_to_grad[block_list[i]]] = optimize_prepared[i]; } - VLOG(3) << "RunAsyncLoop into while"; bool exit_flag = false; + + VLOG(3) << "start async optimize threads"; + std::vector> fs; + for (auto iter = grad_to_queue.begin(); iter != grad_to_queue.end(); iter++) { + std::string grad_name = iter->first; + VLOG(3) << "create async update thread for " << grad_name; + fs.push_back(framework::AsyncIO([grad_name, &exit_flag, &executor, + &grad_to_queue, &grad_to_prepared_ctx]() { + AsyncUpdateThread(grad_name, exit_flag, grad_to_queue[grad_name], + executor, grad_to_prepared_ctx[grad_name].get()); + })); + } + + VLOG(3) << "RunAsyncLoop into while"; while (!exit_flag) { const detail::ReceivedMessage v = rpc_service_->Get(); auto recv_var_name = v.first; @@ -249,13 +275,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, break; } else { VLOG(3) << "received grad: " << recv_var_name; - auto var = v.second->GetVar(); - if (var == nullptr) { - LOG(ERROR) << "Can not find server side var: " << recv_var_name; - PADDLE_THROW("Can not find server side var"); - } - AsyncExecuteBlock(executor, grad_to_prepared_ctx[recv_var_name].get(), - v.second->GetMutableLocalScope()); + grad_to_queue[recv_var_name]->Push(v); } if (exit_flag) { @@ -304,7 +324,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, if (sync_mode) { RunSyncLoop(&executor, program, &recv_scope, prefetch_block); } else { - RunAsyncLoop(&executor, program, &recv_scope, prefetch_block); + RunAsyncLoop(&executor, program); } } diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h index 3cc0f3047733bea94daa310cd39cb0a4f44bef85..5c8fc31c9774a0f2e8233824459b29b42469bd1a 100644 --- a/paddle/fluid/operators/listen_and_serv_op.h +++ b/paddle/fluid/operators/listen_and_serv_op.h @@ -47,9 +47,7 @@ class ListenAndServOp : public framework::OperatorBase { framework::BlockDesc* prefetch_block) const; void RunAsyncLoop(framework::Executor* executor, - framework::ProgramDesc* program, - framework::Scope* recv_scope, - framework::BlockDesc* prefetch_block) const; + framework::ProgramDesc* program) const; void Stop() override; diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f1839e456d66ab95fb4ccac933cf7b635c54f5a0 --- /dev/null +++ b/paddle/fluid/operators/lookup_sparse_table_op.cc @@ -0,0 +1,165 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +constexpr int64_t kNoPadding = -1; + +class LookupSparseTableInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LookupSparseTableOp should not be null."); + auto shape_w = ctx->GetInputDim("W"); + auto shape_ids = ctx->GetInputDim("Ids"); + shape_w[0] = shape_ids.size(); + ctx->SetOutputDim("Out", shape_w); + } +}; + +class LookupSparseTableOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto out_var = scope.FindVar(Output("Out")); + auto w_var = scope.FindVar(Input("W")); + auto ids_var = scope.FindVar(Input("Ids")); + unsigned int seed = static_cast(Attr("seed")); + float min = Attr("min"); + float max = Attr("max"); + bool auto_grown_table = Attr("auto_grown_table"); + + PADDLE_ENFORCE(out_var->IsType(), + "The type of Out var should be LodTensor."); + PADDLE_ENFORCE(w_var->IsType(), + "The type of W var should be SelectedRows."); + PADDLE_ENFORCE(ids_var->IsType(), + "The type of Ids var should be LoDTensor."); + auto &ids_t = ids_var->Get(); + auto out_t = out_var->GetMutable(); + auto w_t = w_var->GetMutable(); + std::vector keys; + keys.resize(ids_t.numel()); + for (size_t i = 0; i < ids_t.numel(); ++i) { + keys[i] = ids_t.data()[i]; + } + + // TODO(Yancey1989): support CUDA Place for the sparse table + platform::CPUPlace cpu; + auto out_shape = w_t->value().dims(); + out_shape[0] = keys.size(); + out_t->Resize(out_shape); + out_t->mutable_data(cpu, w_t->value().type()); + PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()), + framework::proto::VarType::FP32, + "The sparse table only support FP32"); + auto non_keys_pair = w_t->Get(keys, out_t); + if (!auto_grown_table) { + PADDLE_ENFORCE_EQ(non_keys_pair.size(), static_cast(0), + "there is some keys does exists in the sparse table."); + } + auto value_shape = w_t->value().dims(); + value_shape[0] = 1; + for (const auto &it : non_keys_pair) { + const auto key = it.first; + const auto index = it.second; + framework::Tensor value; + value.Resize(value_shape); + auto data = value.mutable_data(cpu); + + std::minstd_rand engine; + engine.seed(seed); + std::uniform_real_distribution dist(min, max); + int64_t size = value.numel(); + for (int64_t i = 0; i < size; ++i) { + data[i] = dist(engine); + } + w_t->Set(key, value); + memory::Copy(cpu, out_t->mutable_data(cpu) + index * value.numel(), + cpu, value.data(), value.numel() * sizeof(float)); + } + } +}; + +class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LookupSparseTableOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("W", + "(SelectedRows) The input represents embedding table, " + "which is a learnable parameter."); + AddInput("Ids", + "(LoDTensor) Ids's type should be LoDTensor" + "THe ids to be looked up in W."); + AddOutput("Out", + "(LoDTensor) The lookup results, which have the " + "same type as W."); + AddAttr("padding_idx", + "(int64, default -1) " + "If the value is -1, it makes no effect to lookup. " + "Otherwise the given value indicates padding the output " + "with zeros whenever lookup encounters it in Ids.") + .SetDefault(kNoPadding); + AddAttr("min", + "(float, default -1.0) " + "Minimum value of uniform random") + .SetDefault(-1.0f); + AddAttr("max", + "(float, default 1.0) " + "Maximun value of uniform random") + .SetDefault(1.0f); + AddAttr("seed", + "(int, default 0) " + "Random seed used for generating samples. " + "0 means use a seed generated by the system." + "Note that if seed is not 0, this operator will always " + "generate the same random numbers every time.") + .SetDefault(0); + AddAttr("auto_grown_table", + "(bool default false)" + "Whether create new value if for nonexistent key.") + .SetDefault(true); + AddComment(R"DOC( +Lookup Sprase Tablel Operator. + +This operator is used to perform lookup on parameter W, +then concatenated into a sparse tensor. + +The type of Ids(Input) is SelectedRows, the rows of Ids contains +the ids to be looked up in W; +if the Id is not in the sparse table, this operator will return a +random value and set the value into the table for the next looking up. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(lookup_sparse_table, ops::LookupSparseTableOp, + ops::LookupSparseTableInferShape, + ops::LookupSparseTableOpMaker, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 6d81fccd2059c511f71d403229e04587e553e93d..77722c50d39003d9342afb04a61ae3aaf6b21100 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/lookup_table_op.h" #include "paddle/fluid/platform/assert.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc index bfce56f9fdcafa0800c9742b9fae41fd6a572b40..cc69212466b72f3fa82e8f5f58b4f3229dab28ec 100644 --- a/paddle/fluid/operators/math/concat.cc +++ b/paddle/fluid/operators/math/concat.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/concat.h" +#include namespace paddle { namespace operators { @@ -70,20 +71,20 @@ class ConcatGradFunctor { public: void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const int axis, - std::vector& outputs) { + std::vector* outputs) { // TODO(zcd): Add input data validity checking - int num = outputs.size(); + int num = outputs->size(); int input_rows = 1; - auto dim_0 = outputs[0].dims(); + auto dim_0 = outputs->at(0).dims(); for (int i = 0; i < axis; ++i) { input_rows *= dim_0[i]; } int input_cols = 0; - std::vector output_cols(outputs.size()); + std::vector output_cols(outputs->size()); for (int i = 0; i < num; ++i) { - int t_cols = outputs[i].numel() / input_rows; + int t_cols = outputs->at(i).numel() / input_rows; input_cols += t_cols; output_cols[i] = t_cols; } @@ -95,7 +96,7 @@ class ConcatGradFunctor { int col_idx = 0; for (int j = 0; j < num; ++j) { int col_len = output_cols[j]; - T* dst_ptr = outputs[j].data() + k * col_len; + T* dst_ptr = outputs->at(j).data() + k * col_len; memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx, sizeof(T) * col_len); col_idx += col_len; diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu index c0786757b34195d47c3b1cadc938f0e9fcfd6038..4285d38dcd6a4124543cdd2246c82a8203f5a281 100644 --- a/paddle/fluid/operators/math/concat.cu +++ b/paddle/fluid/operators/math/concat.cu @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/operators/math/concat.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { @@ -202,16 +204,16 @@ class ConcatGradFunctor { public: void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const int axis, - std::vector& outputs) { + std::vector* outputs) { // TODO(zcd): Add input data validity checking - int o_num = outputs.size(); + int o_num = outputs->size(); int out_row = 1; - auto dim_0 = outputs[0].dims(); + auto dim_0 = outputs->at(0).dims(); for (int i = 0; i < axis; ++i) { out_row *= dim_0[i]; } - int out_col = outputs[0].numel() / out_row; + int out_col = outputs->at(0).numel() / out_row; int in_col = 0, in_row = out_row; bool sameShape = true; @@ -221,13 +223,13 @@ class ConcatGradFunctor { outputs_cols[0] = 0; for (int i = 0; i < o_num; ++i) { - int t_col = outputs[i].numel() / out_row; + int t_col = outputs->at(i).numel() / out_row; if (sameShape) { if (t_col != out_col) sameShape = false; } in_col += t_col; outputs_cols[i + 1] = in_col; - outputs_ptr[i] = outputs[i].data(); + outputs_ptr[i] = outputs->at(i).data(); } T** dev_out_gpu_data = diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat.h index c0e983e4aa7abbdd87649f5a3147d2a464993bce..041ce8bf8a2e9528a004c076ead4471a3837c1a6 100644 --- a/paddle/fluid/operators/math/concat.h +++ b/paddle/fluid/operators/math/concat.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/tensor.h" @@ -56,7 +57,7 @@ template class ConcatGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, - const int axis, std::vector& outputs); + const int axis, std::vector* outputs); }; } // namespace math diff --git a/paddle/fluid/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu index 55c1e726335dfe010e39847ac90b84cc49955360..4e6ff5ee0a449b42762748ba1a103876beee01f2 100644 --- a/paddle/fluid/operators/math/cos_sim_functor.cu +++ b/paddle/fluid/operators/math/cos_sim_functor.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/cos_sim_functor.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu index da73f575f375d8a792a82bf6cf4226bab673170d..6d2ba2bd0d653ecf83f9e2abc1413ae551dc8bb7 100644 --- a/paddle/fluid/operators/math/cross_entropy.cu +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/cross_entropy.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { @@ -31,11 +32,11 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, template __device__ __forceinline__ T sum_single_warp(T val) { - val += __shfl_down(val, 16); - val += __shfl_down(val, 8); - val += __shfl_down(val, 4); - val += __shfl_down(val, 2); - val += __shfl_down(val, 1); + val += platform::__shfl_down_sync(0, val, 16); + val += platform::__shfl_down_sync(0, val, 8); + val += platform::__shfl_down_sync(0, val, 4); + val += platform::__shfl_down_sync(0, val, 2); + val += platform::__shfl_down_sync(0, val, 1); return val; } diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu index d360728484a73ce844b4a36fbffd7dc631f8e786..027e2de48d229761f12f974dc73625c8ea1b3567 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/fluid/operators/math/depthwise_conv.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/depthwise_conv.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h index 1e5ff8ef46db960ddf88ebf03041893b176c1950..26e6adafdfcd28dcc51109b9dec812f1c82c8a0e 100644 --- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h +++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h @@ -89,14 +89,14 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, __m256 r_value_reset_gate; __m256 r_value_reset_output; __m256 r_prev_out = _mm256_set1_ps(0.0f); - __m256 *update_gate = (__m256 *)gate_value; - __m256 *reset_gate = (__m256 *)(gate_value + frame_size); + __m256 *update_gate = reinterpret_cast<__m256 *>(gate_value); + __m256 *reset_gate = reinterpret_cast<__m256 *>(gate_value + frame_size); for (int i = 0; i < frame_size / 8; i++) { r_value_update_gate = update_gate[i]; r_value_reset_gate = reset_gate[i]; if (prev_output_value) { - r_prev_out = ((__m256 *)prev_output_value)[i]; + r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i]; } op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out, @@ -104,7 +104,7 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, update_gate[i] = r_value_update_gate; reset_gate[i] = r_value_reset_gate; - ((__m256 *)reset_output_value)[i] = r_value_reset_output; + (reinterpret_cast<__m256 *>(reset_output_value))[i] = r_value_reset_output; } #endif } @@ -119,21 +119,21 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, __m256 r_value_frame_state; __m256 r_prev_out = _mm256_set1_ps(0.0f); __m256 r_output; - __m256 *update_gate = (__m256 *)gate_value; - __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2); + __m256 *update_gate = reinterpret_cast<__m256 *>(gate_value); + __m256 *frame_state = reinterpret_cast<__m256 *>(gate_value + frame_size * 2); for (int i = 0; i < frame_size / 8; i++) { r_value_update_gate = update_gate[i]; r_value_frame_state = frame_state[i]; if (prev_output_value) { - r_prev_out = ((__m256 *)prev_output_value)[i]; + r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i]; } op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out, r_output, active_node); frame_state[i] = r_value_frame_state; - ((__m256 *)output_value)[i] = r_output; + (reinterpret_cast<__m256 *>(output_value))[i] = r_output; } #endif } @@ -284,20 +284,22 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, __m256 r_out_grad; __m256 r_prev_out_value = _mm256_set1_ps(0.0f); __m256 r_prev_out_grad = _mm256_set1_ps(0.0f); - __m256 *update_gate_value = (__m256 *)gate_value; - __m256 *update_gate_grad = (__m256 *)gate_grad; - __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2); - __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2); + __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value); + __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad); + __m256 *frame_state_value = + reinterpret_cast<__m256 *>(gate_value + frame_size * 2); + __m256 *frame_state_grad = + reinterpret_cast<__m256 *>(gate_grad + frame_size * 2); for (int i = 0; i < frame_size / 8; i++) { r_update_gate_value = update_gate_value[i]; r_frame_state_value = frame_state_value[i]; - r_out_grad = ((__m256 *)output_grad)[i]; + r_out_grad = (reinterpret_cast<__m256 *>(output_grad))[i]; if (prev_out_value) { - r_prev_out_value = ((__m256 *)prev_out_value)[i]; + r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i]; } if (prev_out_grad) { - r_prev_out_grad = ((__m256 *)prev_out_grad)[i]; + r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i]; } op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value, @@ -307,7 +309,7 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, update_gate_grad[i] = r_update_gate_grad; frame_state_grad[i] = r_frame_state_grad; if (prev_out_grad) { - ((__m256 *)prev_out_grad)[i] = r_prev_out_grad; + (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad; } } #endif @@ -327,10 +329,11 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, __m256 r_reset_output_grad = _mm256_set1_ps(0.0f); __m256 r_prev_out_value = _mm256_set1_ps(0.0f); __m256 r_prev_out_grad = _mm256_set1_ps(0.0f); - __m256 *update_gate_value = (__m256 *)gate_value; - __m256 *update_gate_grad = (__m256 *)gate_grad; - __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size); - __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size); + __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value); + __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad); + __m256 *reset_gate_value = + reinterpret_cast<__m256 *>(gate_value + frame_size); + __m256 *reset_gate_grad = reinterpret_cast<__m256 *>(gate_grad + frame_size); for (int i = 0; i < frame_size / 8; i++) { r_update_gate_value = update_gate_value[i]; @@ -338,13 +341,13 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, r_reset_gate_value = reset_gate_value[i]; if (prev_out_value && prev_out_grad) { - r_reset_output_grad = ((__m256 *)reset_output_grad)[i]; + r_reset_output_grad = (reinterpret_cast<__m256 *>(reset_output_grad))[i]; } if (prev_out_value) { - r_prev_out_value = ((__m256 *)prev_out_value)[i]; + r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i]; } if (prev_out_grad) { - r_prev_out_grad = ((__m256 *)prev_out_grad)[i]; + r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i]; } op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value, @@ -354,7 +357,7 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, update_gate_grad[i] = r_update_gate_grad; reset_gate_grad[i] = r_reset_gate_grad; if (prev_out_grad) { - ((__m256 *)prev_out_grad)[i] = r_prev_out_grad; + (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad; } } #endif diff --git a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h index 657652562780ae9932a4394335bfa3c3b397bb80..da25a7d2137cfe5160e28c4e590dd5c43cd99ccf 100644 --- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h +++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/gru_compute.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h index 6ad77830fd7a9809c4922878cc8ccdff1e8e0ef7..19f6b213aa3bc06f7f5750fa42745fd8755c51b9 100644 --- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h @@ -164,10 +164,12 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, __m256 r_state_atv; __m256 r_out; - __m256 *value_in = (__m256 *)value.gate_value; - __m256 *value_ig = (__m256 *)(value.gate_value + frame_size); - __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2); - __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3); + __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value); + __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size); + __m256 *value_fg = + reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2); + __m256 *value_og = + reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3); for (int i = 0; i < frame_size / 8; i++) { r_value_in = value_in[i]; @@ -175,13 +177,13 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, r_value_fg = value_fg[i]; r_value_og = value_og[i]; if (value.check_ig) { - r_checkI = ((__m256 *)value.check_ig)[i]; - r_checkF = ((__m256 *)value.check_fg)[i]; - r_checkO = ((__m256 *)value.check_og)[i]; + r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i]; + r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i]; + r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i]; } if (value.prev_state_value) { - r_prev_state = ((__m256 *)value.prev_state_value)[i]; + r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i]; } op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, @@ -192,9 +194,9 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, value_ig[i] = r_value_ig; value_fg[i] = r_value_fg; value_og[i] = r_value_og; - ((__m256 *)value.state_value)[i] = r_state; - ((__m256 *)value.state_active_value)[i] = r_state_atv; - ((__m256 *)value.output_value)[i] = r_out; + (reinterpret_cast<__m256 *>(value.state_value))[i] = r_state; + (reinterpret_cast<__m256 *>(value.state_active_value))[i] = r_state_atv; + (reinterpret_cast<__m256 *>(value.output_value))[i] = r_out; } #endif } @@ -227,14 +229,16 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, __m256 r_checkFGrad; __m256 r_checkOGrad; - __m256 *value_in = (__m256 *)value.gate_value; - __m256 *value_ig = (__m256 *)(value.gate_value + frame_size); - __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2); - __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3); - __m256 *grad_in = (__m256 *)grad.gate_grad; - __m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size); - __m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2); - __m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3); + __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value); + __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size); + __m256 *value_fg = + reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2); + __m256 *value_og = + reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3); + __m256 *grad_in = reinterpret_cast<__m256 *>(grad.gate_grad); + __m256 *grad_ig = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size); + __m256 *grad_fg = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 2); + __m256 *grad_og = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 3); for (int i = 0; i < frame_size / 8; i++) { r_value_in = value_in[i]; @@ -242,16 +246,16 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, r_value_fg = value_fg[i]; r_value_og = value_og[i]; if (value.check_ig) { - r_checkI = ((__m256 *)value.check_ig)[i]; - r_checkF = ((__m256 *)value.check_fg)[i]; - r_checkO = ((__m256 *)value.check_og)[i]; + r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i]; + r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i]; + r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i]; } - r_state = ((__m256 *)value.state_value)[i]; - r_state_atv = ((__m256 *)value.state_active_value)[i]; - r_output_grad = ((__m256 *)grad.output_grad)[i]; - r_state_grad = ((__m256 *)grad.state_grad)[i]; + r_state = (reinterpret_cast<__m256 *>(value.state_value))[i]; + r_state_atv = (reinterpret_cast<__m256 *>(value.state_active_value))[i]; + r_output_grad = (reinterpret_cast<__m256 *>(grad.output_grad))[i]; + r_state_grad = (reinterpret_cast<__m256 *>(grad.state_grad))[i]; if (value.prev_state_value) { - r_prev_state = ((__m256 *)value.prev_state_value)[i]; + r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i]; } op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, @@ -264,15 +268,18 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, grad_ig[i] = r_grad_ig; grad_fg[i] = r_grad_fg; grad_og[i] = r_grad_og; - ((__m256 *)grad.state_grad)[i] = r_state_grad; + (reinterpret_cast<__m256 *>(grad.state_grad))[i] = r_state_grad; if (grad.prev_state_grad) - ((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad; + (reinterpret_cast<__m256 *>(grad.prev_state_grad))[i] = r_prev_state_grad; if (value.prev_state_value) { - if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad; - if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad; + if (grad.check_ig_grad) + (reinterpret_cast<__m256 *>(grad.check_ig_grad))[i] += r_checkIGrad; + if (grad.check_fg_grad) + (reinterpret_cast<__m256 *>(grad.check_fg_grad))[i] += r_checkFGrad; } - if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad; + if (grad.check_og_grad) + (reinterpret_cast<__m256 *>(grad.check_og_grad))[i] += r_checkOGrad; } #endif } diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h index 0b1034a080f15270e24622b8aaeda7f546aa90e6..d29c780dcfb1f1a3cbab25256238769d3a5ccd93 100644 --- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/lstm_compute.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu index 1268e21e0608000c1a8c22104912b32a973a9737..eecb233d22cea06da016b2671fd606b70eddf5a5 100644 --- a/paddle/fluid/operators/math/im2col.cu +++ b/paddle/fluid/operators/math/im2col.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/math/im2col.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index 7986326e96b2bb05c0936d366bda581d49b87032..b84bb9974930a1619edeb85de4281b19da22fbbd 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -23,32 +23,29 @@ void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size, } TEST(math_function, notrans_mul_trans_fp32) { - using namespace paddle::framework; - using namespace paddle::platform; + paddle::framework::Tensor input1; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor out_gpu; + paddle::framework::Tensor out; - Tensor input1; - Tensor input1_gpu; - Tensor input2_gpu; - Tensor out_gpu; - Tensor out; - - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - TensorCopySync(input1, gpu_place, &input1_gpu); - TensorCopySync(input1, gpu_place, &input2_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu); out_gpu.mutable_data({2, 2}, gpu_place); - paddle::operators::math::matmul( + paddle::operators::math::matmul( context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); - TensorCopySync(out_gpu, cpu_place, &out); + paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); float* out_ptr = out.data(); context.Wait(); @@ -59,39 +56,38 @@ TEST(math_function, notrans_mul_trans_fp32) { } TEST(math_function, notrans_mul_trans_fp16) { - using namespace paddle::framework; - using namespace paddle::platform; - - Tensor input1; - Tensor input1_gpu; - Tensor input2_gpu; - Tensor out_gpu; - Tensor out; + paddle::framework::Tensor input1; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor out_gpu; + paddle::framework::Tensor out; - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); // fp16 GEMM in cublas requires GPU compute capability >= 53 if (context.GetComputeCapability() < 53) { return; } - float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + paddle::platform::float16* input1_ptr = + input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - TensorCopySync(input1, gpu_place, &input1_gpu); - TensorCopySync(input1, gpu_place, &input2_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu); - out_gpu.mutable_data({2, 2}, gpu_place); + out_gpu.mutable_data({2, 2}, gpu_place); - paddle::operators::math::matmul( - context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu, - float16(0)); + paddle::operators::math::matmul( + context, input1_gpu, false, input2_gpu, true, + paddle::platform::float16(1), &out_gpu, paddle::platform::float16(0)); - TensorCopySync(out_gpu, cpu_place, &out); + paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); - float16* out_ptr = out.data(); + paddle::platform::float16* out_ptr = out.data(); context.Wait(); EXPECT_EQ(static_cast(out_ptr[0]), 5); EXPECT_EQ(static_cast(out_ptr[1]), 14); @@ -100,32 +96,29 @@ TEST(math_function, notrans_mul_trans_fp16) { } TEST(math_function, trans_mul_notrans_fp32) { - using namespace paddle::framework; - using namespace paddle::platform; + paddle::framework::Tensor input1; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor out_gpu; + paddle::framework::Tensor out; - Tensor input1; - Tensor input1_gpu; - Tensor input2_gpu; - Tensor out_gpu; - Tensor out; - - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - TensorCopySync(input1, gpu_place, &input1_gpu); - TensorCopySync(input1, gpu_place, &input2_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu); out_gpu.mutable_data({3, 3}, gpu_place); paddle::operators::math::matmul( context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); - TensorCopySync(out_gpu, cpu_place, &out); + paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); float* out_ptr = out.data(); context.Wait(); @@ -141,39 +134,38 @@ TEST(math_function, trans_mul_notrans_fp32) { } TEST(math_function, trans_mul_notrans_fp16) { - using namespace paddle::framework; - using namespace paddle::platform; - - Tensor input1; - Tensor input1_gpu; - Tensor input2_gpu; - Tensor out_gpu; - Tensor out; + paddle::framework::Tensor input1; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor out_gpu; + paddle::framework::Tensor out; - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); // fp16 GEMM in cublas requires GPU compute capability >= 53 if (context.GetComputeCapability() < 53) { return; } - float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + paddle::platform::float16* input1_ptr = + input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - TensorCopySync(input1, gpu_place, &input1_gpu); - TensorCopySync(input1, gpu_place, &input2_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu); - out_gpu.mutable_data({3, 3}, gpu_place); + out_gpu.mutable_data({3, 3}, gpu_place); - paddle::operators::math::matmul( - context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu, - float16(0)); + paddle::operators::math::matmul( + context, input1_gpu, true, input2_gpu, false, + paddle::platform::float16(1), &out_gpu, paddle::platform::float16(0)); - TensorCopySync(out_gpu, cpu_place, &out); + paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); - float16* out_ptr = out.data(); + paddle::platform::float16* out_ptr = out.data(); context.Wait(); EXPECT_EQ(static_cast(out_ptr[0]), 9); EXPECT_EQ(static_cast(out_ptr[1]), 12); @@ -187,19 +179,16 @@ TEST(math_function, trans_mul_notrans_fp16) { } TEST(math_function, gemm_notrans_cublas_fp32) { - using namespace paddle::framework; - using namespace paddle::platform; + paddle::framework::Tensor input1; + paddle::framework::Tensor input2; + paddle::framework::Tensor input3; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor input3_gpu; - Tensor input1; - Tensor input2; - Tensor input3; - Tensor input1_gpu; - Tensor input2_gpu; - Tensor input3_gpu; - - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); int m = 2; int n = 3; @@ -214,9 +203,9 @@ TEST(math_function, gemm_notrans_cublas_fp32) { float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - TensorCopySync(input1, gpu_place, &input1_gpu); - TensorCopySync(input2, gpu_place, &input2_gpu); - TensorCopySync(input3, gpu_place, &input3_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); + paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu); + paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(gpu_place); @@ -224,7 +213,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) { paddle::operators::math::gemm( context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); - TensorCopySync(input3_gpu, cpu_place, &input3); + paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -244,19 +233,16 @@ TEST(math_function, gemm_notrans_cublas_fp32) { } TEST(math_function, gemm_notrans_cublas_fp16) { - using namespace paddle::framework; - using namespace paddle::platform; - - Tensor input1; - Tensor input2; - Tensor input3; - Tensor input1_gpu; - Tensor input2_gpu; - Tensor input3_gpu; + paddle::framework::Tensor input1; + paddle::framework::Tensor input2; + paddle::framework::Tensor input3; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor input3_gpu; - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); // fp16 GEMM in cublas requires GPU compute capability >= 53 if (context.GetComputeCapability() < 53) { @@ -266,26 +252,31 @@ TEST(math_function, gemm_notrans_cublas_fp16) { int m = 2; int n = 3; int k = 3; - float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + paddle::platform::float16* input1_ptr = + input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - float16* input2_ptr = input2.mutable_data({3, 4}, cpu_place); + paddle::platform::float16* input2_ptr = + input2.mutable_data({3, 4}, cpu_place); fill_fp16_data(input2_ptr, input2.numel(), {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); - float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); + paddle::platform::float16* input3_ptr = + input3.mutable_data({2, 4}, cpu_place); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); - TensorCopySync(input1, gpu_place, &input1_gpu); - TensorCopySync(input2, gpu_place, &input2_gpu); - TensorCopySync(input3, gpu_place, &input3_gpu); - float16* a = input1_gpu.data(); - float16* b = input2_gpu.data(); - float16* c = input3_gpu.mutable_data(gpu_place); + paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); + paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu); + paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu); + paddle::platform::float16* a = input1_gpu.data(); + paddle::platform::float16* b = input2_gpu.data(); + paddle::platform::float16* c = + input3_gpu.mutable_data(gpu_place); - paddle::operators::math::gemm( - context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1), - c + 1, 4); + paddle::operators::math::gemm( + context, false, false, m, n, k, paddle::platform::float16(1), a, 3, b + 1, + 4, paddle::platform::float16(1), c + 1, 4); - TensorCopySync(input3_gpu, cpu_place, &input3); + paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -305,19 +296,16 @@ TEST(math_function, gemm_notrans_cublas_fp16) { } TEST(math_function, gemm_trans_cublas_fp32) { - using namespace paddle::framework; - using namespace paddle::platform; - - Tensor input1; - Tensor input2; - Tensor input3; - Tensor input1_gpu; - Tensor input2_gpu; - Tensor input3_gpu; + paddle::framework::Tensor input1; + paddle::framework::Tensor input2; + paddle::framework::Tensor input3; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor input3_gpu; - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); int m = 2; int n = 3; @@ -332,9 +320,9 @@ TEST(math_function, gemm_trans_cublas_fp32) { float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - TensorCopySync(input1, gpu_place, &input1_gpu); - TensorCopySync(input2, gpu_place, &input2_gpu); - TensorCopySync(input3, gpu_place, &input3_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); + paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu); + paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(gpu_place); @@ -342,7 +330,7 @@ TEST(math_function, gemm_trans_cublas_fp32) { paddle::operators::math::gemm( context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); - TensorCopySync(input3_gpu, cpu_place, &input3); + paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3); context.Wait(); EXPECT_EQ(input3_ptr[0], 0); @@ -356,19 +344,16 @@ TEST(math_function, gemm_trans_cublas_fp32) { } TEST(math_function, gemm_trans_cublas_fp16) { - using namespace paddle::framework; - using namespace paddle::platform; + paddle::framework::Tensor input1; + paddle::framework::Tensor input2; + paddle::framework::Tensor input3; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor input3_gpu; - Tensor input1; - Tensor input2; - Tensor input3; - Tensor input1_gpu; - Tensor input2_gpu; - Tensor input3_gpu; - - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); // fp16 GEMM in cublas requires GPU compute capability >= 53 if (context.GetComputeCapability() < 53) { @@ -378,26 +363,31 @@ TEST(math_function, gemm_trans_cublas_fp16) { int m = 2; int n = 3; int k = 3; - float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + paddle::platform::float16* input1_ptr = + input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - float16* input2_ptr = input2.mutable_data({4, 3}, cpu_place); + paddle::platform::float16* input2_ptr = + input2.mutable_data({4, 3}, cpu_place); fill_fp16_data(input2_ptr, input2.numel(), {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}); - float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); + paddle::platform::float16* input3_ptr = + input3.mutable_data({2, 4}, cpu_place); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); - TensorCopySync(input1, gpu_place, &input1_gpu); - TensorCopySync(input2, gpu_place, &input2_gpu); - TensorCopySync(input3, gpu_place, &input3_gpu); - float16* a = input1_gpu.data(); - float16* b = input2_gpu.data(); - float16* c = input3_gpu.mutable_data(gpu_place); + paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); + paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu); + paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu); + paddle::platform::float16* a = input1_gpu.data(); + paddle::platform::float16* b = input2_gpu.data(); + paddle::platform::float16* c = + input3_gpu.mutable_data(gpu_place); - paddle::operators::math::gemm( - context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1), - c + 1, 4); + paddle::operators::math::gemm( + context, false, true, m, n, k, paddle::platform::float16(1), a, 3, b + 3, + 3, paddle::platform::float16(1), c + 1, 4); - TensorCopySync(input3_gpu, cpu_place, &input3); + paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3); context.Wait(); EXPECT_EQ(static_cast(input3_ptr[0]), 0); @@ -412,24 +402,21 @@ TEST(math_function, gemm_trans_cublas_fp16) { template void GemvTest(int m, int n, bool trans) { - using namespace paddle::framework; - using namespace paddle::platform; - - Tensor mat_a; - Tensor vec_b; - Tensor vec_c; + paddle::framework::Tensor mat_a; + paddle::framework::Tensor vec_b; + paddle::framework::Tensor vec_c; - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); T* data_a = mat_a.mutable_data({m, n}, cpu_place); T* data_b = vec_b.mutable_data({trans ? m : n}, cpu_place); T* data_c = vec_c.mutable_data({trans ? n : m}, cpu_place); - Tensor g_mat_a; - Tensor g_vec_b; - Tensor g_vec_c; + paddle::framework::Tensor g_mat_a; + paddle::framework::Tensor g_vec_b; + paddle::framework::Tensor g_vec_c; T* g_data_a = g_mat_a.mutable_data(mat_a.dims(), gpu_place); T* g_data_b = g_vec_b.mutable_data(vec_b.dims(), gpu_place); T* g_data_c = g_vec_c.mutable_data(vec_c.dims(), gpu_place); @@ -441,14 +428,14 @@ void GemvTest(int m, int n, bool trans) { data_b[i] = static_cast(i); } - TensorCopySync(mat_a, gpu_place, &g_mat_a); - TensorCopySync(vec_b, gpu_place, &g_vec_b); + paddle::framework::TensorCopySync(mat_a, gpu_place, &g_mat_a); + paddle::framework::TensorCopySync(vec_b, gpu_place, &g_vec_b); - paddle::operators::math::gemv( + paddle::operators::math::gemv( context, trans, static_cast(m), static_cast(n), 1., g_data_a, g_data_b, 0., g_data_c); - TensorCopySync(g_vec_c, cpu_place, &vec_c); + paddle::framework::TensorCopySync(g_vec_c, cpu_place, &vec_c); if (!trans) { for (int i = 0; i < m; ++i) { diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu index 1e1a6a221c71c9d9cb9fda468360cb502c5ea52f..d9a23299a4d5750fc8c7fe3e5d1f8cd94bcb9cae 100644 --- a/paddle/fluid/operators/math/maxouting.cu +++ b/paddle/fluid/operators/math/maxouting.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/maxouting.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc index 97a2e81c84c060a8be57db6274839ee39edf466c..b871851798e48e6b598cb4ab8e2e42db478a3820 100644 --- a/paddle/fluid/operators/math/pooling.cc +++ b/paddle/fluid/operators/math/pooling.cc @@ -11,8 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/fluid/operators/math/pooling.h" +#include +#include namespace paddle { namespace operators { @@ -27,9 +28,10 @@ template class Pool2dFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_process, framework::Tensor* output) { + const framework::Tensor& input, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_process, + framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -63,11 +65,11 @@ class Pool2dFunctor { T ele = pool_process.initial(); for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - pool_process.compute(ele, input_data[h * input_width + w]); + pool_process.compute(input_data[h * input_width + w], &ele); } } int pool_size = (hend - hstart) * (wend - wstart); - pool_process.finalize(ele, (static_cast(pool_size))); + pool_process.finalize(static_cast(pool_size), &ele); output_data[ph * output_width + pw] = ele; } } @@ -86,13 +88,12 @@ class Pool2dFunctor { template class Pool2dGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_grad_process, - framework::Tensor* input_grad) { + void operator()( + const platform::CPUDeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output, const framework::Tensor& output_grad, + const std::vector& ksize, const std::vector& strides, + const std::vector& paddings, PoolProcess pool_grad_process, + framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -131,8 +132,8 @@ class Pool2dGradFunctor { input_data[h * input_width + w], output_data[ph * output_width + pw], output_grad_data[ph * output_width + pw], - input_grad_data[h * input_width + w], - static_cast(scale)); + static_cast(scale), + input_grad_data + h * input_width + w); } } } @@ -154,12 +155,11 @@ class Pool2dGradFunctor { template class MaxPool2dGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* input_grad) { + void operator()( + const platform::CPUDeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output, const framework::Tensor& output_grad, + const std::vector& ksize, const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -246,9 +246,10 @@ template class Pool3dFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_process, framework::Tensor* output) { + const framework::Tensor& input, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_process, + framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -293,14 +294,14 @@ class Pool3dFunctor { for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { pool_process.compute( - ele, - input_data[(d * input_height + h) * input_width + w]); + input_data[(d * input_height + h) * input_width + w], + &ele); } } } int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); - pool_process.finalize(ele, static_cast(pool_size)); + pool_process.finalize(static_cast(pool_size), &ele); output_data[output_idx] = ele; } } @@ -320,13 +321,12 @@ class Pool3dFunctor { template class Pool3dGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_grad_process, - framework::Tensor* input_grad) { + void operator()( + const platform::CPUDeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output, const framework::Tensor& output_grad, + const std::vector& ksize, const std::vector& strides, + const std::vector& paddings, PoolProcess pool_grad_process, + framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -379,8 +379,8 @@ class Pool3dGradFunctor { (pd * output_height + ph) * output_width + pw; pool_grad_process.compute( input_data[input_idx], output_data[output_idx], - output_grad_data[output_idx], - input_grad_data[input_idx], static_cast(scale)); + output_grad_data[output_idx], static_cast(scale), + input_grad_data + input_idx); } } } @@ -404,12 +404,11 @@ class Pool3dGradFunctor { template class MaxPool3dGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* input_grad) { + void operator()( + const platform::CPUDeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output, const framework::Tensor& output_grad, + const std::vector& ksize, const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -510,9 +509,10 @@ template class MaxPool2dWithIndexFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* output, framework::Tensor* mask) { + const framework::Tensor& input, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -576,8 +576,9 @@ class MaxPool2dWithIndexGradFunctor { public: void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& output_grad, - const framework::Tensor& mask, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& mask, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_height = input_grad->dims()[2]; @@ -628,9 +629,10 @@ template class MaxPool3dWithIndexFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* output, framework::Tensor* mask) { + const framework::Tensor& input, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -708,8 +710,9 @@ class MaxPool3dWithIndexGradFunctor { public: void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& output_grad, - const framework::Tensor& mask, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& mask, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_depth = input_grad->dims()[2]; diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu index 274263c69c535249fceee11075c5948b1fc34358..b1c76350d1724629bae175abf47e6671a1532242 100644 --- a/paddle/fluid/operators/math/pooling.cu +++ b/paddle/fluid/operators/math/pooling.cu @@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include #include "paddle/fluid/operators/math/pooling.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { @@ -47,11 +49,11 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data, T ele = pool_process.initial(); for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - pool_process.compute(ele, input_data[h * input_width + w]); + pool_process.compute(input_data[h * input_width + w], &ele); } } int pool_size = (hend - hstart) * (wend - wstart); - pool_process.finalize(ele, (static_cast(pool_size))); + pool_process.finalize(static_cast(pool_size), &ele); output_data[index] = ele; } } @@ -96,8 +98,8 @@ __global__ void KernelPool2DGrad( int pool_size = (hend - hstart) * (wend - wstart); int output_sub_idx = ph * output_width + pw; pool_process.compute(input, output_data[output_sub_idx], - output_grad[output_sub_idx], gradient, - static_cast(1.0 / pool_size)); + output_grad[output_sub_idx], + static_cast(1.0 / pool_size), &gradient); } } input_grad[index] = gradient; @@ -158,9 +160,10 @@ template class Pool2dFunctor { public: void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_process, framework::Tensor* output) { + const framework::Tensor& input, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_process, + framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -201,9 +204,11 @@ class Pool2dGradFunctor { void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_process, framework::Tensor* input_grad) { + const framework::Tensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_process, + framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -246,8 +251,10 @@ class MaxPool2dGradFunctor { void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -340,12 +347,12 @@ __global__ void KernelPool3D(const int nthreads, const T* input_data, for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { pool_process.compute( - ele, input_data[(d * input_height + h) * input_width + w]); + input_data[(d * input_height + h) * input_width + w], &ele); } } } int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); - pool_process.finalize(ele, static_cast(pool_size)); + pool_process.finalize(static_cast(pool_size), &ele); output_data[index] = ele; } } @@ -405,8 +412,8 @@ __global__ void KernelPool3DGrad( int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); int output_sub_idx = (pd * output_height + ph) * output_width + pw; pool_process.compute(input, output_data[output_sub_idx], - output_grad[output_sub_idx], gradient, - static_cast(1.0 / pool_size)); + output_grad[output_sub_idx], + static_cast(1.0 / pool_size), &gradient); } } } @@ -474,9 +481,10 @@ template class Pool3dFunctor { public: void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_process, framework::Tensor* output) { + const framework::Tensor& input, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_process, + framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -525,9 +533,11 @@ class Pool3dGradFunctor { void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_process, framework::Tensor* input_grad) { + const framework::Tensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_process, + framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -578,8 +588,10 @@ class MaxPool3dGradFunctor { void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -736,9 +748,10 @@ template class MaxPool2dWithIndexFunctor { public: void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* output, framework::Tensor* mask) { + const framework::Tensor& input, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -779,8 +792,9 @@ class MaxPool2dWithIndexGradFunctor { public: void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& output_grad, - const framework::Tensor& mask, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& mask, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_channels = input_grad->dims()[1]; @@ -937,9 +951,10 @@ template class MaxPool3dWithIndexFunctor { public: void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* output, framework::Tensor* mask) { + const framework::Tensor& input, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -987,8 +1002,9 @@ class MaxPool3dWithIndexGradFunctor { public: void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& output_grad, - const framework::Tensor& mask, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& mask, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_channels = input_grad->dims()[1]; diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h index 74cb42f0d02086a6776b22d57832757ae3ffc470..2538d739cce95d1b2fc5b3f905af5e6d94cf7af5 100644 --- a/paddle/fluid/operators/math/pooling.h +++ b/paddle/fluid/operators/math/pooling.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" @@ -23,8 +24,8 @@ namespace operators { namespace math { #define FLT_MAX \ - __FLT_MAX__ // It might need to be placed in another file, but I'm still - // wondering where to put it. + __FLT_MAX__ // TODO(zcd) :It might need to be placed in another file, but I'm + // still wondering where to put it. /* * \brief Extracting simple operations from pooling. @@ -40,33 +41,33 @@ template class MaxPool { public: DEVICE inline T initial() { return static_cast(-FLT_MAX); } - DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; } - DEVICE inline void finalize(T& y, const T& pool_field) {} + DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } + DEVICE inline void finalize(const T& pool_field, T* y) {} }; template class AvgPool { public: DEVICE inline T initial() { return static_cast(0); } - DEVICE inline void compute(T& y, const T& x) { y += x; } - DEVICE inline void finalize(T& y, const T& pool_field) { y /= pool_field; } + DEVICE inline void compute(const T& x, T* y) { *y += x; } + DEVICE inline void finalize(const T& pool_field, T* y) { *y /= pool_field; } }; template class MaxPoolGrad { public: - DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx, - T scale) { - dx += dy * (x == y); + DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale, + T* dx) { + *dx += dy * (x == y); } }; template class AvgPoolGrad { public: - DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx, - T scale) { - dx += (scale * dy); + DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale, + T* dx) { + *dx += (scale * dy); } }; @@ -88,8 +89,9 @@ template class Pool2dFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, - std::vector& ksize, std::vector& strides, - std::vector& paddings, PoolProcess pool_compute, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_compute, framework::Tensor* output); }; @@ -98,9 +100,11 @@ class Pool2dGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_compute, framework::Tensor* input_grad); + const framework::Tensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_compute, + framework::Tensor* input_grad); }; template @@ -108,8 +112,10 @@ class MaxPool2dGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad); }; @@ -117,8 +123,9 @@ template class Pool3dFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, - std::vector& ksize, std::vector& strides, - std::vector& paddings, PoolProcess pool_compute, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_compute, framework::Tensor* output); }; @@ -127,9 +134,11 @@ class Pool3dGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_compute, framework::Tensor* input_grad); + const framework::Tensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_compute, + framework::Tensor* input_grad); }; template @@ -137,8 +146,10 @@ class MaxPool3dGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad); }; @@ -153,8 +164,9 @@ template class MaxPool2dWithIndexFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, - std::vector& ksize, std::vector& strides, - std::vector& paddings, framework::Tensor* output, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output, framework::Tensor* mask); }; @@ -163,8 +175,9 @@ class MaxPool2dWithIndexGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& output_grad, - const framework::Tensor& mask, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& mask, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad); }; @@ -172,8 +185,9 @@ template class MaxPool3dWithIndexFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, - std::vector& ksize, std::vector& strides, - std::vector& paddings, framework::Tensor* output, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output, framework::Tensor* mask); }; @@ -182,8 +196,9 @@ class MaxPool3dWithIndexGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& output_grad, - const framework::Tensor& mask, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& mask, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad); }; diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index 7b31ee8e389b94eeaa04ace52251a23933230d34..a92762c7fea865fad2c7784736cce93a8af21892 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu b/paddle/fluid/operators/math/selected_rows_functor_test.cu index 942d9b13fc1a8f578da779351be9ba9de7fcce33..e89b27855bdeba3a5189feff94eb063ddfb9b9b8 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cu +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu @@ -12,43 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "gtest/gtest.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" TEST(selected_rows_functor, gpu_add) { - using namespace paddle::framework; - using namespace paddle::platform; - using namespace paddle::operators::math; - - CUDAPlace gpu_place(0); - CPUPlace cpu_place; - CUDADeviceContext ctx(gpu_place); - SetConstant functor; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDADeviceContext ctx(gpu_place); + paddle::operators::math::SetConstant + functor; int64_t height = 10; int64_t row_numel = 10; std::vector rows1{0, 4, 7}; - std::unique_ptr selected_rows1{new SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new paddle::framework::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( - make_ddim({static_cast(rows1.size()), row_numel}), gpu_place); + paddle::framework::make_ddim( + {static_cast(rows1.size()), row_numel}), + gpu_place); functor(ctx, in1_value, 1.0); std::vector rows2{0, 5, 7, 9}; - std::unique_ptr selected_rows2{new SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new paddle::framework::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( - make_ddim({static_cast(rows2.size()), row_numel}), gpu_place); + paddle::framework::make_ddim( + {static_cast(rows2.size()), row_numel}), + gpu_place); functor(ctx, in2_value, 2.0); - std::unique_ptr output{new SelectedRows()}; + std::unique_ptr output{ + new paddle::framework::SelectedRows()}; auto* out_value = output->mutable_value(); - // simplely concat two SelectedRows - out_value->mutable_data(make_ddim({7, 10}), gpu_place); + // simply concat two SelectedRows + out_value->mutable_data(paddle::framework::make_ddim({7, 10}), + gpu_place); - SelectedRowsAdd add_functor; + paddle::operators::math::SelectedRowsAdd + add_functor; add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); auto out_height = output->height(); @@ -66,8 +75,8 @@ TEST(selected_rows_functor, gpu_add) { EXPECT_EQ(out_rows[5], 7); EXPECT_EQ(out_rows[6], 9); - Tensor out_cpu; - TensorCopy(*out_value, cpu_place, ctx, &out_cpu); + paddle::framework::Tensor out_cpu; + paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu); ctx.Wait(); auto* out_cpu_data = out_cpu.data(); @@ -83,18 +92,24 @@ TEST(selected_rows_functor, gpu_add) { EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0); EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0); - std::unique_ptr tensor1{new Tensor()}; - tensor1->mutable_data(make_ddim({height, row_numel}), gpu_place); + std::unique_ptr tensor1{ + new paddle::framework::Tensor()}; + tensor1->mutable_data( + paddle::framework::make_ddim({height, row_numel}), gpu_place); functor(ctx, tensor1.get(), 3.0); - std::unique_ptr tensor2{new Tensor()}; - tensor2->mutable_data(make_ddim({height, row_numel}), gpu_place); + std::unique_ptr tensor2{ + new paddle::framework::Tensor()}; + tensor2->mutable_data( + paddle::framework::make_ddim({height, row_numel}), gpu_place); - SelectedRowsAddTensor add_tensor_functor; + paddle::operators::math::SelectedRowsAddTensor< + paddle::platform::CUDADeviceContext, float> + add_tensor_functor; add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); - Tensor tensor2_cpu; - TensorCopy(*tensor2, cpu_place, ctx, &tensor2_cpu); + paddle::framework::Tensor tensor2_cpu; + paddle::framework::TensorCopy(*tensor2, cpu_place, ctx, &tensor2_cpu); ctx.Wait(); auto* tensor2_cpu_data = tensor2_cpu.data(); @@ -115,39 +130,47 @@ TEST(selected_rows_functor, gpu_add) { } TEST(selected_rows_functor, gpu_add_to) { - using namespace paddle::framework; - using namespace paddle::platform; - using namespace paddle::operators::math; - - CUDAPlace gpu_place(0); - CPUPlace cpu_place; - CUDADeviceContext ctx(gpu_place); - SetConstant functor; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDADeviceContext ctx(gpu_place); + paddle::operators::math::SetConstant + functor; int64_t height = 10; int64_t row_numel = 10; std::vector rows1{0, 4, 7}; - std::unique_ptr selected_rows1{new SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new paddle::framework::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( - make_ddim({static_cast(rows1.size()), row_numel}), gpu_place); + paddle::framework::make_ddim( + {static_cast(rows1.size()), row_numel}), + gpu_place); functor(ctx, in1_value, 1.0); std::vector rows2{0, 5, 7, 9}; - std::unique_ptr selected_rows2{new SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new paddle::framework::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( - make_ddim({static_cast(rows2.size()), row_numel}), gpu_place); + paddle::framework::make_ddim( + {static_cast(rows2.size()), row_numel}), + gpu_place); functor(ctx, in2_value, 2.0); - std::unique_ptr output{new SelectedRows()}; + std::unique_ptr output{ + new paddle::framework::SelectedRows()}; output->set_height(height); auto* out_value = output->mutable_value(); - // simplely concat two SelectedRows - out_value->mutable_data(make_ddim({7, 10}), gpu_place); + // simply concat two SelectedRows + out_value->mutable_data(paddle::framework::make_ddim({7, 10}), + gpu_place); - SelectedRowsAddTo add_to_functor; + paddle::operators::math::SelectedRowsAddTo< + paddle::platform::CUDADeviceContext, float> + add_to_functor; add_to_functor(ctx, *selected_rows1, 0, output.get()); add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); @@ -166,8 +189,8 @@ TEST(selected_rows_functor, gpu_add_to) { EXPECT_EQ(out_rows[5], 7); EXPECT_EQ(out_rows[6], 9); - Tensor out_cpu; - TensorCopy(*out_value, cpu_place, ctx, &out_cpu); + paddle::framework::Tensor out_cpu; + paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu); ctx.Wait(); auto* out_cpu_data = out_cpu.data(); @@ -183,15 +206,19 @@ TEST(selected_rows_functor, gpu_add_to) { EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0); EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0); - std::unique_ptr tensor1{new Tensor()}; - tensor1->mutable_data(make_ddim({height, row_numel}), gpu_place); + std::unique_ptr tensor1{ + new paddle::framework::Tensor()}; + tensor1->mutable_data( + paddle::framework::make_ddim({height, row_numel}), gpu_place); functor(ctx, tensor1.get(), 3.0); - SelectedRowsAddToTensor add_to_tensor_functor; + paddle::operators::math::SelectedRowsAddToTensor< + paddle::platform::CUDADeviceContext, float> + add_to_tensor_functor; add_to_tensor_functor(ctx, *output, tensor1.get()); - Tensor tensor1_cpu; - TensorCopy(*tensor1, cpu_place, ctx, &tensor1_cpu); + paddle::framework::Tensor tensor1_cpu; + paddle::framework::TensorCopy(*tensor1, cpu_place, ctx, &tensor1_cpu); ctx.Wait(); auto* tensor1_cpu_data = tensor1_cpu.data(); diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc index 38bd3b99758555a24b3b8eb0de06cca8e424fcb2..d63c6c4ed55331235188c1c750468d4e75b9b7f2 100644 --- a/paddle/fluid/operators/math/sequence_padding.cc +++ b/paddle/fluid/operators/math/sequence_padding.cc @@ -22,7 +22,7 @@ template class PaddingLoDTensorFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const framework::LoDTensor& seq, framework::Tensor& padding, + const framework::LoDTensor& seq, framework::Tensor* padding, bool norm_by_times) { auto lod = seq.lod(); PADDLE_ENFORCE_GT(lod.size(), 0UL, @@ -37,7 +37,7 @@ class PaddingLoDTensorFunctor { "The first dimension of LoDTensor seq should be " "equal to the sum of all sequences's length."); - auto padding_dims = padding.dims(); + auto padding_dims = padding->dims(); PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, "The input padding should be a 3-D Tensor of shape " "[max_sequence_length, num_sequences, sequence_width]."); @@ -58,7 +58,7 @@ class PaddingLoDTensorFunctor { "width of sequence in LoDTensor seq."); const T* seq_data = seq.data(); - T* padding_data = padding.data(); + T* padding_data = padding->data(); for (int64_t i = 0; i < max_sequence_length; ++i) { for (int64_t j = 0; j < num_sequences; ++j) { int64_t start_pos = abs_offset_lod[level][j]; @@ -84,16 +84,16 @@ template class UnpaddingLoDTensorFunctor { public: void operator()(const platform::CPUDeviceContext& context, - framework::LoDTensor& seq, const framework::Tensor& padding, + framework::LoDTensor* seq, const framework::Tensor& padding, bool norm_by_times) { - auto lod = seq.lod(); + auto lod = seq->lod(); PADDLE_ENFORCE_GT(lod.size(), 0UL, "The LoD of LoDTensor seq should not be null."); const size_t level = 0; framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); - auto seq_dims = seq.dims(); + auto seq_dims = seq->dims(); PADDLE_ENFORCE_EQ(seq_dims[0], static_cast(abs_offset_lod[level].back()), "The first dimension of LoDTensor seq should be " @@ -114,13 +114,13 @@ class UnpaddingLoDTensorFunctor { "The second dimension of Tensor padding should be " "the number of sequences in LoDTensor seq."); - const int64_t sequence_width = seq.numel() / seq_dims[0]; + const int64_t sequence_width = seq->numel() / seq_dims[0]; PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, "The third dimension of Tensor padding should be the " "width of sequence in LoDTensor seq."); const T* padding_data = padding.data(); - T* seq_data = seq.data(); + T* seq_data = seq->data(); for (int64_t i = 0; i < num_sequences; ++i) { int64_t start_pos = abs_offset_lod[level][i]; int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos; diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu index c044e6fc32bab8f72a0dce45b4abdb1174a0d72f..0956a0c17d387f4a174c7ed4e9b1b1f816dcf4ae 100644 --- a/paddle/fluid/operators/math/sequence_padding.cu +++ b/paddle/fluid/operators/math/sequence_padding.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/operators/math/sequence_padding.h" namespace paddle { @@ -61,7 +62,7 @@ template class PaddingLoDTensorFunctor { public: void operator()(const platform::CUDADeviceContext& context, - const framework::LoDTensor& seq, framework::Tensor& padding, + const framework::LoDTensor& seq, framework::Tensor* padding, bool norm_by_times) { auto lod = seq.lod(); PADDLE_ENFORCE_GT(lod.size(), 0UL, @@ -76,7 +77,7 @@ class PaddingLoDTensorFunctor { "The first dimension of LoDTensor seq should be " "equal to the sum of all sequences's length."); - auto padding_dims = padding.dims(); + auto padding_dims = padding->dims(); PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, "The input padding should be a 3-D Tensor of shape " "[max_sequence_length, num_sequences, sequence_width]."); @@ -97,8 +98,8 @@ class PaddingLoDTensorFunctor { "width of sequence in LoDTensor seq."); if (!norm_by_times && num_sequences == 1UL) { - TensorCopy(seq, context.GetPlace(), context, &padding); - padding.Resize(padding_dims); + TensorCopy(seq, context.GetPlace(), context, padding); + padding->Resize(padding_dims); return; } @@ -117,7 +118,7 @@ class PaddingLoDTensorFunctor { dim3 grid(grid_dim_x, grid_dim_y); const T* seq_data = seq.data(); - T* padding_data = padding.data(); + T* padding_data = padding->data(); if (norm_by_times) { SequencePaddingKernel<<>>( padding_data, const_cast(seq_data), @@ -136,16 +137,16 @@ template class UnpaddingLoDTensorFunctor { public: void operator()(const platform::CUDADeviceContext& context, - framework::LoDTensor& seq, const framework::Tensor& padding, + framework::LoDTensor* seq, const framework::Tensor& padding, bool norm_by_times) { - auto lod = seq.lod(); + auto lod = seq->lod(); PADDLE_ENFORCE_GT(lod.size(), 0UL, "The lod of LoDTensor seq should not be null."); const size_t level = 0; framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); - auto seq_dims = seq.dims(); + auto seq_dims = seq->dims(); PADDLE_ENFORCE_EQ(seq_dims[0], static_cast(abs_offset_lod[level].back()), "The first dimension of LoDTensor seq should be " @@ -166,14 +167,14 @@ class UnpaddingLoDTensorFunctor { "The second dimension of Tensor padding should be " "the number of sequences in LoDTensor seq."); - const int64_t sequence_width = seq.numel() / seq_dims[0]; + const int64_t sequence_width = seq->numel() / seq_dims[0]; PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, "The third dimension of Tensor padding should be the " "width of sequence in LoDTensor seq."); if (!norm_by_times && num_sequences == 1UL) { - TensorCopy(padding, context.GetPlace(), context, &seq); - seq.Resize(seq_dims); + TensorCopy(padding, context.GetPlace(), context, seq); + seq->Resize(seq_dims); return; } @@ -192,7 +193,7 @@ class UnpaddingLoDTensorFunctor { dim3 grid(grid_dim_x, grid_dim_y); const T* padding_data = padding.data(); - T* seq_data = seq.data(); + T* seq_data = seq->data(); if (norm_by_times) { SequencePaddingKernel<<>>( const_cast(padding_data), seq_data, diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h index 17f044b9d6667ed6a45bf5a0c2362c351d2c2beb..b56e6db1ebdac1a00561c07845c03bb8fbd8d35a 100644 --- a/paddle/fluid/operators/math/sequence_padding.h +++ b/paddle/fluid/operators/math/sequence_padding.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/device_context.h" @@ -64,13 +65,13 @@ template class PaddingLoDTensorFunctor { public: void operator()(const DeviceContext& context, const framework::LoDTensor& seq, - framework::Tensor& padding, bool norm_by_times); + framework::Tensor* padding, bool norm_by_times); }; template class UnpaddingLoDTensorFunctor { public: - void operator()(const DeviceContext& context, framework::LoDTensor& seq, + void operator()(const DeviceContext& context, framework::LoDTensor* seq, const framework::Tensor& padding, bool norm_by_times); }; diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc index e3d62144856eb30a64007b54c16719cc028c70f2..b9a1b9ae4d6e8c82aa782190d9e145e7a2b502be 100644 --- a/paddle/fluid/operators/math/sequence_padding_test.cc +++ b/paddle/fluid/operators/math/sequence_padding_test.cc @@ -54,12 +54,12 @@ void TestSequencePadding(const paddle::framework::LoD& lod, static_cast(sequence_width)}); padding.mutable_data(padding_dims, *place); paddle::operators::math::PaddingLoDTensorFunctor()( - *context, seq, padding, false); + *context, seq, &padding, false); seq_back.set_lod(lod); seq_back.mutable_data(seq_dims, *place); paddle::operators::math::UnpaddingLoDTensorFunctor()( - *context, seq_back, padding, false); + *context, &seq_back, padding, false); if (paddle::platform::is_cpu_place(*place)) { cpu_seq_back = seq_back; diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu index 36f6402396379ab79fcbc71fd43d380227adccc4..97c2e69fe5327956fc2828781fe3a37b88cc1b99 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cu +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/sequence_pooling.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu index 430bf13c3f8d627f2b4cc24b005f2be5a66cefac..079338c1d3dac6a9403c5871f3face9f1f8e77d2 100644 --- a/paddle/fluid/operators/math/sequence_scale.cu +++ b/paddle/fluid/operators/math/sequence_scale.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/sequence_scale.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu index 367f343d51712d38edbb7eb50b41433258cf8c9d..c467ae8427d8f461b332eed8075631ed7e47b96e 100644 --- a/paddle/fluid/operators/math/unpooling.cu +++ b/paddle/fluid/operators/math/unpooling.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/unpooling.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu index e0f3ef36879327c0592bb955dd800b44b228e721..28e1a752e34cf0171785a0341d8f0d8d3712fc7b 100644 --- a/paddle/fluid/operators/math/vol2col.cu +++ b/paddle/fluid/operators/math/vol2col.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/math/vol2col.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index c9fabc8d485b3bba2c8ae14b3616d0bdcae058a7..6903cf83b41a54b54382fac2cf58f7bfe192b55f 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -204,6 +204,8 @@ REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(mul_grad, ops::MulGradOp); REGISTER_OP_CPU_KERNEL( - mul, ops::MulKernel); + mul, ops::MulKernel, + ops::MulKernel); REGISTER_OP_CPU_KERNEL( - mul_grad, ops::MulGradKernel); + mul_grad, ops::MulGradKernel, + ops::MulGradKernel); diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc index 757f9c3ee2665c7ac654659416fe8dd727dca16d..81f3e42bf412fa4d2cb48405f2f8ee49b6aa0b67 100644 --- a/paddle/fluid/operators/mul_op.cu.cc +++ b/paddle/fluid/operators/mul_op.cu.cc @@ -18,6 +18,8 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel, + ops::MulKernel, ops::MulKernel); REGISTER_OP_CUDA_KERNEL(mul_grad, - ops::MulGradKernel); + ops::MulGradKernel, + ops::MulGradKernel); diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu index 240ac895e2c8391322411d347384f4834995eb7c..625065692c1f32c89d9e566d00051e237ac9a3af 100644 --- a/paddle/fluid/operators/one_hot_op.cu +++ b/paddle/fluid/operators/one_hot_op.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/one_hot_op.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/gpu_info.h" namespace paddle { diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu index 0bdfee0434f6934b20083c42dd5da64f4cddf8e2..f905d690f984a20622c5fbcbcc813d888dfb19d9 100644 --- a/paddle/fluid/operators/roi_pool_op.cu +++ b/paddle/fluid/operators/roi_pool_op.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/roi_pool_op.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu index 67083455a7579a4bbb6d9598a77b68a8375cf815..dd8e62aca47a3b34a3788a43cc0043a887af818f 100644 --- a/paddle/fluid/operators/row_conv_op.cu +++ b/paddle/fluid/operators/row_conv_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/row_conv_op.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { @@ -220,7 +220,7 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout, for (int offset = 16; offset > 0; offset = offset / 2) { // blockDim.x is 32. - val += __shfl_down(val, offset); + val += platform::__shfl_down_sync(0, val, offset); } __syncthreads(); @@ -276,7 +276,7 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence, for (int offset = 16; offset > 0; offset = offset / 2) { // blockDim.x is 32. - val += __shfl_down(val, offset); + val += platform::__shfl_down_sync(0, val, offset); } __syncthreads(); diff --git a/paddle/fluid/operators/save_load_op_test.cc b/paddle/fluid/operators/save_load_op_test.cc index a7ba1e0ae1d22a22cf2943c9aaf0c394ef4ae326..74385ee47543e3f4887081c2225212996d3df3f1 100644 --- a/paddle/fluid/operators/save_load_op_test.cc +++ b/paddle/fluid/operators/save_load_op_test.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/float16.h" USE_NO_KERNEL_OP(save); USE_NO_KERNEL_OP(load); @@ -61,3 +62,35 @@ TEST(SaveLoadOp, CPU) { } } } + +TEST(SaveLoadFP16Op, CPU) { + paddle::framework::Scope scope; + paddle::platform::CPUPlace place; + + auto var = scope.Var("test_var"); + auto tensor = var->GetMutable(); + tensor->Resize({3, 10}); + + float* expect = tensor->mutable_data(place); + for (int64_t i = 0; i < tensor->numel(); ++i) { + expect[i] = static_cast(paddle::platform::float16(i)); + } + + paddle::framework::AttributeMap attrs; + attrs.insert({"file_path", std::string("tensor.save")}); + attrs.insert({"save_as_fp16", true}); + + auto save_op = paddle::framework::OpRegistry::CreateOp( + "save", {{"X", {"test_var"}}}, {}, attrs); + save_op->Run(scope, place); + + auto load_var = scope.Var("out_var"); + auto target = load_var->GetMutable(); + auto load_op = paddle::framework::OpRegistry::CreateOp( + "load", {}, {{"Out", {"out_var"}}}, attrs); + load_op->Run(scope, place); + paddle::platform::float16* actual = target->data(); + for (int64_t i = 0; i < tensor->numel(); ++i) { + EXPECT_EQ(expect[i], static_cast(actual[i])); + } +} diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index 4a715c4baab2da7b7af86ada22ee88a16b05a814..f45d07ed90d52d204e9a3a5c2efe6df6b27ebfe6 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" @@ -68,6 +69,7 @@ class SaveOp : public framework::OperatorBase { const platform::Place &place) const override { auto filename = Attr("file_path"); auto overwrite = Attr("overwrite"); + auto save_as_fp16 = Attr("save_as_fp16"); if (FileExists(filename) && !overwrite) { PADDLE_THROW("%s is existed, cannot save to it when overwrite=false", @@ -96,7 +98,18 @@ class SaveOp : public framework::OperatorBase { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(place); - framework::SerializeToStream(fout, tensor, dev_ctx); + auto in_dtype = framework::ToDataType(tensor.type()); + auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + + if (in_dtype != out_dtype) { + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor out; + framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out); + framework::SerializeToStream(fout, out, dev_ctx); + } else { + framework::SerializeToStream(fout, tensor, dev_ctx); + } } }; @@ -114,6 +127,12 @@ This operator will serialize and write a tensor variable to file on disk. "(boolean, default true)" "Overwrite the output file if exist") .SetDefault(true); + AddAttr("save_as_fp16", + "(boolean, default false)" + "If true, the tensor will be converted to float16 data " + "type and then saved. Otherwise, the tensor will be " + "directly saved without data type conversion.") + .SetDefault(false); AddAttr("file_path", "(string)" "The \"file_path\" where the variable will be saved.") diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc index d2e1f3cb2ff9c8254cd4815a0f8750966a6e161c..93e55d410388a672eb749302162ea81de4c6cba1 100644 --- a/paddle/fluid/operators/send_recv_op_test.cc +++ b/paddle/fluid/operators/send_recv_op_test.cc @@ -113,7 +113,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs, op->SetAttrMap(attrs); } -void StartServerNet(bool is_sparse) { +void StartServerNet(bool is_sparse, std::atomic *initialized) { f::Scope scope; p::CPUPlace place; if (is_sparse) { @@ -121,7 +121,6 @@ void StartServerNet(bool is_sparse) { } else { InitTensorsInScope(place, &scope); } - // sub program run in listen_and_serv_op, for simple test we use sum f::ProgramDesc program; const auto &root_block = program.Block(0); @@ -129,7 +128,6 @@ void StartServerNet(bool is_sparse) { auto *prefetch_block = program.AppendBlock(root_block); // X for server side tensors, RX for received tensors, must be of same shape. AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block); - f::AttributeMap attrs; attrs.insert({"endpoint", std::string("127.0.0.1:0")}); attrs.insert({"Fanin", 1}); @@ -141,12 +139,16 @@ void StartServerNet(bool is_sparse) { attrs.insert({"sync_mode", true}); listen_and_serv_op = f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs); + *initialized = true; listen_and_serv_op->Run(scope, place); LOG(INFO) << "server exit"; } TEST(SendRecvOp, CPUDense) { - std::thread server_thread(StartServerNet, false); + std::atomic initialized{false}; + std::thread server_thread(StartServerNet, false, &initialized); + while (!initialized) { + } sleep(5); // wait server to start // local net f::Scope scope; @@ -156,9 +158,11 @@ TEST(SendRecvOp, CPUDense) { scope.Var("RPC_CLIENT_VAR"); f::AttributeMap attrs; - selected_port = static_cast( - listen_and_serv_op.get()) - ->GetSelectedPort(); + auto *listen_and_serv_op_ptr = + static_cast( + listen_and_serv_op.get()); + ASSERT_TRUE(listen_and_serv_op_ptr != nullptr); + selected_port = listen_and_serv_op_ptr->GetSelectedPort(); std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port); attrs.insert({"endpoints", std::vector({endpoint})}); attrs.insert({"epmap", std::vector({endpoint})}); @@ -184,8 +188,12 @@ TEST(SendRecvOp, CPUDense) { } TEST(SendRecvOp, CPUSparse) { - std::thread server_thread(StartServerNet, true); - sleep(3); // wait server to start + std::atomic initialized; + initialized = false; + std::thread server_thread(StartServerNet, true, &initialized); + while (!initialized) { + } + sleep(5); // wait server to start // local net f::Scope scope; p::CPUPlace place; @@ -193,9 +201,11 @@ TEST(SendRecvOp, CPUSparse) { InitSelectedRowsInScope(place, &scope); scope.Var("RPC_CLIENT_VAR"); f::AttributeMap attrs; - selected_port = static_cast( - listen_and_serv_op.get()) - ->GetSelectedPort(); + auto *listen_and_serv_op_ptr = + static_cast( + listen_and_serv_op.get()); + ASSERT_TRUE(listen_and_serv_op_ptr != nullptr); + selected_port = listen_and_serv_op_ptr->GetSelectedPort(); std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port); attrs.insert({"endpoints", std::vector({endpoint})}); attrs.insert({"epmap", std::vector({endpoint})}); diff --git a/paddle/fluid/operators/sequence_erase_op.cu b/paddle/fluid/operators/sequence_erase_op.cu index fc9b91c351defb92246e0966b9993fd1e288aaac..3a58e47f1132cd1ac85584b2470e8c6cddcfb28a 100644 --- a/paddle/fluid/operators/sequence_erase_op.cu +++ b/paddle/fluid/operators/sequence_erase_op.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/sequence_erase_op.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu index c00765e5d59af068e5682b39ebace5f3d7a62250..550677b22694085059e914678a5361d914b455bc 100644 --- a/paddle/fluid/operators/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_expand_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/sequence_expand_op.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc index 06cb0550ad7d4ad0241a4f439ea9ac16d9714c38..bd04c60ffa5c1e5eb8d2051ce495ab6c685b14b5 100644 --- a/paddle/fluid/operators/sgd_op.cc +++ b/paddle/fluid/operators/sgd_op.cc @@ -48,6 +48,24 @@ class SGDOp : public framework::OperatorWithKernel { } }; +class SGDOpInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto input_var = op_desc.Input("Param")[0]; + for (auto& out_var : op_desc.Output("ParamOut")) { + if (block->FindRecursiveOrCreateVar(input_var).GetType() == + framework::proto::VarType::SELECTED_ROWS) { + block->FindRecursiveOrCreateVar(out_var).SetType( + framework::proto::VarType::SELECTED_ROWS); + } else { + block->FindRecursiveOrCreateVar(out_var).SetType( + framework::proto::VarType::LOD_TENSOR); + } + } + } +}; + class SGDOpMaker : public framework::OpProtoAndCheckerMaker { public: SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker) @@ -74,5 +92,6 @@ $$param\_out = param - learning\_rate * grad$$ } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker); +REGISTER_OPERATOR(sgd, ops::SGDOp, ops::SGDOpMaker, + paddle::framework::EmptyGradOpMaker, ops::SGDOpInferVarType); REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel, ops::SGDOpKernel); diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu index 9d211541c0bf729393b8190edb18e101d5e07d1a..4722be7a666d3e8f3c25c9499f88ddda835f60e3 100644 --- a/paddle/fluid/operators/sgd_op.cu +++ b/paddle/fluid/operators/sgd_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #define EIGEN_USE_GPU #include "paddle/fluid/operators/sgd_op.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index acaefaacdaa593c090d81084fdc1b3665314833f..3b5cf68dd4f28d23e507058337fe55de9b88d3cd 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -116,11 +116,31 @@ uniform distribution. .SetDefault(framework::proto::VarType::FP32); } }; + +class UniformRandomOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto out_var_name = op_desc.Output("Out").front(); + if (block->FindRecursiveOrCreateVar(out_var_name).GetType() == + framework::proto::VarType::SELECTED_ROWS) { + block->FindRecursiveOrCreateVar(out_var_name) + .SetType(framework::proto::VarType::SELECTED_ROWS); + } else { + block->FindRecursiveOrCreateVar(out_var_name) + .SetType(framework::proto::VarType::LOD_TENSOR); + } + } +}; + } // namespace operators } // namespace paddle -REGISTER_OP_WITHOUT_GRADIENT(uniform_random, paddle::operators::UniformRandomOp, - paddle::operators::UniformRandomOpMaker); +REGISTER_OPERATOR(uniform_random, paddle::operators::UniformRandomOp, + paddle::operators::UniformRandomOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::operators::UniformRandomOpVarTypeInference); + REGISTER_OP_CPU_KERNEL(uniform_random, paddle::operators::CPUUniformRandomKernel, paddle::operators::CPUUniformRandomKernel); diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h index 85131d002595f7681e4bec4135e28fe49cf842fb..705cc894c06b207f4e4e45fc771c04fa3cbdf6d5 100644 --- a/paddle/fluid/operators/warpctc_op.h +++ b/paddle/fluid/operators/warpctc_op.h @@ -162,7 +162,7 @@ class WarpCTCKernel : public framework::OpKernel { static_cast(sequence_width)}); warpctc_logits.mutable_data(warpctc_logits_dims, ctx.GetPlace()); math::PaddingLoDTensorFunctor()( - ctx.template device_context(), *logits, warpctc_logits, + ctx.template device_context(), *logits, &warpctc_logits, false); const T* warpctc_logits_data = warpctc_logits.data(); @@ -217,7 +217,7 @@ class WarpCTCGradKernel : public framework::OpKernel { logits_grad->mutable_data(ctx.GetPlace()); bool norm_by_times = ctx.Attr("norm_by_times"); math::UnpaddingLoDTensorFunctor()( - ctx.template device_context(), *logits_grad, + ctx.template device_context(), logits_grad, *warpctc_grad, norm_by_times); const T* loss_grad_data = loss_grad->data(); diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_primitives.h similarity index 85% rename from paddle/fluid/platform/cuda_helper.h rename to paddle/fluid/platform/cuda_primitives.h index 8758af0804ae08fec6fa64d7387f197f046ce20e..866ff30a8be7be124a72a8dc7e70ef4140ee716a 100644 --- a/paddle/fluid/platform/cuda_helper.h +++ b/paddle/fluid/platform/cuda_primitives.h @@ -66,5 +66,18 @@ CUDA_ATOMIC_WRAPPER(Add, double) { } #endif +// __shfl_down has been deprecated as of CUDA 9.0. +#if CUDA_VERSION < 9000 +template +__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) { + return __shfl_down(val, delta); +} +#define CREATE_SHFL_MASK(mask, predicate) mask = 0u; +#else +#define FULL_WARP_MASK 0xFFFFFFFF +#define CREATE_SHFL_MASK(mask, predicate) \ + mask = __ballot_sync(FULL_WARP_MASK, (predicate)) +#endif + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index b20b514fcdd0b41fefa0933bc2d22645e7d4b6d6..c925686f8382da1758fb7cdc048253290ef69513 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -502,11 +502,11 @@ All parameter, weight, gradient are variables in Paddle. const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope, std::vector &local_scopes, - bool allow_op_delay, bool customize_loss_grad) { - new (&self) ParallelExecutor(num_threads, use_event, places, - params, bcast_vars, main_program, - loss_var_name, scope, local_scopes, - allow_op_delay, customize_loss_grad); + bool allow_op_delay, bool use_default_grad_scale) { + new (&self) ParallelExecutor( + num_threads, use_event, places, params, bcast_vars, + main_program, loss_var_name, scope, local_scopes, + allow_op_delay, use_default_grad_scale); }) .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs) // NOTE: even we return a vec* to Python use reference policy. diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index dcd711a33ff3a35fdd51d11f54a3343a0bb491c9..93b09ed6922b32a5531224acc470daf0d97f95bd 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -107,7 +107,7 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) { return self.data()[offset]; } else { std::shared_ptr dst(new framework::Tensor); - framework::TensorCopy(self, platform::CPUPlace(), dst.get()); + framework::TensorCopySync(self, platform::CPUPlace(), dst.get()); return dst->data()[offset]; } } @@ -117,9 +117,9 @@ template void TensorSetElement(framework::Tensor *self, size_t offset, T elem) { if (platform::is_gpu_place(self->place())) { std::shared_ptr dst(new framework::Tensor); - framework::TensorCopy(*self, platform::CPUPlace(), dst.get()); + framework::TensorCopySync(*self, platform::CPUPlace(), dst.get()); dst->data()[offset] = elem; - framework::TensorCopy(*dst.get(), self->place(), self); + framework::TensorCopySync(*dst.get(), self->place(), self); } else if (platform::is_cpu_place(self->place())) { self->data()[offset] = elem; diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 94628270228b9e7fd32405bdcb5e11c163ba4791..7e00bd38487902227c3b4521db20cdbe314059be 100755 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -155,7 +155,7 @@ EOF function gen_dockerfile() { # Set BASE_IMAGE according to env variables if [[ ${WITH_GPU} == "ON" ]]; then - BASE_IMAGE="nvidia/cuda:8.0-cudnn7-runtime-ubuntu16.04" + BASE_IMAGE="nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04" else BASE_IMAGE="ubuntu:16.04" fi diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 654c8272a18e5adb01e75be94985a80502ba2c8d..53455fd86041f87d4050eba89f37c3435fed1ae1 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -208,8 +208,8 @@ EOF --platform=android-$ANDROID_API \ --install-dir=$ANDROID_STANDALONE_TOOLCHAIN - BUILD_ROOT=${PADDLE_ROOT}/build - DEST_ROOT={PADDLE_ROOT}/install + BUILD_ROOT=${PADDLE_ROOT}/build_android + DEST_ROOT=${PADDLE_ROOT}/install_android mkdir -p $BUILD_ROOT cd $BUILD_ROOT @@ -349,13 +349,18 @@ function gen_docs() { ======================================== EOF cmake .. \ + -DCMAKE_BUILD_TYPE=Release \ -DWITH_DOC=ON \ -DWITH_GPU=OFF \ - -DWITH_AVX=${WITH_AVX:-ON} \ - -DWITH_SWIG_PY=ON \ + -DWITH_MKL=OFF \ -DWITH_STYLE_CHECK=OFF make -j `nproc` paddle_docs paddle_apis + + # check websites for broken links + linkchecker doc/v2/en/html/index.html + linkchecker doc/v2/cn/html/index.html + linkchecker doc/v2/api/en/html/index.html } function gen_html() { diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh index 252227ef88abbe238686dd5d7672e57ad68dab7e..311eb576014a741c67c0c6e62740b954a4a02462 100755 --- a/paddle/scripts/paddle_docker_build.sh +++ b/paddle/scripts/paddle_docker_build.sh @@ -28,11 +28,16 @@ function start_build_docker() { docker rm -f "${CONTAINER_ID}" 1>/dev/null fi + apt_mirror='s#http://archive.ubuntu.com/ubuntu#mirror://mirrors.ubuntu.com/mirrors.txt#g' DOCKER_ENV=$(cat < var + self.vars = collections.OrderedDict() # var_name --> var self.ops = list() # operator list self.program = program - self.removed_vars = dict() + self.removed_vars = collections.OrderedDict() def __str__(self): return self.to_string(True) diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py new file mode 100644 index 0000000000000000000000000000000000000000..3ea50bf196d00152e6579623c981ecbfb57b8e3b --- /dev/null +++ b/python/paddle/fluid/inferencer.py @@ -0,0 +1,31 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['Inferencer', ] + + +class Inferencer(object): + def __init__(self, network_func, params, place=None): + # 1. we need to generate a framework.Program by calling + # network_func. Reference: fluid.program_guard in test_word2vec.py + + # 2. move the default_main_program to self.program. + + # 3. run the default_startup program. + self.params = params + self.place = place + + def infer(self, inputs): + # run self.program + pass diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index cc71c2136a6756ff094f6e06b8e200c6a68db06a..acfad45704d4ea9e28711c019db3563489aab3ff 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -168,7 +168,9 @@ class ListenAndServ(object): 'endpoint': self.endpoint, 'Fanin': self.fan_in, 'OptimizeBlock': current_block, - 'PrefetchBlock': empty_block + 'PrefetchBlock': empty_block, + 'sync_mode': True, # did not support async now in layers + 'grad_to_block_id': [""] }) diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index da066c34bdeba1f1b76f8d1cafd9244b2f7708fa..4be0dc6a6bfeed3ac254f5c363d3560973c031b4 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -193,10 +193,7 @@ def assign(input, output): helper = LayerHelper('assign', **locals()) if isinstance(input, Variable): helper.append_op( - type='scale', - inputs={'X': [input]}, - outputs={'Out': [output]}, - attrs={'scale': 1.0}) + type='assign', inputs={'X': [input]}, outputs={'Out': [output]}) elif isinstance(input, numpy.ndarray): dtype = convert_np_dtype_to_dtype_(input.dtype) if dtype == VarDesc.VarType.FP32: diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 4adbb2ea99b58c78c5c08c7ac8a556ca1de1615e..f4128dcbe935ea5caf63b8c0b377f9bed36d6705 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -30,7 +30,7 @@ class ParallelExecutor(object): num_threads=None, allow_op_delay=False, share_vars_from=None, - customize_loss_grad=False): + use_default_grad_scale=True): """ ParallelExecutor can run program in parallel. @@ -46,6 +46,11 @@ class ParallelExecutor(object): improve performance in some cases, defalut False. share_vars_from(ParallelExecutor, default None): If provied, it will share variables from the specified ParallelExecutor. + use_default_grad_scale(bool, default True): If set True, a default + scale value equal to `1./device_count` would be multiplied to + gradients of each device and scaled gradients would be + aggregated. Otherwise, a customized scale value should be fed + to the network. Returns: A ParallelExecutor object. @@ -124,7 +129,7 @@ class ParallelExecutor(object): scope, local_scopes, allow_op_delay, - customize_loss_grad) + use_default_grad_scale) self.scope = scope def run(self, fetch_list, feed=None, feed_dict=None): diff --git a/python/paddle/fluid/params.py b/python/paddle/fluid/params.py new file mode 100644 index 0000000000000000000000000000000000000000..a5d257e53a2958acd1b8f6ef29d0f9f531b36678 --- /dev/null +++ b/python/paddle/fluid/params.py @@ -0,0 +1,39 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import core + +__all__ = ['Params', ] + + +class Params(object): + def __init__(self, path=None): + self.scope = core.Scope() + + if path: + self._load(path) + + def _load(self, path): + # reference: load_persistables in io.py + pass + + def save(self, path): + # reference: save_persistables in io.py + pass + + def add_params(self, scope): + # take the keys from the scope, + # if not already exists in self.scope, + # add the key and value into self.scope. + pass diff --git a/python/paddle/fluid/tests/book/image_classification/notest_image_classification_resnet.py b/python/paddle/fluid/tests/book/image_classification/notest_image_classification_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..5cbfdef91a64ae7c58d060edfb7b9f3bc8160f2b --- /dev/null +++ b/python/paddle/fluid/tests/book/image_classification/notest_image_classification_resnet.py @@ -0,0 +1,145 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import paddle +import paddle.fluid as fluid +import numpy + + +def resnet_cifar10(input, depth=32): + def conv_bn_layer(input, + ch_out, + filter_size, + stride, + padding, + act='relu', + bias_attr=False): + tmp = fluid.layers.conv2d( + input=input, + filter_size=filter_size, + num_filters=ch_out, + stride=stride, + padding=padding, + act=None, + bias_attr=bias_attr) + return fluid.layers.batch_norm(input=tmp, act=act) + + def shortcut(input, ch_in, ch_out, stride): + if ch_in != ch_out: + return conv_bn_layer(input, ch_out, 1, stride, 0, None) + else: + return input + + def basicblock(input, ch_in, ch_out, stride): + tmp = conv_bn_layer(input, ch_out, 3, stride, 1) + tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True) + short = shortcut(input, ch_in, ch_out, stride) + return fluid.layers.elementwise_add(x=tmp, y=short, act='relu') + + def layer_warp(block_func, input, ch_in, ch_out, count, stride): + tmp = block_func(input, ch_in, ch_out, stride) + for i in range(1, count): + tmp = block_func(tmp, ch_out, ch_out, 1) + return tmp + + assert (depth - 2) % 6 == 0 + n = (depth - 2) / 6 + conv1 = conv_bn_layer( + input=input, ch_out=16, filter_size=3, stride=1, padding=1) + res1 = layer_warp(basicblock, conv1, 16, 16, n, 1) + res2 = layer_warp(basicblock, res1, 16, 32, n, 2) + res3 = layer_warp(basicblock, res2, 32, 64, n, 2) + pool = fluid.layers.pool2d( + input=res3, pool_size=8, pool_type='avg', pool_stride=1) + return pool + + +def inference_network(): + classdim = 10 + data_shape = [3, 32, 32] + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + net = resnet_cifar10(images, 32) + predict = fluid.layers.fc(input=net, size=classdim, act='softmax') + return predict + + +def train_network(): + predict = inference_network() + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(cost) + accuracy = fluid.layers.accuracy(input=predict, label=label) + return avg_cost, accuracy + + +def train(use_cuda, save_path): + BATCH_SIZE = 128 + EPOCH_NUM = 1 + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10(), buf_size=128 * 10), + batch_size=BATCH_SIZE) + + test_reader = paddle.batch( + paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE) + + def event_handler(event): + if isinstance(event, fluid.EndIteration): + if (event.batch_id % 10) == 0: + avg_cost, accuracy = trainer.test(reader=test_reader) + + print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format( + event.batch_id + 1, avg_cost, accuracy)) + + if accuracy > 0.01: # Low threshold for speeding up CI + trainer.params.save(save_path) + return + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + trainer = fluid.Trainer( + train_network, + optimizer=fluid.optimizer.Adam(learning_rate=0.001), + place=place, + event_handler=event_handler) + trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler) + + +def infer(use_cuda, save_path): + params = fluid.Params(save_path) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + inferencer = fluid.Inferencer(inference_network, params, place=place) + + # The input's dimension of conv should be 4-D or 5-D. + # Use normilized image pixels as input data, which should be in the range + # [0, 1.0]. + tensor_img = numpy.random.rand(1, 3, 32, 32).astype("float32") + results = inferencer.infer({'pixel': tensor_img}) + + print("infer results: ", results) + + +def main(use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + save_path = "image_classification_resnet.inference.model" + train(use_cuda, save_path) + infer(use_cuda, save_path) + + +if __name__ == '__main__': + for use_cuda in (False, True): + main(use_cuda=use_cuda) diff --git a/python/paddle/fluid/tests/book/image_classification/notest_image_classification_vgg.py b/python/paddle/fluid/tests/book/image_classification/notest_image_classification_vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..8a6a5ff61a913ad6cbc609f8376afcbc621d60e2 --- /dev/null +++ b/python/paddle/fluid/tests/book/image_classification/notest_image_classification_vgg.py @@ -0,0 +1,124 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import paddle +import paddle.fluid as fluid +import numpy + + +def vgg16_bn_drop(input): + def conv_block(input, num_filter, groups, dropouts): + return fluid.nets.img_conv_group( + input=input, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act='relu', + conv_with_batchnorm=True, + conv_batchnorm_drop_rate=dropouts, + pool_type='max') + + conv1 = conv_block(input, 64, 2, [0.3, 0]) + conv2 = conv_block(conv1, 128, 2, [0.4, 0]) + conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) + conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) + conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) + + drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) + fc1 = fluid.layers.fc(input=drop, size=4096, act=None) + bn = fluid.layers.batch_norm(input=fc1, act='relu') + drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) + fc2 = fluid.layers.fc(input=drop2, size=4096, act=None) + return fc2 + + +def inference_network(): + classdim = 10 + data_shape = [3, 32, 32] + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + net = vgg16_bn_drop(images) + predict = fluid.layers.fc(input=net, size=classdim, act='softmax') + return predict + + +def train_network(): + predict = inference_network() + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(cost) + accuracy = fluid.layers.accuracy(input=predict, label=label) + return avg_cost, accuracy + + +def train(use_cuda, save_path): + BATCH_SIZE = 128 + EPOCH_NUM = 1 + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10(), buf_size=128 * 10), + batch_size=BATCH_SIZE) + + test_reader = paddle.batch( + paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE) + + def event_handler(event): + if isinstance(event, fluid.EndIteration): + if (event.batch_id % 10) == 0: + avg_cost, accuracy = trainer.test(reader=test_reader) + + print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format( + event.batch_id + 1, avg_cost, accuracy)) + + if accuracy > 0.01: # Low threshold for speeding up CI + trainer.params.save(save_path) + return + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + trainer = fluid.Trainer( + train_network, + optimizer=fluid.optimizer.Adam(learning_rate=0.001), + place=place, + event_handler=event_handler) + trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler) + + +def infer(use_cuda, save_path): + params = fluid.Params(save_path) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + inferencer = fluid.Inferencer(inference_network, params, place=place) + + # The input's dimension of conv should be 4-D or 5-D. + # Use normilized image pixels as input data, which should be in the range + # [0, 1.0]. + tensor_img = numpy.random.rand(1, 3, 32, 32).astype("float32") + results = inferencer.infer({'pixel': tensor_img}) + + print("infer results: ", results) + + +def main(use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + save_path = "image_classification_vgg.inference.model" + train(use_cuda, save_path) + infer(use_cuda, save_path) + + +if __name__ == '__main__': + for use_cuda in (False, True): + main(use_cuda=use_cuda) diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py index 6afb6fa6e753d3d6478313c840b158c3895b3efb..a0e78a460703778b46191b50c75e92bfbcaec411 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py @@ -275,10 +275,7 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference): class TestBatchNormOpTraining(unittest.TestCase): def __assert_close(self, tensor, np_array, msg, atol=1e-4): - if not np.allclose(np.array(tensor), np_array, atol=atol): - import pdb - pdb.set_trace() - self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) + np.allclose(np.array(tensor), np_array, atol=atol) def test_forward_backward(self): def test_with_place(place, data_layout, shape): diff --git a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py new file mode 100644 index 0000000000000000000000000000000000000000..aa9eae1e882f55ef51f38e158317a1a9aeed641c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py @@ -0,0 +1,86 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest +import paddle.fluid.core as core +from paddle.fluid.op import Operator + + +def output_hist(out): + hist, _ = np.histogram(out, range=(-5, 10)) + hist = hist.astype("float32") + hist /= float(out.size) + prob = 0.1 * np.ones((10)) + return hist, prob + + +class TestLookupSpraseTable(OpTest): + def check_with_place(self, place): + scope = core.Scope() + + # create and initialize Id Variable + ids = scope.var("Ids").get_tensor() + ids_array = np.array([0, 2, 3, 5, 100]).astype("int64") + ids.set(ids_array, place) + + # create and initialize W Variable + rows = [0, 1, 2, 3, 4, 5, 6] + row_numel = 10000 + + w_selected_rows = scope.var('W').get_selected_rows() + w_selected_rows.set_height(len(rows)) + w_selected_rows.set_rows(rows) + w_array = np.ones((len(rows), row_numel)).astype("float32") + for i in range(len(rows)): + w_array[i] *= i + w_tensor = w_selected_rows.get_tensor() + w_tensor.set(w_array, place) + + # create Out Variable + out_tensor = scope.var('Out').get_tensor() + + # create and run lookup_table operator + lookup_table = Operator( + "lookup_sparse_table", + W='W', + Ids='Ids', + Out='Out', + min=-5.0, + max=10.0, + seed=10) + lookup_table.run(scope, place) + + # get result from Out + result_array = np.array(out_tensor) + # all(): return True if all elements of the iterable are true (or if the iterable is empty) + for idx, row in enumerate(ids_array[:-2]): + assert (row == result_array[idx]).all() + + # check the random value + hist, prob = output_hist(result_array[-1]) + self.assertTrue( + np.allclose( + hist, prob, rtol=0, atol=0.01), "hist: " + str(hist)) + + def test_w_is_selected_rows(self): + places = [core.CPUPlace()] + # currently only support CPU + for place in places: + self.check_with_place(place) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..aeda67650205a473486b05fdd5b4364753fb18ca --- /dev/null +++ b/python/paddle/fluid/trainer.py @@ -0,0 +1,55 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = [ + 'Event', + 'Trainer', +] + + +class Event(object): + BEGIN_EPOCH = 0 + END_EPOCH = 1 + BEGIN_STEP = 2 + END_STEP = 3 + + def __init__(self): + self.step = 0 + self.epoch = 0 + self.type = Event.BEGIN_EPOCH + + +class Trainer(object): + def __init__(self, network_func, optimizer, params=None, place=None): + # 1. we need to generate a framework.Program by calling + # network_func. Reference: fluid.program_guard in + # test_word2vec.py + + # 2. move the default_main_program to self.program and run the + # default_startup program on an empty core.Scope() + + # 3. call self.params.add_vars with the initialized scope, it + # will add the new vars of the initialized scope into + # self.params. + self.network_func = network_func + self.optimizer = optimizer + self.params = params + self.place = place + # TODO(helin): support distributed training + + def train(self, reader, num_epochs, event_handler): + pass + + def test(self, reader): + pass diff --git a/python/requirements.txt b/python/requirements.txt index daf3f368b92408408897e33223118fe3647aa6de..ea827e9d5a0dcf8eb2ede1f6eaa88c777a138816 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -8,3 +8,4 @@ scipy>=0.19.0 Pillow nltk>=3.2.2 graphviz +LinkChecker