diff --git a/Dockerfile b/Dockerfile index 634be18a51bf61e96a8bf6f263b6674a7932d6e4..738bba9bc2e1ab19709722fe04f1490b1b13bd4f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,6 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/ RUN apt-get update && \ apt-get install -y --allow-downgrades patchelf \ + python3 python3-dev python3-pip \ git python-pip python-dev python-opencv openssh-server bison \ libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ @@ -70,24 +71,33 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # version(1.7.1 for now), which causes building documentation failed. -RUN easy_install -U pip && \ +RUN pip3 install -U wheel && \ + pip3 install -U docopt PyYAML sphinx==1.5.6 && \ + pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ + easy_install -U pip && \ pip install -U wheel && \ pip install -U docopt PyYAML sphinx==1.5.6 && \ pip install sphinx-rtd-theme==0.1.9 recommonmark -RUN pip install pre-commit 'ipython==5.3.0' && \ +RUN pip3 install pre-commit 'ipython==5.3.0' && \ + pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3 install opencv-python && \ + pip install pre-commit 'ipython==5.3.0' && \ pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip install opencv-python #For docstring checker +RUN pip3 install pylint pytest astroid isort RUN pip install pylint pytest astroid isort LinkChecker COPY ./python/requirements.txt /root/ +RUN pip3 install -r /root/requirements.txt RUN pip install -r /root/requirements.txt # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 RUN apt-get install -y libssl-dev libffi-dev +RUN pip3 install certifi urllib3[secure] RUN pip install certifi urllib3[secure] diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 6ed51c648478efb9784d0c43b169c285e740e0f3..24de8d9d7ced5f8111cc5d65f761b7506bde048e 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -40,7 +40,7 @@ set(OPENBLAS_LIB_SEARCH_PATHS /usr/local/opt/openblas/lib) find_path(OPENBLAS_INC_DIR NAMES cblas.h - PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS}) + PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH) find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS}) find_library(OPENBLAS_LIB NAMES openblas diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index c3fbe4dbdb28f1008bb274ee18293db348bfc6ed..755dbd610c40c2d9b85d3017b6f000a869b0f39a 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -27,7 +27,7 @@ IF(NOT ${CBLAS_FOUND}) SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas) SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas) - SET(CBLAS_INCLUDE_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE) + SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE) SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" @@ -96,7 +96,7 @@ IF(NOT ${CBLAS_FOUND}) ENDIF(NOT WIN32) SET(CBLAS_PROVIDER openblas) IF(WITH_C_API) - INSTALL(DIRECTORY ${CBLAS_INCLUDE_DIR} DESTINATION third_party/openblas) + INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas) # Because libopenblas.a is a symbolic link of another library, thus need to # install the whole directory. IF(ANDROID) @@ -117,8 +117,8 @@ IF(NOT ${CBLAS_FOUND}) ENDIF(NOT ${CBLAS_FOUND}) MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}") -MESSAGE(STATUS "BLAS Include: ${CBLAS_INCLUDE_DIR}") -INCLUDE_DIRECTORIES(${CBLAS_INCLUDE_DIR}) +MESSAGE(STATUS "BLAS Include: ${CBLAS_INC_DIR}") +INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) # FIXME(gangliao): generate cblas target to track all high performance # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index e0556a0babc74ba6efa0a190d4f7b77416bef3bf..fd1c2c3838f8238f957aed5fc7adeef97aeaa692 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -138,13 +138,17 @@ if (APPLE) # On Mac OS X build fat binaries with x86_64 architectures by default. set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) endif() -else() + # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0 + set (COMMON_FLAGS -Wno-deprecated-register) +endif(APPLE) + +if(LINUX) set(GPU_COMMON_FLAGS -Wall -Wextra -Werror ${GPU_COMMON_FLAGS}) -endif() +endif(LINUX) if(UNIX AND NOT APPLE) # except apple from nix*Os family diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index b1ce551ce73de33bcede187c72feebad6e2fa1a5..2d1f688d64ece3322e253b0c070264b9eb73d678 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -80,15 +80,15 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( // This is weird but there is really some variables without var_desc // in computation_op if (var_desc == nullptr) { - if (compute_op->Node()->Op()->Block()->FindVar(var_name) == nullptr) - continue; - } else { - if (var_desc->Persistable()) continue; - auto var_type = var_desc->Proto()->type().type(); - if (var_type != proto::VarType::LOD_TENSOR && - var_type != proto::VarType::SELECTED_ROWS) { - continue; - } + var_desc = compute_op->Node()->Op()->Block()->FindVar(var_name); + if (var_desc == nullptr) continue; + } + + if (var_desc->Persistable()) continue; + auto var_type = var_desc->Proto()->type().type(); + if (var_type != proto::VarType::LOD_TENSOR && + var_type != proto::VarType::SELECTED_ROWS) { + continue; } // compute op only runs in one device diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f5a54c0f48c98512dcb393d63a61f3c927e7ac1f..274a3e686c6a1c00df0cccfa0f9d47a2f5955af8 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -319,6 +319,13 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, #ifdef PADDLE_WITH_CUDA if (!gcs_.empty()) { ResetReferenceCount(); + for (auto &pair : cur_ref_cnts_) { + auto &name_map = *(pair.second); + for (auto &fetch_name : fetch_tensors) { + name_map.erase(fetch_name); + } + name_map.erase(fetched_var_name); + } } #endif auto fetch_data = member_->executor_->Run(fetch_tensors); diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index b61bca8c3d1e2a63faed43b437459f4e9318e46e..fa41266d62f5bdab2afe07439b604f147ad0fa20 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -301,7 +301,6 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(layer_norm_op DEPS cub) - op_library(reduce_mean_op DEPS cub) else() op_library(conv_op DEPS vol2col im2col) endif() diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index ef76106f17218a03d24ebc0eca43dbb0ae935093..b3140116dfe6a17a400bb88219ff43b249ecb32a 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -380,8 +380,7 @@ class DepthwiseConvKernel : public framework::OpKernel { math::DepthwiseConvFunctor depthwiseConv; auto& dev_ctx = context.template device_context(); - depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, - output); + depthwiseConv(dev_ctx, *input, filter, strides, paddings, output); } }; @@ -416,14 +415,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel { input_grad->mutable_data(context.GetPlace()); set_zero(dev_ctx, input_grad, static_cast(0)); depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, - paddings, dilations, input_grad); + paddings, input_grad); } if (filter_grad) { filter_grad->mutable_data(context.GetPlace()); set_zero(dev_ctx, filter_grad, static_cast(0)); depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings, - dilations, filter_grad); + filter_grad); } } }; diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h index 88c578b1410558b9adcd55f1cd6b53fb9cb124e2..0d9c6a62fec1ea24bee5c24b4a7b792781f14d9e 100644 --- a/paddle/fluid/operators/conv_transpose_op.h +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel { math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings, - dilations, output); + output); } }; @@ -367,11 +367,10 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); if (input_grad) { math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, dilations, + depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, input_grad); } @@ -383,7 +382,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel { math::DepthwiseConvFilterGradFunctor depthwiseConvFilterGrad; depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings, - dilations, filter_grad); + filter_grad); } } }; diff --git a/paddle/fluid/operators/cub_reduce.h b/paddle/fluid/operators/cub_reduce.h deleted file mode 100644 index 16fdad775f7befaac04b1ac59a601f04e0ab2bdc..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cub_reduce.h +++ /dev/null @@ -1,322 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include - -#include // NOLINT -#include "paddle/fluid/framework/tensor.h" - -namespace paddle { -namespace operators { - -namespace detail { -template -struct Array { - public: - HOSTDEVICE inline Array() {} - - HOSTDEVICE inline T& operator[](size_t index) { return data_[index]; } - - HOSTDEVICE inline const T& operator[](size_t index) const { - return data_[index]; - } - - HOSTDEVICE constexpr inline size_t size() const { return ElementCount; } - - template - static inline Array From(const VectorLikeType& vec) { - PADDLE_ENFORCE_EQ(vec.size(), ElementCount, "size not match"); - size_t n = static_cast(vec.size()); - Array ret; - for (size_t i = 0; i < n; ++i) ret[i] = vec[i]; - return ret; - } - - private: - T data_[ElementCount]; -}; - -// reduce the last axis of 2d array -template -__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer, - TransformOp transformer, Ty init, - int reduce_num) { - __shared__ typename cub::BlockReduce::TempStorage temp_storage; - int idx_x = blockIdx.x * reduce_num; - int idx_y = threadIdx.x; - Ty reduce_var = init; - for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim) - reduce_var = reducer(reduce_var, transformer(x[idx_x + idx_y])); - - reduce_var = - cub::BlockReduce(temp_storage).Reduce(reduce_var, reducer); - - if (threadIdx.x == 0) { - y[blockIdx.x] = reduce_var; - } -} - -template -__global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer, - TransformOp transformer, Ty init, int reduce_num, - Array x_strides, - Array reduce_dim, - Array reduce_strides, - Array left_dim, - Array left_strides) { - __shared__ typename cub::BlockReduce::TempStorage temp_storage; - Array sub_index; - int left_idx = blockIdx.x; - for (int i = 0; i < Rank - ReduceRank; ++i) { - sub_index[left_dim[i]] = left_idx / left_strides[i]; - left_idx %= left_strides[i]; - } - - int reduce_idx = threadIdx.x; - for (int j = 0; j < ReduceRank; ++j) { - sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j]; - reduce_idx %= reduce_strides[j]; - } - - int idx_x = 0; - for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]); - Ty reduce_var = static_cast(transformer(x[idx_x])); - - for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) { - int reduce_idx = i; - for (int j = 0; j < ReduceRank; ++j) { - sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j]; - reduce_idx %= reduce_strides[j]; - } - - int idx_x = 0; - for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]); - reduce_var = static_cast(reducer(reduce_var, transformer(x[idx_x]))); - } - - reduce_var = - cub::BlockReduce(temp_storage).Reduce(reduce_var, reducer); - - if (threadIdx.x == 0) { - y[blockIdx.x] = reduce_var; - } -} - -static inline std::vector GetStrides(const std::vector& dims) { - int n = static_cast(dims.size()); - if (n == 0) return std::vector(); - std::vector strides(n); - strides.back() = 1; - for (int i = n - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * dims[i + 1]; - } - return strides; -} - -static inline std::vector GetStrides(const std::vector& dims, - const std::vector& idx) { - int n = static_cast(idx.size()); - if (n == 0) return std::vector(); - std::vector strides(n); - strides.back() = 1; - for (int i = n - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * dims[idx[i + 1]]; - } - return strides; -} - -constexpr int kMaxBlockDim = 512; - -static inline int GetDesiredBlockDim(int block_dim) { - return block_dim >= kMaxBlockDim - ? kMaxBlockDim - : (1 << static_cast(std::log2(block_dim))); -} - -template -static void TensorReduceImpl( - const Tx* x_data, Ty* y_data, const platform::Place& place, - const ReduceOp& reducer, const TransformOp& transformer, const Ty& init, - int left_num, int reduce_num, const std::vector& x_strides, - const std::vector& reduce_dim, const std::vector& reduce_strides, - const std::vector& left_dim, const std::vector& left_strides, - cudaStream_t stream) { -#define CUB_RANK_CASE(i, ...) \ - case i: { \ - constexpr auto kRank = i; \ - switch (reduce_rank) { __VA_ARGS__; } \ - } break - -#define CUB_REDUCE_RANK_CASE(i, ...) \ - case i: { \ - constexpr auto kReduceRank = i; \ - ReduceKernel<<>>( \ - x_data, y_data, reducer, transformer, init, reduce_num, \ - Array::From(x_strides), \ - Array::From(reduce_dim), \ - Array::From(reduce_strides), \ - Array::From(left_dim), \ - Array::From(left_strides)); \ - } break - - int rank = x_strides.size(); - int reduce_rank = reduce_strides.size(); - if (rank == reduce_rank) { - cub::TransformInputIterator trans_x( - x_data, transformer); - size_t temp_storage_bytes = 0; - cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data, - reduce_num, reducer, init, stream); - framework::Tensor tmp; - auto* temp_storage = tmp.mutable_data( - framework::make_ddim({static_cast(temp_storage_bytes)}), - place); - cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data, - reduce_num, reducer, init, stream); - return; - } - if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) { - ReduceKernel2D<<>>( - x_data, y_data, reducer, transformer, init, reduce_num); - return; - } - /* - if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) { - // TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced. - // Currently, it is handled by code below, but inefficient - return; - } - */ - - switch (rank) { - CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1);); - - CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);); - - CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); - CUB_REDUCE_RANK_CASE(3);); - - CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); - CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);); - - CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); - CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); - CUB_REDUCE_RANK_CASE(5);); - - CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); - CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); - CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);); - - CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); - CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); - CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);); - - CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); - CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); - CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6); - CUB_REDUCE_RANK_CASE(7); CUB_REDUCE_RANK_CASE(8);); - } - -#undef CUB_REDUCE_RANK_CASE -#undef CUB_RANK_CASE -} - -} // namespace detail - -template -void TensorReduce(const framework::Tensor& x, framework::Tensor* y, - std::vector origin_reduce_dims, const Ty& init, - const ReduceOp& reducer, const TransformOp& transformer, - cudaStream_t stream) { - auto x_dim = framework::vectorize2int(x.dims()); - std::vector new_x_dim, new_reduce_dims; - int is_reduced = 0; - for (auto e : origin_reduce_dims) { - auto pos = e >= 0 ? e : e + x_dim.size(); - is_reduced |= 1 << e; - } - for (int i = 0; i < x_dim.size(); i++) { - if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) { - new_x_dim.push_back(x_dim[i]); - if ((is_reduced >> i) & 1) - new_reduce_dims.push_back(new_x_dim.size() - 1); - } else { - new_x_dim[new_x_dim.size() - 1] *= x_dim[i]; - } - } - x_dim = new_x_dim; - origin_reduce_dims = new_reduce_dims; - int x_rank = static_cast(x_dim.size()); - std::set left_set, reduce_set; - for (int i = 0; i < x_rank; ++i) left_set.insert(i); - - for (auto e : origin_reduce_dims) { - left_set.erase(e); - reduce_set.insert(e); - } - - std::vector reduce_dim(reduce_set.begin(), reduce_set.end()); - std::vector left_dim(left_set.begin(), left_set.end()); - - std::vector x_strides = detail::GetStrides(x_dim); - std::vector reduce_strides = detail::GetStrides(x_dim, reduce_dim); - std::vector left_strides = detail::GetStrides(x_dim, left_dim); - int reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]]; - int left_num = 1; - if (left_dim.size()) left_num = left_strides[0] * x_dim[left_dim[0]]; - - std::vector y_dim(left_dim.size()); - for (int i = 0; i < left_dim.size(); ++i) { - y_dim[i] = x_dim[left_dim[i]]; - } - auto x_data = x.data(); - auto y_data = y->mutable_data(x.place()); - if (reduce_num == 1) return; - -#define CUB_BLOCK_DIM_CASE(block_dim) \ - case block_dim: { \ - constexpr auto kBlockDim = block_dim; \ - detail::TensorReduceImpl( \ - x_data, y_data, x.place(), reducer, transformer, init, left_num, \ - reduce_num, x_strides, reduce_dim, reduce_strides, left_dim, \ - left_strides, stream); \ - } break - - switch (detail::GetDesiredBlockDim(reduce_num)) { - CUB_BLOCK_DIM_CASE(512); - CUB_BLOCK_DIM_CASE(256); - CUB_BLOCK_DIM_CASE(128); - CUB_BLOCK_DIM_CASE(64); - CUB_BLOCK_DIM_CASE(32); - CUB_BLOCK_DIM_CASE(16); - CUB_BLOCK_DIM_CASE(8); - CUB_BLOCK_DIM_CASE(4); - CUB_BLOCK_DIM_CASE(2); - } -#undef CUB_BLOCK_DIM_CASE -} - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h index a79b900b9801e6b80e4433a9acdd4dab6c34859d..94df11bee70dec44f19ee9ffff04ca92d5990ee8 100644 --- a/paddle/fluid/operators/elementwise_op.h +++ b/paddle/fluid/operators/elementwise_op.h @@ -89,7 +89,7 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("use_mkldnn", "(bool, default false). Used by MKLDNN.") .SetDefault(false); AddComment(string::Sprintf(R"DOC( -Limited Elementwise %s Operator +Elementwise %s Operator The equation is: diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc index 31e87d9113118ebe7a4b25ffee5ba55e2714fb66..a04c1c1263fba659e2d3f623b607e9f476bb40ed 100644 --- a/paddle/fluid/operators/fusion_gru_op.cc +++ b/paddle/fluid/operators/fusion_gru_op.cc @@ -290,12 +290,13 @@ class FusionGRUKernel : public framework::OpKernel { void BatchCompute(const framework::ExecutionContext& ctx) const { using DeviceContext = paddle::platform::CPUDeviceContext; auto* x = ctx.Input("X"); + INIT_BASE_INPUT_OUTPUT + INIT_BASE_SIZES if (x->lod()[0].size() == 2) { + xx->Resize({total_T, D3}); SeqCompute(ctx); return; } - INIT_BASE_INPUT_OUTPUT - INIT_BASE_SIZES INIT_VEC_FUNC auto* reordered_h0 = ctx.Output("ReorderedH0"); diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index 8ca79d20ec4f6412b00dbf3990068f81b65e2efd..a20a5880dc3654e99d33d6f8607bafa90170708c 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -424,11 +424,12 @@ class FuisonLSTMKernel : public framework::OpKernel { void BatchCompute(const framework::ExecutionContext& ctx) const { using DeviceContext = platform::CPUDeviceContext; INIT_BASE_INPUT_OUTPUT + INIT_BASE_SIZES if (x->lod()[0].size() == 2) { + xx->Resize({x_dims[0], D4}); SeqCompute(ctx); return; } - INIT_BASE_SIZES INIT_VEC_FUNC INIT_BASE_INPUT_DATAS diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu index 3be389912307f7aac6dda6d1018943eb8f08696d..027e2de48d229761f12f974dc73625c8ea1b3567 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/fluid/operators/math/depthwise_conv.cu @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -21,268 +20,149 @@ namespace paddle { namespace operators { namespace math { -template -__inline__ __device__ T warpReduceSum(T val) { -#if CUDA_VERSION < 9000 - for (int offset = 16; offset > 0; offset /= 2) - val += __shfl_down(val, offset); - return val; -#else -#define FULL_MASK 0xffffffff - for (int offset = 16; offset > 0; offset /= 2) - val += __shfl_down_sync(FULL_MASK, val, offset); - return val; -#endif -} -__forceinline__ __device__ unsigned lane_id() { - unsigned ret; - asm volatile("mov.u32 %0, %laneid;" : "=r"(ret)); - return ret; -} - -__forceinline__ __device__ unsigned warp_id() { - unsigned ret; - asm volatile("mov.u32 %0, %warpid;" : "=r"(ret)); - return ret; -} - // A Cuda kernel to compute the depthwise convolution forward pass // in NCHW format. template -__device__ __inline__ void KernelDepthwiseConv( - const T* const input_data, const T* const filter_data, const int batch_size, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, +__global__ void KernelDepthwiseConv( + const int nthreads, const T* const input_data, const T* const filter_data, + const int batch_size, const int output_channels, const int output_height, + const int output_width, const int input_channels, const int input_height, + const int input_width, const int filter_multiplier, const int filter_height, const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* const output_data) { - for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) { - for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) { - const int batch = blockIdx.y; - const int c_out = blockIdx.x; - - const int c_in = c_out / filter_multiplier; - const T* weight = filter_data + c_out * filter_height * filter_width; - T value = 0; - const int h_in_start = -padding_height + h_out * stride_height; - const int w_in_start = -padding_width + w_out * stride_width; - const int h_in_end = h_in_start + filter_height * dilate_height; - const int w_in_end = w_in_start + filter_width * dilate_width; - - const int in_offset = - ((batch * input_channels + c_in) * input_height) * input_width; - - const int h_end = h_in_end < input_height ? h_in_end : input_height; - const int w_end = w_in_end < input_width ? w_in_end : input_width; - const int h_start = h_in_start > 0 ? h_in_start : 0; - const int w_start = w_in_start > 0 ? w_in_start : 0; - int weight_offset = 0; - - for (int h_in = h_in_start; h_in < h_in_end; h_in += dilate_height) { - for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) { - if (h_in >= h_start && h_in < h_end && w_in >= w_start && - w_in < w_end) { - const int offset = in_offset + h_in * input_width + w_in; - value += weight[weight_offset] * input_data[offset]; - } - weight_offset++; - } + const int padding_height, const int padding_width, T* const output_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if (index < nthreads) { + const int batch = index / output_channels / output_height / output_width; + const int c_out = (index / output_height / output_width) % output_channels; + const int h_out = (index / output_width) % output_height; + const int w_out = index % output_width; + + const int c_in = c_out / filter_multiplier; + const T* weight = filter_data + c_out * filter_height * filter_width; + T value = 0; + const int h_in_start = -padding_height + h_out * stride_height; + const int w_in_start = -padding_width + w_out * stride_width; + const int h_in_end = h_in_start + filter_height; + const int w_in_end = w_in_start + filter_width; + + const int in_offset = + ((batch * input_channels + c_in) * input_height) * input_width; + + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + + for (int h_in = h_start; h_in < h_end; h_in++) { + for (int w_in = w_start; w_in < w_end; w_in++) { + const int offset = in_offset + h_in * input_width + w_in; + value += + weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] * + input_data[offset]; } - int index = - ((batch * gridDim.x + c_out) * output_height + h_out) * output_width + - w_out; - output_data[index] = value; } + output_data[index] = value; } } -template -__global__ void KernelDepthwiseConvSp( - const T* const input_data, const T* const filter_data, const int batch_size, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* const output_data) { - if (c_filter_multiplier == 0) - KernelDepthwiseConv(input_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, - input_height, input_width, filter_multiplier, - filter_height, filter_width, stride_height, - stride_width, padding_height, padding_width, - dilate_height, dilate_width, output_data); - - else - KernelDepthwiseConv(input_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, - input_height, input_width, c_filter_multiplier, - filter_height, filter_height, c_stride, c_stride, - padding_height, padding_width, dilate_height, - dilate_width, output_data); -} - // CUDA kernel to compute the depthwise convolution backprop w.r.t input. template -__device__ __inline__ void KernelDepthwiseConvInputGrad( - const T* const output_grad_data, const T* const filter_data, - const int batch_size, const int output_channels, const int output_height, - const int output_width, const int input_channels, const int input_height, - const int input_width, const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* const input_grad_data) { - for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) { - for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) { - const int batch = blockIdx.y; - const int c_in = blockIdx.x; - - const int c_out_start = c_in * filter_multiplier; - - int h_out_start = - h_in - (filter_height - 1) * dilate_height + padding_height; - - int h_out_end = h_in + padding_height; - - int w_out_start = - w_in - (filter_width - 1) * dilate_width + padding_width; - - int w_out_end = w_in + padding_width; - - T value = 0; - - for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; - c_out++) { - int filter_offset = (c_out + 1) * filter_height * filter_width; - for (int h_out = h_out_start; h_out <= h_out_end; - h_out += dilate_height) { - for (int w_out = w_out_start; w_out <= w_out_end; - w_out += dilate_width) { - filter_offset--; - int s_h_out = h_out / stride_height; - int s_w_out = w_out / stride_width; - if (h_out % stride_height == 0 && w_out % stride_width == 0 && - s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 && - s_w_out < output_width) { - const int output_grad_offset = - ((batch * output_channels + c_out) * output_height + - s_h_out) * - output_width + - s_w_out; - value += output_grad_data[output_grad_offset] * - filter_data[filter_offset]; - } - } +__global__ void KernelDepthwiseConvInputGrad( + const int nthreads, const T* const output_grad_data, + const T* const filter_data, const int batch_size, const int output_channels, + const int output_height, const int output_width, const int input_channels, + const int input_height, const int input_width, const int filter_multiplier, + const int filter_height, const int filter_width, const int stride_height, + const int stride_width, const int padding_height, const int padding_width, + T* const input_grad_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < nthreads) { + const int batch = index / input_channels / input_height / input_width; + const int c_in = (index / input_height / input_width) % input_channels; + const int h_in = (index / input_width) % input_height; + const int w_in = index % input_width; + + const int c_out_start = c_in * filter_multiplier; + + int h_out_start = + (h_in - filter_height + padding_height + stride_height) / stride_height; + h_out_start = 0 > h_out_start ? 0 : h_out_start; + + int h_out_end = (h_in + padding_height) / stride_height; + h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end; + + int w_out_start = + (w_in - filter_width + padding_width + stride_width) / stride_width; + w_out_start = 0 > w_out_start ? 0 : w_out_start; + + int w_out_end = (w_in + padding_width) / stride_width; + w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end; + + T value = 0; + + for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; + c_out++) { + for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { + const int filter_h = h_in + padding_height - h_out * stride_height; + for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { + const int filter_w = w_in + padding_width - w_out * stride_width; + const int filter_offset = c_out * filter_height * filter_width + + filter_h * filter_width + filter_w; + const int output_grad_offset = + ((batch * output_channels + c_out) * output_height + h_out) * + output_width + + w_out; + value += + output_grad_data[output_grad_offset] * filter_data[filter_offset]; } } - int index = - ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + - w_in; - input_grad_data[index] = value; } + input_grad_data[index] += value; } } -template -__global__ void KernelDepthwiseConvInputGradSp( - const T* const output_grad_data, const T* const filter_data, - const int batch_size, const int output_channels, const int output_height, - const int output_width, const int input_channels, const int input_height, - const int input_width, const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* const input_grad_data) { - if (c_filter_multiplier == 0) - KernelDepthwiseConvInputGrad( - output_grad_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, input_height, input_width, - filter_multiplier, filter_height, filter_width, stride_height, - stride_width, padding_height, padding_width, dilate_height, - dilate_width, input_grad_data); - else - KernelDepthwiseConvInputGrad( - output_grad_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, input_height, input_width, - c_filter_multiplier, filter_height, filter_width, c_stride, c_stride, - padding_height, padding_width, dilate_height, dilate_width, - input_grad_data); -} - // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. template -__device__ __inline__ void KernelDepthwiseConvFilterGrad( - const T* output_grad_data, const T* input_data, const int num, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data) { - T s = 0; - - int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x; - int lid = lane_id(); - - for (int image_w = threadIdx.x; image_w < output_width; - image_w += blockDim.x) { - for (int bid = 0; bid < num; bid++) { - for (int image_h = threadIdx.y; image_h < output_height; - image_h += blockDim.y) { - int kernel_id = blockIdx.z; - int kernel_h = blockIdx.y * dilate_height - padding_height; - int kernel_w = blockIdx.x * dilate_width - padding_width; - - int image_hk = image_h * stride_height + kernel_h; - int image_wk = image_w * stride_width + kernel_w; - if (image_hk < 0 || image_hk >= input_height) continue; - if (image_wk < 0 || image_wk >= input_width) continue; -#define gaid(N, C, H, W) \ - ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W)) - - s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] * - input_data[((bid * (gridDim.z / filter_multiplier) + - kernel_id / filter_multiplier) * - input_height + - image_hk) * - input_width + - image_wk]; - -#undef gaid +__global__ void KernelDepthwiseConvFilterGrad( + const int nthreads, const T* const output_grad_data, + const T* const input_data, const int num, const int output_channels, + const int output_height, const int output_width, const int input_channels, + const int input_height, const int input_width, const int filter_multiplier, + const int filter_height, const int filter_width, const int stride_height, + const int stride_width, const int padding_height, const int padding_width, + T* const filter_grad_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < nthreads) { + const int w_out = index % output_width; + const int h_out = (index / output_width) % output_height; + const int c_out = (index / output_width / output_height) % output_channels; + const int batch = (index / output_width / output_height / output_channels); + const int c_in = c_out / filter_multiplier; + const int h_in_start = -padding_height + h_out * stride_height; + const int w_in_start = -padding_width + w_out * stride_width; + const int h_in_end = + -padding_height + h_out * stride_height + filter_height; + const int w_in_end = -padding_width + w_out * stride_width + filter_width; + const int in_offset = + (batch * input_channels + c_in) * input_height * input_width; + + T* addr_offset = filter_grad_data + c_out * filter_height * filter_width; + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + + for (int h_in = h_start; h_in < h_end; h_in++) { + for (int w_in = w_start; w_in < w_end; w_in++) { + const int offset = in_offset + h_in * input_width + w_in; + const T diff_temp = output_grad_data[index] * input_data[offset]; + T* addr = addr_offset + (h_in - h_in_start) * filter_width + + (w_in - w_in_start); + paddle::platform::CudaAtomicAdd(addr, diff_temp); } } } -#if __CUDA_ARCH__ >= 530 - s = warpReduceSum(s); - if (lid == 0) paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s); -#else - paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s); -#endif -} - -template -__global__ void KernelDepthwiseConvFilterGradSp( - const T* output_grad_data, const T* input_data, const int num, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data) { - if (c_filter_multiplier == 0) - KernelDepthwiseConvFilterGrad( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - filter_multiplier, filter_height, filter_width, stride_height, - stride_width, padding_height, padding_width, dilate_height, - dilate_width, filter_grad_data); - else - KernelDepthwiseConvFilterGrad( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - c_filter_multiplier, filter_height, filter_width, stride_height, - stride_width, padding_height, padding_width, dilate_height, - dilate_width, filter_grad_data); } /* @@ -297,9 +177,7 @@ class DepthwiseConvFunctor { const framework::Tensor& input, const framework::Tensor& filter, const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - framework::Tensor* output) { + const std::vector& paddings, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -313,37 +191,22 @@ class DepthwiseConvFunctor { const int stride_width = strides[1]; const int padding_height = paddings[0]; const int padding_width = paddings[1]; - const int dilate_height = dilations[0]; - const int dilate_width = dilations[1]; const T* input_data = input.data(); const T* filter_data = filter.data(); T* output_data = output->mutable_data(context.GetPlace()); - int thread = 512; - int blocks = std::min(std::max(thread / output_width, 1), output_height); - dim3 threads(std::min(output_width, thread), blocks, 1); - dim3 grid(output_channels, batch_size, 1); - int filter_multiplier = output_channels / input_channels; -#define check_case(c_filter_multiplier, c_stride) \ - if (c_filter_multiplier == 0 || \ - filter_multiplier == c_filter_multiplier && \ - stride_height == stride_width && stride_height == c_stride) { \ - KernelDepthwiseConvSp<<>>( \ - input_data, filter_data, batch_size, output_channels, output_height, \ - output_width, input_channels, input_height, input_width, \ - filter_multiplier, ksize_height, ksize_width, stride_height, \ - stride_width, padding_height, padding_width, dilate_height, \ - dilate_width, output_data); \ - return; \ - } - check_case(1, 1); - check_case(1, 2); - // NOTE(liangdun): 0,0 for other case - // add other case if needed, e.g. check_case(2^n,1) - check_case(0, 0); -#undef check_case + int nthreads = batch_size * output_channels * output_height * output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConv<<>>( + nthreads, input_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + output_data); } }; @@ -356,7 +219,6 @@ class DepthwiseConvInputGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, - const std::vector& dilations, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -371,39 +233,22 @@ class DepthwiseConvInputGradFunctor { const int stride_width = strides[1]; const int padding_height = paddings[0]; const int padding_width = paddings[1]; - const int dilate_height = dilations[0]; - const int dilate_width = dilations[1]; const T* filter_data = filter.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - int thread = 512; - int blocks = std::min(std::max(thread / input_width, 1), input_height); - dim3 threads(std::min(input_width, thread), blocks, 1); - dim3 grid(input_channels, batch_size, 1); - int filter_multiplier = output_channels / input_channels; - -#define check_case(c_filter_multiplier, c_stride) \ - if (c_filter_multiplier == 0 || \ - filter_multiplier == c_filter_multiplier && \ - stride_height == stride_width && stride_height == c_stride) { \ - KernelDepthwiseConvInputGradSp< \ - T, c_filter_multiplier, \ - c_stride><<>>( \ - output_grad_data, filter_data, batch_size, output_channels, \ - output_height, output_width, input_channels, input_height, \ - input_width, filter_multiplier, ksize_height, ksize_width, \ - stride_height, stride_width, padding_height, padding_width, \ - dilate_height, dilate_width, input_grad_data); \ - return; \ - } - check_case(1, 1); - check_case(1, 2); - // NOTE(liangdun): 0,0 for other case - // add other case if needed, e.g. check_case(2^n,1) - check_case(0, 0); -#undef check_case + int nthreads = batch_size * input_channels * input_height * input_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConvInputGrad<<>>( + nthreads, output_grad_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + input_grad_data); } }; @@ -415,7 +260,6 @@ class DepthwiseConvFilterGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, - const std::vector& dilations, framework::Tensor* filter_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -430,34 +274,23 @@ class DepthwiseConvFilterGradFunctor { const int stride_width = strides[1]; const int padding_height = paddings[0]; const int padding_width = paddings[1]; - const int dilate_height = dilations[0]; - const int dilate_width = dilations[1]; const T* input_data = input.data(); const T* output_grad_data = output_grad.data(); T* filter_grad_data = filter_grad->mutable_data(context.GetPlace()); - int block_size = 512; - int crop_output_height = - std::min(std::max(block_size / output_width, 1), output_height); - dim3 grid(ksize_width, ksize_height, output_channels); - dim3 threads(std::min(output_width, block_size), crop_output_height, 1); - int filter_multiplier = output_channels / input_channels; - -#define check_case(c_filter_multiplier) \ - if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \ - KernelDepthwiseConvFilterGradSp< \ - T, c_filter_multiplier><<>>( \ - output_grad_data, input_data, batch_size, output_channels, \ - output_height, output_width, input_channels, input_height, \ - input_width, filter_multiplier, ksize_height, ksize_width, \ - stride_height, stride_width, padding_height, padding_width, \ - dilate_height, dilate_width, filter_grad_data); \ - return; \ - } - check_case(1); - check_case(0); -#undef check_case + int nthreads = batch_size * output_channels * output_height * output_width; + + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConvFilterGrad<<>>( + nthreads, output_grad_data, input_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + filter_grad_data); } }; diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h index 71f6fcb23df1942d6dcf7177165f2ec1022a9b35..97aec401889a56d3fc9ac08e766d931bb3725b01 100644 --- a/paddle/fluid/operators/math/depthwise_conv.h +++ b/paddle/fluid/operators/math/depthwise_conv.h @@ -32,8 +32,7 @@ class DepthwiseConvFunctor { void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& filter, const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, framework::Tensor* output); + const std::vector& paddings, framework::Tensor* output); }; template @@ -44,7 +43,6 @@ class DepthwiseConvInputGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, - const std::vector& dilations, framework::Tensor* input_grad); }; @@ -55,7 +53,6 @@ class DepthwiseConvFilterGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, - const std::vector& dilations, framework::Tensor* filter_grad); }; diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu index 59b30244839849d79e3e531953134633503c4090..960cb3235be7f4cc98b97d3b088ceaeb3d4a4209 100644 --- a/paddle/fluid/operators/reduce_mean_op.cu +++ b/paddle/fluid/operators/reduce_mean_op.cu @@ -12,64 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include "paddle/fluid/operators/cub_reduce.h" #include "paddle/fluid/operators/reduce_mean_op.h" -namespace paddle { -namespace operators { - -template -struct DivideFunctor { - HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} - - HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } - - private: - T n_inv; -}; - -template -class ReduceMeanKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - - auto dims = context.Attr>("dim"); - bool keep_dim = context.Attr("keep_dim"); - - std::vector reduce_dims; - if (reduce_all) { - reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; - } else { - for (auto e : dims) { - reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); - } - } - - int reduce_num = 1; - for (int i = 0; i < reduce_dims.size(); ++i) { - reduce_num *= input->dims()[reduce_dims[i]]; - } - - auto stream = context.cuda_device_context().stream(); - TensorReduce>( - *input, output, reduce_dims, static_cast(0), cub::Sum(), - DivideFunctor(reduce_num), stream); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel, - ops::ReduceMeanKernel, - ops::ReduceMeanKernel, - ops::ReduceMeanKernel); - +REGISTER_OP_CUDA_KERNEL(reduce_mean, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); REGISTER_OP_CUDA_KERNEL( reduce_mean_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu index 53cd9e9419dd9aecee730917ae21d7a4ab332ffc..f2e16955a50dc6a7feda9fbaf968c929ef3d8a4f 100644 --- a/paddle/fluid/operators/reduce_sum_op.cu +++ b/paddle/fluid/operators/reduce_sum_op.cu @@ -12,59 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/cub_reduce.h" #include "paddle/fluid/operators/reduce_sum_op.h" -namespace paddle { -namespace operators { - -template -struct IdentityFunctor { - HOSTDEVICE explicit inline IdentityFunctor() {} - - HOSTDEVICE inline T operator()(const T& x) const { return x; } -}; - -template -class ReduceSumKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - - auto dims = context.Attr>("dim"); - bool keep_dim = context.Attr("keep_dim"); - - std::vector reduce_dims; - if (reduce_all) { - reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; - } else { - for (auto e : dims) { - reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); - } - } - - int reduce_num = 1; - for (int i = 0; i < reduce_dims.size(); ++i) { - reduce_num *= input->dims()[reduce_dims[i]]; - } - - auto stream = context.cuda_device_context().stream(); - TensorReduce>( - *input, output, reduce_dims, static_cast(0), cub::Sum(), - IdentityFunctor(), stream); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel, - ops::ReduceSumKernel, ops::ReduceSumKernel, - ops::ReduceSumKernel); - +REGISTER_OP_CUDA_KERNEL(reduce_sum, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); REGISTER_OP_CUDA_KERNEL( reduce_sum_grad, ops::ReduceGradKernel, diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 2d11527040d3bb89f54186b042920758168a80c4..bb288384e8501caf1cb6f9cd186cc09985a5de55 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -395,10 +395,11 @@ EOF ctest --output-on-failure -j $1 # make install should also be test when unittest make install -j 8 - pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl + pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then paddle version fi + pip uninstall -y paddlepaddle fi } @@ -597,9 +598,9 @@ EOF EOF if [[ ${WITH_GPU} == "ON" ]]; then - NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} &&" + NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} || true" else - NCCL_DEPS="" + NCCL_DEPS="true" fi if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then @@ -613,9 +614,8 @@ EOF cat >> ${PADDLE_ROOT}/build/Dockerfile <