提交 e06f2f33 编写于 作者: S shippingwang

Merge branch 'release/1.0.0' of https://github.com/PaddlePaddle/Paddle into release/1.0.0

...@@ -138,13 +138,17 @@ if (APPLE) ...@@ -138,13 +138,17 @@ if (APPLE)
# On Mac OS X build fat binaries with x86_64 architectures by default. # On Mac OS X build fat binaries with x86_64 architectures by default.
set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
endif() endif()
else() # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
set (COMMON_FLAGS -Wno-deprecated-register)
endif(APPLE)
if(LINUX)
set(GPU_COMMON_FLAGS set(GPU_COMMON_FLAGS
-Wall -Wall
-Wextra -Wextra
-Werror -Werror
${GPU_COMMON_FLAGS}) ${GPU_COMMON_FLAGS})
endif() endif(LINUX)
if(UNIX AND NOT APPLE) if(UNIX AND NOT APPLE)
# except apple from nix*Os family # except apple from nix*Os family
......
...@@ -80,16 +80,16 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( ...@@ -80,16 +80,16 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
// This is weird but there is really some variables without var_desc // This is weird but there is really some variables without var_desc
// in computation_op // in computation_op
if (var_desc == nullptr) { if (var_desc == nullptr) {
if (compute_op->Node()->Op()->Block()->FindVar(var_name) == nullptr) var_desc = compute_op->Node()->Op()->Block()->FindVar(var_name);
continue; if (var_desc == nullptr) continue;
} else { }
if (var_desc->Persistable()) continue; if (var_desc->Persistable()) continue;
auto var_type = var_desc->Proto()->type().type(); auto var_type = var_desc->Proto()->type().type();
if (var_type != proto::VarType::LOD_TENSOR && if (var_type != proto::VarType::LOD_TENSOR &&
var_type != proto::VarType::SELECTED_ROWS) { var_type != proto::VarType::SELECTED_ROWS) {
continue; continue;
} }
}
// compute op only runs in one device // compute op only runs in one device
if (ref_cnts[place.device]->count(var_name)) if (ref_cnts[place.device]->count(var_name))
......
...@@ -319,6 +319,13 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors, ...@@ -319,6 +319,13 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (!gcs_.empty()) { if (!gcs_.empty()) {
ResetReferenceCount(); ResetReferenceCount();
for (auto &pair : cur_ref_cnts_) {
auto &name_map = *(pair.second);
for (auto &fetch_name : fetch_tensors) {
name_map.erase(fetch_name);
}
name_map.erase(fetched_var_name);
}
} }
#endif #endif
auto fetch_data = member_->executor_->Run(fetch_tensors); auto fetch_data = member_->executor_->Run(fetch_tensors);
......
...@@ -301,7 +301,6 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute) ...@@ -301,7 +301,6 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute)
if (WITH_GPU) if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(conv_op DEPS vol2col depthwise_conv im2col)
op_library(layer_norm_op DEPS cub) op_library(layer_norm_op DEPS cub)
op_library(reduce_mean_op DEPS cub)
else() else()
op_library(conv_op DEPS vol2col im2col) op_library(conv_op DEPS vol2col im2col)
endif() endif()
......
...@@ -380,8 +380,7 @@ class DepthwiseConvKernel : public framework::OpKernel<T> { ...@@ -380,8 +380,7 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv; math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);
output);
} }
}; };
...@@ -416,14 +415,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> { ...@@ -416,14 +415,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
input_grad->mutable_data<T>(context.GetPlace()); input_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(0)); set_zero(dev_ctx, input_grad, static_cast<T>(0));
depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
paddings, dilations, input_grad); paddings, input_grad);
} }
if (filter_grad) { if (filter_grad) {
filter_grad->mutable_data<T>(context.GetPlace()); filter_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0)); set_zero(dev_ctx, filter_grad, static_cast<T>(0));
depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings, depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
dilations, filter_grad); filter_grad);
} }
} }
}; };
......
...@@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> { ...@@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
math::DepthwiseConvInputGradFunctor<DeviceContext, T> math::DepthwiseConvInputGradFunctor<DeviceContext, T>
depthwiseConvInputGrad; depthwiseConvInputGrad;
depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings, depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
dilations, output); output);
} }
}; };
...@@ -367,11 +367,10 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -367,11 +367,10 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> strides = context.Attr<std::vector<int>>("strides"); std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
if (input_grad) { if (input_grad) {
math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv; math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, dilations, depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings,
input_grad); input_grad);
} }
...@@ -383,7 +382,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -383,7 +382,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
math::DepthwiseConvFilterGradFunctor<DeviceContext, T> math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
depthwiseConvFilterGrad; depthwiseConvFilterGrad;
depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings, depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
dilations, filter_grad); filter_grad);
} }
} }
}; };
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cmath>
#include <numeric>
#include <set>
#include <vector>
#include <cub/cub.cuh> // NOLINT
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace operators {
namespace detail {
template <typename T, size_t ElementCount>
struct Array {
public:
HOSTDEVICE inline Array() {}
HOSTDEVICE inline T& operator[](size_t index) { return data_[index]; }
HOSTDEVICE inline const T& operator[](size_t index) const {
return data_[index];
}
HOSTDEVICE constexpr inline size_t size() const { return ElementCount; }
template <typename VectorLikeType>
static inline Array<T, ElementCount> From(const VectorLikeType& vec) {
PADDLE_ENFORCE_EQ(vec.size(), ElementCount, "size not match");
size_t n = static_cast<size_t>(vec.size());
Array<T, ElementCount> ret;
for (size_t i = 0; i < n; ++i) ret[i] = vec[i];
return ret;
}
private:
T data_[ElementCount];
};
// reduce the last axis of 2d array
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
int BlockDim>
__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, Ty init,
int reduce_num) {
__shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
int idx_x = blockIdx.x * reduce_num;
int idx_y = threadIdx.x;
Ty reduce_var = init;
for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim)
reduce_var = reducer(reduce_var, transformer(x[idx_x + idx_y]));
reduce_var =
cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
if (threadIdx.x == 0) {
y[blockIdx.x] = reduce_var;
}
}
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
int BlockDim, int Rank, int ReduceRank>
__global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, Ty init, int reduce_num,
Array<int, Rank> x_strides,
Array<int, ReduceRank> reduce_dim,
Array<int, ReduceRank> reduce_strides,
Array<int, Rank - ReduceRank> left_dim,
Array<int, Rank - ReduceRank> left_strides) {
__shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
Array<int, Rank> sub_index;
int left_idx = blockIdx.x;
for (int i = 0; i < Rank - ReduceRank; ++i) {
sub_index[left_dim[i]] = left_idx / left_strides[i];
left_idx %= left_strides[i];
}
int reduce_idx = threadIdx.x;
for (int j = 0; j < ReduceRank; ++j) {
sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
reduce_idx %= reduce_strides[j];
}
int idx_x = 0;
for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
Ty reduce_var = static_cast<Ty>(transformer(x[idx_x]));
for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) {
int reduce_idx = i;
for (int j = 0; j < ReduceRank; ++j) {
sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
reduce_idx %= reduce_strides[j];
}
int idx_x = 0;
for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
reduce_var = static_cast<Ty>(reducer(reduce_var, transformer(x[idx_x])));
}
reduce_var =
cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
if (threadIdx.x == 0) {
y[blockIdx.x] = reduce_var;
}
}
static inline std::vector<int> GetStrides(const std::vector<int>& dims) {
int n = static_cast<int>(dims.size());
if (n == 0) return std::vector<int>();
std::vector<int> strides(n);
strides.back() = 1;
for (int i = n - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * dims[i + 1];
}
return strides;
}
static inline std::vector<int> GetStrides(const std::vector<int>& dims,
const std::vector<int>& idx) {
int n = static_cast<int>(idx.size());
if (n == 0) return std::vector<int>();
std::vector<int> strides(n);
strides.back() = 1;
for (int i = n - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * dims[idx[i + 1]];
}
return strides;
}
constexpr int kMaxBlockDim = 512;
static inline int GetDesiredBlockDim(int block_dim) {
return block_dim >= kMaxBlockDim
? kMaxBlockDim
: (1 << static_cast<int>(std::log2(block_dim)));
}
template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
typename TransformOp>
static void TensorReduceImpl(
const Tx* x_data, Ty* y_data, const platform::Place& place,
const ReduceOp& reducer, const TransformOp& transformer, const Ty& init,
int left_num, int reduce_num, const std::vector<int>& x_strides,
const std::vector<int>& reduce_dim, const std::vector<int>& reduce_strides,
const std::vector<int>& left_dim, const std::vector<int>& left_strides,
cudaStream_t stream) {
#define CUB_RANK_CASE(i, ...) \
case i: { \
constexpr auto kRank = i; \
switch (reduce_rank) { __VA_ARGS__; } \
} break
#define CUB_REDUCE_RANK_CASE(i, ...) \
case i: { \
constexpr auto kReduceRank = i; \
ReduceKernel<Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, \
kReduceRank><<<left_num, BlockDim, 0, stream>>>( \
x_data, y_data, reducer, transformer, init, reduce_num, \
Array<int, kRank>::From(x_strides), \
Array<int, kReduceRank>::From(reduce_dim), \
Array<int, kReduceRank>::From(reduce_strides), \
Array<int, kRank - kReduceRank>::From(left_dim), \
Array<int, kRank - kReduceRank>::From(left_strides)); \
} break
int rank = x_strides.size();
int reduce_rank = reduce_strides.size();
if (rank == reduce_rank) {
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
x_data, transformer);
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
reduce_num, reducer, init, stream);
framework::Tensor tmp;
auto* temp_storage = tmp.mutable_data<uint8_t>(
framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
place);
cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
reduce_num, reducer, init, stream);
return;
}
if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
ReduceKernel2D<Tx, Ty, ReduceOp, TransformOp,
BlockDim><<<left_num, BlockDim, 0, stream>>>(
x_data, y_data, reducer, transformer, init, reduce_num);
return;
}
/*
if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) {
// TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced.
// Currently, it is handled by code below, but inefficient
return;
}
*/
switch (rank) {
CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1););
CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2););
CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3););
CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4););
CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5););
CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);
CUB_REDUCE_RANK_CASE(7); CUB_REDUCE_RANK_CASE(8););
}
#undef CUB_REDUCE_RANK_CASE
#undef CUB_RANK_CASE
}
} // namespace detail
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
void TensorReduce(const framework::Tensor& x, framework::Tensor* y,
std::vector<int> origin_reduce_dims, const Ty& init,
const ReduceOp& reducer, const TransformOp& transformer,
cudaStream_t stream) {
auto x_dim = framework::vectorize2int(x.dims());
std::vector<int> new_x_dim, new_reduce_dims;
int is_reduced = 0;
for (auto e : origin_reduce_dims) {
auto pos = e >= 0 ? e : e + x_dim.size();
is_reduced |= 1 << e;
}
for (int i = 0; i < x_dim.size(); i++) {
if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
new_x_dim.push_back(x_dim[i]);
if ((is_reduced >> i) & 1)
new_reduce_dims.push_back(new_x_dim.size() - 1);
} else {
new_x_dim[new_x_dim.size() - 1] *= x_dim[i];
}
}
x_dim = new_x_dim;
origin_reduce_dims = new_reduce_dims;
int x_rank = static_cast<int>(x_dim.size());
std::set<int> left_set, reduce_set;
for (int i = 0; i < x_rank; ++i) left_set.insert(i);
for (auto e : origin_reduce_dims) {
left_set.erase(e);
reduce_set.insert(e);
}
std::vector<int> reduce_dim(reduce_set.begin(), reduce_set.end());
std::vector<int> left_dim(left_set.begin(), left_set.end());
std::vector<int> x_strides = detail::GetStrides(x_dim);
std::vector<int> reduce_strides = detail::GetStrides(x_dim, reduce_dim);
std::vector<int> left_strides = detail::GetStrides(x_dim, left_dim);
int reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
int left_num = 1;
if (left_dim.size()) left_num = left_strides[0] * x_dim[left_dim[0]];
std::vector<int> y_dim(left_dim.size());
for (int i = 0; i < left_dim.size(); ++i) {
y_dim[i] = x_dim[left_dim[i]];
}
auto x_data = x.data<Tx>();
auto y_data = y->mutable_data<Ty>(x.place());
if (reduce_num == 1) return;
#define CUB_BLOCK_DIM_CASE(block_dim) \
case block_dim: { \
constexpr auto kBlockDim = block_dim; \
detail::TensorReduceImpl<Tx, Ty, block_dim, ReduceOp, TransformOp>( \
x_data, y_data, x.place(), reducer, transformer, init, left_num, \
reduce_num, x_strides, reduce_dim, reduce_strides, left_dim, \
left_strides, stream); \
} break
switch (detail::GetDesiredBlockDim(reduce_num)) {
CUB_BLOCK_DIM_CASE(512);
CUB_BLOCK_DIM_CASE(256);
CUB_BLOCK_DIM_CASE(128);
CUB_BLOCK_DIM_CASE(64);
CUB_BLOCK_DIM_CASE(32);
CUB_BLOCK_DIM_CASE(16);
CUB_BLOCK_DIM_CASE(8);
CUB_BLOCK_DIM_CASE(4);
CUB_BLOCK_DIM_CASE(2);
}
#undef CUB_BLOCK_DIM_CASE
}
} // namespace operators
} // namespace paddle
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
...@@ -21,54 +20,31 @@ namespace paddle { ...@@ -21,54 +20,31 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
template <typename T>
__inline__ __device__ T warpReduceSum(T val) {
#if CUDA_VERSION < 9000
for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
#else
#define FULL_MASK 0xffffffff
for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down_sync(FULL_MASK, val, offset);
return val;
#endif
}
__forceinline__ __device__ unsigned lane_id() {
unsigned ret;
asm volatile("mov.u32 %0, %laneid;" : "=r"(ret));
return ret;
}
__forceinline__ __device__ unsigned warp_id() {
unsigned ret;
asm volatile("mov.u32 %0, %warpid;" : "=r"(ret));
return ret;
}
// A Cuda kernel to compute the depthwise convolution forward pass // A Cuda kernel to compute the depthwise convolution forward pass
// in NCHW format. // in NCHW format.
template <typename T> template <typename T>
__device__ __inline__ void KernelDepthwiseConv( __global__ void KernelDepthwiseConv(
const T* const input_data, const T* const filter_data, const int batch_size, const int nthreads, const T* const input_data, const T* const filter_data,
const int output_channels, const int output_height, const int output_width, const int batch_size, const int output_channels, const int output_height,
const int input_channels, const int input_height, const int input_width, const int output_width, const int input_channels, const int input_height,
const int filter_multiplier, const int filter_height, const int input_width, const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width, const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height, const int padding_height, const int padding_width, T* const output_data) {
const int dilate_width, T* const output_data) { int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) { if (index < nthreads) {
const int batch = blockIdx.y; const int batch = index / output_channels / output_height / output_width;
const int c_out = blockIdx.x; const int c_out = (index / output_height / output_width) % output_channels;
const int h_out = (index / output_width) % output_height;
const int w_out = index % output_width;
const int c_in = c_out / filter_multiplier; const int c_in = c_out / filter_multiplier;
const T* weight = filter_data + c_out * filter_height * filter_width; const T* weight = filter_data + c_out * filter_height * filter_width;
T value = 0; T value = 0;
const int h_in_start = -padding_height + h_out * stride_height; const int h_in_start = -padding_height + h_out * stride_height;
const int w_in_start = -padding_width + w_out * stride_width; const int w_in_start = -padding_width + w_out * stride_width;
const int h_in_end = h_in_start + filter_height * dilate_height; const int h_in_end = h_in_start + filter_height;
const int w_in_end = w_in_start + filter_width * dilate_width; const int w_in_end = w_in_start + filter_width;
const int in_offset = const int in_offset =
((batch * input_channels + c_in) * input_height) * input_width; ((batch * input_channels + c_in) * input_height) * input_width;
...@@ -77,212 +53,116 @@ __device__ __inline__ void KernelDepthwiseConv( ...@@ -77,212 +53,116 @@ __device__ __inline__ void KernelDepthwiseConv(
const int w_end = w_in_end < input_width ? w_in_end : input_width; const int w_end = w_in_end < input_width ? w_in_end : input_width;
const int h_start = h_in_start > 0 ? h_in_start : 0; const int h_start = h_in_start > 0 ? h_in_start : 0;
const int w_start = w_in_start > 0 ? w_in_start : 0; const int w_start = w_in_start > 0 ? w_in_start : 0;
int weight_offset = 0;
for (int h_in = h_in_start; h_in < h_in_end; h_in += dilate_height) { for (int h_in = h_start; h_in < h_end; h_in++) {
for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) { for (int w_in = w_start; w_in < w_end; w_in++) {
if (h_in >= h_start && h_in < h_end && w_in >= w_start &&
w_in < w_end) {
const int offset = in_offset + h_in * input_width + w_in; const int offset = in_offset + h_in * input_width + w_in;
value += weight[weight_offset] * input_data[offset]; value +=
weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] *
input_data[offset];
} }
weight_offset++;
} }
}
int index =
((batch * gridDim.x + c_out) * output_height + h_out) * output_width +
w_out;
output_data[index] = value; output_data[index] = value;
} }
}
}
template <typename T, int c_filter_multiplier, int c_stride>
__global__ void KernelDepthwiseConvSp(
const T* const input_data, const T* const filter_data, const int batch_size,
const int output_channels, const int output_height, const int output_width,
const int input_channels, const int input_height, const int input_width,
const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* const output_data) {
if (c_filter_multiplier == 0)
KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels,
input_height, input_width, filter_multiplier,
filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width,
dilate_height, dilate_width, output_data);
else
KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels,
input_height, input_width, c_filter_multiplier,
filter_height, filter_height, c_stride, c_stride,
padding_height, padding_width, dilate_height,
dilate_width, output_data);
} }
// CUDA kernel to compute the depthwise convolution backprop w.r.t input. // CUDA kernel to compute the depthwise convolution backprop w.r.t input.
template <typename T> template <typename T>
__device__ __inline__ void KernelDepthwiseConvInputGrad( __global__ void KernelDepthwiseConvInputGrad(
const T* const output_grad_data, const T* const filter_data, const int nthreads, const T* const output_grad_data,
const int batch_size, const int output_channels, const int output_height, const T* const filter_data, const int batch_size, const int output_channels,
const int output_width, const int input_channels, const int input_height, const int output_height, const int output_width, const int input_channels,
const int input_width, const int filter_multiplier, const int filter_height, const int input_height, const int input_width, const int filter_multiplier,
const int filter_width, const int stride_height, const int stride_width, const int filter_height, const int filter_width, const int stride_height,
const int padding_height, const int padding_width, const int dilate_height, const int stride_width, const int padding_height, const int padding_width,
const int dilate_width, T* const input_grad_data) { T* const input_grad_data) {
for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) { int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) { if (index < nthreads) {
const int batch = blockIdx.y; const int batch = index / input_channels / input_height / input_width;
const int c_in = blockIdx.x; const int c_in = (index / input_height / input_width) % input_channels;
const int h_in = (index / input_width) % input_height;
const int w_in = index % input_width;
const int c_out_start = c_in * filter_multiplier; const int c_out_start = c_in * filter_multiplier;
int h_out_start = int h_out_start =
h_in - (filter_height - 1) * dilate_height + padding_height; (h_in - filter_height + padding_height + stride_height) / stride_height;
h_out_start = 0 > h_out_start ? 0 : h_out_start;
int h_out_end = h_in + padding_height; int h_out_end = (h_in + padding_height) / stride_height;
h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end;
int w_out_start = int w_out_start =
w_in - (filter_width - 1) * dilate_width + padding_width; (w_in - filter_width + padding_width + stride_width) / stride_width;
w_out_start = 0 > w_out_start ? 0 : w_out_start;
int w_out_end = w_in + padding_width; int w_out_end = (w_in + padding_width) / stride_width;
w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end;
T value = 0; T value = 0;
for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
c_out++) { c_out++) {
int filter_offset = (c_out + 1) * filter_height * filter_width; for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
for (int h_out = h_out_start; h_out <= h_out_end; const int filter_h = h_in + padding_height - h_out * stride_height;
h_out += dilate_height) { for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
for (int w_out = w_out_start; w_out <= w_out_end; const int filter_w = w_in + padding_width - w_out * stride_width;
w_out += dilate_width) { const int filter_offset = c_out * filter_height * filter_width +
filter_offset--; filter_h * filter_width + filter_w;
int s_h_out = h_out / stride_height;
int s_w_out = w_out / stride_width;
if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
s_w_out < output_width) {
const int output_grad_offset = const int output_grad_offset =
((batch * output_channels + c_out) * output_height + ((batch * output_channels + c_out) * output_height + h_out) *
s_h_out) *
output_width + output_width +
s_w_out; w_out;
value += output_grad_data[output_grad_offset] * value +=
filter_data[filter_offset]; output_grad_data[output_grad_offset] * filter_data[filter_offset];
}
}
} }
} }
int index =
((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
w_in;
input_grad_data[index] = value;
} }
input_grad_data[index] += value;
} }
} }
template <typename T, int c_filter_multiplier, int c_stride>
__global__ void KernelDepthwiseConvInputGradSp(
const T* const output_grad_data, const T* const filter_data,
const int batch_size, const int output_channels, const int output_height,
const int output_width, const int input_channels, const int input_height,
const int input_width, const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* const input_grad_data) {
if (c_filter_multiplier == 0)
KernelDepthwiseConvInputGrad<T>(
output_grad_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
filter_multiplier, filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width, dilate_height,
dilate_width, input_grad_data);
else
KernelDepthwiseConvInputGrad<T>(
output_grad_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
padding_height, padding_width, dilate_height, dilate_width,
input_grad_data);
}
// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
template <typename T> template <typename T>
__device__ __inline__ void KernelDepthwiseConvFilterGrad( __global__ void KernelDepthwiseConvFilterGrad(
const T* output_grad_data, const T* input_data, const int num, const int nthreads, const T* const output_grad_data,
const int output_channels, const int output_height, const int output_width, const T* const input_data, const int num, const int output_channels,
const int input_channels, const int input_height, const int input_width, const int output_height, const int output_width, const int input_channels,
const int filter_multiplier, const int filter_height, const int input_height, const int input_width, const int filter_multiplier,
const int filter_width, const int stride_height, const int stride_width, const int filter_height, const int filter_width, const int stride_height,
const int padding_height, const int padding_width, const int dilate_height, const int stride_width, const int padding_height, const int padding_width,
const int dilate_width, T* filter_grad_data) { T* const filter_grad_data) {
T s = 0; int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x; const int w_out = index % output_width;
int lid = lane_id(); const int h_out = (index / output_width) % output_height;
const int c_out = (index / output_width / output_height) % output_channels;
for (int image_w = threadIdx.x; image_w < output_width; const int batch = (index / output_width / output_height / output_channels);
image_w += blockDim.x) { const int c_in = c_out / filter_multiplier;
for (int bid = 0; bid < num; bid++) { const int h_in_start = -padding_height + h_out * stride_height;
for (int image_h = threadIdx.y; image_h < output_height; const int w_in_start = -padding_width + w_out * stride_width;
image_h += blockDim.y) { const int h_in_end =
int kernel_id = blockIdx.z; -padding_height + h_out * stride_height + filter_height;
int kernel_h = blockIdx.y * dilate_height - padding_height; const int w_in_end = -padding_width + w_out * stride_width + filter_width;
int kernel_w = blockIdx.x * dilate_width - padding_width; const int in_offset =
(batch * input_channels + c_in) * input_height * input_width;
int image_hk = image_h * stride_height + kernel_h;
int image_wk = image_w * stride_width + kernel_w; T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
if (image_hk < 0 || image_hk >= input_height) continue; const int h_end = h_in_end < input_height ? h_in_end : input_height;
if (image_wk < 0 || image_wk >= input_width) continue; const int w_end = w_in_end < input_width ? w_in_end : input_width;
#define gaid(N, C, H, W) \ const int h_start = h_in_start > 0 ? h_in_start : 0;
((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W)) const int w_start = w_in_start > 0 ? w_in_start : 0;
s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] * for (int h_in = h_start; h_in < h_end; h_in++) {
input_data[((bid * (gridDim.z / filter_multiplier) + for (int w_in = w_start; w_in < w_end; w_in++) {
kernel_id / filter_multiplier) * const int offset = in_offset + h_in * input_width + w_in;
input_height + const T diff_temp = output_grad_data[index] * input_data[offset];
image_hk) * T* addr = addr_offset + (h_in - h_in_start) * filter_width +
input_width + (w_in - w_in_start);
image_wk]; paddle::platform::CudaAtomicAdd(addr, diff_temp);
#undef gaid
} }
} }
} }
#if __CUDA_ARCH__ >= 530
s = warpReduceSum<T>(s);
if (lid == 0) paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
#else
paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
#endif
}
template <typename T, int c_filter_multiplier>
__global__ void KernelDepthwiseConvFilterGradSp(
const T* output_grad_data, const T* input_data, const int num,
const int output_channels, const int output_height, const int output_width,
const int input_channels, const int input_height, const int input_width,
const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* filter_grad_data) {
if (c_filter_multiplier == 0)
KernelDepthwiseConvFilterGrad<T>(
output_grad_data, input_data, num, output_channels, output_height,
output_width, input_channels, input_height, input_width,
filter_multiplier, filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width, dilate_height,
dilate_width, filter_grad_data);
else
KernelDepthwiseConvFilterGrad<T>(
output_grad_data, input_data, num, output_channels, output_height,
output_width, input_channels, input_height, input_width,
c_filter_multiplier, filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width, dilate_height,
dilate_width, filter_grad_data);
} }
/* /*
...@@ -297,9 +177,7 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> { ...@@ -297,9 +177,7 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& input, const framework::Tensor& input,
const framework::Tensor& filter, const framework::Tensor& filter,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings, framework::Tensor* output) {
const std::vector<int>& dilations,
framework::Tensor* output) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
...@@ -313,37 +191,22 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> { ...@@ -313,37 +191,22 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
const int stride_width = strides[1]; const int stride_width = strides[1];
const int padding_height = paddings[0]; const int padding_height = paddings[0];
const int padding_width = paddings[1]; const int padding_width = paddings[1];
const int dilate_height = dilations[0];
const int dilate_width = dilations[1];
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const T* filter_data = filter.data<T>(); const T* filter_data = filter.data<T>();
T* output_data = output->mutable_data<T>(context.GetPlace()); T* output_data = output->mutable_data<T>(context.GetPlace());
int thread = 512; int nthreads = batch_size * output_channels * output_height * output_width;
int blocks = std::min(std::max(thread / output_width, 1), output_height); int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(std::min(output_width, thread), blocks, 1); dim3 threads(1024, 1);
dim3 grid(output_channels, batch_size, 1); dim3 grid(blocks, 1);
int filter_multiplier = output_channels / input_channels;
#define check_case(c_filter_multiplier, c_stride) \ KernelDepthwiseConv<T><<<grid, threads, 0, context.stream()>>>(
if (c_filter_multiplier == 0 || \ nthreads, input_data, filter_data, batch_size, output_channels,
filter_multiplier == c_filter_multiplier && \ output_height, output_width, input_channels, input_height, input_width,
stride_height == stride_width && stride_height == c_stride) { \ output_channels / input_channels, ksize_height, ksize_width,
KernelDepthwiseConvSp<T, c_filter_multiplier, \ stride_height, stride_width, padding_height, padding_width,
c_stride><<<grid, threads, 0, context.stream()>>>( \ output_data);
input_data, filter_data, batch_size, output_channels, output_height, \
output_width, input_channels, input_height, input_width, \
filter_multiplier, ksize_height, ksize_width, stride_height, \
stride_width, padding_height, padding_width, dilate_height, \
dilate_width, output_data); \
return; \
}
check_case(1, 1);
check_case(1, 2);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
check_case(0, 0);
#undef check_case
} }
}; };
...@@ -356,7 +219,6 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> { ...@@ -356,7 +219,6 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* input_grad) { framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
...@@ -371,39 +233,22 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> { ...@@ -371,39 +233,22 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
const int stride_width = strides[1]; const int stride_width = strides[1];
const int padding_height = paddings[0]; const int padding_height = paddings[0];
const int padding_width = paddings[1]; const int padding_width = paddings[1];
const int dilate_height = dilations[0];
const int dilate_width = dilations[1];
const T* filter_data = filter.data<T>(); const T* filter_data = filter.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
int thread = 512; int nthreads = batch_size * input_channels * input_height * input_width;
int blocks = std::min(std::max(thread / input_width, 1), input_height); int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(std::min(input_width, thread), blocks, 1); dim3 threads(1024, 1);
dim3 grid(input_channels, batch_size, 1); dim3 grid(blocks, 1);
int filter_multiplier = output_channels / input_channels;
KernelDepthwiseConvInputGrad<T><<<grid, threads, 0, context.stream()>>>(
#define check_case(c_filter_multiplier, c_stride) \ nthreads, output_grad_data, filter_data, batch_size, output_channels,
if (c_filter_multiplier == 0 || \ output_height, output_width, input_channels, input_height, input_width,
filter_multiplier == c_filter_multiplier && \ output_channels / input_channels, ksize_height, ksize_width,
stride_height == stride_width && stride_height == c_stride) { \ stride_height, stride_width, padding_height, padding_width,
KernelDepthwiseConvInputGradSp< \ input_grad_data);
T, c_filter_multiplier, \
c_stride><<<grid, threads, 0, context.stream()>>>( \
output_grad_data, filter_data, batch_size, output_channels, \
output_height, output_width, input_channels, input_height, \
input_width, filter_multiplier, ksize_height, ksize_width, \
stride_height, stride_width, padding_height, padding_width, \
dilate_height, dilate_width, input_grad_data); \
return; \
}
check_case(1, 1);
check_case(1, 2);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
check_case(0, 0);
#undef check_case
} }
}; };
...@@ -415,7 +260,6 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> { ...@@ -415,7 +260,6 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* filter_grad) { framework::Tensor* filter_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
...@@ -430,34 +274,23 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> { ...@@ -430,34 +274,23 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
const int stride_width = strides[1]; const int stride_width = strides[1];
const int padding_height = paddings[0]; const int padding_height = paddings[0];
const int padding_width = paddings[1]; const int padding_width = paddings[1];
const int dilate_height = dilations[0];
const int dilate_width = dilations[1];
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace()); T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
int block_size = 512; int nthreads = batch_size * output_channels * output_height * output_width;
int crop_output_height =
std::min(std::max(block_size / output_width, 1), output_height); int blocks = (nthreads + 1024 - 1) / 1024;
dim3 grid(ksize_width, ksize_height, output_channels); dim3 threads(1024, 1);
dim3 threads(std::min(output_width, block_size), crop_output_height, 1); dim3 grid(blocks, 1);
int filter_multiplier = output_channels / input_channels;
KernelDepthwiseConvFilterGrad<T><<<grid, threads, 0, context.stream()>>>(
#define check_case(c_filter_multiplier) \ nthreads, output_grad_data, input_data, batch_size, output_channels,
if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \ output_height, output_width, input_channels, input_height, input_width,
KernelDepthwiseConvFilterGradSp< \ output_channels / input_channels, ksize_height, ksize_width,
T, c_filter_multiplier><<<grid, threads, 0, context.stream()>>>( \ stride_height, stride_width, padding_height, padding_width,
output_grad_data, input_data, batch_size, output_channels, \ filter_grad_data);
output_height, output_width, input_channels, input_height, \
input_width, filter_multiplier, ksize_height, ksize_width, \
stride_height, stride_width, padding_height, padding_width, \
dilate_height, dilate_width, filter_grad_data); \
return; \
}
check_case(1);
check_case(0);
#undef check_case
} }
}; };
......
...@@ -32,8 +32,7 @@ class DepthwiseConvFunctor { ...@@ -32,8 +32,7 @@ class DepthwiseConvFunctor {
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& filter, const framework::Tensor& filter,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings, framework::Tensor* output);
const std::vector<int>& dilations, framework::Tensor* output);
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -44,7 +43,6 @@ class DepthwiseConvInputGradFunctor { ...@@ -44,7 +43,6 @@ class DepthwiseConvInputGradFunctor {
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* input_grad); framework::Tensor* input_grad);
}; };
...@@ -55,7 +53,6 @@ class DepthwiseConvFilterGradFunctor { ...@@ -55,7 +53,6 @@ class DepthwiseConvFilterGradFunctor {
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* filter_grad); framework::Tensor* filter_grad);
}; };
......
...@@ -12,64 +12,17 @@ ...@@ -12,64 +12,17 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <vector>
#include "paddle/fluid/operators/cub_reduce.h"
#include "paddle/fluid/operators/reduce_mean_op.h" #include "paddle/fluid/operators/reduce_mean_op.h"
namespace paddle { REGISTER_OP_CUDA_KERNEL(reduce_mean,
namespace operators { ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::MeanFunctor>,
template <typename T> ops::ReduceKernel<paddle::platform::CUDADeviceContext,
struct DivideFunctor { double, ops::MeanFunctor>,
HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::MeanFunctor>,
HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::MeanFunctor>);
private:
T n_inv;
};
template <typename T>
class ReduceMeanKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
auto dims = context.Attr<std::vector<int>>("dim");
bool keep_dim = context.Attr<bool>("keep_dim");
std::vector<int> reduce_dims;
if (reduce_all) {
reduce_dims.resize(input->dims().size());
for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
} else {
for (auto e : dims) {
reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
}
}
int reduce_num = 1;
for (int i = 0; i < reduce_dims.size(); ++i) {
reduce_num *= input->dims()[reduce_dims[i]];
}
auto stream = context.cuda_device_context().stream();
TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
DivideFunctor<T>(reduce_num), stream);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
ops::ReduceMeanKernel<double>,
ops::ReduceMeanKernel<int>,
ops::ReduceMeanKernel<int64_t>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::MeanGradFunctor>, float, ops::MeanGradFunctor>,
......
...@@ -12,59 +12,17 @@ ...@@ -12,59 +12,17 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/cub_reduce.h"
#include "paddle/fluid/operators/reduce_sum_op.h" #include "paddle/fluid/operators/reduce_sum_op.h"
namespace paddle { REGISTER_OP_CUDA_KERNEL(reduce_sum,
namespace operators { ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::SumFunctor>,
template <typename T> ops::ReduceKernel<paddle::platform::CUDADeviceContext,
struct IdentityFunctor { double, ops::SumFunctor>,
HOSTDEVICE explicit inline IdentityFunctor() {} ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::SumFunctor>,
HOSTDEVICE inline T operator()(const T& x) const { return x; } ops::ReduceKernel<paddle::platform::CUDADeviceContext,
}; int64_t, ops::SumFunctor>);
template <typename T>
class ReduceSumKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
auto dims = context.Attr<std::vector<int>>("dim");
bool keep_dim = context.Attr<bool>("keep_dim");
std::vector<int> reduce_dims;
if (reduce_all) {
reduce_dims.resize(input->dims().size());
for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
} else {
for (auto e : dims) {
reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
}
}
int reduce_num = 1;
for (int i = 0; i < reduce_dims.size(); ++i) {
reduce_num *= input->dims()[reduce_dims[i]];
}
auto stream = context.cuda_device_context().stream();
TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
IdentityFunctor<T>(), stream);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>,
ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
ops::ReduceSumKernel<int64_t>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::SumGradFunctor>, float, ops::SumGradFunctor>,
......
...@@ -842,6 +842,13 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None): ...@@ -842,6 +842,13 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
def shuffle(reader, buffer_size): def shuffle(reader, buffer_size):
""" """
Shuffle the reader. Shuffle the reader.
Args:
reader(Variable): The reader to be decorated with 'shuffling'.
buffer_size(int): The pre-read number of data in :code:`reader`.
Returns:
Variable: The reader which has been decorated with 'shuffling'.
""" """
return __create_unshared_decorated_reader__( return __create_unshared_decorated_reader__(
'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)}) 'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
......
...@@ -6367,7 +6367,7 @@ def stack(x, axis=0): ...@@ -6367,7 +6367,7 @@ def stack(x, axis=0):
if not isinstance(x, list) and not isinstance(x, tuple): if not isinstance(x, list) and not isinstance(x, tuple):
x = [x] x = [x]
out = helper.create_tmp_variable(x[0].dtype) out = helper.create_tmp_variable(dtype=x[0].dtype)
helper.append_op( helper.append_op(
type='stack', inputs={'X': x}, outputs={'Y': out}, type='stack', inputs={'X': x}, outputs={'Y': out},
attrs={'axis': axis}) attrs={'axis': axis})
...@@ -6404,8 +6404,8 @@ def unstack(x, axis=0, num=None): ...@@ -6404,8 +6404,8 @@ def unstack(x, axis=0, num=None):
num = x.shape[axis] num = x.shape[axis]
outs = [] outs = []
for _ in num: for _ in xrange(num):
outs.append(helper.create_tmp_variable(x.dtype)) outs.append(helper.create_tmp_variable(dtype=x.dtype))
helper.append_op( helper.append_op(
type='unstack', type='unstack',
......
...@@ -67,7 +67,6 @@ class TestConv2dOp(OpTest): ...@@ -67,7 +67,6 @@ class TestConv2dOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "conv2d" self.op_type = "conv2d"
self.use_cudnn = False self.use_cudnn = False
self.use_cuda = False
self.use_mkldnn = False self.use_mkldnn = False
self.data_format = "AnyLayout" self.data_format = "AnyLayout"
self.dtype = np.float32 self.dtype = np.float32
...@@ -102,25 +101,24 @@ class TestConv2dOp(OpTest): ...@@ -102,25 +101,24 @@ class TestConv2dOp(OpTest):
} }
self.outputs = {'Output': output} self.outputs = {'Output': output}
def testcuda(self): def testcudnn(self):
return core.is_compiled_with_cuda() and (self.use_cudnn or return core.is_compiled_with_cuda() and self.use_cudnn
self.use_cuda)
def test_check_output(self): def test_check_output(self):
place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
self.check_output_with_place(place, atol=1e-5) self.check_output_with_place(place, atol=1e-5)
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
self.check_grad_with_place( self.check_grad_with_place(
place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02) place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
def test_check_grad_no_filter(self): def test_check_grad_no_filter(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
self.check_grad_with_place( self.check_grad_with_place(
place, ['Input'], place, ['Input'],
'Output', 'Output',
...@@ -130,7 +128,7 @@ class TestConv2dOp(OpTest): ...@@ -130,7 +128,7 @@ class TestConv2dOp(OpTest):
def test_check_grad_no_input(self): def test_check_grad_no_input(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
self.check_grad_with_place( self.check_grad_with_place(
place, ['Filter'], place, ['Filter'],
'Output', 'Output',
...@@ -327,65 +325,22 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): ...@@ -327,65 +325,22 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
class TestDepthwiseConv(TestConv2dOp): class TestDepthwiseConv(TestConv2dOp):
def init_test_case(self): def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1] self.pad = [1, 1]
self.stride = [2, 2] self.stride = [2, 2]
self.input_size = [2, 3, 5, 5] # NCHW self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3 self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0 assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups f_c = self.input_size[1] // self.groups
self.filter_size = [3, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
class TestDepthwiseConv2(TestConv2dOp):
def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [3, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
class TestDepthwiseConv3(TestConv2dOp):
def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3] self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv2d" self.op_type = "depthwise_conv2d"
class TestDepthwiseConvWithDilation(TestConv2dOp): class TestDepthwiseConv2(TestConv2dOp):
def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
self.dilations = [2, 2]
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
class TestDepthwiseConvWithDilation2(TestConv2dOp):
def init_test_case(self): def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1] self.pad = [1, 1]
self.stride = [1, 1] self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3 self.groups = 3
self.dilations = [2, 2]
assert np.mod(self.input_size[1], self.groups) == 0 assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3] self.filter_size = [6, f_c, 3, 3]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册