提交 e06f2f33 编写于 作者: S shippingwang

Merge branch 'release/1.0.0' of https://github.com/PaddlePaddle/Paddle into release/1.0.0

...@@ -138,13 +138,17 @@ if (APPLE) ...@@ -138,13 +138,17 @@ if (APPLE)
# On Mac OS X build fat binaries with x86_64 architectures by default. # On Mac OS X build fat binaries with x86_64 architectures by default.
set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
endif() endif()
else() # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
set (COMMON_FLAGS -Wno-deprecated-register)
endif(APPLE)
if(LINUX)
set(GPU_COMMON_FLAGS set(GPU_COMMON_FLAGS
-Wall -Wall
-Wextra -Wextra
-Werror -Werror
${GPU_COMMON_FLAGS}) ${GPU_COMMON_FLAGS})
endif() endif(LINUX)
if(UNIX AND NOT APPLE) if(UNIX AND NOT APPLE)
# except apple from nix*Os family # except apple from nix*Os family
......
...@@ -80,15 +80,15 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( ...@@ -80,15 +80,15 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
// This is weird but there is really some variables without var_desc // This is weird but there is really some variables without var_desc
// in computation_op // in computation_op
if (var_desc == nullptr) { if (var_desc == nullptr) {
if (compute_op->Node()->Op()->Block()->FindVar(var_name) == nullptr) var_desc = compute_op->Node()->Op()->Block()->FindVar(var_name);
continue; if (var_desc == nullptr) continue;
} else { }
if (var_desc->Persistable()) continue;
auto var_type = var_desc->Proto()->type().type(); if (var_desc->Persistable()) continue;
if (var_type != proto::VarType::LOD_TENSOR && auto var_type = var_desc->Proto()->type().type();
var_type != proto::VarType::SELECTED_ROWS) { if (var_type != proto::VarType::LOD_TENSOR &&
continue; var_type != proto::VarType::SELECTED_ROWS) {
} continue;
} }
// compute op only runs in one device // compute op only runs in one device
......
...@@ -319,6 +319,13 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors, ...@@ -319,6 +319,13 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (!gcs_.empty()) { if (!gcs_.empty()) {
ResetReferenceCount(); ResetReferenceCount();
for (auto &pair : cur_ref_cnts_) {
auto &name_map = *(pair.second);
for (auto &fetch_name : fetch_tensors) {
name_map.erase(fetch_name);
}
name_map.erase(fetched_var_name);
}
} }
#endif #endif
auto fetch_data = member_->executor_->Run(fetch_tensors); auto fetch_data = member_->executor_->Run(fetch_tensors);
......
...@@ -301,7 +301,6 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute) ...@@ -301,7 +301,6 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute)
if (WITH_GPU) if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(conv_op DEPS vol2col depthwise_conv im2col)
op_library(layer_norm_op DEPS cub) op_library(layer_norm_op DEPS cub)
op_library(reduce_mean_op DEPS cub)
else() else()
op_library(conv_op DEPS vol2col im2col) op_library(conv_op DEPS vol2col im2col)
endif() endif()
......
...@@ -380,8 +380,7 @@ class DepthwiseConvKernel : public framework::OpKernel<T> { ...@@ -380,8 +380,7 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv; math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);
output);
} }
}; };
...@@ -416,14 +415,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> { ...@@ -416,14 +415,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
input_grad->mutable_data<T>(context.GetPlace()); input_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(0)); set_zero(dev_ctx, input_grad, static_cast<T>(0));
depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
paddings, dilations, input_grad); paddings, input_grad);
} }
if (filter_grad) { if (filter_grad) {
filter_grad->mutable_data<T>(context.GetPlace()); filter_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0)); set_zero(dev_ctx, filter_grad, static_cast<T>(0));
depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings, depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
dilations, filter_grad); filter_grad);
} }
} }
}; };
......
...@@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> { ...@@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
math::DepthwiseConvInputGradFunctor<DeviceContext, T> math::DepthwiseConvInputGradFunctor<DeviceContext, T>
depthwiseConvInputGrad; depthwiseConvInputGrad;
depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings, depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
dilations, output); output);
} }
}; };
...@@ -367,11 +367,10 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -367,11 +367,10 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> strides = context.Attr<std::vector<int>>("strides"); std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
if (input_grad) { if (input_grad) {
math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv; math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, dilations, depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings,
input_grad); input_grad);
} }
...@@ -383,7 +382,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -383,7 +382,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
math::DepthwiseConvFilterGradFunctor<DeviceContext, T> math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
depthwiseConvFilterGrad; depthwiseConvFilterGrad;
depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings, depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
dilations, filter_grad); filter_grad);
} }
} }
}; };
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cmath>
#include <numeric>
#include <set>
#include <vector>
#include <cub/cub.cuh> // NOLINT
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace operators {
namespace detail {
template <typename T, size_t ElementCount>
struct Array {
public:
HOSTDEVICE inline Array() {}
HOSTDEVICE inline T& operator[](size_t index) { return data_[index]; }
HOSTDEVICE inline const T& operator[](size_t index) const {
return data_[index];
}
HOSTDEVICE constexpr inline size_t size() const { return ElementCount; }
template <typename VectorLikeType>
static inline Array<T, ElementCount> From(const VectorLikeType& vec) {
PADDLE_ENFORCE_EQ(vec.size(), ElementCount, "size not match");
size_t n = static_cast<size_t>(vec.size());
Array<T, ElementCount> ret;
for (size_t i = 0; i < n; ++i) ret[i] = vec[i];
return ret;
}
private:
T data_[ElementCount];
};
// reduce the last axis of 2d array
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
int BlockDim>
__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, Ty init,
int reduce_num) {
__shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
int idx_x = blockIdx.x * reduce_num;
int idx_y = threadIdx.x;
Ty reduce_var = init;
for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim)
reduce_var = reducer(reduce_var, transformer(x[idx_x + idx_y]));
reduce_var =
cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
if (threadIdx.x == 0) {
y[blockIdx.x] = reduce_var;
}
}
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
int BlockDim, int Rank, int ReduceRank>
__global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, Ty init, int reduce_num,
Array<int, Rank> x_strides,
Array<int, ReduceRank> reduce_dim,
Array<int, ReduceRank> reduce_strides,
Array<int, Rank - ReduceRank> left_dim,
Array<int, Rank - ReduceRank> left_strides) {
__shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
Array<int, Rank> sub_index;
int left_idx = blockIdx.x;
for (int i = 0; i < Rank - ReduceRank; ++i) {
sub_index[left_dim[i]] = left_idx / left_strides[i];
left_idx %= left_strides[i];
}
int reduce_idx = threadIdx.x;
for (int j = 0; j < ReduceRank; ++j) {
sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
reduce_idx %= reduce_strides[j];
}
int idx_x = 0;
for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
Ty reduce_var = static_cast<Ty>(transformer(x[idx_x]));
for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) {
int reduce_idx = i;
for (int j = 0; j < ReduceRank; ++j) {
sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
reduce_idx %= reduce_strides[j];
}
int idx_x = 0;
for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
reduce_var = static_cast<Ty>(reducer(reduce_var, transformer(x[idx_x])));
}
reduce_var =
cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
if (threadIdx.x == 0) {
y[blockIdx.x] = reduce_var;
}
}
static inline std::vector<int> GetStrides(const std::vector<int>& dims) {
int n = static_cast<int>(dims.size());
if (n == 0) return std::vector<int>();
std::vector<int> strides(n);
strides.back() = 1;
for (int i = n - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * dims[i + 1];
}
return strides;
}
static inline std::vector<int> GetStrides(const std::vector<int>& dims,
const std::vector<int>& idx) {
int n = static_cast<int>(idx.size());
if (n == 0) return std::vector<int>();
std::vector<int> strides(n);
strides.back() = 1;
for (int i = n - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * dims[idx[i + 1]];
}
return strides;
}
constexpr int kMaxBlockDim = 512;
static inline int GetDesiredBlockDim(int block_dim) {
return block_dim >= kMaxBlockDim
? kMaxBlockDim
: (1 << static_cast<int>(std::log2(block_dim)));
}
template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
typename TransformOp>
static void TensorReduceImpl(
const Tx* x_data, Ty* y_data, const platform::Place& place,
const ReduceOp& reducer, const TransformOp& transformer, const Ty& init,
int left_num, int reduce_num, const std::vector<int>& x_strides,
const std::vector<int>& reduce_dim, const std::vector<int>& reduce_strides,
const std::vector<int>& left_dim, const std::vector<int>& left_strides,
cudaStream_t stream) {
#define CUB_RANK_CASE(i, ...) \
case i: { \
constexpr auto kRank = i; \
switch (reduce_rank) { __VA_ARGS__; } \
} break
#define CUB_REDUCE_RANK_CASE(i, ...) \
case i: { \
constexpr auto kReduceRank = i; \
ReduceKernel<Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, \
kReduceRank><<<left_num, BlockDim, 0, stream>>>( \
x_data, y_data, reducer, transformer, init, reduce_num, \
Array<int, kRank>::From(x_strides), \
Array<int, kReduceRank>::From(reduce_dim), \
Array<int, kReduceRank>::From(reduce_strides), \
Array<int, kRank - kReduceRank>::From(left_dim), \
Array<int, kRank - kReduceRank>::From(left_strides)); \
} break
int rank = x_strides.size();
int reduce_rank = reduce_strides.size();
if (rank == reduce_rank) {
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
x_data, transformer);
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
reduce_num, reducer, init, stream);
framework::Tensor tmp;
auto* temp_storage = tmp.mutable_data<uint8_t>(
framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
place);
cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
reduce_num, reducer, init, stream);
return;
}
if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
ReduceKernel2D<Tx, Ty, ReduceOp, TransformOp,
BlockDim><<<left_num, BlockDim, 0, stream>>>(
x_data, y_data, reducer, transformer, init, reduce_num);
return;
}
/*
if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) {
// TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced.
// Currently, it is handled by code below, but inefficient
return;
}
*/
switch (rank) {
CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1););
CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2););
CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3););
CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4););
CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5););
CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);
CUB_REDUCE_RANK_CASE(7); CUB_REDUCE_RANK_CASE(8););
}
#undef CUB_REDUCE_RANK_CASE
#undef CUB_RANK_CASE
}
} // namespace detail
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
void TensorReduce(const framework::Tensor& x, framework::Tensor* y,
std::vector<int> origin_reduce_dims, const Ty& init,
const ReduceOp& reducer, const TransformOp& transformer,
cudaStream_t stream) {
auto x_dim = framework::vectorize2int(x.dims());
std::vector<int> new_x_dim, new_reduce_dims;
int is_reduced = 0;
for (auto e : origin_reduce_dims) {
auto pos = e >= 0 ? e : e + x_dim.size();
is_reduced |= 1 << e;
}
for (int i = 0; i < x_dim.size(); i++) {
if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
new_x_dim.push_back(x_dim[i]);
if ((is_reduced >> i) & 1)
new_reduce_dims.push_back(new_x_dim.size() - 1);
} else {
new_x_dim[new_x_dim.size() - 1] *= x_dim[i];
}
}
x_dim = new_x_dim;
origin_reduce_dims = new_reduce_dims;
int x_rank = static_cast<int>(x_dim.size());
std::set<int> left_set, reduce_set;
for (int i = 0; i < x_rank; ++i) left_set.insert(i);
for (auto e : origin_reduce_dims) {
left_set.erase(e);
reduce_set.insert(e);
}
std::vector<int> reduce_dim(reduce_set.begin(), reduce_set.end());
std::vector<int> left_dim(left_set.begin(), left_set.end());
std::vector<int> x_strides = detail::GetStrides(x_dim);
std::vector<int> reduce_strides = detail::GetStrides(x_dim, reduce_dim);
std::vector<int> left_strides = detail::GetStrides(x_dim, left_dim);
int reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
int left_num = 1;
if (left_dim.size()) left_num = left_strides[0] * x_dim[left_dim[0]];
std::vector<int> y_dim(left_dim.size());
for (int i = 0; i < left_dim.size(); ++i) {
y_dim[i] = x_dim[left_dim[i]];
}
auto x_data = x.data<Tx>();
auto y_data = y->mutable_data<Ty>(x.place());
if (reduce_num == 1) return;
#define CUB_BLOCK_DIM_CASE(block_dim) \
case block_dim: { \
constexpr auto kBlockDim = block_dim; \
detail::TensorReduceImpl<Tx, Ty, block_dim, ReduceOp, TransformOp>( \
x_data, y_data, x.place(), reducer, transformer, init, left_num, \
reduce_num, x_strides, reduce_dim, reduce_strides, left_dim, \
left_strides, stream); \
} break
switch (detail::GetDesiredBlockDim(reduce_num)) {
CUB_BLOCK_DIM_CASE(512);
CUB_BLOCK_DIM_CASE(256);
CUB_BLOCK_DIM_CASE(128);
CUB_BLOCK_DIM_CASE(64);
CUB_BLOCK_DIM_CASE(32);
CUB_BLOCK_DIM_CASE(16);
CUB_BLOCK_DIM_CASE(8);
CUB_BLOCK_DIM_CASE(4);
CUB_BLOCK_DIM_CASE(2);
}
#undef CUB_BLOCK_DIM_CASE
}
} // namespace operators
} // namespace paddle
...@@ -32,8 +32,7 @@ class DepthwiseConvFunctor { ...@@ -32,8 +32,7 @@ class DepthwiseConvFunctor {
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& filter, const framework::Tensor& filter,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings, framework::Tensor* output);
const std::vector<int>& dilations, framework::Tensor* output);
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -44,7 +43,6 @@ class DepthwiseConvInputGradFunctor { ...@@ -44,7 +43,6 @@ class DepthwiseConvInputGradFunctor {
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* input_grad); framework::Tensor* input_grad);
}; };
...@@ -55,7 +53,6 @@ class DepthwiseConvFilterGradFunctor { ...@@ -55,7 +53,6 @@ class DepthwiseConvFilterGradFunctor {
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* filter_grad); framework::Tensor* filter_grad);
}; };
......
...@@ -12,64 +12,17 @@ ...@@ -12,64 +12,17 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <vector>
#include "paddle/fluid/operators/cub_reduce.h"
#include "paddle/fluid/operators/reduce_mean_op.h" #include "paddle/fluid/operators/reduce_mean_op.h"
namespace paddle { REGISTER_OP_CUDA_KERNEL(reduce_mean,
namespace operators { ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::MeanFunctor>,
template <typename T> ops::ReduceKernel<paddle::platform::CUDADeviceContext,
struct DivideFunctor { double, ops::MeanFunctor>,
HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::MeanFunctor>,
HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::MeanFunctor>);
private:
T n_inv;
};
template <typename T>
class ReduceMeanKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
auto dims = context.Attr<std::vector<int>>("dim");
bool keep_dim = context.Attr<bool>("keep_dim");
std::vector<int> reduce_dims;
if (reduce_all) {
reduce_dims.resize(input->dims().size());
for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
} else {
for (auto e : dims) {
reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
}
}
int reduce_num = 1;
for (int i = 0; i < reduce_dims.size(); ++i) {
reduce_num *= input->dims()[reduce_dims[i]];
}
auto stream = context.cuda_device_context().stream();
TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
DivideFunctor<T>(reduce_num), stream);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
ops::ReduceMeanKernel<double>,
ops::ReduceMeanKernel<int>,
ops::ReduceMeanKernel<int64_t>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::MeanGradFunctor>, float, ops::MeanGradFunctor>,
......
...@@ -12,59 +12,17 @@ ...@@ -12,59 +12,17 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/cub_reduce.h"
#include "paddle/fluid/operators/reduce_sum_op.h" #include "paddle/fluid/operators/reduce_sum_op.h"
namespace paddle { REGISTER_OP_CUDA_KERNEL(reduce_sum,
namespace operators { ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::SumFunctor>,
template <typename T> ops::ReduceKernel<paddle::platform::CUDADeviceContext,
struct IdentityFunctor { double, ops::SumFunctor>,
HOSTDEVICE explicit inline IdentityFunctor() {} ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::SumFunctor>,
HOSTDEVICE inline T operator()(const T& x) const { return x; } ops::ReduceKernel<paddle::platform::CUDADeviceContext,
}; int64_t, ops::SumFunctor>);
template <typename T>
class ReduceSumKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
auto dims = context.Attr<std::vector<int>>("dim");
bool keep_dim = context.Attr<bool>("keep_dim");
std::vector<int> reduce_dims;
if (reduce_all) {
reduce_dims.resize(input->dims().size());
for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
} else {
for (auto e : dims) {
reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
}
}
int reduce_num = 1;
for (int i = 0; i < reduce_dims.size(); ++i) {
reduce_num *= input->dims()[reduce_dims[i]];
}
auto stream = context.cuda_device_context().stream();
TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
IdentityFunctor<T>(), stream);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>,
ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
ops::ReduceSumKernel<int64_t>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::SumGradFunctor>, float, ops::SumGradFunctor>,
......
...@@ -842,6 +842,13 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None): ...@@ -842,6 +842,13 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
def shuffle(reader, buffer_size): def shuffle(reader, buffer_size):
""" """
Shuffle the reader. Shuffle the reader.
Args:
reader(Variable): The reader to be decorated with 'shuffling'.
buffer_size(int): The pre-read number of data in :code:`reader`.
Returns:
Variable: The reader which has been decorated with 'shuffling'.
""" """
return __create_unshared_decorated_reader__( return __create_unshared_decorated_reader__(
'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)}) 'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
......
...@@ -6367,7 +6367,7 @@ def stack(x, axis=0): ...@@ -6367,7 +6367,7 @@ def stack(x, axis=0):
if not isinstance(x, list) and not isinstance(x, tuple): if not isinstance(x, list) and not isinstance(x, tuple):
x = [x] x = [x]
out = helper.create_tmp_variable(x[0].dtype) out = helper.create_tmp_variable(dtype=x[0].dtype)
helper.append_op( helper.append_op(
type='stack', inputs={'X': x}, outputs={'Y': out}, type='stack', inputs={'X': x}, outputs={'Y': out},
attrs={'axis': axis}) attrs={'axis': axis})
...@@ -6404,8 +6404,8 @@ def unstack(x, axis=0, num=None): ...@@ -6404,8 +6404,8 @@ def unstack(x, axis=0, num=None):
num = x.shape[axis] num = x.shape[axis]
outs = [] outs = []
for _ in num: for _ in xrange(num):
outs.append(helper.create_tmp_variable(x.dtype)) outs.append(helper.create_tmp_variable(dtype=x.dtype))
helper.append_op( helper.append_op(
type='unstack', type='unstack',
......
...@@ -67,7 +67,6 @@ class TestConv2dOp(OpTest): ...@@ -67,7 +67,6 @@ class TestConv2dOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "conv2d" self.op_type = "conv2d"
self.use_cudnn = False self.use_cudnn = False
self.use_cuda = False
self.use_mkldnn = False self.use_mkldnn = False
self.data_format = "AnyLayout" self.data_format = "AnyLayout"
self.dtype = np.float32 self.dtype = np.float32
...@@ -102,25 +101,24 @@ class TestConv2dOp(OpTest): ...@@ -102,25 +101,24 @@ class TestConv2dOp(OpTest):
} }
self.outputs = {'Output': output} self.outputs = {'Output': output}
def testcuda(self): def testcudnn(self):
return core.is_compiled_with_cuda() and (self.use_cudnn or return core.is_compiled_with_cuda() and self.use_cudnn
self.use_cuda)
def test_check_output(self): def test_check_output(self):
place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
self.check_output_with_place(place, atol=1e-5) self.check_output_with_place(place, atol=1e-5)
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
self.check_grad_with_place( self.check_grad_with_place(
place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02) place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
def test_check_grad_no_filter(self): def test_check_grad_no_filter(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
self.check_grad_with_place( self.check_grad_with_place(
place, ['Input'], place, ['Input'],
'Output', 'Output',
...@@ -130,7 +128,7 @@ class TestConv2dOp(OpTest): ...@@ -130,7 +128,7 @@ class TestConv2dOp(OpTest):
def test_check_grad_no_input(self): def test_check_grad_no_input(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
self.check_grad_with_place( self.check_grad_with_place(
place, ['Filter'], place, ['Filter'],
'Output', 'Output',
...@@ -327,65 +325,22 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): ...@@ -327,65 +325,22 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
class TestDepthwiseConv(TestConv2dOp): class TestDepthwiseConv(TestConv2dOp):
def init_test_case(self): def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1] self.pad = [1, 1]
self.stride = [2, 2] self.stride = [2, 2]
self.input_size = [2, 3, 5, 5] # NCHW self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3 self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0 assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups f_c = self.input_size[1] // self.groups
self.filter_size = [3, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
class TestDepthwiseConv2(TestConv2dOp):
def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [3, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
class TestDepthwiseConv3(TestConv2dOp):
def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3] self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv2d" self.op_type = "depthwise_conv2d"
class TestDepthwiseConvWithDilation(TestConv2dOp): class TestDepthwiseConv2(TestConv2dOp):
def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
self.dilations = [2, 2]
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
class TestDepthwiseConvWithDilation2(TestConv2dOp):
def init_test_case(self): def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1] self.pad = [1, 1]
self.stride = [1, 1] self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3 self.groups = 3
self.dilations = [2, 2]
assert np.mod(self.input_size[1], self.groups) == 0 assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3] self.filter_size = [6, f_c, 3, 3]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册