提交 5802880b 编写于 作者: W wanghaox

update maxoutop for code review 3

上级 3ef776ef
......@@ -22,23 +22,20 @@ namespace math {
* All tensors are in NCHW format.
* groups mustbe > 1
*/
template <typename MaxOutProcess, typename T>
class MaxOutFunctor<platform::CPUPlace, MaxOutProcess, T> {
template <typename T>
class MaxOutFunctor<platform::CPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
framework::Tensor * output,
int groups,
MaxOutProcess maxout_process) {
int groups) {
const int batch_size = input.dims()[0];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output->dims()[1];
int fea_size = input_height * input_width;
// c_size mean output one batch size
// c_size means the output size of each sample
int c_size = fea_size * output_channels;
const T* input_data = input.data<T>();
T* output_data = output->mutable_data<T>(context.GetPlace());
......@@ -47,10 +44,11 @@ class MaxOutFunctor<platform::CPUPlace, MaxOutProcess, T> {
for (int c = 0; c < output_channels; ++c) {
int new_cindex = fea_size * c;
for (int f = 0; f < fea_size; ++f) {
T ele = maxout_process.initial();
// T ele = maxout_process.initial();
T ele = static_cast<T>(-FLT_MAX);
for (int ph = 0; ph < groups; ++ph) {
maxout_process.compute(ele,
input_data[(new_bindex+new_cindex) * groups+ph*fea_size+f]);
T x=input_data[(new_bindex+new_cindex) * groups+ph*fea_size+f];
ele = ele > x ? ele : x;
}
output_data[(new_bindex+new_cindex+f)] = ele;
}
......@@ -74,9 +72,7 @@ public:
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output.dims()[1];
int fea_size = input_height * input_width;
const T* input_data = input.data<T>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
......@@ -87,15 +83,15 @@ public:
for (int c = 0; c < output_channels; ++c) {
int clen = fea_size * c;
for (int f = 0; f < fea_size; ++f) {
int input_idx = 0;
bool stop = false;
int input_idx0 = (blen + clen) * groups + f;
bool continue_match = true;
int output_idx = blen + clen + f;
for (int g = 0; g < groups && !stop; ++g) {
input_idx = (blen + clen) * groups + fea_size * g + f;
for (int g = 0; g < groups && continue_match; ++g) {
int input_idx = input_idx0 + fea_size * g;
input_grad_data[input_idx] = 0;
if (input_data[input_idx] == output_data[output_idx]) {
input_grad_data[input_idx] += output_grad_data[output_idx];
stop = true;
continue_match = false;
}
}
}
......@@ -106,10 +102,8 @@ public:
template class MaxOutGradFunctor<platform::CPUPlace, float>;
template class MaxOutGradFunctor<platform::CPUPlace, double>;
template class MaxOutFunctor<platform::CPUPlace,
math::MaxOut<float>, float>;
template class MaxOutFunctor<platform::CPUPlace,
math::MaxOut<double>, double>;
template class MaxOutFunctor<platform::CPUPlace, float>;
template class MaxOutFunctor<platform::CPUPlace, double>;
} // namespace math
} // namespace operators
......
......@@ -19,27 +19,28 @@ namespace paddle {
namespace operators {
namespace math {
template <typename MaxOutProcess, typename T>
template <typename T>
__global__ void KernelMaxOut(const int nthreads, const T* input_data,
const int channels,
const int input_height, const int input_width,
int groups, T* output_data,
MaxOutProcess maxout_process) {
int groups, T* output_data ) {
const int size = input_height * input_width * channels / groups;
const int feat_len = input_height * input_width;
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
index += blockDim.x * gridDim.x) {
int batch_idx = index / size;
int batch_offset = index % size;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (int i = index; i < nthreads; i += offset) {
int batch_idx = i / size;
int batch_offset = i % size;
int channel_idx = batch_offset / feat_len;
int feat_idx = batch_offset % feat_len;
int data_idx =
(batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
T ele = maxout_process.initial();
T ele = static_cast<T>(-FLT_MAX);
for (int g = 0; g < groups; ++g) {
maxout_process.compute(ele, input_data[data_idx + g * feat_len]);
T x=input_data[data_idx + g * feat_len];
ele = ele > x ? ele : x;
}
output_data[index] = ele;
output_data[i] = ele;
}
}
template <typename T>
......@@ -49,38 +50,38 @@ __global__ void KernelMaxoutGrad(
const int input_height, const int input_width, int groups) {
const int size = input_height * input_width * channels / groups;
const int feat_len = input_height * input_width;
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
index += blockDim.x * gridDim.x) {
int batch_idx = index / size;
int batch_offset = index % size;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (int i = index; i < nthreads; i += offset) {
int batch_idx = i / size;
int batch_offset = i % size;
int channel_idx = batch_offset / feat_len;
int feat_idx = batch_offset % feat_len;
int data_idx =
(batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
int maxIndex = -1;
bool stop = false;
for (int g = 0; g < groups && !stop; ++g) {
if (input_data[data_idx + g * feat_len] == output_data[index]) {
maxIndex = data_idx + g * feat_len;
stop = true;
int max_index = -1;
bool continue_match = true;
for (int g = 0; g < groups && continue_match; ++g) {
if (input_data[data_idx + g * feat_len] == output_data[i]) {
max_index = data_idx + g * feat_len;
continue_match = false;
}
}
if (maxIndex != -1) {
if (max_index != -1) {
// atomic add
platform::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]);
platform::CudaAtomicAdd(input_grad + max_index, output_grad[index]);
}
}
}
/*
* All tensors are in NCHW format.
*/
template <typename MaxOutProcess, typename T>
class MaxOutFunctor<platform::GPUPlace, MaxOutProcess, T> {
template <typename T>
class MaxOutFunctor<platform::GPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor * output,
int groups,
MaxOutProcess maxout_process) {
int groups) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_height = input.dims()[2];
......@@ -97,12 +98,11 @@ class MaxOutFunctor<platform::GPUPlace, MaxOutProcess, T> {
dim3 grid(blocks, 1);
KernelMaxOut<
MaxOutProcess,
T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(nthreads, input_data, input_channels,
input_height, input_width, groups,
output_data, maxout_process);
output_data);
}
};
/*
......@@ -145,10 +145,8 @@ class MaxOutGradFunctor<platform::GPUPlace, T> {
template class MaxOutGradFunctor<platform::GPUPlace, float>;
template class MaxOutGradFunctor<platform::GPUPlace, double>;
template class MaxOutFunctor<platform::GPUPlace,
math::MaxOut<float>, float>;
template class MaxOutFunctor<platform::GPUPlace,
math::MaxOut<double>, double>;
template class MaxOutFunctor<platform::GPUPlace, float>;
template class MaxOutFunctor<platform::GPUPlace, double>;
} // namespace math
} // namespace operators
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/hostdevice.h"
......@@ -22,42 +21,18 @@ namespace paddle {
namespace operators {
namespace math {
#define FLT_MAX \
__FLT_MAX__
/*
* \brief Extracting simple operations from maxout.
* need "initial", "compute"
* operation.
*/
template <class T>
class MaxOut {
public:
DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; }
};
template <class T>
class MaxOutGrad {
public:
DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
T scale) {
dx += dy * (x == y);
}
};
template <typename Place, typename MaxOutProcess, typename T>
template <typename Place, typename T>
class MaxOutFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor * output,
int groups, MaxOutProcess maxout_compute);
int groups );
};
template <typename Place, class T>
class MaxOutGradFunctor {
public:
......@@ -67,13 +42,6 @@ class MaxOutGradFunctor {
const framework::Tensor& output,
const framework::Tensor& output_grad, int groups);
};
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -12,7 +12,6 @@
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/operators/maxout_op.h"
namespace paddle {
namespace operators {
......@@ -33,18 +32,18 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
"Where N is batch size, C is "
"the number of channels, H and W is the height and "
"width of feature.");
AddAttr<int>(
"groups",
R"DOC(The group number of input layer.
)DOC");
AddComment(R"DOC(
- Input: NCHW.
- Output: feature map size same as input. Channel is (input channel) / groups.
- Output: The feature map size of output is the same as the input.
The output_channel is (input channel) / groups
So groups should be larger than 1, and the num of channels should be able
to devided by groups.
to be devided by groups.
.. math::
math:
y_{si+j} = \max_k x_{gsi + sk + j}
g = groups
s = input.size / num_channels
......@@ -57,29 +56,6 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
- Multi-digit Number Recognition from Street View \
Imagery using Deep Convolutional Neural Networks: \
https://arxiv.org/pdf/1312.6082v4.pdf
The simple usage is:
.. code-block:: python
maxout = maxout_layer(input,
num_channels=128,
groups=4)
:param input: The input of this layer.
:type input: LayerOutput
:param num_channels: The channel number of input layer. If None will be set
automatically from previous output.
:type num_channels: int | None
:param groups: The group number of input layer.
:type groups: int
:param name: The name of this layer. It is optional.
:type name: None | basestring.
:param layer_attr: Extra Layer attribute.
:type layer_attr: ExtraLayerAttribute
:return: LayerOutput object.
:rtype: LayerOutput
)DOC");
}
};
......@@ -88,7 +64,6 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
class MaxOutOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of maxoutOp"
"should not be null.");
......@@ -96,26 +71,20 @@ class MaxOutOp : public framework::OperatorWithKernel {
"Output(Out) of maxoutOp should not be null.");
auto in_x_dims = ctx->GetInputDim("X");
int groups = ctx->Attrs().Get<int>("groups");
// check groups > 1
PADDLE_ENFORCE_GT(
groups, 1,
"in maxoutop groups should be larger than 1");
"groups should be larger than 1 in maxoutop");
std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1] / groups});
output_shape.push_back(in_x_dims[2]);
output_shape.push_back(in_x_dims[3]);
ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
}
};
class MaxOutOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
......@@ -129,8 +98,6 @@ class MaxOutOpGrad : public framework::OperatorWithKernel {
namespace ops = paddle::operators;
REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
ops::MaxOutOpGrad);
REGISTER_OP_CPU_KERNEL(maxout, ops::MaxOutKernel<paddle::platform::CPUPlace,
float>);
REGISTER_OP_CPU_KERNEL(maxout_grad,
......
......@@ -29,16 +29,12 @@ class MaxOutKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* in_x = context.Input<Tensor>("X");
Tensor* out = context.Output<Tensor>("Out");
int groups = context.template Attr<int>("groups");
paddle::operators::math::MaxOutFunctor<
Place, paddle::operators::math::MaxOut<T>, T>
Place, T>
maxout_forward;
paddle::operators::math::MaxOut<T> maxout_process;
maxout_forward(context.device_context(), *in_x, out, groups,
maxout_process);
maxout_forward(context.device_context(), *in_x, out, groups);
}
};
......@@ -51,15 +47,12 @@ class MaxOutGradKernel : public framework::OpKernel<T> {
const Tensor* out_grad =
context.Input<Tensor>(framework::GradVarName("Out"));
Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
int groups = context.template Attr<int>("groups");
auto& device_ctx = context.device_context();
math::SetConstant<Place, T> zero;
if (in_x_grad) {
in_x_grad->mutable_data<T>(context.GetPlace());
zero(device_ctx, in_x_grad, static_cast<T>(0.0));
paddle::operators::math::MaxOutGradFunctor<Place, T>
maxout_backward;
maxout_backward(context.device_context(), *in_x, *in_x_grad, *out,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册