From 5ede6fd434119d22cf0f257858dd5dedb1c1c091 Mon Sep 17 00:00:00 2001 From: xzl Date: Mon, 18 Sep 2017 21:05:31 +0800 Subject: [PATCH] delete cuda impl, complete comments, modify variable naming --- paddle/operators/transpose_op.cc | 77 +++++++----- paddle/operators/transpose_op.cu | 117 +----------------- paddle/operators/transpose_op.h | 83 +++++-------- .../v2/framework/tests/test_transpose_op.py | 53 +++++--- 4 files changed, 121 insertions(+), 209 deletions(-) diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc index ea6b2a9ec5..2fd86d900a 100644 --- a/paddle/operators/transpose_op.cc +++ b/paddle/operators/transpose_op.cc @@ -13,8 +13,6 @@ limitations under the License. */ #include "paddle/operators/transpose_op.h" -#include -#include "paddle/framework/ddim.h" namespace paddle { namespace operators { @@ -27,28 +25,31 @@ class TransposeOp : public framework::OperatorWithKernel { protected: void InferShape(const framework::InferShapeContext &ctx) const override { - auto in_dim = ctx.Input("X")->dims(); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Input"), + "Input(Input) should not be null"); + auto input_dim = ctx.Input("Input")->dims(); auto axis = ctx.Attr>("axis"); - size_t in_dim_size = in_dim.size(); + size_t input_dim_size = input_dim.size(); size_t axis_size = axis.size(); - PADDLE_ENFORCE_EQ( - in_dim_size, axis_size, - "the input tensor dimensions should be equal to the axis size"); + PADDLE_ENFORCE_EQ(input_dim_size, axis_size, + "the input tensor's dimension(%d) " + "should be equal to the axis's size(%d)", + input_dim_size, axis_size); std::vector axis_sorted(axis); std::sort(axis_sorted.begin(), axis_sorted.end()); for (size_t i = 0; i < axis_sorted.size(); i++) { - PADDLE_ENFORCE_EQ(axis_sorted[i], (int)i, + PADDLE_ENFORCE_EQ(axis_sorted[i], static_cast(i), "the sorted axis should be [0, 1, ... dims - 1], " - "the dims equals to the input tensor dimensions"); + "where the dims is the axis's size"); } - framework::DDim out_dim(in_dim); + framework::DDim output_dim(input_dim); for (size_t i = 0; i < axis.size(); i++) { - out_dim[i] = in_dim[axis[i]]; + output_dim[i] = input_dim[axis[i]]; } - ctx.Output("Out")->Resize(out_dim); + ctx.Output("Output")->Resize(output_dim); } }; @@ -57,16 +58,30 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker { TransposeOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "The input of transpose op"); - AddOutput("Out", "The output of transpose op"); + AddInput( + "Input", + "(Tensor)The input tensor, tensors with rank at most 7 are supported"); + AddOutput("Output", "(Tensor)The output tensor"); AddAttr>( "axis", - "a list of values, and the size of the list should be " + "(vector)a list of values, and the size of the list should be " "the same with the input tensor dimensions, the tensor will " "permute the axes according the the values given"); AddComment(R"DOC( The Tensor will be permuted according to the axis values given. -For example, given a input tensor of shape(N, C, H, W) and the axis is {0, 2, 3, 1}, +The op is very much like the numpy.transpose function in python +For example: + >> input = numpy.arange(6).reshape((2,3)) + >> input + array([[0, 1, 2], + [3, 4, 5]]) + >> axis = [1, 0] + >> output = input.transpose(axis) + >> output + array([[0, 3], + [1, 4], + [2, 5]]) +So, given a input tensor of shape(N, C, H, W) and the axis is {0, 2, 3, 1}, the output tensor shape will be (N, H, W, C) )DOC"); } @@ -78,20 +93,22 @@ class TransposeOpGrad : public framework::OperatorWithKernel { protected: void InferShape(const framework::InferShapeContext &ctx) const override { - PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null"); - PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null"); - auto x_dims = ctx.Input("X")->dims(); - auto *x_grad = ctx.Output(framework::GradVarName("X")); - - auto out_grad_dims = - ctx.Input(framework::GradVarName("Out"))->dims(); - auto out_dims = ctx.Input("Out")->dims(); - - PADDLE_ENFORCE(out_grad_dims == out_dims, - "Out@GRAD dims must equal to Input(X) dims"); - - x_grad->Resize(x_dims); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Input"), + "Input(Input) should not be null"); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Output")), + "Input(Output@GRAD) should not be null"); + auto input_dims = ctx.Input("Input")->dims(); + auto *input_grad = + ctx.Output(framework::GradVarName("Input")); + + auto output_grad_dims = + ctx.Input(framework::GradVarName("Output"))->dims(); + auto output_dims = ctx.Input("Output")->dims(); + + PADDLE_ENFORCE(output_grad_dims == output_dims, + "Output@GRAD dims must equal to Input(Input) dims"); + + input_grad->Resize(input_dims); } }; diff --git a/paddle/operators/transpose_op.cu b/paddle/operators/transpose_op.cu index a3c4d2bf63..af3f581462 100644 --- a/paddle/operators/transpose_op.cu +++ b/paddle/operators/transpose_op.cu @@ -12,118 +12,11 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/memory/memcpy.h" -#include "paddle/memory/memory.h" #include "paddle/operators/transpose_op.h" -namespace paddle { -namespace operators { - -template -__global__ void transpose_kernel(int nthreads, const T* in_data, T* out_data, - int* offset_buffer, int ndims) { - int* in_offset = offset_buffer; - int* out_offset = offset_buffer + ndims; - int* axis = offset_buffer + ndims * 2; - - int to_index = blockIdx.x * blockDim.x + threadIdx.x; - - if (to_index < nthreads) { - int from_index = 0; - int temp = to_index; - for (size_t i = 0; i < ndims; i++) { - from_index += (temp / out_offset[i]) * in_offset[axis[i]]; - temp = temp % out_offset[i]; - } - out_data[to_index] = in_data[from_index]; - } -} - -template -void TransposeCUDA(const framework::ExecutionContext& context, - const framework::Tensor& in, framework::Tensor& out, - std::vector axis) { - auto* in_data = in.template data(); - auto* out_data = out.template mutable_data(context.GetPlace()); - auto in_dim = in.dims(); - auto out_dim = out.dims(); - auto data_size = product(in_dim); - size_t ndims = in_dim.size(); - std::vector in_offset(ndims, 1); - std::vector out_offset(ndims, 1); - - auto cpu_place = platform::CPUPlace(); - auto gpu_place = boost::get(context.GetPlace()); - - // Get a host_buffer to cache the input offset, output offset and the axis. - std::vector buffer_dim_shape(1, ndims * 3); - auto buffer_dims = framework::make_ddim(buffer_dim_shape); - framework::Tensor host_buffer; - int* host_buffer_data = host_buffer.mutable_data(buffer_dims, cpu_place); - - for (int i = ndims - 2; i >= 0; i--) { - in_offset[i] = in_offset[i + 1] * in_dim[i + 1]; - out_offset[i] = out_offset[i + 1] * out_dim[i + 1]; - } - // copy the data to the host_buffer - for (int i = 0; i < ndims; i++) { - host_buffer_data[i] = in_offset[i]; - host_buffer_data[i + ndims] = out_offset[i]; - host_buffer_data[i + ndims * 2] = axis[i]; - } - - // Get a device_buffer to cache the input offset, output offset and the axis. - auto offset_buffer = memory::Alloc(gpu_place, ndims * 3 * sizeof(int)); - - auto* cuda_device_context = reinterpret_cast( - const_cast(context.device_context_)); - - // copy the host_buffer data to the device_buffer - memory::Copy(gpu_place, offset_buffer, cpu_place, host_buffer_data, - ndims * 3 * sizeof(int), cuda_device_context->stream()); - - int block = 512; - int grid = (data_size + block - 1) / block; - transpose_kernel<<>>(data_size, in_data, out_data, - static_cast(offset_buffer), ndims); - memory::Free(gpu_place, offset_buffer); -} - -template -class TransposeCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), - "It must use GPUPlace."); - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - auto axis = context.Attr>("axis"); - TransposeCUDA(context, *in, *out, axis); - } -}; - -template -class TransposeGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), - "It must use GPUPlace."); - auto* in = context.Input(framework::GradVarName("Out")); - auto* out = context.Output(framework::GradVarName("X")); - auto axis_temp = context.Attr>("axis"); - - std::vector axis(axis_temp); - - for (size_t i = 0; i < axis.size(); i++) { - axis[axis_temp[i]] = i; - } - TransposeCUDA(context, *in, *out, axis); - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(transpose, ops::TransposeCUDAKernel); -REGISTER_OP_GPU_KERNEL(transpose_grad, ops::TransposeGradCUDAKernel); +REGISTER_OP_GPU_KERNEL(transpose, + ops::TransposeKernel); +REGISTER_OP_GPU_KERNEL( + transpose_grad, + ops::TransposeGradKernel); diff --git a/paddle/operators/transpose_op.h b/paddle/operators/transpose_op.h index 19916cc224..48d8c250a8 100644 --- a/paddle/operators/transpose_op.h +++ b/paddle/operators/transpose_op.h @@ -20,41 +20,10 @@ namespace paddle { namespace operators { -template -void NaiveCpuTranspose(const framework::ExecutionContext& context, - const framework::Tensor& in, framework::Tensor& out, - std::vector axis) { - auto in_data = in.data(); - auto out_data = out.mutable_data(context.GetPlace()); - auto in_dim = in.dims(); - auto out_dim = out.dims(); - size_t ndims = in_dim.size(); - - std::vector in_offset(ndims, 1); - std::vector out_offset(ndims, 1); - - for (int i = ndims - 2; i >= 0; i--) { - in_offset[i] = in_offset[i + 1] * in_dim[i + 1]; - out_offset[i] = out_offset[i + 1] * out_dim[i + 1]; - } - - size_t data_size = product(in_dim); - - for (size_t to_index = 0; to_index < data_size; to_index++) { - int from_index = 0; - int temp = to_index; - for (size_t i = 0; i < ndims; i++) { - from_index += (temp / out_offset[i]) * in_offset[axis[i]]; - temp = temp % out_offset[i]; - } - out_data[to_index] = in_data[from_index]; - } -} - template -void DoTranspose(const framework::ExecutionContext& context, - const framework::Tensor& in, framework::Tensor& out, - std::vector axis) { +void EigenTranspose(const framework::ExecutionContext& context, + const framework::Tensor& in, framework::Tensor& out, + std::vector axis) { Eigen::array permute; for (int i = 0; i < Dims; i++) { permute[i] = axis[i]; @@ -72,28 +41,32 @@ template class TransposeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); + auto* input = context.Input("Input"); + auto* output = context.Output("Output"); + output->mutable_data(context.GetPlace()); auto axis = context.Attr>("axis"); int ndims = axis.size(); switch (ndims) { + case 1: + break; case 2: - DoTranspose(context, *in, *out, axis); + EigenTranspose(context, *input, *output, axis); break; case 3: - DoTranspose(context, *in, *out, axis); + EigenTranspose(context, *input, *output, axis); break; case 4: - DoTranspose(context, *in, *out, axis); + EigenTranspose(context, *input, *output, axis); break; case 5: - DoTranspose(context, *in, *out, axis); + EigenTranspose(context, *input, *output, axis); break; - default: - NaiveCpuTranspose(context, *in, *out, axis); + case 6: + EigenTranspose(context, *input, *output, axis); break; + default: + PADDLE_THROW("Tensors with rank at most 6 are supported"); } } }; @@ -102,9 +75,11 @@ template class TransposeGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input(framework::GradVarName("Out")); - auto* out = context.Output(framework::GradVarName("X")); - out->mutable_data(context.GetPlace()); + auto* output_grad = + context.Input(framework::GradVarName("Output")); + auto* input_grad = + context.Output(framework::GradVarName("Input")); + input_grad->mutable_data(context.GetPlace()); auto axis_temp = context.Attr>("axis"); std::vector axis(axis_temp); @@ -116,21 +91,25 @@ class TransposeGradKernel : public framework::OpKernel { int ndims = axis.size(); switch (ndims) { + case 1: + break; case 2: - DoTranspose(context, *in, *out, axis); + EigenTranspose(context, *output_grad, *input_grad, axis); break; case 3: - DoTranspose(context, *in, *out, axis); + EigenTranspose(context, *output_grad, *input_grad, axis); break; case 4: - DoTranspose(context, *in, *out, axis); + EigenTranspose(context, *output_grad, *input_grad, axis); break; case 5: - DoTranspose(context, *in, *out, axis); + EigenTranspose(context, *output_grad, *input_grad, axis); break; - default: - NaiveCpuTranspose(context, *in, *out, axis); + case 6: + EigenTranspose(context, *output_grad, *input_grad, axis); break; + default: + PADDLE_THROW("Tensors with rank at most 6 are supported"); } } }; diff --git a/python/paddle/v2/framework/tests/test_transpose_op.py b/python/paddle/v2/framework/tests/test_transpose_op.py index 63021da6aa..8e7e12910d 100644 --- a/python/paddle/v2/framework/tests/test_transpose_op.py +++ b/python/paddle/v2/framework/tests/test_transpose_op.py @@ -1,26 +1,49 @@ import unittest import numpy as np -from gradient_checker import GradientChecker -from op_test_util import OpTestMeta -from paddle.v2.framework.op import Operator +from op_test import OpTest -class TestTransposeOp(unittest.TestCase): - __metaclass__ = OpTestMeta - +class TestTransposeOp(OpTest): def setUp(self): - self.type = "transpose" - self.inputs = {'X': np.random.random((3, 4)).astype("float32"), } - self.attrs = {'axis': [1, 0]} - self.outputs = {'Out': self.inputs['X'].transpose((1, 0))} + self.initTestCase() + self.op_type = "transpose" + self.inputs = {'Input': np.random.random(self.shape).astype("float32")} + self.attrs = {'axis': list(self.axis)} + self.outputs = {'Output': self.inputs['Input'].transpose(self.axis)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['Input'], 'Output') + + def initTestCase(self): + self.shape = (3, 4) + self.axis = (1, 0) + + +class TestCase1(TestTransposeOp): + def initTestCase(self): + self.shape = (3, 4, 5) + self.axis = (0, 2, 1) + + +class TestCase2(TestTransposeOp): + def initTestCase(self): + self.shape = (2, 3, 4, 5) + self.axis = (0, 2, 3, 1) + +class TestCase3(TestTransposeOp): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6) + self.axis = (4, 2, 3, 1, 0) -class TransposeGradOpTest(GradientChecker): - def test_transpose(self): - op = Operator("transpose", X="X", Out="Out", axis=[1, 0]) - inputs = {'X': np.random.random((32, 84)).astype("float32"), } - self.check_grad(op, inputs, set(["X"]), "Out", max_relative_error=0.5) +class TestCase4(TestTransposeOp): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6, 1) + self.axis = (4, 2, 3, 1, 0, 5) if __name__ == '__main__': -- GitLab