提交 d6651b9b 编写于 作者: X xzl

fixed bug of the gpu impl

上级 17b4b980
...@@ -51,8 +51,7 @@ list(REMOVE_ITEM GENERAL_OPS ...@@ -51,8 +51,7 @@ list(REMOVE_ITEM GENERAL_OPS
minus_op minus_op
mul_op mul_op
recurrent_op recurrent_op
scale_op scale_op)
transpose_op)
op_library(net_op SRCS net_op.cc) op_library(net_op SRCS net_op.cc)
op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op) op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op)
...@@ -60,7 +59,6 @@ op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function) ...@@ -60,7 +59,6 @@ op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
DEPS framework_proto tensor operator net_op) DEPS framework_proto tensor operator net_op)
op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op) op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op)
op_library(transpose_op SRCS transpose_op.cc transpose_op.cu DEPS paddle_memory device_context)
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
op_library(${src} SRCS ${src}.cc ${src}.cu) op_library(${src} SRCS ${src}.cc ${src}.cu)
......
...@@ -31,6 +31,7 @@ class TransposeOp : public framework::OperatorWithKernel { ...@@ -31,6 +31,7 @@ class TransposeOp : public framework::OperatorWithKernel {
auto axis = ctx.GetAttr<std::vector<int>>("axis"); auto axis = ctx.GetAttr<std::vector<int>>("axis");
size_t in_dim_size = in_dim.size(); size_t in_dim_size = in_dim.size();
size_t axis_size = axis.size(); size_t axis_size = axis.size();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_dim_size, axis_size, in_dim_size, axis_size,
"the input tensor dimensions should be equal to the axis size"); "the input tensor dimensions should be equal to the axis size");
...@@ -42,7 +43,7 @@ class TransposeOp : public framework::OperatorWithKernel { ...@@ -42,7 +43,7 @@ class TransposeOp : public framework::OperatorWithKernel {
"the sorted axis should be [0, 1, ... dims - 1], " "the sorted axis should be [0, 1, ... dims - 1], "
"the dims equals to the input tensor dimensions"); "the dims equals to the input tensor dimensions");
} }
//
framework::DDim out_dim(in_dim); framework::DDim out_dim(in_dim);
for (size_t i = 0; i < axis.size(); i++) { for (size_t i = 0; i < axis.size(); i++) {
out_dim[i] = in_dim[axis[i]]; out_dim[i] = in_dim[axis[i]];
...@@ -60,11 +61,12 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -60,11 +61,12 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "The output of transpose op"); AddOutput("Out", "The output of transpose op");
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"axis", "axis",
"a list of integers, and the num of integers should be " "a list of values, and the size of the list should be "
"the same with the input tensor dimensions"); "the same with the input tensor dimensions, the tensor will "
"permute the axes according the the values given");
AddComment(R"DOC( AddComment(R"DOC(
Transpose the input tensor. The Tensor will be permuted according to the axis values given.
For example, input tensor shape(N, C, H, W) and axis {0, 2, 3, 1}, For example, given a input tensor of shape(N, C, H, W) and the axis is {0, 2, 3, 1},
the output tensor shape will be (N, H, W, C) the output tensor shape will be (N, H, W, C)
)DOC"); )DOC");
} }
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <iostream>
#include "paddle/memory/memcpy.h" #include "paddle/memory/memcpy.h"
#include "paddle/memory/memory.h" #include "paddle/memory/memory.h"
#include "paddle/operators/transpose_op.h" #include "paddle/operators/transpose_op.h"
...@@ -24,7 +25,7 @@ __global__ void transpose_kernel(int nthreads, const T* in_data, T* out_data, ...@@ -24,7 +25,7 @@ __global__ void transpose_kernel(int nthreads, const T* in_data, T* out_data,
int* offset_buffer, int ndims) { int* offset_buffer, int ndims) {
int* in_offset = offset_buffer; int* in_offset = offset_buffer;
int* out_offset = offset_buffer + ndims; int* out_offset = offset_buffer + ndims;
int* axis = offset_buffer + ndims; int* axis = offset_buffer + ndims * 2;
int to_index = blockIdx.x * blockDim.x + threadIdx.x; int to_index = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -51,31 +52,37 @@ void TransposeCUDA(const framework::ExecutionContext& context, ...@@ -51,31 +52,37 @@ void TransposeCUDA(const framework::ExecutionContext& context,
size_t ndims = in_dim.size(); size_t ndims = in_dim.size();
std::vector<int> in_offset(ndims, 1); std::vector<int> in_offset(ndims, 1);
std::vector<int> out_offset(ndims, 1); std::vector<int> out_offset(ndims, 1);
std::vector<int64_t> buffer_dim_shape(1, ndims * 3);
auto cpu_place = platform::CPUPlace();
auto gpu_place = boost::get<platform::GPUPlace>(context.GetPlace());
// Get a host_buffer to cache the input offset, output offset and the axis.
std::vector<int64_t> buffer_dim_shape(1, ndims * 3);
auto buffer_dims = framework::make_ddim(buffer_dim_shape); auto buffer_dims = framework::make_ddim(buffer_dim_shape);
framework::Tensor host_buffer; framework::Tensor host_buffer;
platform::CPUPlace cpu_place;
platform::GPUPlace gpu_place;
int* host_buffer_data = host_buffer.mutable_data<int>(buffer_dims, cpu_place); int* host_buffer_data = host_buffer.mutable_data<int>(buffer_dims, cpu_place);
auto offset_buffer =
memory::Alloc(context.GetPlace(), ndims * 3 * sizeof(int));
for (int i = ndims - 2; i >= 0; i--) { for (int i = ndims - 2; i >= 0; i--) {
in_offset[i] = in_offset[i + 1] * in_dim[i + 1]; in_offset[i] = in_offset[i + 1] * in_dim[i + 1];
out_offset[i] = out_offset[i + 1] * out_dim[i + 1]; out_offset[i] = out_offset[i + 1] * out_dim[i + 1];
} }
// copy the data to the host_buffer
for (int i = 0; i < ndims; i++) { for (int i = 0; i < ndims; i++) {
host_buffer_data[i] = in_offset[i]; host_buffer_data[i] = in_offset[i];
host_buffer_data[i + ndims] = out_offset[i]; host_buffer_data[i + ndims] = out_offset[i];
host_buffer_data[i + ndims * 2] = axis[i]; host_buffer_data[i + ndims * 2] = axis[i];
} }
// Get a device_buffer to cache the input offset, output offset and the axis.
auto offset_buffer = memory::Alloc(gpu_place, ndims * 3 * sizeof(int));
auto* cuda_device_context = reinterpret_cast<platform::CUDADeviceContext*>(
const_cast<platform::DeviceContext*>(context.device_context_));
// copy the host_buffer data to the device_buffer
memory::Copy(gpu_place, offset_buffer, cpu_place, host_buffer_data, memory::Copy(gpu_place, offset_buffer, cpu_place, host_buffer_data,
ndims * 3 * sizeof(int)); ndims * 3 * sizeof(int), cuda_device_context->stream());
int block = 512; int block = 512;
int grid = (data_size + block - 1) / block; int grid = (data_size + block - 1) / block;
transpose_kernel<T><<<grid, block>>>(data_size, in_data, out_data, transpose_kernel<T><<<grid, block>>>(data_size, in_data, out_data,
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
#include <iostream> #include <iostream>
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
import unittest
import numpy as np
from gradient_checker import GradientChecker
from op_test_util import OpTestMeta
from paddle.v2.framework.op import Operator
class TestTransposeOp(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "transpose"
self.inputs = {'X': np.random.random((3, 4)).astype("float32"), }
self.attrs = {'axis': [1, 0]}
self.outputs = {'Out': self.inputs['X'].transpose((1, 0))}
class TransposeGradOpTest(GradientChecker):
def test_transpose(self):
op = Operator("transpose", X="X", Out="Out", axis=[1, 0])
inputs = {'X': np.random.random((32, 84)).astype("float32"), }
self.check_grad(op, inputs, set(["X"]), "Out", max_relative_error=0.5)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册