提交 78808b20 编写于 作者: Z zchen0211

1 api

上级 b851515b
......@@ -169,8 +169,8 @@ void CondOp::Run(const Scope& scope,
tensor_child->Resize(dim);
tensor_child->mutable_data<float>(dim, platform::CPUPlace());
CPUTGather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i],
tensor_child);
CPUGather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i],
tensor_child);
}
}
......
......@@ -38,19 +38,6 @@ __global__ void GatherCUDAKernel(const T* params, const int* indices, T* output,
}
}
// Implementation of GPU copy:
template <typename T>
struct GPUGather {
void operator()(const T* src, const int* index, const int slice_size,
const int index_size, T* output) {
int block = 512;
int n = slice_size * index_size;
int grid = (n + block - 1) / block;
GatherCUDAKernel<T><<<grid, block>>>(src, index, output, index_size,
slice_size);
}
};
/**
* A thin wrapper on gpu tensor
* Return a new tensor from source tensor, gathered according to index
......@@ -59,8 +46,8 @@ struct GPUGather {
* return: output tensor
*/
template <typename T>
void GPUTGather(const Place& place, const Tensor* src, const Tensor* index,
Tensor* output) {
void GPUGather(const Place& place, const Tensor* src, const Tensor* index,
Tensor* output) {
PADDLE_ENFORCE(platform::is_gpu_place(place));
// check index of shape 1-D
PADDLE_ENFORCE(index->dims().size() == 1);
......@@ -74,10 +61,15 @@ void GPUTGather(const Place& place, const Tensor* src, const Tensor* index,
int slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
// Gathering
GPUGather<T> gather_functor;
gather_functor(src->data<T>(), index->data<int>(), slice_size, index_size,
output->data<T>());
const T* p_src = src->data<T>();
const int* p_index = index->data<int>();
T* p_output = output->data<T>();
int block = 512;
int n = slice_size * index_size;
int grid = (n + block - 1) / block;
GatherCUDAKernel<T><<<grid, block>>>(p_src, p_index, p_output, index_size,
slice_size);
}
} // namespace operators
......
......@@ -24,32 +24,18 @@ limitations under the License. */
namespace paddle {
namespace operators {
// Implementation of CPU copy
template <typename T>
struct CPUGather {
void operator()(const T* src, const int* indices, const int slice_size,
const int index_size, T* output) {
const size_t slice_bytes = slice_size * sizeof(T);
for (int i = 0; i < index_size; ++i) {
int index_ = indices[i];
memcpy(output + i * slice_size, src + index_ * slice_size, slice_bytes);
}
}
};
/**
* A thin wrapper on cpu tensor
* A thin wrapper for gathering on cpu tensor
* Return a new tensor from source tensor, gathered according to index
* input[src]: type-T source Tensor
* input[index]: type-int index Tensor (1-D)
* return: output tensor
*/
template <typename T>
void CPUTGather(const platform::Place& place,
const paddle::framework::Tensor* src,
const paddle::framework::Tensor* index,
paddle::framework::Tensor* output) {
void CPUGather(const platform::Place& place,
const paddle::framework::Tensor* src,
const paddle::framework::Tensor* index,
paddle::framework::Tensor* output) {
PADDLE_ENFORCE(platform::is_cpu_place(place));
// check index of shape 1-D
PADDLE_ENFORCE(index->dims().size() == 1);
......@@ -59,14 +45,20 @@ void CPUTGather(const platform::Place& place,
framework::DDim output_dims(src_dims);
output_dims[0] = index_size;
const T* p_src = src->data<T>();
const int* p_index = index->data<int>();
T* p_output = output->data<T>();
// slice size
int slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
// Gathering
CPUGather<T> gather_functor;
gather_functor(src->data<T>(), index->data<int>(), slice_size, index_size,
output->data<T>());
const size_t slice_bytes = slice_size * sizeof(T);
for (int i = 0; i < index_size; ++i) {
int index_ = p_index[i];
memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
}
}
} // namespace operators
......
......@@ -32,7 +32,7 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
output->mutable_data<T>(ctx.GetPlace());
GPUTGather<T>(ctx.GetPlace(), x, index, output);
GPUGather<T>(ctx.GetPlace(), x, index, output);
}
};
......@@ -53,7 +53,7 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
auto place = ctx.GetEigenDevice<platform::GPUPlace>();
dxt.device(place) = dxt.constant(static_cast<T>(0));
GPUTScatter<T>(ctx.GetPlace(), dO, Index, dX);
GPUScatterAssign<T>(ctx.GetPlace(), dO, Index, dX);
}
};
......
......@@ -36,7 +36,7 @@ class GatherOpKernel : public framework::OpKernel<T> {
output->mutable_data<T>(ctx.GetPlace());
CPUTGather<T>(ctx.GetPlace(), x, index, output);
CPUGather<T>(ctx.GetPlace(), x, index, output);
}
};
......
......@@ -41,7 +41,7 @@ TEST(Gather, GatherData) {
int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
CPUTGather<int>(CPUPlace(), src, index, output);
CPUGather<int>(CPUPlace(), src, index, output);
for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
......
......@@ -36,20 +36,6 @@ __global__ void ScatterCUDAKernel(const T* params, const int* indices,
}
}
// Implementation of GPU copy:
template <typename T>
struct GPUScatterAssign {
void operator()(const T* src, const int* index, const int slice_size,
const int index_size, T* output) {
int block = 512;
int n = slice_size * index_size;
int grid = (n + block - 1) / block;
// printf("grid, block: %d %d\n", grid, block);
ScatterCUDAKernel<T><<<grid, block>>>(src, index, output, index_size,
slice_size);
}
};
/**
* A thin wrapper on gpu tensor
* Return a new updated tensor from source tensor, scatter-assigned according to
......@@ -59,10 +45,10 @@ struct GPUScatterAssign {
* return: output tensor
*/
template <typename T>
void GPUTScatter(const platform::Place& place,
const paddle::framework::Tensor* src,
const paddle::framework::Tensor* index,
paddle::framework::Tensor* output) {
void GPUScatterAssign(const platform::Place& place,
const paddle::framework::Tensor* src,
const paddle::framework::Tensor* index,
paddle::framework::Tensor* output) {
PADDLE_ENFORCE(platform::is_gpu_place(place));
// check index of shape 1-D
PADDLE_ENFORCE(index->dims().size() == 1);
......@@ -76,10 +62,16 @@ void GPUTScatter(const platform::Place& place,
int slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
// Scatter Assign
GPUScatterAssign<T> scatter_functor;
scatter_functor(src->data<T>(), index->data<int>(), slice_size, index_size,
output->data<T>());
const T* p_src = src->data<T>();
const int* p_index = index->data<int>();
T* p_output = output->data<T>();
int block = 512;
int n = slice_size * index_size;
int grid = (n + block - 1) / block;
ScatterCUDAKernel<T><<<grid, block>>>(p_src, p_index, p_output, index_size,
slice_size);
}
} // namespace operators
......
......@@ -25,19 +25,6 @@ namespace operators {
using Tensor = framework::Tensor;
// Implementation of CPU copy
template <typename T>
void CPUScatterAssign(const T* src, const int* index, const int slice_size,
const int index_size, T* output) {
// paddle::framework::DDim output_dims = output->dims();
const size_t slice_bytes = slice_size * sizeof(T);
for (int i = 0; i < index_size; ++i) {
int index_ = index[i];
memcpy(output + index_ * slice_size, src + i * slice_size, slice_bytes);
}
}
/**
* Return a updated tensor from source tensor, scattered according to index:
* dst[i] = src[index[i]]
......@@ -70,7 +57,12 @@ void ScatterAssign(const platform::Place& place,
size_t slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
CPUScatterAssign<T>(p_src, p_index, slice_size, index_size, p_output);
const size_t slice_bytes = slice_size * sizeof(T);
for (int i = 0; i < index_size; ++i) {
int index_ = p_index[i];
memcpy(p_output + index_ * slice_size, p_src + i * slice_size, slice_bytes);
}
}
} // namespace operators
......
......@@ -32,7 +32,7 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
Out->ShareDataWith<T>(*Ref);
GPUTScatter<T>(ctx.GetPlace(), Updates, Index, Out);
GPUScatterAssign<T>(ctx.GetPlace(), Updates, Index, Out);
}
};
......@@ -51,7 +51,7 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
dRef->ShareDataWith<T>(*dOut);
dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates = dO[Index]
GPUTGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
GPUGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
}
};
......
......@@ -56,7 +56,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
dRef->ShareDataWith<T>(*dOut);
dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates += dO[Index]
CPUTGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
CPUGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
}
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册