提交 78808b20 编写于 作者: Z zchen0211

1 api

上级 b851515b
...@@ -169,7 +169,7 @@ void CondOp::Run(const Scope& scope, ...@@ -169,7 +169,7 @@ void CondOp::Run(const Scope& scope,
tensor_child->Resize(dim); tensor_child->Resize(dim);
tensor_child->mutable_data<float>(dim, platform::CPUPlace()); tensor_child->mutable_data<float>(dim, platform::CPUPlace());
CPUTGather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i], CPUGather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i],
tensor_child); tensor_child);
} }
} }
......
...@@ -38,19 +38,6 @@ __global__ void GatherCUDAKernel(const T* params, const int* indices, T* output, ...@@ -38,19 +38,6 @@ __global__ void GatherCUDAKernel(const T* params, const int* indices, T* output,
} }
} }
// Implementation of GPU copy:
template <typename T>
struct GPUGather {
void operator()(const T* src, const int* index, const int slice_size,
const int index_size, T* output) {
int block = 512;
int n = slice_size * index_size;
int grid = (n + block - 1) / block;
GatherCUDAKernel<T><<<grid, block>>>(src, index, output, index_size,
slice_size);
}
};
/** /**
* A thin wrapper on gpu tensor * A thin wrapper on gpu tensor
* Return a new tensor from source tensor, gathered according to index * Return a new tensor from source tensor, gathered according to index
...@@ -59,7 +46,7 @@ struct GPUGather { ...@@ -59,7 +46,7 @@ struct GPUGather {
* return: output tensor * return: output tensor
*/ */
template <typename T> template <typename T>
void GPUTGather(const Place& place, const Tensor* src, const Tensor* index, void GPUGather(const Place& place, const Tensor* src, const Tensor* index,
Tensor* output) { Tensor* output) {
PADDLE_ENFORCE(platform::is_gpu_place(place)); PADDLE_ENFORCE(platform::is_gpu_place(place));
// check index of shape 1-D // check index of shape 1-D
...@@ -74,10 +61,15 @@ void GPUTGather(const Place& place, const Tensor* src, const Tensor* index, ...@@ -74,10 +61,15 @@ void GPUTGather(const Place& place, const Tensor* src, const Tensor* index,
int slice_size = 1; int slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
// Gathering const T* p_src = src->data<T>();
GPUGather<T> gather_functor; const int* p_index = index->data<int>();
gather_functor(src->data<T>(), index->data<int>(), slice_size, index_size, T* p_output = output->data<T>();
output->data<T>());
int block = 512;
int n = slice_size * index_size;
int grid = (n + block - 1) / block;
GatherCUDAKernel<T><<<grid, block>>>(p_src, p_index, p_output, index_size,
slice_size);
} }
} // namespace operators } // namespace operators
......
...@@ -24,29 +24,15 @@ limitations under the License. */ ...@@ -24,29 +24,15 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
// Implementation of CPU copy
template <typename T>
struct CPUGather {
void operator()(const T* src, const int* indices, const int slice_size,
const int index_size, T* output) {
const size_t slice_bytes = slice_size * sizeof(T);
for (int i = 0; i < index_size; ++i) {
int index_ = indices[i];
memcpy(output + i * slice_size, src + index_ * slice_size, slice_bytes);
}
}
};
/** /**
* A thin wrapper on cpu tensor * A thin wrapper for gathering on cpu tensor
* Return a new tensor from source tensor, gathered according to index * Return a new tensor from source tensor, gathered according to index
* input[src]: type-T source Tensor * input[src]: type-T source Tensor
* input[index]: type-int index Tensor (1-D) * input[index]: type-int index Tensor (1-D)
* return: output tensor * return: output tensor
*/ */
template <typename T> template <typename T>
void CPUTGather(const platform::Place& place, void CPUGather(const platform::Place& place,
const paddle::framework::Tensor* src, const paddle::framework::Tensor* src,
const paddle::framework::Tensor* index, const paddle::framework::Tensor* index,
paddle::framework::Tensor* output) { paddle::framework::Tensor* output) {
...@@ -59,14 +45,20 @@ void CPUTGather(const platform::Place& place, ...@@ -59,14 +45,20 @@ void CPUTGather(const platform::Place& place,
framework::DDim output_dims(src_dims); framework::DDim output_dims(src_dims);
output_dims[0] = index_size; output_dims[0] = index_size;
const T* p_src = src->data<T>();
const int* p_index = index->data<int>();
T* p_output = output->data<T>();
// slice size // slice size
int slice_size = 1; int slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
// Gathering const size_t slice_bytes = slice_size * sizeof(T);
CPUGather<T> gather_functor;
gather_functor(src->data<T>(), index->data<int>(), slice_size, index_size, for (int i = 0; i < index_size; ++i) {
output->data<T>()); int index_ = p_index[i];
memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
}
} }
} // namespace operators } // namespace operators
......
...@@ -32,7 +32,7 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> { ...@@ -32,7 +32,7 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
output->mutable_data<T>(ctx.GetPlace()); output->mutable_data<T>(ctx.GetPlace());
GPUTGather<T>(ctx.GetPlace(), x, index, output); GPUGather<T>(ctx.GetPlace(), x, index, output);
} }
}; };
...@@ -53,7 +53,7 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -53,7 +53,7 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
auto place = ctx.GetEigenDevice<platform::GPUPlace>(); auto place = ctx.GetEigenDevice<platform::GPUPlace>();
dxt.device(place) = dxt.constant(static_cast<T>(0)); dxt.device(place) = dxt.constant(static_cast<T>(0));
GPUTScatter<T>(ctx.GetPlace(), dO, Index, dX); GPUScatterAssign<T>(ctx.GetPlace(), dO, Index, dX);
} }
}; };
......
...@@ -36,7 +36,7 @@ class GatherOpKernel : public framework::OpKernel<T> { ...@@ -36,7 +36,7 @@ class GatherOpKernel : public framework::OpKernel<T> {
output->mutable_data<T>(ctx.GetPlace()); output->mutable_data<T>(ctx.GetPlace());
CPUTGather<T>(ctx.GetPlace(), x, index, output); CPUGather<T>(ctx.GetPlace(), x, index, output);
} }
}; };
......
...@@ -41,7 +41,7 @@ TEST(Gather, GatherData) { ...@@ -41,7 +41,7 @@ TEST(Gather, GatherData) {
int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace()); int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
CPUTGather<int>(CPUPlace(), src, index, output); CPUGather<int>(CPUPlace(), src, index, output);
for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4); for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4); for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
......
...@@ -36,20 +36,6 @@ __global__ void ScatterCUDAKernel(const T* params, const int* indices, ...@@ -36,20 +36,6 @@ __global__ void ScatterCUDAKernel(const T* params, const int* indices,
} }
} }
// Implementation of GPU copy:
template <typename T>
struct GPUScatterAssign {
void operator()(const T* src, const int* index, const int slice_size,
const int index_size, T* output) {
int block = 512;
int n = slice_size * index_size;
int grid = (n + block - 1) / block;
// printf("grid, block: %d %d\n", grid, block);
ScatterCUDAKernel<T><<<grid, block>>>(src, index, output, index_size,
slice_size);
}
};
/** /**
* A thin wrapper on gpu tensor * A thin wrapper on gpu tensor
* Return a new updated tensor from source tensor, scatter-assigned according to * Return a new updated tensor from source tensor, scatter-assigned according to
...@@ -59,7 +45,7 @@ struct GPUScatterAssign { ...@@ -59,7 +45,7 @@ struct GPUScatterAssign {
* return: output tensor * return: output tensor
*/ */
template <typename T> template <typename T>
void GPUTScatter(const platform::Place& place, void GPUScatterAssign(const platform::Place& place,
const paddle::framework::Tensor* src, const paddle::framework::Tensor* src,
const paddle::framework::Tensor* index, const paddle::framework::Tensor* index,
paddle::framework::Tensor* output) { paddle::framework::Tensor* output) {
...@@ -76,10 +62,16 @@ void GPUTScatter(const platform::Place& place, ...@@ -76,10 +62,16 @@ void GPUTScatter(const platform::Place& place,
int slice_size = 1; int slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
// Scatter Assign const T* p_src = src->data<T>();
GPUScatterAssign<T> scatter_functor; const int* p_index = index->data<int>();
scatter_functor(src->data<T>(), index->data<int>(), slice_size, index_size, T* p_output = output->data<T>();
output->data<T>());
int block = 512;
int n = slice_size * index_size;
int grid = (n + block - 1) / block;
ScatterCUDAKernel<T><<<grid, block>>>(p_src, p_index, p_output, index_size,
slice_size);
} }
} // namespace operators } // namespace operators
......
...@@ -25,19 +25,6 @@ namespace operators { ...@@ -25,19 +25,6 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
// Implementation of CPU copy
template <typename T>
void CPUScatterAssign(const T* src, const int* index, const int slice_size,
const int index_size, T* output) {
// paddle::framework::DDim output_dims = output->dims();
const size_t slice_bytes = slice_size * sizeof(T);
for (int i = 0; i < index_size; ++i) {
int index_ = index[i];
memcpy(output + index_ * slice_size, src + i * slice_size, slice_bytes);
}
}
/** /**
* Return a updated tensor from source tensor, scattered according to index: * Return a updated tensor from source tensor, scattered according to index:
* dst[i] = src[index[i]] * dst[i] = src[index[i]]
...@@ -70,7 +57,12 @@ void ScatterAssign(const platform::Place& place, ...@@ -70,7 +57,12 @@ void ScatterAssign(const platform::Place& place,
size_t slice_size = 1; size_t slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
CPUScatterAssign<T>(p_src, p_index, slice_size, index_size, p_output); const size_t slice_bytes = slice_size * sizeof(T);
for (int i = 0; i < index_size; ++i) {
int index_ = p_index[i];
memcpy(p_output + index_ * slice_size, p_src + i * slice_size, slice_bytes);
}
} }
} // namespace operators } // namespace operators
......
...@@ -32,7 +32,7 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> { ...@@ -32,7 +32,7 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
Out->ShareDataWith<T>(*Ref); Out->ShareDataWith<T>(*Ref);
GPUTScatter<T>(ctx.GetPlace(), Updates, Index, Out); GPUScatterAssign<T>(ctx.GetPlace(), Updates, Index, Out);
} }
}; };
...@@ -51,7 +51,7 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -51,7 +51,7 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
dRef->ShareDataWith<T>(*dOut); dRef->ShareDataWith<T>(*dOut);
dUpdates->mutable_data<T>(ctx.GetPlace()); dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates = dO[Index] // Gradient by Gather: dUpdates = dO[Index]
GPUTGather<T>(ctx.GetPlace(), dOut, Index, dUpdates); GPUGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
} }
}; };
......
...@@ -56,7 +56,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> { ...@@ -56,7 +56,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
dRef->ShareDataWith<T>(*dOut); dRef->ShareDataWith<T>(*dOut);
dUpdates->mutable_data<T>(ctx.GetPlace()); dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates += dO[Index] // Gradient by Gather: dUpdates += dO[Index]
CPUTGather<T>(ctx.GetPlace(), dOut, Index, dUpdates); CPUGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
} }
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册