未验证 提交 a270fdf2 编写于 作者: C chengduo 提交者: GitHub

Fix SelectedRowsAdd bug (#14309)

* fix selected_rows bug
test=develop

* refine cos_sim
test=develop
上级 1001f8e1
...@@ -51,7 +51,7 @@ struct CosSimDyFunctor<platform::CUDADeviceContext, T> { ...@@ -51,7 +51,7 @@ struct CosSimDyFunctor<platform::CUDADeviceContext, T> {
T* dy) const { T* dy) const {
const int block_size = 512; const int block_size = 512;
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(1, (rows + block_size - 1) / block_size); dim3 grid((rows + block_size - 1) / block_size, 1);
CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>( CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>(
x_norm, y_norm, x, y, z, dz, rows, cols, dy); x_norm, y_norm, x, y, z, dz, rows, cols, dy);
} }
......
...@@ -81,7 +81,7 @@ template <typename T, int block_size> ...@@ -81,7 +81,7 @@ template <typename T, int block_size>
__global__ void SelectedRowsAddTensorKernel(const T* selected_rows, __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
const int64_t* rows, T* tensor_out, const int64_t* rows, T* tensor_out,
int64_t row_numel) { int64_t row_numel) {
const int ty = blockIdx.y; const int ty = blockIdx.x;
int tid = threadIdx.x; int tid = threadIdx.x;
selected_rows += ty * row_numel; selected_rows += ty * row_numel;
...@@ -123,7 +123,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> { ...@@ -123,7 +123,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
const int block_size = 256; const int block_size = 256;
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(1, in1_rows.size()); dim3 grid(in1_rows.size(), 1);
SelectedRowsAddTensorKernel< SelectedRowsAddTensorKernel<
T, block_size><<<grid, threads, 0, context.stream()>>>( T, block_size><<<grid, threads, 0, context.stream()>>>(
in1_data, in1_rows.CUDAData(context.GetPlace()), out_data, in1_data, in1_rows.CUDAData(context.GetPlace()), out_data,
...@@ -188,7 +188,7 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows, ...@@ -188,7 +188,7 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
const int64_t* rows, const int64_t* rows,
T* tensor_out, T* tensor_out,
int64_t row_numel) { int64_t row_numel) {
const int ty = blockIdx.y; const int ty = blockIdx.x;
int tid = threadIdx.x; int tid = threadIdx.x;
selected_rows += ty * row_numel; selected_rows += ty * row_numel;
...@@ -221,7 +221,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> { ...@@ -221,7 +221,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
auto* in2_data = input2->data<T>(); auto* in2_data = input2->data<T>();
const int block_size = 256; const int block_size = 256;
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(1, in1_rows.size()); dim3 grid(in1_rows.size(), 1);
SelectedRowsAddToTensorKernel< SelectedRowsAddToTensorKernel<
T, block_size><<<grid, threads, 0, context.stream()>>>( T, block_size><<<grid, threads, 0, context.stream()>>>(
in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data, in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data,
...@@ -388,7 +388,7 @@ template <typename T, int block_size> ...@@ -388,7 +388,7 @@ template <typename T, int block_size>
__global__ void UpdateToTensorKernel(const T* selected_rows, __global__ void UpdateToTensorKernel(const T* selected_rows,
const int64_t* rows, const ScatterOps& op, const int64_t* rows, const ScatterOps& op,
T* tensor_out, int64_t row_numel) { T* tensor_out, int64_t row_numel) {
const int ty = blockIdx.y; const int ty = blockIdx.x;
int tid = threadIdx.x; int tid = threadIdx.x;
selected_rows += ty * row_numel; selected_rows += ty * row_numel;
...@@ -457,7 +457,7 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> { ...@@ -457,7 +457,7 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
auto* in2_data = input2->data<T>(); auto* in2_data = input2->data<T>();
dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1); dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
dim3 grid(1, in1_rows.size()); dim3 grid(in1_rows.size(), 1);
UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<< UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(), grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(),
op, in2_data, in1_row_numel); op, in2_data, in1_row_numel);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册