提交 7ef568e8 编写于 作者: Q qijun

fix gpu unittest error

上级 4130e5fa
if(WITH_GPU) if(WITH_GPU)
nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context operator) nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context operator)
nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor) nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor selected_rows)
nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator) nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
else() else()
cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator) cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
cc_library(softmax SRCS softmax.cc DEPS operator) cc_library(softmax SRCS softmax.cc DEPS operator)
cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator) cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
cc_library(pooling SRCS pooling.cc DEPS device_context) cc_library(pooling SRCS pooling.cc DEPS device_context)
cc_library(vol2col SRCS vol2col.cc DEPS device_context) cc_library(vol2col SRCS vol2col.cc DEPS device_context)
endif() endif()
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor selected_rows)
cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor) cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor) cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor)
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -191,7 +192,7 @@ struct SelectedRowsAdd<platform::GPUPlace, T> { ...@@ -191,7 +192,7 @@ struct SelectedRowsAdd<platform::GPUPlace, T> {
auto in2_place = input2.place(); auto in2_place = input2.place();
PADDLE_ENFORCE(platform::is_gpu_place(in2_place)); PADDLE_ENFORCE(platform::is_gpu_place(in2_place));
auto out_place = context.GetPlace(); auto out_place = context.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(out_place)) PADDLE_ENFORCE(platform::is_gpu_place(out_place));
memory::Copy( memory::Copy(
boost::get<platform::GPUPlace>(out_place), out_data, boost::get<platform::GPUPlace>(out_place), out_data,
...@@ -211,22 +212,26 @@ struct SelectedRowsAdd<platform::GPUPlace, T> { ...@@ -211,22 +212,26 @@ struct SelectedRowsAdd<platform::GPUPlace, T> {
template struct SelectedRowsAdd<platform::GPUPlace, float>; template struct SelectedRowsAdd<platform::GPUPlace, float>;
namespace { namespace {
template <int block_size, typename T> template <typename T>
__global__ void SelectedRowsAddTensorKernel(T* selected_rows, int64_t* rows, __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
T* tensor_in, T* tensor_out, const int64_t* rows,
const int64_t row_numel) { T* tensor_out,
const ty = blockIdx.y; int64_t row_numel,
int block_size) {
const int ty = blockIdx.y;
int tid = threadIdx.x; int tid = threadIdx.x;
selected_rows += ty * row_numel; selected_rows += ty * row_numel;
tensor_in += rows[ty] * row_numel;
tensor_out += rows[ty] * row_numel; tensor_out += rows[ty] * row_numel;
for (int index = tid; index < row_numel; index += block_size) { for (int index = tid; index < row_numel; index += block_size) {
tensor_out[index] = tensor_in[index] + selected_rows[index]; // Since index in rows of SelectedRows can be duplicate, we can not use
// tensor_out[index] += selected_rows[index]; Instead, we have to use
// AtomicAdd to avoid concurrent write error.
paddle::platform::CudaAtomicAdd(&tensor_out[index], selected_rows[index]);
} }
} }
} } // namespace
template <typename T> template <typename T>
struct SelectedRowsAddTensor<platform::GPUPlace, T> { struct SelectedRowsAddTensor<platform::GPUPlace, T> {
...@@ -250,13 +255,22 @@ struct SelectedRowsAddTensor<platform::GPUPlace, T> { ...@@ -250,13 +255,22 @@ struct SelectedRowsAddTensor<platform::GPUPlace, T> {
auto* in2_data = input2.data<T>(); auto* in2_data = input2.data<T>();
auto* out_data = output->data<T>(); auto* out_data = output->data<T>();
const int block_size = 256; SetConstant<platform::GPUPlace, T> functor;
functor(context, output, 0.0);
int block_size = 256;
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(1, in1_height); dim3 grid(1, in1_height);
SelectedRowsAddTensorKernel<block_size, T><<< SelectedRowsAddTensorKernel<T><<<
grid, threads, 0, grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>( reinterpret_cast<const platform::CUDADeviceContext&>(context).stream()
in1_data, in1_rows.data(), in2_data, out_data, in1_row_numel); >>>(in1_data, in1_rows.data(),
out_data, in1_row_numel, block_size);
auto out_eigen = framework::EigenVector<T>::Flatten(*output);
auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
out_eigen.device(*context.GetEigenDevice<platform::GPUPlace>()) =
out_eigen + in2_eigen;
} }
}; };
......
...@@ -183,20 +183,21 @@ TEST(math_function, selected_rows_add) { ...@@ -183,20 +183,21 @@ TEST(math_function, selected_rows_add) {
using namespace paddle::platform; using namespace paddle::platform;
using namespace paddle::operators::math; using namespace paddle::operators::math;
CPUPlace gpu_place(0); GPUPlace gpu_place(0);
CPUPlace cpu_place;
CUDADeviceContext ctx(gpu_place); CUDADeviceContext ctx(gpu_place);
SetConstant<GPUPlace, float> functor; SetConstant<GPUPlace, float> functor;
int64_t height = 10; int64_t height = 10;
int64_t row_numel = 10; int64_t row_numel = 10;
Vector<int64_t> rows1{0, 4, 7}; std::vector<int64_t> rows1{0, 4, 7};
std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)}; std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
auto* in1_value = selected_rows1->mutable_value(); auto* in1_value = selected_rows1->mutable_value();
in1_value->mutable_data<float>( in1_value->mutable_data<float>(
make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place); make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place);
functor(ctx, in1_value, 1.0); functor(ctx, in1_value, 1.0);
Vector<int64_t> rows2{0, 5, 7, 9}; std::vector<int64_t> rows2{0, 5, 7, 9};
std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)}; std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
auto* in2_value = selected_rows2->mutable_value(); auto* in2_value = selected_rows2->mutable_value();
in2_value->mutable_data<float>( in2_value->mutable_data<float>(
...@@ -228,7 +229,7 @@ TEST(math_function, selected_rows_add) { ...@@ -228,7 +229,7 @@ TEST(math_function, selected_rows_add) {
EXPECT_EQ(out_rows[6], 9); EXPECT_EQ(out_rows[6], 9);
Tensor out_cpu; Tensor out_cpu;
out_cpu.CopyFrom<float>(*out_value, platform::CPUPlace(), ctx); out_cpu.CopyFrom<float>(*out_value, cpu_place, ctx);
ctx.Wait(); ctx.Wait();
auto* out_cpu_data = out_cpu.data<float>(); auto* out_cpu_data = out_cpu.data<float>();
...@@ -256,10 +257,10 @@ TEST(math_function, selected_rows_add) { ...@@ -256,10 +257,10 @@ TEST(math_function, selected_rows_add) {
add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
Tensor tensor2_cpu; Tensor tensor2_cpu;
tensor2_cpu.CopyFrom<float>(*tensor2, platform::CPUPlace(), ctx); tensor2_cpu.CopyFrom<float>(*tensor2, cpu_place, ctx);
ctx.Wait(); ctx.Wait();
auto* tensor2_cpu_data = tensor2_cpu->data<float>(); auto* tensor2_cpu_data = tensor2_cpu.data<float>();
// row0: 1.0 + 2.0 + 3.0 // row0: 1.0 + 2.0 + 3.0
EXPECT_EQ(tensor2_cpu_data[0 * row_numel + 0], 6.0); EXPECT_EQ(tensor2_cpu_data[0 * row_numel + 0], 6.0);
// row1: 3.0 // row1: 3.0
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册