未验证 提交 44b45b9f 编写于 作者: Y Yiqun Liu 提交者: GitHub

Correct the use of DeviceContext in unittest sequence_pooling_test and...

Correct the use of DeviceContext in unittest sequence_pooling_test and sequence_padding_test (#22456)

* Add log in memory::Copy for debug purpose.

* Change to use context in DeviceContextPool directly in sequence_pooling_test, instead to new one.

* Change to use context in DeviceContextPool directly in sequence_padding_test, instead to new one.
test=develop

* Change the type of second_dim from size_t to int64_t.
test=develop
上级 b80eef79
......@@ -43,8 +43,10 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(src_place.device);
platform::SetDeviceId(src_place.device);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";
if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
......@@ -65,6 +67,8 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(dst_place.device);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";
if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
......
......@@ -16,8 +16,9 @@ limitations under the License. */
#include <gtest/gtest.h>
#include <vector>
template <typename DeviceContext, typename Place, typename T>
void TestSequencePadding(const paddle::framework::LoD& lod,
template <typename DeviceContext, typename T>
void TestSequencePadding(const DeviceContext &context,
const paddle::framework::LoD &lod,
const size_t sequence_width) {
paddle::framework::LoDTensor cpu_seq;
paddle::framework::LoDTensor cpu_seq_back;
......@@ -38,12 +39,11 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
cpu_seq.data<T>()[i] = static_cast<T>(i);
}
auto* place = new Place();
DeviceContext* context = new DeviceContext(*place);
if (paddle::platform::is_cpu_place(*place)) {
auto place = context.GetPlace();
if (paddle::platform::is_cpu_place(place)) {
seq = cpu_seq;
} else {
TensorCopySync(cpu_seq, *place, &seq);
TensorCopySync(cpu_seq, place, &seq);
seq.set_lod(lod);
}
......@@ -55,28 +55,28 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
static_cast<int64_t>(num_sequences),
static_cast<int64_t>(sequence_width)});
padding.mutable_data<T>(padding_dims, *place);
padding.mutable_data<T>(padding_dims, place);
T* pad_value_data =
T *pad_value_data =
cpu_pad_value.mutable_data<T>({1}, paddle::platform::CPUPlace());
*pad_value_data = static_cast<T>(0);
if (paddle::platform::is_cpu_place(*place)) {
if (paddle::platform::is_cpu_place(place)) {
pad_value = cpu_pad_value;
} else {
TensorCopySync(cpu_pad_value, *place, &pad_value);
TensorCopySync(cpu_pad_value, place, &pad_value);
}
paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
*context, seq, &padding, pad_value, -1, 0, false,
context, seq, &padding, pad_value, -1, 0, false,
paddle::operators::math::kLengthBatchWidth);
seq_back.set_lod(lod);
seq_back.mutable_data<T>(seq_dims, *place);
seq_back.mutable_data<T>(seq_dims, place);
paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
*context, padding, &seq_back, -1, 0, false,
context, padding, &seq_back, -1, 0, false,
paddle::operators::math::kLengthBatchWidth);
if (paddle::platform::is_cpu_place(*place)) {
if (paddle::platform::is_cpu_place(place)) {
cpu_seq_back = seq_back;
} else {
TensorCopySync(seq_back, paddle::platform::CPUPlace(), &cpu_seq_back);
......@@ -88,33 +88,38 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
for (int64_t i = 0; i < cpu_seq.numel(); ++i) {
EXPECT_EQ(cpu_seq.data<T>()[i], cpu_seq_back.data<T>()[i]);
}
delete place;
delete context;
}
TEST(Seq2BatchPadding, CPU) {
auto place = paddle::platform::CPUPlace();
auto *context = static_cast<paddle::platform::CPUDeviceContext *>(
paddle::platform::DeviceContextPool::Instance().Get(place));
paddle::framework::LoD lod1;
lod1.push_back(std::vector<size_t>{0, 10});
TestSequencePadding<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace, float>(lod1, 16);
TestSequencePadding<paddle::platform::CPUDeviceContext, float>(*context, lod1,
16);
paddle::framework::LoD lod2;
lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
TestSequencePadding<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace, float>(lod2, 128);
TestSequencePadding<paddle::platform::CPUDeviceContext, float>(*context, lod2,
128);
}
#ifdef PADDLE_WITH_CUDA
TEST(SequencePadding, CUDA) {
auto place = paddle::platform::CUDAPlace(0);
auto *context = static_cast<paddle::platform::CUDADeviceContext *>(
paddle::platform::DeviceContextPool::Instance().Get(place));
paddle::framework::LoD lod1;
lod1.push_back(std::vector<size_t>{0, 10});
TestSequencePadding<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace, float>(lod1, 16);
TestSequencePadding<paddle::platform::CUDADeviceContext, float>(*context,
lod1, 16);
paddle::framework::LoD lod2;
lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
TestSequencePadding<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace, float>(lod2, 128);
TestSequencePadding<paddle::platform::CUDADeviceContext, float>(*context,
lod2, 128);
}
#endif
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <string>
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_pooling.h"
......@@ -166,7 +167,7 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
auto& lod = input.lod()[lod_level - 1];
const size_t item_dim = output->numel() / output->dims()[0];
dim3 threads(1024, 1);
dim3 grid(lod.size(), 1);
dim3 grid(std::max(lod.size() - 1, 1UL), 1);
if (pooltype == "MAX") {
sequence_pool_kernel<
T, MaxPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
......@@ -330,7 +331,7 @@ class SequencePoolGradFunctor<platform::CUDADeviceContext, T> {
auto& lod = in_grad->lod()[lod_level - 1];
const size_t item_dim = in_grad->numel() / in_grad->dims()[0];
dim3 threads(1024, 1);
dim3 grid(lod.size(), 1);
dim3 grid(std::max(lod.size() - 1, 1UL), 1);
if (pooltype == "MAX") {
sequence_pool_grad_kernel<
T, MaxPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
......
......@@ -16,18 +16,19 @@ limitations under the License. */
#include <gtest/gtest.h>
#include <vector>
template <typename DeviceContext, typename Place, typename T>
void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
template <typename DeviceContext, typename T>
void TestSequencePoolingSum(const DeviceContext &context,
const paddle::framework::LoD &lod,
const int64_t second_dim) {
paddle::framework::LoDTensor cpu_out_grad;
paddle::framework::LoDTensor cpu_in_grad;
paddle::framework::LoDTensor out_grad;
paddle::framework::LoDTensor in_grad;
const size_t second_dim = 128u;
// construct out_grad's tensor in cpu
const size_t out_first_dim = lod[0].size() - 1;
auto out_dims = paddle::framework::make_ddim(
{static_cast<int64_t>(out_first_dim), static_cast<int64_t>(second_dim)});
{static_cast<int64_t>(out_first_dim), second_dim});
cpu_out_grad.mutable_data<T>(out_dims, paddle::platform::CPUPlace());
for (int64_t i = 0; i < cpu_out_grad.numel(); ++i) {
......@@ -35,19 +36,18 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
}
// copy to dst out_grad
auto* place = new Place();
DeviceContext* context = new DeviceContext(*place);
if (paddle::platform::is_cpu_place(*place)) {
auto place = context.GetPlace();
if (paddle::platform::is_cpu_place(place)) {
out_grad = cpu_out_grad;
} else {
TensorCopySync(cpu_out_grad, *place, &out_grad);
TensorCopySync(cpu_out_grad, place, &out_grad);
}
// construct in_grad
in_grad.set_lod(lod);
auto in_dims = paddle::framework::make_ddim(
{static_cast<int64_t>(lod[0].back()), static_cast<int64_t>(second_dim)});
in_grad.mutable_data<T>(in_dims, context->GetPlace());
{static_cast<int64_t>(lod[0].back()), second_dim});
in_grad.mutable_data<T>(in_dims, place);
// check tensor contruction result
PADDLE_ENFORCE_EQ(in_grad.dims().size(), out_grad.dims().size());
......@@ -57,9 +57,9 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
// call functor
paddle::operators::math::SequencePoolGradFunctor<DeviceContext, T>()(
*context, "SUM", out_grad, &in_grad);
context, "SUM", out_grad, &in_grad);
if (paddle::platform::is_cpu_place(*place)) {
if (paddle::platform::is_cpu_place(place)) {
cpu_in_grad = in_grad;
} else {
TensorCopySync(in_grad, paddle::platform::CPUPlace(), &cpu_in_grad);
......@@ -69,12 +69,12 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
EXPECT_EQ(in_grad.numel(), static_cast<int64_t>(lod[0].back() * second_dim));
EXPECT_EQ(in_grad.lod(), lod);
if (paddle::platform::is_cpu_place(*place)) {
if (paddle::platform::is_cpu_place(place)) {
for (size_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) {
int64_t begin = in_grad.lod()[0][i];
int64_t end = in_grad.lod()[0][i + 1];
paddle::framework::Tensor tmp = in_grad.Slice(begin, end);
for (size_t j = 0; j != tmp.numel() / second_dim; ++j) {
for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) {
for (int64_t m = 0; m != second_dim; ++m) {
EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
out_grad.data<T>()[m + i * second_dim]);
......@@ -86,7 +86,7 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
int64_t begin = cpu_in_grad.lod()[0][i];
int64_t end = cpu_in_grad.lod()[0][i + 1];
paddle::framework::Tensor tmp = cpu_in_grad.Slice(begin, end);
for (size_t j = 0; j != tmp.numel() / second_dim; ++j) {
for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) {
for (int64_t m = 0; m != second_dim; ++m) {
EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
cpu_out_grad.data<T>()[m + i * second_dim]);
......@@ -94,33 +94,38 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
}
}
}
delete place;
delete context;
}
TEST(SequencePoolingGrad, CPU_SUM) {
auto place = paddle::platform::CPUPlace();
auto *context = static_cast<paddle::platform::CPUDeviceContext *>(
paddle::platform::DeviceContextPool::Instance().Get(place));
paddle::framework::LoD lod1;
lod1.push_back(std::vector<size_t>{0, 10});
TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace, float>(lod1);
TestSequencePoolingSum<paddle::platform::CPUDeviceContext, float>(*context,
lod1, 128);
paddle::framework::LoD lod2;
lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace, float>(lod2);
TestSequencePoolingSum<paddle::platform::CPUDeviceContext, float>(*context,
lod2, 128);
}
#ifdef PADDLE_WITH_CUDA
TEST(SequencePoolingGrad, CUDA_SUM) {
auto place = paddle::platform::CUDAPlace(0);
auto *context = static_cast<paddle::platform::CUDADeviceContext *>(
paddle::platform::DeviceContextPool::Instance().Get(place));
paddle::framework::LoD lod1;
lod1.push_back(std::vector<size_t>{0, 10});
TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace, float>(lod1);
TestSequencePoolingSum<paddle::platform::CUDADeviceContext, float>(*context,
lod1, 128);
paddle::framework::LoD lod2;
lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace, float>(lod2);
TestSequencePoolingSum<paddle::platform::CUDADeviceContext, float>(*context,
lod2, 128);
}
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册