diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index c08d86eb213310b4e8dbac541c254867bb44b903..4871de3682437faa937670b666b2e3757d074dac 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -43,8 +43,10 @@ void Copy( platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, const void* src, size_t num, cudaStream_t stream) { if (UNLIKELY(num == 0)) return; - platform::SetDeviceId(src_place.device); + platform::SetDeviceId(src_place.device); + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << " by thream(" << stream << ")"; if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU"); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); @@ -65,6 +67,8 @@ void Copy( if (UNLIKELY(num == 0)) return; platform::SetDeviceId(dst_place.device); + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << " by thream(" << stream << ")"; if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU"); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc index 4f61b1029c65aedaf4fce771866964fe1d0d6112..eab4553ae8b9745b71a21198d0e5ccf1b8a62a35 100644 --- a/paddle/fluid/operators/math/sequence_padding_test.cc +++ b/paddle/fluid/operators/math/sequence_padding_test.cc @@ -16,8 +16,9 @@ limitations under the License. */ #include #include -template -void TestSequencePadding(const paddle::framework::LoD& lod, +template +void TestSequencePadding(const DeviceContext &context, + const paddle::framework::LoD &lod, const size_t sequence_width) { paddle::framework::LoDTensor cpu_seq; paddle::framework::LoDTensor cpu_seq_back; @@ -38,12 +39,11 @@ void TestSequencePadding(const paddle::framework::LoD& lod, cpu_seq.data()[i] = static_cast(i); } - auto* place = new Place(); - DeviceContext* context = new DeviceContext(*place); - if (paddle::platform::is_cpu_place(*place)) { + auto place = context.GetPlace(); + if (paddle::platform::is_cpu_place(place)) { seq = cpu_seq; } else { - TensorCopySync(cpu_seq, *place, &seq); + TensorCopySync(cpu_seq, place, &seq); seq.set_lod(lod); } @@ -55,28 +55,28 @@ void TestSequencePadding(const paddle::framework::LoD& lod, static_cast(num_sequences), static_cast(sequence_width)}); - padding.mutable_data(padding_dims, *place); + padding.mutable_data(padding_dims, place); - T* pad_value_data = + T *pad_value_data = cpu_pad_value.mutable_data({1}, paddle::platform::CPUPlace()); *pad_value_data = static_cast(0); - if (paddle::platform::is_cpu_place(*place)) { + if (paddle::platform::is_cpu_place(place)) { pad_value = cpu_pad_value; } else { - TensorCopySync(cpu_pad_value, *place, &pad_value); + TensorCopySync(cpu_pad_value, place, &pad_value); } paddle::operators::math::PaddingLoDTensorFunctor()( - *context, seq, &padding, pad_value, -1, 0, false, + context, seq, &padding, pad_value, -1, 0, false, paddle::operators::math::kLengthBatchWidth); seq_back.set_lod(lod); - seq_back.mutable_data(seq_dims, *place); + seq_back.mutable_data(seq_dims, place); paddle::operators::math::UnpaddingLoDTensorFunctor()( - *context, padding, &seq_back, -1, 0, false, + context, padding, &seq_back, -1, 0, false, paddle::operators::math::kLengthBatchWidth); - if (paddle::platform::is_cpu_place(*place)) { + if (paddle::platform::is_cpu_place(place)) { cpu_seq_back = seq_back; } else { TensorCopySync(seq_back, paddle::platform::CPUPlace(), &cpu_seq_back); @@ -88,33 +88,38 @@ void TestSequencePadding(const paddle::framework::LoD& lod, for (int64_t i = 0; i < cpu_seq.numel(); ++i) { EXPECT_EQ(cpu_seq.data()[i], cpu_seq_back.data()[i]); } - - delete place; - delete context; } TEST(Seq2BatchPadding, CPU) { + auto place = paddle::platform::CPUPlace(); + auto *context = static_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)); + paddle::framework::LoD lod1; lod1.push_back(std::vector{0, 10}); - TestSequencePadding(lod1, 16); + TestSequencePadding(*context, lod1, + 16); paddle::framework::LoD lod2; lod2.push_back(std::vector{0, 2, 7, 10}); - TestSequencePadding(lod2, 128); + TestSequencePadding(*context, lod2, + 128); } #ifdef PADDLE_WITH_CUDA TEST(SequencePadding, CUDA) { + auto place = paddle::platform::CUDAPlace(0); + auto *context = static_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)); + paddle::framework::LoD lod1; lod1.push_back(std::vector{0, 10}); - TestSequencePadding(lod1, 16); + TestSequencePadding(*context, + lod1, 16); paddle::framework::LoD lod2; lod2.push_back(std::vector{0, 2, 7, 10}); - TestSequencePadding(lod2, 128); + TestSequencePadding(*context, + lod2, 128); } #endif diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu index 91545131e4cbb5d6dcae9c111e97598ee54cc898..124028ee1bcf2001091a7e3ff13dbbfa0850513e 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cu +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/sequence_pooling.h" @@ -166,7 +167,7 @@ class SequencePoolFunctor { auto& lod = input.lod()[lod_level - 1]; const size_t item_dim = output->numel() / output->dims()[0]; dim3 threads(1024, 1); - dim3 grid(lod.size(), 1); + dim3 grid(std::max(lod.size() - 1, 1UL), 1); if (pooltype == "MAX") { sequence_pool_kernel< T, MaxPoolFunctor><<>>( @@ -330,7 +331,7 @@ class SequencePoolGradFunctor { auto& lod = in_grad->lod()[lod_level - 1]; const size_t item_dim = in_grad->numel() / in_grad->dims()[0]; dim3 threads(1024, 1); - dim3 grid(lod.size(), 1); + dim3 grid(std::max(lod.size() - 1, 1UL), 1); if (pooltype == "MAX") { sequence_pool_grad_kernel< T, MaxPoolGradFunctor><<>>( diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc index cf6e89b3d9f11f2b68322ef15ddf026625f6a5a5..efab1a375b56bea3caec2c8169dc390298a37cbe 100644 --- a/paddle/fluid/operators/math/sequence_pooling_test.cc +++ b/paddle/fluid/operators/math/sequence_pooling_test.cc @@ -16,18 +16,19 @@ limitations under the License. */ #include #include -template -void TestSequencePoolingSum(const paddle::framework::LoD& lod) { +template +void TestSequencePoolingSum(const DeviceContext &context, + const paddle::framework::LoD &lod, + const int64_t second_dim) { paddle::framework::LoDTensor cpu_out_grad; paddle::framework::LoDTensor cpu_in_grad; paddle::framework::LoDTensor out_grad; paddle::framework::LoDTensor in_grad; - const size_t second_dim = 128u; // construct out_grad's tensor in cpu const size_t out_first_dim = lod[0].size() - 1; auto out_dims = paddle::framework::make_ddim( - {static_cast(out_first_dim), static_cast(second_dim)}); + {static_cast(out_first_dim), second_dim}); cpu_out_grad.mutable_data(out_dims, paddle::platform::CPUPlace()); for (int64_t i = 0; i < cpu_out_grad.numel(); ++i) { @@ -35,19 +36,18 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { } // copy to dst out_grad - auto* place = new Place(); - DeviceContext* context = new DeviceContext(*place); - if (paddle::platform::is_cpu_place(*place)) { + auto place = context.GetPlace(); + if (paddle::platform::is_cpu_place(place)) { out_grad = cpu_out_grad; } else { - TensorCopySync(cpu_out_grad, *place, &out_grad); + TensorCopySync(cpu_out_grad, place, &out_grad); } // construct in_grad in_grad.set_lod(lod); auto in_dims = paddle::framework::make_ddim( - {static_cast(lod[0].back()), static_cast(second_dim)}); - in_grad.mutable_data(in_dims, context->GetPlace()); + {static_cast(lod[0].back()), second_dim}); + in_grad.mutable_data(in_dims, place); // check tensor contruction result PADDLE_ENFORCE_EQ(in_grad.dims().size(), out_grad.dims().size()); @@ -57,9 +57,9 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { // call functor paddle::operators::math::SequencePoolGradFunctor()( - *context, "SUM", out_grad, &in_grad); + context, "SUM", out_grad, &in_grad); - if (paddle::platform::is_cpu_place(*place)) { + if (paddle::platform::is_cpu_place(place)) { cpu_in_grad = in_grad; } else { TensorCopySync(in_grad, paddle::platform::CPUPlace(), &cpu_in_grad); @@ -69,12 +69,12 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { EXPECT_EQ(in_grad.numel(), static_cast(lod[0].back() * second_dim)); EXPECT_EQ(in_grad.lod(), lod); - if (paddle::platform::is_cpu_place(*place)) { + if (paddle::platform::is_cpu_place(place)) { for (size_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) { int64_t begin = in_grad.lod()[0][i]; int64_t end = in_grad.lod()[0][i + 1]; paddle::framework::Tensor tmp = in_grad.Slice(begin, end); - for (size_t j = 0; j != tmp.numel() / second_dim; ++j) { + for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) { for (int64_t m = 0; m != second_dim; ++m) { EXPECT_EQ(tmp.data()[m + j * second_dim], out_grad.data()[m + i * second_dim]); @@ -86,7 +86,7 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { int64_t begin = cpu_in_grad.lod()[0][i]; int64_t end = cpu_in_grad.lod()[0][i + 1]; paddle::framework::Tensor tmp = cpu_in_grad.Slice(begin, end); - for (size_t j = 0; j != tmp.numel() / second_dim; ++j) { + for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) { for (int64_t m = 0; m != second_dim; ++m) { EXPECT_EQ(tmp.data()[m + j * second_dim], cpu_out_grad.data()[m + i * second_dim]); @@ -94,33 +94,38 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { } } } - - delete place; - delete context; } TEST(SequencePoolingGrad, CPU_SUM) { + auto place = paddle::platform::CPUPlace(); + auto *context = static_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)); + paddle::framework::LoD lod1; lod1.push_back(std::vector{0, 10}); - TestSequencePoolingSum(lod1); + TestSequencePoolingSum(*context, + lod1, 128); paddle::framework::LoD lod2; lod2.push_back(std::vector{0, 2, 7, 10}); - TestSequencePoolingSum(lod2); + TestSequencePoolingSum(*context, + lod2, 128); } #ifdef PADDLE_WITH_CUDA TEST(SequencePoolingGrad, CUDA_SUM) { + auto place = paddle::platform::CUDAPlace(0); + auto *context = static_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)); + paddle::framework::LoD lod1; lod1.push_back(std::vector{0, 10}); - TestSequencePoolingSum(lod1); + TestSequencePoolingSum(*context, + lod1, 128); paddle::framework::LoD lod2; lod2.push_back(std::vector{0, 2, 7, 10}); - TestSequencePoolingSum(lod2); + TestSequencePoolingSum(*context, + lod2, 128); } #endif