未验证 提交 e3a64fca 编写于 作者: Q Qiyang Min 提交者: GitHub

Merge pull request #13835 from velconia/fix_reshape_op

Fix Reshape op when input is the same with output
...@@ -36,6 +36,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -36,6 +36,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto size = src.numel() * SizeOfType(src.type()); auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size); boost::get<platform::CPUPlace>(src_place), src_ptr, size);
} }
...@@ -71,6 +76,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -71,6 +76,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
if (platform::is_same_place(src_place, dst_place)) { if (platform::is_same_place(src_place, dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream); stream);
} else { } else {
...@@ -114,6 +124,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -114,6 +124,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
auto dst_ptr = dst->mutable_data(dst_place, src.type()); auto dst_ptr = dst->mutable_data(dst_place, src.type());
auto size = src.numel() * SizeOfType(src.type()); auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size); boost::get<platform::CPUPlace>(src_place), src_ptr, size);
} }
...@@ -130,6 +145,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -130,6 +145,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
} else if (platform::is_gpu_place(src_place) && } else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
VLOG(3) << "Skip copy the same data from " << src_place << " to "
<< dst_place;
return;
}
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place); auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place); auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
......
...@@ -41,6 +41,11 @@ TEST(TensorCopy, Tensor) { ...@@ -41,6 +41,11 @@ TEST(TensorCopy, Tensor) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]); EXPECT_EQ(src_ptr[i], dst_ptr[i]);
} }
TensorCopy(dst_tensor, *cpu_place, &dst_tensor);
for (size_t i = 0; i < 9; ++i) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]);
}
EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
Tensor slice_tensor = src_tensor.Slice(1, 2); Tensor slice_tensor = src_tensor.Slice(1, 2);
...@@ -82,6 +87,15 @@ TEST(TensorCopy, Tensor) { ...@@ -82,6 +87,15 @@ TEST(TensorCopy, Tensor) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]); EXPECT_EQ(src_ptr[i], dst_ptr[i]);
} }
// Copy the same tensor
TensorCopy(gpu_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
gpu_ctx.Wait();
const int* dst_ptr_tmp = dst_tensor.data<int>();
EXPECT_NE(src_ptr, dst_ptr_tmp);
for (size_t i = 0; i < 9; ++i) {
EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]);
}
Tensor slice_tensor = src_tensor.Slice(1, 2); Tensor slice_tensor = src_tensor.Slice(1, 2);
// CPU Slice Tensor to GPU Tensor // CPU Slice Tensor to GPU Tensor
......
...@@ -164,7 +164,7 @@ dimension value will be copied from Input(X) at runtime. Note that the index of ...@@ -164,7 +164,7 @@ dimension value will be copied from Input(X) at runtime. Note that the index of
[2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input. [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while 3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
Attr(shape) still should be set correctly to gurantee shape inference in Attr(shape) still should be set correctly to gurantee shape inference in
compile-time. compile-time.
)DOC"); )DOC");
...@@ -259,7 +259,6 @@ class Reshape2Op : public ReshapeOp { ...@@ -259,7 +259,6 @@ class Reshape2Op : public ReshapeOp {
: ReshapeOp(type, inputs, outputs, attrs) {} : ReshapeOp(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
ReshapeOp::InferShape(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"), PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output(XShape) of ReshapeOp should not be null."); "Output(XShape) of ReshapeOp should not be null.");
const auto &x_dims = ctx->GetInputDim("X"); const auto &x_dims = ctx->GetInputDim("X");
...@@ -270,6 +269,8 @@ class Reshape2Op : public ReshapeOp { ...@@ -270,6 +269,8 @@ class Reshape2Op : public ReshapeOp {
} }
ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims)); ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
ctx->ShareLoD("X", /*->*/ "XShape"); ctx->ShareLoD("X", /*->*/ "XShape");
ReshapeOp::InferShape(ctx);
} }
}; };
......
...@@ -90,11 +90,13 @@ REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel, ...@@ -90,11 +90,13 @@ REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel,
paddle::framework::DefaultGradOpDescMaker<false>); paddle::framework::DefaultGradOpDescMaker<false>);
template <typename T> template <typename T>
using Kernel = op::SeqConcatKernel<paddle::platform::CPUDeviceContext, T>; using Kernel = op::SeqConcatKernel<paddle::platform::CPUDeviceContext, T>;
REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>); REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>,
Kernel<int64_t>);
REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel, REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel,
op::SeqConcatGradShapeInferer); op::SeqConcatGradShapeInferer);
template <typename T> template <typename T>
using GradKernel = using GradKernel =
op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, T>; op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, T>;
REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel<float>, REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel<float>,
GradKernel<double>); GradKernel<double>, GradKernel<int64_t>);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册