diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index b20e27bebbeddf8cf4bfa868716458d8c0b11a87..f1374bc8f7bd7cacb1c96575a6b3472b1cc3d9b4 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -410,6 +410,9 @@ void Tracer::TraceOp(const std::string& type, VLOG(6) << "Running On Eager TraceOp with use_default_attr_map: " << use_default_attr_map; std::map need_backup_inputs2outputs; + std::map> + need_backup_inputs2holder; + std::map need_backup_inputs2strides; if (FLAGS_use_stride_kernel) { for (auto& iter : inplace_map) { auto inputs_iter = ins.find(iter.first); @@ -426,11 +429,12 @@ void Tracer::TraceOp(const std::string& type, outputs_iter->second[i] ->MutableVar() ->GetMutable(); + need_backup_inputs2holder[dense_tensor] = dense_tensor->Holder(); + need_backup_inputs2strides[dense_tensor] = dense_tensor->strides(); } } } } - TraceOpImpl(type, ins, outs, @@ -443,7 +447,11 @@ void Tracer::TraceOp(const std::string& type, auto dev_ctx = paddle::platform::DeviceContextPool::Instance().Get(place); for (auto& iter : need_backup_inputs2outputs) { - paddle::experimental::TransStride(dev_ctx, iter.second, iter.first); + iter.first->ResetHolder(need_backup_inputs2holder[iter.first]); + iter.first->set_strides(need_backup_inputs2strides[iter.first]); + paddle::experimental::TransStrideLegacy(dev_ctx, iter.second, iter.first); + iter.second->ResetHolder(need_backup_inputs2holder[iter.first]); + iter.second->set_strides(need_backup_inputs2strides[iter.first]); } } else { TraceOpImpl(type, diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc index d1e549c91c4df17c432ef02cc8239bf6bc96a7df..e3f58683d988a7898437c2b0ec478860a830c2d2 100644 --- a/paddle/phi/api/lib/api_gen_utils.cc +++ b/paddle/phi/api/lib/api_gen_utils.cc @@ -423,6 +423,56 @@ void TransStride(phi::DeviceContext* dev_ctx, } } +void TransStrideLegacy(phi::DeviceContext* dev_ctx, + phi::DenseTensor* from, + phi::DenseTensor* to) { + if (to) { + auto* cpu_ctx = dynamic_cast(dev_ctx); + if (cpu_ctx) { + PD_VISIT_ALL_TYPES(to->dtype(), "StridedCopyKernel", ([&] { + phi::StridedCopyKernel( + *cpu_ctx, + *from, + phi::vectorize(to->dims()), + phi::vectorize(to->strides()), + to->offset(), + to); + })); + return; + } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + auto* gpu_ctx = dynamic_cast(dev_ctx); + if (gpu_ctx) { + PD_VISIT_ALL_TYPES(to->dtype(), "StridedCopyKernel", ([&] { + phi::StridedCopyKernel( + *gpu_ctx, + *from, + phi::vectorize(to->dims()), + phi::vectorize(to->strides()), + to->offset(), + to); + })); + return; + } +#endif +#ifdef PADDLE_WITH_XPU + auto* xpu_ctx = dynamic_cast(dev_ctx); + if (xpu_ctx) { + PD_VISIT_ALL_TYPES(to->dtype(), "StridedCopyKernel", ([&] { + phi::StridedCopyKernel( + *xpu_ctx, + *from, + phi::vectorize(to->dims()), + phi::vectorize(to->strides()), + to->offset(), + to); + })); + return; + } +#endif + } +} + void TransStride(phi::DeviceContext* dev_ctx, const std::vector& from, const std::vector& to) { diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h index afe312b7096416a262ede2256906d69479470a24..1b552bf94eaa4d51eddd3955752a9a7f6a5a48f7 100644 --- a/paddle/phi/api/lib/api_gen_utils.h +++ b/paddle/phi/api/lib/api_gen_utils.h @@ -133,6 +133,10 @@ void TransStride(phi::DeviceContext* dev_ctx, phi::SelectedRows* from, phi::SelectedRows* to); +void TransStrideLegacy(phi::DeviceContext* dev_ctx, + phi::DenseTensor* from, + phi::DenseTensor* to); + #ifdef PADDLE_WITH_DISTRIBUTE /* ------------------ for auto parallel ----------------------- */