diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc index 543ac9d638594a455c9ae6a0e5c85e56773e5d10..39be7865149c70d0514e0a0cc64d0175437469d7 100644 --- a/paddle/fluid/inference/anakin/engine.cc +++ b/paddle/fluid/inference/anakin/engine.cc @@ -71,6 +71,7 @@ void AnakinEngine::Execute( const std::map &inputs, const std::map &outputs, cudaStream_t stream) { + cudaDeviceSynchronize(); for (const auto &input : inputs) { auto *tensor = input.second; auto *data = tensor->data(); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 9a40cf4b60a64c3d0452a4367ccb7ac36de6b3b8..33de430fc8cf3d73bd0e290d9a4b8c1e8a33e80b 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -74,6 +74,19 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const { return res; } +PaddleDType ZeroCopyTensor::type() { + EAGER_GET_TENSOR; + auto type = tensor->type(); + if (type == framework::proto::VarType::FP32) { + return PaddleDType::FLOAT32; + } else if (type == framework::proto::VarType::INT64) { + return PaddleDType::INT64; + } else { + LOG(ERROR) << "unknown type, only support float32 and int64 now."; + } + return PaddleDType::FLOAT32; +} + template void ZeroCopyTensor::copy_from_cpu(const T *data) { EAGER_GET_TENSOR; @@ -119,6 +132,7 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { static_cast(pool.Get(gpu_place)); memory::Copy(platform::CPUPlace(), static_cast(data), gpu_place, t_data, ele_num * sizeof(T), dev_ctx->stream()); + cudaDeviceSynchronize(); #else PADDLE_THROW("Not compile with CUDA, should not reach here."); #endif diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 703fd18069474f28b29c6f16c6308fc19bd3527f..52c5cb34b5378cea1788bb8845f42cfe2f6590b4 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -177,6 +177,8 @@ class ZeroCopyTensor { device_ = device; } + PaddleDType type(); + protected: explicit ZeroCopyTensor(void* scope) : scope_{scope} {} void SetName(const std::string& name) { name_ = name; } @@ -191,6 +193,7 @@ class ZeroCopyTensor { // performance. mutable void* tensor_{nullptr}; PaddlePlace place_; + PaddleDType dtype_; int device_; };