未验证 提交 2052f1e3 编写于 作者: Z Zhanlue Yang 提交者: GitHub

[Unify Tensors PR #8] Merged Tensor into DenseTensor, test=allcases (#38914)

* Merged LoDTensor with Tensor,test=allcases

* Patched python level LoDTensor

* Patched python level LoDTensor

* Merge Tensor into DenseTensor

* Fixed namespace issues,test=allcases

* Fixed merge issues

* Fixed inference issues

* Fixed NPU test issues

* Fixed merge issues
上级 bfacd706
......@@ -36,7 +36,6 @@ limitations under the License. */
namespace paddle {
namespace framework {
class Tensor;
class Scope;
class SelectedRows;
class Variable;
......
......@@ -145,8 +145,8 @@ bool DistModel::LoadParameters() {
return true;
}
void DistModel::Run(const std::vector<framework::Tensor> &input_data,
std::vector<framework::Tensor> *output_data) {
void DistModel::Run(const std::vector<paddle::framework::Tensor> &input_data,
std::vector<paddle::framework::Tensor> *output_data) {
/* TODO(fleet exe dev): implement this funct */
}
......
......@@ -18,6 +18,7 @@
#include <vector>
#include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/place.h"
......@@ -25,7 +26,6 @@ namespace paddle {
namespace framework {
class ProgramDesc;
class Scope;
class Tensor;
}
namespace distributed {
......@@ -45,8 +45,8 @@ class DistModel {
public:
explicit DistModel(const DistModelConfig& config) : config_(config) {}
bool Init();
void Run(const std::vector<framework::Tensor>& input_data,
std::vector<framework::Tensor>* output_data);
void Run(const std::vector<paddle::framework::Tensor>& input_data,
std::vector<paddle::framework::Tensor>* output_data);
~DistModel() = default;
private:
......
......@@ -20,10 +20,13 @@ limitations under the License. */
namespace paddle {
namespace framework {
class Variable;
class Tensor;
} // namespace framework
} // namespace paddle
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace distributed {
......
......@@ -31,11 +31,14 @@ class PSClient;
class PSServer;
} // namespace distributed
namespace framework {
class Tensor;
class Variable;
} // namespace framework
} // namespace paddle
namespace pten {
class DenseTensor;
} // namespace pten
namespace framework = paddle::framework;
namespace platform = paddle::platform;
namespace operators = paddle::operators;
......
......@@ -32,11 +32,14 @@ class PSClient;
class PSServer;
} // namespace distributed
namespace framework {
class Tensor;
class Variable;
} // namespace framework
} // namespace paddle
namespace pten {
class DenseTensor;
} // namespace pten
namespace framework = paddle::framework;
namespace platform = paddle::platform;
namespace operators = paddle::operators;
......
......@@ -29,7 +29,6 @@ DECLARE_bool(use_mkldnn);
namespace paddle {
namespace framework {
class Tensor;
class Variable;
} // namespace framework
namespace platform {
......@@ -37,6 +36,10 @@ class DeviceContext;
} // namespace platform
} // namespace paddle
namespace pten {
class DenseTensor;
} // namespace pten
namespace egr {
namespace legacy {
......
......@@ -68,7 +68,7 @@ static bool CopySameTensorTestMain(const DDim &dims,
if (sync_copy) {
TensorCopySync(src_tensor, dst_place, &src_tensor);
} else {
TensorCopy(src_tensor, dst_place, &src_tensor);
paddle::framework::TensorCopy(src_tensor, dst_place, &src_tensor);
platform::DeviceContextPool::Instance().Get(src_place)->Wait();
platform::DeviceContextPool::Instance().Get(dst_place)->Wait();
}
......
......@@ -28,8 +28,9 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
// NOTE(zhiqiu): Special case for CPU->NPU, avoid stream sync.
if (platform::is_cpu_place(in.place()) && platform::is_npu_place(dst_place)) {
TensorCopy(in, dst_place,
*platform::DeviceContextPool::Instance().Get(dst_place), out);
paddle::framework::TensorCopy(
in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place),
out);
return;
}
......
......@@ -21,8 +21,6 @@ limitations under the License. */
namespace paddle {
namespace framework {
class Tensor;
void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
Tensor* out);
......
......@@ -50,12 +50,15 @@ DECLARE_bool(enable_slotrecord_reset_shrink);
namespace paddle {
namespace framework {
class DataFeedDesc;
class Tensor;
class Scope;
class Variable;
} // namespace framework
} // namespace paddle
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
......
......@@ -25,7 +25,6 @@
namespace paddle {
namespace framework {
class OpKernelType;
class Tensor;
} // namespace framework
} // namespace paddle
......
......@@ -31,7 +31,6 @@ namespace paddle {
namespace framework {
class OpKernelType;
class Tensor;
class Variable;
void TransformData(const OpKernelType &expected_kernel_type,
......
......@@ -25,7 +25,6 @@ namespace paddle {
namespace framework {
class OpKernelType;
class Tensor;
using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
......
......@@ -169,7 +169,7 @@ FetchResultType AsyncSSAGraphExecutor::Run(
std::vector<const LoDTensor *> lodtensor_ptrs;
lodtensor_ptrs.push_back(&(BOOST_GET(LoDTensor, val.at(fetch_idx))));
LoDTensor var;
var.MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
MergeLoDTensor(&var, lodtensor_ptrs, platform::CPUPlace());
ret.emplace_back(var);
} else {
auto array = BOOST_GET(LoDTensorArray, val.at(fetch_idx));
......@@ -179,7 +179,8 @@ FetchResultType AsyncSSAGraphExecutor::Run(
std::vector<const LoDTensor *> lodtensor_ptrs;
lodtensor_ptrs.push_back(&array[i]);
item_array.emplace_back();
item_array.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
MergeLoDTensor(&(item_array.back()), lodtensor_ptrs,
platform::CPUPlace());
}
ret.emplace_back(item_array);
}
......
......@@ -22,14 +22,18 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
class Tensor;
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class DeviceContext;
} // namespace platform
......
......@@ -81,7 +81,7 @@ void FetchOpHandle::WaitAndMergeCPUFetchVars() const {
}
auto &val = BOOST_GET(FetchList, *data_);
LoDTensor var;
var.MergeLoDTensor(tensors_ptr, platform::CPUPlace());
MergeLoDTensor(&var, tensors_ptr, platform::CPUPlace());
val.at(offset_) = std::move(var);
} else {
auto &array = BOOST_GET_CONST(LoDTensorArray, tensors_[0]);
......@@ -99,7 +99,7 @@ void FetchOpHandle::WaitAndMergeCPUFetchVars() const {
tensors_ptr.push_back(&element[i]);
}
tmp_array.emplace_back();
tmp_array.back().MergeLoDTensor(tensors_ptr, platform::CPUPlace());
MergeLoDTensor(&(tmp_array.back()), tensors_ptr, platform::CPUPlace());
}
auto &val = BOOST_GET(FetchList, *data_);
val.at(offset_) = std::move(tmp_array);
......
......@@ -16,11 +16,10 @@
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
namespace paddle {
namespace framework {
class Tensor;
} // namespace framework
} // namespace paddle
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
......
......@@ -19,11 +19,9 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace framework {
class Tensor;
} // namespace framework
} // namespace paddle
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
......
......@@ -275,7 +275,7 @@ FetchResultType ParallelSSAGraphExecutor::Run(
}
if (lodtensor_ptrs.size() != 0) {
LoDTensor var;
var.MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
MergeLoDTensor(&var, lodtensor_ptrs, platform::CPUPlace());
ret.emplace_back(var);
} else {
LoDTensorArray var_array(lodtensorarray_ptrs[0]->size());
......@@ -285,7 +285,7 @@ FetchResultType ParallelSSAGraphExecutor::Run(
for (size_t j = 0; j < lodtensorarray_ptrs.size(); ++j) {
ptrs.push_back(&(lodtensorarray_ptrs[j]->at(i)));
}
var.MergeLoDTensor(ptrs, platform::CPUPlace());
MergeLoDTensor(&var, ptrs, platform::CPUPlace());
var_array[i] = std::move(var);
}
ret.emplace_back(var_array);
......
......@@ -18,11 +18,9 @@
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
class Tensor;
} // namespace framework
} // namespace paddle
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
......
......@@ -22,7 +22,6 @@
namespace paddle {
namespace framework {
class Scope;
class Tensor;
class Variable;
namespace ir {
......@@ -31,6 +30,10 @@ class MemOptVarInfo;
} // namespace framework
} // namespace paddle
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
namespace details {
......
......@@ -16,9 +16,12 @@
#include "paddle/fluid/framework/selected_rows.h"
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
class Tensor;
class Variable;
} // namespace framework
} // namespace paddle
......
......@@ -19,7 +19,6 @@
namespace paddle {
namespace framework {
class Tensor;
class Variable;
} // namespace framework
} // namespace paddle
......
......@@ -14,10 +14,13 @@ limitations under the License. */
#include "paddle/fluid/framework/device_worker.h"
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
class Tensor;
class Scope;
void DeviceWorker::SetRootScope(Scope* root_scope) { root_scope_ = root_scope; }
......
......@@ -43,7 +43,6 @@ limitations under the License. */
namespace paddle {
namespace framework {
class Tensor;
class ProgramDesc;
class Scope;
} // namespace framework
......
......@@ -21,8 +21,6 @@
namespace paddle {
namespace framework {
class Tensor;
class DLPackTensor {
public:
using LaneType = decltype(::DLTensor::dtype.lanes); // uint16_t
......
......@@ -15,9 +15,12 @@ limitations under the License. */
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/platform/cpu_helper.h"
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
class Tensor;
class Variable;
} // namespace framework
} // namespace paddle
......
......@@ -19,10 +19,13 @@ limitations under the License. */
#include <boost/variant.hpp>
#include "glog/logging.h"
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
class Tensor;
class Variable;
void SetFeedVariable(Scope* scope, const LoDTensor& input,
......
......@@ -20,10 +20,13 @@ limitations under the License. */
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/string_array.h"
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
class Tensor;
class Scope;
void SetFeedVariable(Scope* scope, const LoDTensor& input,
......
......@@ -18,9 +18,12 @@
#include "paddle/fluid/framework/op_version_registry.h"
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
class Tensor;
class Scope;
} // namespace framework
} // namespace paddle
......
......@@ -19,9 +19,12 @@
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
class Tensor;
class Scope;
} // namespace framework
} // namespace paddle
......
......@@ -15,11 +15,9 @@
#include "paddle/fluid/framework/ir/delete_dropout_op_pass.h"
namespace paddle {
namespace framework {
class Tensor;
} // namespace framework
} // namespace paddle
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
......
......@@ -16,11 +16,9 @@
#include <string>
namespace paddle {
namespace framework {
class Tensor;
} // namespace framework
} // namespace paddle
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
......
......@@ -22,11 +22,9 @@ limitations under the License. */
#include "paddle/fluid/platform/device_code.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace framework {
class Tensor;
} // namespace framework
} // namespace paddle
namespace pten {
class DenseTensor;
} // namespace pten
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......@@ -206,9 +204,11 @@ void TestMainImpl(std::string func_name, std::string code_str,
for (int64_t i = 0; i < cpu_tensors[id].numel(); ++i) {
tmp_cpu_ptr[i] = paddle::platform::float16(cpu_ptr[i]);
}
TensorCopySync(tmp_cpu_tensors[id], place, &gpu_tensors[id]);
paddle::framework::TensorCopySync(tmp_cpu_tensors[id], place,
&gpu_tensors[id]);
} else {
TensorCopySync(cpu_tensors[id], place, &gpu_tensors[id]);
paddle::framework::TensorCopySync(cpu_tensors[id], place,
&gpu_tensors[id]);
}
args.push_back(&gpu_ptrs[id]);
}
......@@ -234,8 +234,8 @@ void TestMainImpl(std::string func_name, std::string code_str,
paddle::platform::float16* tmp_cpu_ptr =
tmp_cpu_tensors[id].mutable_data<paddle::platform::float16>(
cpu_tensors[id].dims(), paddle::platform::CPUPlace());
TensorCopySync(gpu_tensors[id], paddle::platform::CPUPlace(),
&tmp_cpu_tensors[id]);
paddle::framework::TensorCopySync(
gpu_tensors[id], paddle::platform::CPUPlace(), &tmp_cpu_tensors[id]);
float* cpu_ptr = cpu_tensors[id].mutable_data<float>(
cpu_tensors[id].dims(), paddle::platform::CPUPlace());
......@@ -243,8 +243,8 @@ void TestMainImpl(std::string func_name, std::string code_str,
cpu_ptr[i] = static_cast<float>(tmp_cpu_ptr[i]);
}
} else {
TensorCopySync(gpu_tensors[id], paddle::platform::CPUPlace(),
&cpu_tensors[id]);
paddle::framework::TensorCopySync(
gpu_tensors[id], paddle::platform::CPUPlace(), &cpu_tensors[id]);
}
}
}
......
......@@ -319,14 +319,47 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
}
std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
const std::vector<platform::Place> places) const {
LoD ConvertToLengthBasedLoD(const LoD &offset_lod) {
LoD length_lod;
length_lod.reserve(offset_lod.size());
for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) {
std::vector<size_t> level;
if (offset_lod[lvl].size() > 0) {
level.reserve(offset_lod[lvl].size() - 1);
}
for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) {
level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]);
}
length_lod.push_back(level);
}
return length_lod;
}
LoD ConvertToOffsetBasedLoD(const LoD &length_lod) {
LoD offset_lod;
offset_lod.reserve(length_lod.size());
for (size_t lvl = 0; lvl < length_lod.size(); ++lvl) {
std::vector<size_t> level;
level.reserve(length_lod[lvl].size() + 1);
size_t tmp = 0;
level.push_back(tmp);
for (size_t idx = 0; idx < length_lod[lvl].size(); ++idx) {
tmp += length_lod[lvl][idx];
level.push_back(tmp);
}
offset_lod.push_back(level);
}
return offset_lod;
}
std::vector<LoDTensor> SplitLoDTensor(
const LoDTensor &src, const std::vector<platform::Place> places) {
PADDLE_ENFORCE_GT(places.size(), 0,
platform::errors::InvalidArgument(
"Place number cannot be empty when splitting."));
check_memory_size();
size_t batch_size =
lod().empty() ? static_cast<size_t>(dims()[0]) : lod()[0].size() - 1;
src.check_memory_size();
size_t batch_size = src.lod().empty() ? static_cast<size_t>(src.dims()[0])
: src.lod()[0].size() - 1;
// if batch_size is 0, just return #places.size() copys of empty
// tensors.
......@@ -335,10 +368,10 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
empty_results.reserve(places.size());
for (size_t i = 0; i < places.size(); ++i) {
LoDTensor dst;
dst.Resize(dims());
dst.mutable_data(places[i], type());
if (!lod().empty()) {
dst.set_lod(lod());
dst.Resize(src.dims());
dst.mutable_data(places[i], src.type());
if (!src.lod().empty()) {
dst.set_lod(src.lod());
}
empty_results.emplace_back(std::move(dst));
}
......@@ -360,17 +393,18 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
begin, end));
LoDTensor dst;
if (lod().empty()) {
auto src = Slice(begin, end);
if (src.lod().empty()) {
auto sliced_src = src.Slice(begin, end);
auto &dst_place = places[i];
framework::TensorCopy(src, dst_place, &dst);
framework::TensorCopy(sliced_src, dst_place, &dst);
} else {
auto lod_and_offset = GetSubLoDAndAbsoluteOffset(lod(), begin, end, 0);
auto lod_and_offset =
GetSubLoDAndAbsoluteOffset(src.lod(), begin, end, 0);
auto &offset = lod_and_offset.second;
auto src = Slice(offset.first, offset.second);
auto sliced_src = src.Slice(offset.first, offset.second);
auto &dst_place = places[i];
framework::TensorCopy(src, dst_place, &dst);
framework::TensorCopy(sliced_src, dst_place, &dst);
LoD my_lod;
for (auto &l : lod_and_offset.first) {
......@@ -388,9 +422,9 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
return results;
}
void LoDTensor::MergeLoDTensor(
const std::vector<const LoDTensor *> &lod_tensors,
platform::Place dst_place) {
void MergeLoDTensor(LoDTensor *target,
const std::vector<const LoDTensor *> &lod_tensors,
platform::Place dst_place) {
PADDLE_ENFORCE_EQ(lod_tensors.empty(), false,
platform::errors::InvalidArgument(
"The LoDTensors to be merged are empty."));
......@@ -449,10 +483,10 @@ void LoDTensor::MergeLoDTensor(
}
}
}
Resize(new_dim);
set_layout(new_layout);
set_lod(new_lod);
mutable_data(dst_place, new_type);
target->Resize(new_dim);
target->set_layout(new_layout);
target->set_lod(new_lod);
target->mutable_data(dst_place, new_type);
int begin = 0;
for (auto *src : lod_tensors) {
......@@ -460,44 +494,11 @@ void LoDTensor::MergeLoDTensor(
if (end == begin) {
continue;
}
auto dst = Slice(begin, end);
auto dst = target->Slice(begin, end);
framework::TensorCopy(*src, dst_place, &dst);
begin = end;
}
}
LoD ConvertToLengthBasedLoD(const LoD &offset_lod) {
LoD length_lod;
length_lod.reserve(offset_lod.size());
for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) {
std::vector<size_t> level;
if (offset_lod[lvl].size() > 0) {
level.reserve(offset_lod[lvl].size() - 1);
}
for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) {
level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]);
}
length_lod.push_back(level);
}
return length_lod;
}
LoD ConvertToOffsetBasedLoD(const LoD &length_lod) {
LoD offset_lod;
offset_lod.reserve(length_lod.size());
for (size_t lvl = 0; lvl < length_lod.size(); ++lvl) {
std::vector<size_t> level;
level.reserve(length_lod[lvl].size() + 1);
size_t tmp = 0;
level.push_back(tmp);
for (size_t idx = 0; idx < length_lod[lvl].size(); ++idx) {
tmp += length_lod[lvl][idx];
level.push_back(tmp);
}
offset_lod.push_back(level);
}
return offset_lod;
}
} // namespace framework
} // namespace paddle
......@@ -36,7 +36,15 @@ class DeviceContext;
namespace paddle {
namespace framework {
using LoDTensor = paddle::framework::Tensor;
using LoDTensor = pten::DenseTensor;
// Split Tensor and copy to each place specified in places.
std::vector<LoDTensor> SplitLoDTensor(
const LoDTensor& src, const std::vector<platform::Place> places);
void MergeLoDTensor(LoDTensor* target,
const std::vector<const LoDTensor*>& lod_tensors,
platform::Place dst_place);
/*
* LoD is short for Level of Details.
......
......@@ -147,7 +147,7 @@ TEST(LoD, SplitLoDTensor) {
lod1.push_back(std::vector<size_t>({0, 1, 2}));
lod1.push_back(std::vector<size_t>({0, 2, 7}));
auto lods = lod_tensor.SplitLoDTensor(places);
auto lods = SplitLoDTensor(lod_tensor, places);
EXPECT_EQ(lods[0].lod(), lod0);
EXPECT_EQ(lods[1].lod(), lod1);
}
......@@ -167,7 +167,7 @@ TEST(LoD, SplitLoDTensorWithZeroBatchSize) {
LoD lod_res;
lod_res.push_back(std::vector<size_t>({0}));
auto lods = lod_tensor.SplitLoDTensor(places);
auto lods = SplitLoDTensor(lod_tensor, places);
EXPECT_EQ(lods[0].lod(), lod_res);
EXPECT_EQ(lods[1].lod(), lod_res);
}
......@@ -213,7 +213,7 @@ TEST(LoD, MergeLoDTensor) {
std::vector<const LoDTensor*> lods{&lod_tensor0, &lod_tensor1, &lod_tensor2};
LoDTensor lod_tensor;
lod_tensor.MergeLoDTensor(lods, place);
MergeLoDTensor(&lod_tensor, lods, place);
EXPECT_EQ(lod_tensor.lod(), lod);
}
......
......@@ -24,6 +24,10 @@
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
......@@ -31,7 +35,6 @@ namespace framework {
* Simple, intuitive and effective. Only single thread is supported, and
* currently designed for inference.
*/
class Tensor;
class ProgramDesc;
class Scope;
......
......@@ -32,11 +32,10 @@ limitations under the License. */
#include "paddle/pten/common/scalar.h"
#include "paddle/pten/common/scalar_array.h"
namespace paddle {
namespace framework {
class Tensor;
} // namespace framework
} // namespace paddle
namespace pten {
class DenseTensor;
} // namespace pten
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
......
......@@ -1048,7 +1048,7 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
VLOG(3) << "Split " << (is_persistable ? "persistable" : "no persistable")
<< " data (" << pair.first << "), dim:" << pair.second.dims()
<< ", place: " << pair.second.place();
auto lod_tensors = pair.second.SplitLoDTensor(member_->places_);
auto lod_tensors = SplitLoDTensor(pair.second, member_->places_);
bool is_cpu_place = platform::is_cpu_place(member_->places_.front());
if (!is_persistable && num_places != lod_tensors.size() &&
!allow_partial_feed) {
......
......@@ -14,10 +14,13 @@ limitations under the License. */
#include <time.h>
#include "paddle/fluid/framework/device_worker.h"
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
class Tensor;
class Scope;
class Variable;
......
......@@ -30,8 +30,6 @@ limitations under the License. */
namespace paddle {
namespace framework {
class Tensor;
class SelectedRows {
/*
* @brief We can use the SelectedRows structure to reproduce a sparse table.
......
......@@ -18,105 +18,13 @@ limitations under the License. */
DECLARE_bool(use_stream_safe_cuda_allocator);
namespace paddle {
namespace framework {
Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const {
check_memory_size();
PADDLE_ENFORCE_GE(begin_idx, 0,
paddle::platform::errors::OutOfRange(
"The start row index must be greater than 0."
"But received the start index is d%.",
begin_idx));
PADDLE_ENFORCE_LE(end_idx, meta_.dims[0],
paddle::platform::errors::OutOfRange(
"The end row index is out of bound."));
PADDLE_ENFORCE_LT(
begin_idx, end_idx,
paddle::platform::errors::InvalidArgument(
"The start row index must be less than the end row index."
"But received the start index = %d, the end index = %d.",
begin_idx, end_idx));
if (meta_.dims[0] == 1) {
return *this;
} else {
size_t base = numel() / meta_.dims[0];
Tensor dst;
dst.storage_ = pten::make_intrusive<paddle::experimental::SharedStorage>(
storage_->data_shared());
dst.meta_.layout = meta_.layout;
dst.meta_.dtype = meta_.dtype;
DDim dst_dims = meta_.dims;
dst_dims[0] = end_idx - begin_idx;
dst.Resize(dst_dims);
dst.meta_.offset = meta_.offset + begin_idx * base * SizeOf(dtype());
return dst;
}
}
std::vector<Tensor> Tensor::Split(int64_t split_size, int64_t axis) const {
check_memory_size();
PADDLE_ENFORCE_GE(meta_.dims.size(), 0,
paddle::platform::errors::OutOfRange(
"split expects at least a 1-dimensional tensor"));
PADDLE_ENFORCE_GE(
split_size, 0,
paddle::platform::errors::OutOfRange(
"split expects split_size be non-negative, but got split_size is %d",
split_size));
int64_t numel_size = meta_.dims[axis];
int64_t num_splits = 1;
if (split_size != 0) {
num_splits =
std::max<int64_t>((numel_size + split_size - 1) / split_size, 1);
}
std::vector<Tensor> splits(num_splits);
int64_t last_split_size = split_size - (split_size * num_splits - numel_size);
for (int64_t i = 0; i < num_splits; ++i) {
int64_t length = i < num_splits - 1 ? split_size : last_split_size;
splits[i] = Slice(i * split_size, i * split_size + length);
}
return splits;
}
std::vector<Tensor> Tensor::Chunk(int64_t chunks, int64_t axis) const {
check_memory_size();
PADDLE_ENFORCE_GE(meta_.dims.size(), 0,
paddle::platform::errors::OutOfRange(
"split expects at least a 1-dimensional tensor"));
PADDLE_ENFORCE_GE(
chunks, 0,
paddle::platform::errors::OutOfRange(
"chunks expects to be greater than 0, but got chunks is %d", chunks));
int64_t numel_size = meta_.dims[axis];
int64_t split_size = (numel_size + chunks - 1) / chunks;
return Split(split_size, axis);
}
Tensor& Tensor::ShareDataWith(const Tensor& src) {
src.check_memory_size();
// Preserve LoD
auto lod = meta_.lod;
*this = src;
meta_.lod = lod;
return *this;
}
Tensor& Tensor::ShareInplaceVersionCounterWith(const Tensor& src) {
PADDLE_ENFORCE_NOT_NULL(
inplace_version_counter_,
platform::errors::PreconditionNotMet(
"Tensor does not hold inplace_version_counter_."));
inplace_version_counter_ = src.inplace_version_counter_;
return *this;
}
namespace memory {
namespace allocation {
class Allocation;
} // namespace allocation
} // namespace memory
} // namespace paddle
} // namespace framework
namespace paddle {
namespace framework {} // namespace framework
} // namespace paddle
......@@ -69,35 +69,7 @@ using LoD = std::vector<paddle::framework::Vector<size_t>>;
Variable object but not a pointer.
*/
class Tensor : public pten::DenseTensor {
public:
using DenseTensor = pten::DenseTensor;
using DenseTensor::DenseTensor;
// Split Tensor and copy to each place specified in places.
std::vector<Tensor> SplitLoDTensor(
const std::vector<platform::Place> places) const;
void MergeLoDTensor(const std::vector<const Tensor*>& lod_tensors,
platform::Place place);
/*! The internal of two tensors share the same memory block. */
Tensor& ShareDataWith(const Tensor& src);
/*! The internal of two tensors share the same inplace version counter. */
Tensor& ShareInplaceVersionCounterWith(const Tensor& src);
Tensor Slice(int64_t begin_idx, int64_t end_idx) const;
std::vector<Tensor> Split(int64_t split_size, int64_t axis) const;
std::vector<Tensor> Chunk(int64_t chunks, int64_t axis) const;
Tensor& Resize(const DDim& dims) {
meta_.dims = dims;
return *this;
}
};
using Tensor = pten::DenseTensor;
} // namespace framework
} // namespace paddle
......
......@@ -387,18 +387,10 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
Tensor* dst) {
TensorCopyImpl<Tensor>(src, dst_place, dst);
}
void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place,
pten::DenseTensor* dst) {
TensorCopyImpl<pten::DenseTensor>(src, dst_place, dst);
}
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, Tensor* dst) {
TensorCopyImpl<Tensor>(src, dst_place, ctx, dst);
}
void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, pten::DenseTensor* dst) {
TensorCopyImpl<pten::DenseTensor>(src, dst_place, ctx, dst);
}
void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
Tensor* dst) {
......@@ -1394,45 +1386,50 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) {
return os;
}
std::ostream& operator<<(std::ostream& os, const Tensor& t) {
} // namespace framework
} // namespace paddle
namespace pten {
std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) {
if (t.lod().size() > 0) {
os << " - lod: " << t.lod() << "\n";
}
os << " - place: " << t.place() << "\n";
os << " - shape: [" << t.dims() << "]\n";
os << " - layout: " << DataLayoutToString(t.layout()) << "\n";
os << " - layout: " << paddle::framework::DataLayoutToString(t.layout())
<< "\n";
#ifdef PADDLE_WITH_MKLDNN
os << " - format: "
<< dnnl_fmt_tag2str(static_cast<dnnl_format_tag_t>(t.format())) << "\n";
#endif
Tensor tensor;
DenseTensor tensor;
tensor.Resize(t.dims());
if (platform::is_cpu_place(t.place())) {
if (paddle::platform::is_cpu_place(t.place())) {
tensor.ShareDataWith(t);
} else {
platform::CPUPlace place;
framework::TensorCopy(t, place, &tensor);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
paddle::platform::CPUPlace place;
paddle::framework::TensorCopy(t, place, &tensor);
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto& dev_ctx = *pool.Get(t.place());
dev_ctx.Wait();
}
#define PrintTensorCallback(cpp_type, proto_type) \
do { \
if (tensor.type() == proto_type) { \
os << " - dtype: " << proto_type << "\n"; \
print_tensor<cpp_type>(os, tensor); \
return os; \
} \
#define PrintTensorCallback(cpp_type, proto_type) \
do { \
if (tensor.type() == proto_type) { \
os << " - dtype: " << proto_type << "\n"; \
paddle::framework::print_tensor<cpp_type>(os, tensor); \
return os; \
} \
} while (0)
_ForEachDataType_(PrintTensorCallback);
VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
return os;
}
} // namespace framework
} // namespace paddle
}
......@@ -39,9 +39,6 @@ limitations under the License. */
namespace paddle {
namespace framework {
std::ostream& operator<<(std::ostream& os, const LoD& lod);
std::ostream& operator<<(std::ostream& os, const Tensor& t);
class PrintOptions {
public:
static PrintOptions& Instance() {
......@@ -76,12 +73,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
// If ctx_place and src_place are the same, src_ctx.Wait() is added
// after memory::Copy; if ctx_place and dst_place are the same,
// src_ctx.Wait() is added before memory::Copy.
class Tensor;
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, Tensor* dst);
void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, pten::DenseTensor* dst);
// NOTE(zcd): If the src.place() and dst_place are two different GPU,
// the copy operation is carried out on the dst_place's stream. This is
......@@ -92,8 +85,6 @@ void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place,
// not completed.
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
Tensor* dst);
void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place,
pten::DenseTensor* dst);
void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
Tensor* dst);
......@@ -469,5 +460,11 @@ inline void TensorToVector(const Tensor& src, std::vector<bool>* dst) {
delete[] array;
}
std::ostream& operator<<(std::ostream& os, const LoD& lod);
} // namespace framework
} // namespace paddle
namespace pten {
std::ostream& operator<<(std::ostream& os, const DenseTensor& t);
}
......@@ -40,7 +40,6 @@ namespace paddle {
namespace framework {
class Dataset;
class Tensor;
class ProgramDesc;
class PullDenseWorker;
class Scope;
......
......@@ -47,6 +47,10 @@
#include "xpu/bkcl.h"
#endif
namespace pten {
class DenseTensor;
} // namespace pten
// Users should add forward declarations here
namespace paddle {
......@@ -70,7 +74,6 @@ class BKCLCommunicator;
namespace framework {
class LoDRankTable;
class ScopeBase;
class Tensor;
class ReaderHolder;
class Scope;
class SelectedRows;
......
......@@ -29,9 +29,12 @@
DECLARE_bool(use_mkldnn);
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
class Tensor;
class Variable;
} // namespace framework
namespace platform {
......
......@@ -16,6 +16,7 @@
#include <iostream>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/string/string_helper.h"
......@@ -24,6 +25,7 @@
#include "paddle/fluid/imperative/parallel_context.h"
#include "paddle/pten/core/dense_tensor.h"
namespace paddle {
namespace imperative {
......@@ -975,7 +977,8 @@ void Reducer::ProcessUnusedDenseVars() {
auto *dest_grad_tensor =
grad_var_base_tmp->MutableVar()->GetMutable<framework::LoDTensor>();
const auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
TensorCopy(src_tensor, place_, *dev_ctx, dest_grad_tensor);
paddle::framework::TensorCopy(src_tensor, place_, *dev_ctx,
dest_grad_tensor);
dest_grad_tensor->Resize(dest_dims);
}
}
......
......@@ -90,12 +90,12 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
temp_tensor.mutable_data<float>(cpu_place);
// Copy the parameter data to a tmp tensor.
TensorCopySync(*t, cpu_place, &temp_tensor);
paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
// Reallocation the space on GPU
t->clear();
// Copy parameter data to newly allocated GPU space.
TensorCopySync(temp_tensor, place, t);
paddle::framework::TensorCopySync(temp_tensor, place, t);
}
}
}
......
......@@ -35,7 +35,6 @@ limitations under the License. */
namespace paddle {
namespace framework {
class Tensor;
class Scope;
} // namespace framework
......
......@@ -21,9 +21,12 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h"
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
class Tensor;
class Scope;
class SelectedRows;
} // namespace framework
......
......@@ -122,7 +122,7 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
}
#endif
std::vector<float> result;
TensorToVector(lod_tensor_n, ctx, &result);
paddle::framework::TensorToVector(lod_tensor_n, ctx, &result);
ASSERT_EQ(result, vector);
ASSERT_EQ(lod_tensor_n.lod(), lod_tensor.lod());
}
......@@ -142,7 +142,7 @@ void test_tensor_share(const platform::DeviceContext& ctx) {
framework::LoDTensor lod_tensor_n;
TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
std::vector<float> result;
TensorToVector(lod_tensor_n, ctx, &result);
paddle::framework::TensorToVector(lod_tensor_n, ctx, &result);
ASSERT_EQ(result, vector);
ASSERT_EQ(lod_tensor_n.lod(), lod_tensor.lod());
}
......
......@@ -82,10 +82,11 @@ class BatchNormOpConverter : public OpConverter {
platform::CPUPlace cpu_place;
// copy data from gpu to cpu
TensorCopySync((*Bias_t), cpu_place, &bias_tensor);
TensorCopySync((*Mean_t), cpu_place, &mean_tensor);
TensorCopySync((*Scale_t), cpu_place, &scale_tensor);
TensorCopySync((*Variance_t), cpu_place, &variance_tensor);
paddle::framework::TensorCopySync((*Bias_t), cpu_place, &bias_tensor);
paddle::framework::TensorCopySync((*Mean_t), cpu_place, &mean_tensor);
paddle::framework::TensorCopySync((*Scale_t), cpu_place, &scale_tensor);
paddle::framework::TensorCopySync((*Variance_t), cpu_place,
&variance_tensor);
auto* bias_data = bias_tensor.mutable_data<float>(platform::CPUPlace());
auto* mean_data = mean_tensor.mutable_data<float>(platform::CPUPlace());
......
......@@ -55,8 +55,8 @@ class LayerNormOpConverter : public OpConverter {
scale_tensor->Resize(Scale_t->dims());
platform::CPUPlace cpu_place;
TensorCopySync((*Bias_t), cpu_place, &(*bias_tensor));
TensorCopySync((*Scale_t), cpu_place, &(*scale_tensor));
paddle::framework::TensorCopySync((*Bias_t), cpu_place, &(*bias_tensor));
paddle::framework::TensorCopySync((*Scale_t), cpu_place, &(*scale_tensor));
auto* bias_data = bias_tensor->mutable_data<float>(platform::CPUPlace());
auto* scale_data = scale_tensor->mutable_data<float>(platform::CPUPlace());
......
......@@ -46,7 +46,8 @@ class PReluOpConverter : public OpConverter {
std::unique_ptr<framework::LoDTensor> alpha_tensor_temp(
new framework::LoDTensor());
alpha_tensor_temp->Resize(alpha_tensor->dims());
TensorCopySync(*alpha_tensor, cpu_place, alpha_tensor_temp.get());
paddle::framework::TensorCopySync(*alpha_tensor, cpu_place,
alpha_tensor_temp.get());
float* alpha_data = alpha_tensor_temp->mutable_data<float>(cpu_place);
nvinfer1::ILayer* layer = nullptr;
......
......@@ -63,7 +63,7 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
*(temp_data + i) = random(0., 1.);
}
TensorCopySync(temp_tensor, place, tensor);
paddle::framework::TensorCopySync(temp_tensor, place, tensor);
}
/*
......
......@@ -370,7 +370,8 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name,
name_with_suffix));
weight_map[name_with_suffix].reset(new framework::Tensor());
weight_map[name_with_suffix]->Resize(weight_tensor->dims());
TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get());
paddle::framework::TensorCopySync(*weight_tensor, cpu_place,
weight_map[name_with_suffix].get());
float *weight_data =
weight_map[name_with_suffix]->mutable_data<float>(cpu_place);
name_suffix_counter += 1;
......
......@@ -35,12 +35,6 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/utils/any.h"
namespace paddle {
namespace framework {
class Tensor;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace inference {
namespace tensorrt {
......
......@@ -41,12 +41,12 @@ class TensorRTEngineTest : public ::testing::Test {
void PrepareInputOutput(const std::vector<float> &input,
std::vector<int> output_shape) {
TensorFromVector(input, *ctx_, &input_);
paddle::framework::TensorFromVector(input, *ctx_, &input_);
output_.Resize(framework::make_ddim(output_shape));
}
void GetOutput(std::vector<float> *output) {
TensorToVector(output_, *ctx_, output);
paddle::framework::TensorToVector(output_, *ctx_, output);
}
protected:
......
......@@ -50,9 +50,9 @@ class AbsKernel<platform::CUDADeviceContext, T>
std::vector<const framework::Tensor*> ins = {x};
std::vector<framework::Tensor*> outs = {out};
auto functor = CudaAbsFunctor<T>();
LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T,
math::Real<T>>(dev_ctx, ins, &outs,
functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, math::Real<T>>(dev_ctx, ins, &outs,
functor);
}
};
......
......@@ -1368,14 +1368,14 @@ class ELUGradCudaKernel : public framework::OpKernel<T> {
if (alpha > 0) {
CudaELUGradFunctor<T> functor;
functor.alpha = alpha;
LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
} else {
CudaELUGradNegativeAlphaFunctor<T> functor;
functor.alpha = alpha;
ins.push_back(x);
LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
}
}
};
......@@ -1451,8 +1451,8 @@ class ActivationCudaKernel
for (auto& attr : attrs) {
*attr.second = ctx.Attr<float>(attr.first);
}
LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
}
};
......@@ -1481,17 +1481,17 @@ class ActivationGradCudaKernel
if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
// Only need forward output Out
ins.push_back(out);
LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
} else if (static_cast<int>(Functor::FwdDeps()) ==
static_cast<int>(kDepX)) {
// Only need forward input X
ins.push_back(x);
LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
} else {
LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
}
}
};
......
......@@ -2696,8 +2696,8 @@ class PowKernel : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
auto* factor_data = factor_tensor->data<float>();
framework::Tensor cpu_factor_tensor;
if (platform::is_gpu_place(factor_tensor->place())) {
TensorCopySync(*factor_tensor, platform::CPUPlace(),
&cpu_factor_tensor);
framework::TensorCopySync(*factor_tensor, platform::CPUPlace(),
&cpu_factor_tensor);
factor_data = cpu_factor_tensor.data<float>();
}
auto factor =
......@@ -2751,8 +2751,8 @@ class PowGradKernel
auto* factor_data = factor_tensor->data<float>();
framework::Tensor cpu_factor_tensor;
if (platform::is_gpu_place(factor_tensor->place())) {
TensorCopySync(*factor_tensor, platform::CPUPlace(),
&cpu_factor_tensor);
framework::TensorCopySync(*factor_tensor, platform::CPUPlace(),
&cpu_factor_tensor);
factor_data = cpu_factor_tensor.data<float>();
}
auto factor =
......
......@@ -50,7 +50,7 @@ void Update(const platform::NPUDeviceContext& ctx,
runner_p2.Run(stream);
std::vector<int> bad_out_data;
TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
paddle::framework::TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
if (bad_out_data[0] >= decr_every_n_nan_or_inf) {
const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor},
......@@ -61,7 +61,8 @@ void Update(const platform::NPUDeviceContext& ctx,
runner_p3.Run(stream);
std::vector<T> new_loss_scaling;
TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
paddle::framework::TensorToVector(*updated_loss_scaling_tensor, ctx,
&new_loss_scaling);
float min_value = 1.0;
if (FLAGS_min_loss_scaling > 1) {
min_value = static_cast<float>(FLAGS_min_loss_scaling);
......@@ -98,7 +99,7 @@ void Update(const platform::NPUDeviceContext& ctx,
runner_p2.Run(stream);
std::vector<int> good_out_data;
TensorToVector(*good_out_tensor, ctx, &good_out_data);
paddle::framework::TensorToVector(*good_out_tensor, ctx, &good_out_data);
if (good_out_data[0] >= incr_every_n_steps) {
const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
......@@ -109,7 +110,8 @@ void Update(const platform::NPUDeviceContext& ctx,
runner_p3.Run(stream);
std::vector<T> new_loss_scaling;
TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
paddle::framework::TensorToVector(*updated_loss_scaling_tensor, ctx,
&new_loss_scaling);
if (!std::isfinite(new_loss_scaling[0])) {
// updated_loss_scaling_data = pre_loss_scaling_data
const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
......@@ -209,7 +211,8 @@ class UpdateLossScalingNPUKernel : public framework::OpKernel<T> {
"FoundInfinite must has only one element."));
std::vector<bool> found_inf_vec;
TensorToVector(*found_inf, ctx.device_context(), &found_inf_vec);
paddle::framework::TensorToVector(*found_inf, ctx.device_context(),
&found_inf_vec);
LazyZerosNPU<T>{}(dev_ctx, found_inf_vec, xs, outs);
const bool stop_update = ctx.Attr<bool>("stop_update");
......
......@@ -16,10 +16,13 @@
#include "paddle/fluid/operators/controlflow/while_op_helper.h"
#include "paddle/fluid/operators/tensor_formatter.h"
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
class InferShapeContext;
class Tensor;
class OpDesc;
class Scope;
class Variable;
......
......@@ -25,9 +25,12 @@ class DeviceContext;
} // namespace platform
} // namespace paddle
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
class Tensor;
class Variable;
} // namespace framework
} // namespace paddle
......@@ -76,7 +79,7 @@ class AssignFunctor {
framework::LoDTensor *out) const {
if (lod_tensor.numel() == 0) return;
auto &out_tensor = *out;
TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor);
paddle::framework::TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor);
out_tensor.set_lod(lod_tensor.lod());
}
......
......@@ -47,7 +47,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
init.push_back(static_cast<T>(3.0));
init.push_back(static_cast<T>(4.0));
TensorFromVector(init, ctx, tensor_x);
paddle::framework::TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({4});
ctx.Wait();
......@@ -62,7 +62,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
op->Run(*scope, place);
std::vector<T> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
......
......@@ -382,7 +382,8 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
if (ctx.HasInput("MomentumTensor")) {
const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
Tensor mom_cpu;
TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
&mom_cpu);
momentum = mom_cpu.data<float>()[0];
}
......
......@@ -86,7 +86,8 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
if (ctx.HasInput("MomentumTensor")) {
const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
Tensor mom_cpu;
TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
&mom_cpu);
momentum = mom_cpu.data<float>()[0];
}
......
......@@ -87,7 +87,8 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
if (ctx.HasInput("MomentumTensor")) {
const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
Tensor mom_cpu;
TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
&mom_cpu);
momentum = mom_tensor->data<float>()[0];
}
......
......@@ -91,8 +91,8 @@ class BCELossGradCUDAKernel : public framework::OpKernel<T> {
std::vector<framework::Tensor*> outs = {dx};
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto functor = BCELossGradFunctor<T>();
LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kTernary, T, T>(
dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kTernary, T, T>(dev_ctx, ins, &outs, functor);
}
};
......
......@@ -308,7 +308,7 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor,
}
if (!platform::is_cpu_place(place_)) {
TensorCopySync(cpu_tensor, place_, tensor);
paddle::framework::TensorCopySync(cpu_tensor, place_, tensor);
}
}
......
......@@ -77,8 +77,10 @@ void BincountCUDAInner(const framework::ExecutionContext& context) {
input_min_scala.device(*place) = input_x.minimum();
Tensor input_min_cpu, input_max_cpu;
TensorCopySync(input_max_t, platform::CPUPlace(), &input_max_cpu);
TensorCopySync(input_min_t, platform::CPUPlace(), &input_min_cpu);
paddle::framework::TensorCopySync(input_max_t, platform::CPUPlace(),
&input_max_cpu);
paddle::framework::TensorCopySync(input_min_t, platform::CPUPlace(),
&input_min_cpu);
InputT input_min = input_min_cpu.data<InputT>()[0];
......
......@@ -100,7 +100,7 @@ void CopyInputDataToPlace(const framework::Scope& scope,
for (const auto& var_name : scope.LocalVarNames()) {
const auto& src_tensor = scope.GetVar(var_name)->Get<LoDTensor>();
auto* dst_tensor = dst_scope->Var(var_name)->GetMutable<LoDTensor>();
TensorCopySync(src_tensor, dst_place, dst_tensor);
paddle::framework::TensorCopySync(src_tensor, dst_place, dst_tensor);
}
}
......@@ -135,10 +135,12 @@ TEST(CinnLaunchOpTest, TestElementwiseAddPass) {
elementwise_add_op->Run(scope, run_place);
LoDTensor test_out, expected_out;
TensorCopySync(scope.Var(test_out_name)->Get<LoDTensor>(),
platform::CPUPlace(), &test_out);
TensorCopySync(scope.Var(expected_out_name)->Get<LoDTensor>(),
platform::CPUPlace(), &expected_out);
paddle::framework::TensorCopySync(
scope.Var(test_out_name)->Get<LoDTensor>(), platform::CPUPlace(),
&test_out);
paddle::framework::TensorCopySync(
scope.Var(expected_out_name)->Get<LoDTensor>(), platform::CPUPlace(),
&expected_out);
ASSERT_TRUE(test_out.IsInitialized());
ASSERT_TRUE(expected_out.IsInitialized());
......
......@@ -64,7 +64,8 @@ class ClipKernel : public framework::OpKernel<T> {
auto* max_t = context.Input<Tensor>("Max");
auto* max_data = max_t->data<T>();
if (platform::is_gpu_place(max_t->place())) {
TensorCopySync(*max_t, platform::CPUPlace(), &max_cpu);
paddle::framework::TensorCopySync(*max_t, platform::CPUPlace(),
&max_cpu);
max_data = max_cpu.data<T>();
}
max = max_data[0];
......@@ -77,7 +78,8 @@ class ClipKernel : public framework::OpKernel<T> {
auto* min_t = context.Input<Tensor>("Min");
auto* min_data = min_t->data<T>();
if (platform::is_gpu_place(min_t->place())) {
TensorCopySync(*min_t, platform::CPUPlace(), &min_cpu);
paddle::framework::TensorCopySync(*min_t, platform::CPUPlace(),
&min_cpu);
min_data = min_cpu.data<T>();
}
min = min_data[0];
......@@ -101,7 +103,8 @@ class ClipKernel : public framework::OpKernel<T> {
std::vector<const framework::Tensor*> ins = {x};
std::vector<framework::Tensor*> outs = {out};
auto functor = ClipFunctor<T>(min, max);
LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, T>(
context.template device_context<platform::CUDADeviceContext>(), ins,
&outs, functor);
#endif
......@@ -141,7 +144,8 @@ class ClipGradKernel : public framework::OpKernel<T> {
auto* max_t = context.Input<Tensor>("Max");
auto* max_data = max_t->data<T>();
if (platform::is_gpu_place(max_t->place())) {
TensorCopySync(*max_t, platform::CPUPlace(), &max_cpu);
paddle::framework::TensorCopySync(*max_t, platform::CPUPlace(),
&max_cpu);
max_data = max_cpu.data<T>();
}
max = max_data[0];
......@@ -154,7 +158,8 @@ class ClipGradKernel : public framework::OpKernel<T> {
auto* min_t = context.Input<Tensor>("Min");
auto* min_data = min_t->data<T>();
if (platform::is_gpu_place(min_t->place())) {
TensorCopySync(*min_t, platform::CPUPlace(), &min_cpu);
paddle::framework::TensorCopySync(*min_t, platform::CPUPlace(),
&min_cpu);
min_data = min_cpu.data<T>();
}
min = min_data[0];
......
......@@ -36,7 +36,8 @@ class ClipXPUKernel : public framework::OpKernel<T> {
auto* max_t = ctx.Input<Tensor>("Max");
auto* max_data = max_t->data<T>();
if (platform::is_xpu_place(max_t->place())) {
TensorCopySync(*max_t, platform::CPUPlace(), &max_cpu);
paddle::framework::TensorCopySync(*max_t, platform::CPUPlace(),
&max_cpu);
max_data = max_cpu.data<T>();
}
max = max_data[0];
......@@ -48,7 +49,8 @@ class ClipXPUKernel : public framework::OpKernel<T> {
auto* min_t = ctx.Input<Tensor>("Min");
auto* min_data = min_t->data<T>();
if (platform::is_xpu_place(min_t->place())) {
TensorCopySync(*min_t, platform::CPUPlace(), &min_cpu);
paddle::framework::TensorCopySync(*min_t, platform::CPUPlace(),
&min_cpu);
min_data = min_cpu.data<T>();
}
min = min_data[0];
......
......@@ -139,7 +139,7 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
}
PrintDebugInfo("input data", init);
TensorFromVector(init, ctx, tensor_x);
paddle::framework::TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num1, num2});
ctx.Wait();
......@@ -165,7 +165,7 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
ctx.Wait();
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
PrintDebugInfo("output data", out_vec);
......
......@@ -139,7 +139,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
}
PrintDebugInfo("input data", init);
TensorFromVector(init, ctx, tensor_x);
paddle::framework::TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num1, num2});
ctx.Wait();
......@@ -164,7 +164,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
ctx.Wait();
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
PrintDebugInfo("output data", out_vec);
......
......@@ -144,7 +144,7 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
try {
const auto& runner_mean = paddle::operators::NpuOpRunner(
"ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
TensorToVector(mean, dev_ctx, &vec);
paddle::framework::TensorToVector(mean, dev_ctx, &vec);
} catch (...) {
LOG(WARNING) << "ContainsNan catch exception";
return true;
......
......@@ -146,7 +146,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
auto place = ctx.GetPlace();
TensorFromVector(init, ctx, tensor_x);
paddle::framework::TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num1, num2});
ctx.Wait();
......@@ -170,7 +170,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
ctx.Wait();
std::vector<T> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
PrintDebugInfo("output data", out_vec);
......
......@@ -133,7 +133,7 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
}
PrintDebugInfo("input data", init);
TensorFromVector(init, ctx, tensor_x);
paddle::framework::TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num, num});
ctx.Wait();
......@@ -159,7 +159,7 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
ctx.Wait();
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
PrintDebugInfo("output data", out_vec);
......
......@@ -71,8 +71,8 @@ void shard_index(const Tensor &table_t, const Tensor &ids_t, int64_t start_idx,
#if (CANN_VERSION_CODE >= 503003)
Tensor factor_tensor(ids_t.type());
factor_tensor.mutable_data<T>({1}, context.GetPlace());
TensorFromVector(std::vector<T>{static_cast<T>(start_idx)},
context.device_context(), &factor_tensor);
paddle::framework::TensorFromVector(std::vector<T>{static_cast<T>(start_idx)},
context.device_context(), &factor_tensor);
sub_runner.SetType("Sub")
.AddInput(ids_t)
.AddInput(factor_tensor)
......
......@@ -48,7 +48,7 @@ class CIdentityOpKernel : public framework::OpKernel<T> {
"The ring_id (%d) for c_identity op must be non-negative.", rid));
out->mutable_data<T>(ctx.GetPlace());
TensorCopy(*x, out->place(), out);
paddle::framework::TensorCopy(*x, out->place(), out);
}
};
......
......@@ -137,7 +137,7 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
auto place = ctx.GetPlace();
TensorFromVector(init, ctx, tensor_x);
paddle::framework::TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num1, num2});
ctx.Wait();
......@@ -161,7 +161,7 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
ctx.Wait();
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
PrintDebugInfo("output data", out_vec);
......
......@@ -137,7 +137,7 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
}
PrintDebugInfo("input data", init);
TensorFromVector(init, ctx, tensor_x);
paddle::framework::TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num1, num2});
ctx.Wait();
......@@ -166,7 +166,7 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
}
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
PrintDebugInfo("output data", out_vec);
......
......@@ -56,9 +56,9 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
init_y.push_back(static_cast<T>(2.0));
}
TensorFromVector(init_x, ctx, tensor_x);
paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
tensor_x->Resize({10, 10});
TensorFromVector(init_y, ctx, tensor_y);
paddle::framework::TensorFromVector(init_y, ctx, tensor_y);
tensor_y->Resize({10, 10});
f::AttributeMap attrs;
......@@ -85,7 +85,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
sync_op->Run(*scope, place);
std::vector<T> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
// sync op copy
auto sync_op2 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}},
......
......@@ -136,7 +136,7 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
}
std::cout << std::endl;
TensorFromVector(init, ctx, tensor_x);
paddle::framework::TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num, num});
ctx.Wait();
......@@ -169,7 +169,7 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
// ctx.Wait();
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
EXPECT_EQ(out_vec.size(), init.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
......
......@@ -64,7 +64,7 @@ bool Check(T value, int size = 2 * 512 * 8192) {
init.push_back(static_cast<T>(value));
}
TensorFromVector(init, ctx, tensor_x);
paddle::framework::TensorFromVector(init, ctx, tensor_x);
bool result = paddle::operators::ContainsNan(ctx, ctx.stream(), tensor_x);
return result;
}
......
......@@ -145,7 +145,7 @@ void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) {
}
VLOG(3) << "Run op recv_v2";
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
EXPECT_EQ(out_vec == init, true);
......
......@@ -119,7 +119,7 @@ void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) {
std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
int rank_id = atoi(getenv("RANK_ID"));
VLOG(3) << "rank id:" << rank_id;
TensorFromVector(init, ctx, tensor_x);
paddle::framework::TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num, num});
ctx.Wait();
auto place = ctx.GetPlace();
......
......@@ -35,8 +35,9 @@ class BinaryBitwiseOpKernel<platform::CUDADeviceContext, Functor>
std::vector<framework::Tensor*> outs = {out};
const auto& cuda_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
cuda_ctx, ins, &outs, -1, functor);
paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
T>(cuda_ctx, ins, &outs, -1,
functor);
}
};
......@@ -56,8 +57,8 @@ class UnaryBitwiseOpKernel<platform::CUDADeviceContext, Functor>
std::vector<framework::Tensor*> outs = {out};
const auto& cuda_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
cuda_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, T>(cuda_ctx, ins, &outs, functor);
}
};
......
......@@ -55,8 +55,8 @@ class CompareReduceOpKernel
context.template device_context<platform::CUDADeviceContext>();
std::vector<const framework::Tensor*> ins = {x, y};
std::vector<framework::Tensor*> outs = {&tmp};
LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, bool>(
cuda_ctx, ins, &outs, Functor());
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kBinary, T, bool>(cuda_ctx, ins, &outs, Functor());
// Reduce by 'bitwise and' operator
std::vector<int> reduce_dims;
......
......@@ -35,7 +35,8 @@ class CompareOpKernel<platform::CUDADeviceContext, Functor, InverseFunctor>
ctx.template device_context<platform::CUDADeviceContext>();
int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
LaunchElementwiseCudaKernel<ElementwiseType::kBinary, InT, OutT>(
paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
InT, OutT>(
cuda_ctx, ins, &outs, axis, functor);
}
};
......
......@@ -39,12 +39,13 @@ static void DataCopy(const framework::LoDTensor &src_item,
: paddle::platform::MKLDNNDeviceContext::tls()
.get_cur_paddle_data_layout(),
src_item, &out, platform::CPUPlace());
TensorCopySync(out, platform::CPUPlace(), dst_item);
paddle::framework::TensorCopySync(out, platform::CPUPlace(), dst_item);
} else {
TensorCopySync(src_item, platform::CPUPlace(), dst_item);
paddle::framework::TensorCopySync(src_item, platform::CPUPlace(),
dst_item);
}
#else
TensorCopySync(src_item, platform::CPUPlace(), dst_item);
paddle::framework::TensorCopySync(src_item, platform::CPUPlace(), dst_item);
#endif
} else {
// Not copy, if the src tensor is empty.
......
......@@ -50,12 +50,13 @@ static void DeepCopy(const framework::LoDTensor &src_item,
: paddle::platform::MKLDNNDeviceContext::tls()
.get_cur_paddle_data_layout(),
src_item, &out, platform::CPUPlace());
TensorCopySync(out, platform::CPUPlace(), dst_item);
paddle::framework::TensorCopySync(out, platform::CPUPlace(), dst_item);
} else {
TensorCopySync(src_item, platform::CPUPlace(), dst_item);
paddle::framework::TensorCopySync(src_item, platform::CPUPlace(),
dst_item);
}
#else
TensorCopySync(src_item, platform::CPUPlace(), dst_item);
paddle::framework::TensorCopySync(src_item, platform::CPUPlace(), dst_item);
#endif
} else {
// Not copy, if the src tensor is empty.
......
......@@ -34,10 +34,12 @@ class BinaryLogicalOpKernel<platform::CUDADeviceContext, Functor>
int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
if (ins.size() == 1) {
LaunchElementwiseCudaKernel<ElementwiseType::kUnary, InT, OutT>(
paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kUnary,
InT, OutT>(
cuda_ctx, ins, &outs, axis, functor);
} else {
LaunchElementwiseCudaKernel<ElementwiseType::kBinary, InT, OutT>(
paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
InT, OutT>(
cuda_ctx, ins, &outs, axis, functor);
}
}
......
......@@ -55,7 +55,7 @@ class WriteToArrayOp : public ArrayOp {
platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
TensorCopy(x_tensor, place, dev_ctx, out_tensor);
paddle::framework::TensorCopy(x_tensor, place, dev_ctx, out_tensor);
} else {
VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
"nothing has been written to output array["
......
......@@ -22,9 +22,12 @@
#include "paddle/fluid/operators/controlflow/op_variant.h"
#include "paddle/fluid/platform/variant.h"
namespace pten {
class DenseTensor;
} // namespace pten
namespace paddle {
namespace framework {
class Tensor;
class ProgramDesc;
} // namespace framework
} // namespace paddle
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册