未验证 提交 48f41a7f 编写于 作者: G guofei 提交者: GitHub

Support LoDTensorArray in fetch (#23645) (#23968)

cherry-pick #23645
上级 5bcf1632
......@@ -197,13 +197,27 @@ FetchResultType AsyncSSAGraphExecutor::Run(
HandleException();
FeedFetchList ret;
auto &val = boost::get<FeedFetchList>(fetch_data);
FetchList ret;
auto &val = boost::get<FetchList>(fetch_data);
for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
std::vector<const LoDTensor *> lodtensor_ptrs;
lodtensor_ptrs.push_back(&val.at(fetch_idx));
ret.emplace_back();
ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
if (data_is_lod_tensor(val.at(fetch_idx))) {
std::vector<const LoDTensor *> lodtensor_ptrs;
lodtensor_ptrs.push_back(&(boost::get<LoDTensor>(val.at(fetch_idx))));
LoDTensor var;
var.MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
ret.emplace_back(var);
} else {
auto array = boost::get<LoDTensorArray>(val.at(fetch_idx));
LoDTensorArray item_array;
item_array.reserve(array.size());
for (size_t i = 0; i < array.size(); ++i) {
std::vector<const LoDTensor *> lodtensor_ptrs;
lodtensor_ptrs.push_back(&array[i]);
item_array.emplace_back();
item_array.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
}
ret.emplace_back(item_array);
}
}
return ret;
}
......
......@@ -63,7 +63,7 @@ FetchResultType FastThreadedSSAGraphExecutor::Run(
FetchResultType fetches;
if (return_merged) {
fetches = FeedFetchList(fetch_tensors.size());
fetches = FetchList(fetch_tensors.size());
} else {
fetches = FetchUnmergedList(fetch_tensors.size());
}
......
......@@ -39,51 +39,98 @@ void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
}
void FetchOpHandle::WaitAndMergeCPUTensors() const {
static void CheckDims(const framework::DDim &tensor_dims,
const framework::DDim &ele_dims, const size_t offset) {
PADDLE_ENFORCE_EQ(
tensor_dims.size(), ele_dims.size(),
platform::errors::Fatal("The dimension sizes of fetched Tensors or "
"the items of fetched LoDTensorArray are "
"different from each other on different "
"devices. And the error is caused by the %zu "
"(th) fetched variable. Please set the "
"parameter `return_merged = False` when you "
"call the `Executor.run()` method.",
offset));
for (int j = 1; j < tensor_dims.size(); j++) {
PADDLE_ENFORCE_EQ(
tensor_dims[j], ele_dims[j],
platform::errors::Fatal("The dimensions of fetched Tensors or "
"the items of fetched LoDTensorArray are "
"different from each other on different "
"devices. And the error is caused by the "
"%zu (th) fetched variable. Please set the "
"parameter `return_merged = False` when "
"you call the `Executor.run()` method.",
offset));
}
}
void FetchOpHandle::WaitAndMergeCPUFetchVars() const {
if (return_merged_) {
const auto &tensor_dims = tensors_[0].dims();
for (size_t i = 1; i < tensors_.size(); i++) {
const auto &ele_dims = tensors_[i].dims();
PADDLE_ENFORCE_EQ(
tensor_dims.size(), ele_dims.size(),
platform::errors::Fatal("The dimension sizes of fetched Tensors are "
"different from each other on different "
"devices. And the error is caused by the %zu "
"(th) fetched variable. Please set the "
"parameter `return_merged = False` when you "
"call the `Executor.run()` method.",
offset_));
for (int j = 1; j < tensor_dims.size(); j++) {
PADDLE_ENFORCE_EQ(
tensor_dims[j], ele_dims[j],
platform::errors::Fatal("The dimensions of fetched Tensors are "
"different from each other on different "
"devices. And the error is caused by the "
"%zu (th) fetched variable. Please set the "
"parameter `return_merged = False` when "
"you call the `Executor.run()` method.",
offset_));
if (data_is_lod_tensor(tensors_[0])) {
const auto &tensor_dims = boost::get<LoDTensor>(tensors_[0]).dims();
for (size_t i = 1; i < tensors_.size(); i++) {
const auto &ele_dims = boost::get<LoDTensor>(tensors_[i]).dims();
CheckDims(tensor_dims, ele_dims, offset_);
}
std::vector<const LoDTensor *> tensors_ptr;
tensors_ptr.reserve(tensors_.size());
for (auto &t : tensors_) {
tensors_ptr.emplace_back(&boost::get<LoDTensor>(t));
}
auto &val = boost::get<FetchList>(*data_);
LoDTensor var;
var.MergeLoDTensor(tensors_ptr, platform::CPUPlace());
val.at(offset_) = std::move(var);
} else {
auto &array = boost::get<LoDTensorArray>(tensors_[0]);
LoDTensorArray tmp_array;
tmp_array.reserve(array.size());
for (size_t i = 0; i < array.size(); ++i) {
const auto &tensor_dims = array[i].dims();
std::vector<const LoDTensor *> tensors_ptr;
tensors_ptr.reserve(tensors_.size());
tensors_ptr.push_back(&array[i]);
for (size_t j = 1; j < tensors_.size(); ++j) {
auto &element = boost::get<LoDTensorArray>(tensors_[j]);
const auto &ele_dims = element[i].dims();
CheckDims(tensor_dims, ele_dims, offset_);
tensors_ptr.push_back(&element[i]);
}
tmp_array.emplace_back();
tmp_array.back().MergeLoDTensor(tensors_ptr, platform::CPUPlace());
}
auto &val = boost::get<FetchList>(*data_);
val.at(offset_) = std::move(tmp_array);
}
std::vector<const LoDTensor *> tensors_ptr;
tensors_ptr.reserve(tensors_.size());
for (auto &t : tensors_) {
tensors_ptr.emplace_back(&t);
}
auto &val = boost::get<FeedFetchList>(*data_);
val.at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace());
} else {
auto &val = boost::get<FetchUnmergedList>(*data_);
val.at(offset_) = std::move(tensors_);
}
}
static void TransData(const framework::LoDTensor &src_item,
framework::LoDTensor *dst_item) {
if (src_item.IsInitialized() && src_item.numel() > 0) {
if (platform::is_gpu_place(src_item.place())) {
#ifdef PADDLE_WITH_CUDA
TensorCopy(src_item, platform::CPUPlace(), dst_item);
#endif
} else {
dst_item->ShareDataWith(src_item);
}
} else {
dst_item->clear();
dst_item->Resize({0});
}
dst_item->set_lod(src_item.lod());
}
void FetchOpHandle::RunImpl() {
platform::RecordEvent record_event(Name());
WaitInputVarGenerated(platform::CPUPlace());
tensors_.resize(inputs_.size());
platform::CPUPlace cpu;
auto &scopes = *local_exec_scopes_;
for (size_t i = 0; i < inputs_.size(); ++i) {
......@@ -93,23 +140,21 @@ void FetchOpHandle::RunImpl() {
PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
var_handle->name());
auto &t = var->Get<framework::LoDTensor>();
if (t.IsInitialized() && t.numel() > 0) {
if (platform::is_gpu_place(t.place())) {
#ifdef PADDLE_WITH_CUDA
TensorCopy(t, cpu, &tensors_[i]);
#endif
} else {
tensors_[i].ShareDataWith(t);
}
if (var->IsType<LoDTensor>()) {
auto &t = var->Get<framework::LoDTensor>();
auto &item = boost::get<LoDTensor>(tensors_[i]);
TransData(t, &item);
} else {
tensors_[i].clear();
tensors_[i].Resize({0});
auto &t = var->Get<framework::LoDTensorArray>();
LoDTensorArray tmp(t.size());
tensors_[i] = tmp;
auto &item = boost::get<LoDTensorArray>(tensors_[i]);
for (size_t j = 0; j < t.size(); ++j) {
TransData(t[j], &item[j]);
}
}
tensors_[i].set_lod(t.lod());
}
this->WaitAndMergeCPUTensors();
this->WaitAndMergeCPUFetchVars();
}
void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
......
......@@ -36,7 +36,7 @@ struct FetchOpHandle : public OpHandleBase {
void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) override;
void WaitAndMergeCPUTensors() const;
void WaitAndMergeCPUFetchVars() const;
std::string Name() const override;
......@@ -54,7 +54,7 @@ struct FetchOpHandle : public OpHandleBase {
size_t offset_;
std::vector<Scope *> *local_scopes_;
std::vector<Scope *> *local_exec_scopes_;
std::vector<LoDTensor> tensors_;
std::vector<FetchType> tensors_;
bool return_merged_;
};
......
......@@ -179,7 +179,7 @@ FetchResultType ParallelSSAGraphExecutor::Run(
}
if (return_merged) {
return FeedFetchList();
return FetchList();
} else {
return FetchUnmergedList();
}
......@@ -245,22 +245,43 @@ FetchResultType ParallelSSAGraphExecutor::Run(
}
if (return_merged) {
FeedFetchList ret;
FetchList ret;
ret.reserve(fetch_tensors.size());
for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
std::vector<const LoDTensor *> lodtensor_ptrs;
lodtensor_ptrs.reserve(place_num);
std::vector<const LoDTensorArray *> lodtensorarray_ptrs;
lodtensorarray_ptrs.reserve(place_num);
for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
if (!is_valid[scope_idx]) {
continue;
}
const auto &fetch_list =
boost::get<FeedFetchList>(fetch_data[scope_idx]);
lodtensor_ptrs.push_back(&fetch_list[fetch_idx]);
const auto &fetch_list = boost::get<FetchList>(fetch_data[scope_idx]);
if (data_is_lod_tensor(fetch_list[fetch_idx])) {
lodtensor_ptrs.push_back(
&(boost::get<LoDTensor>(fetch_list[fetch_idx])));
} else {
lodtensorarray_ptrs.push_back(
&(boost::get<LoDTensorArray>(fetch_list[fetch_idx])));
}
}
if (lodtensor_ptrs.size() != 0) {
LoDTensor var;
var.MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
ret.emplace_back(var);
} else {
LoDTensorArray var_array(lodtensorarray_ptrs[0]->size());
for (size_t i = 0; i < lodtensorarray_ptrs[0]->size(); ++i) {
LoDTensor var;
std::vector<const LoDTensor *> ptrs;
for (size_t j = 0; j < lodtensorarray_ptrs.size(); ++j) {
ptrs.push_back(&(lodtensorarray_ptrs[j]->at(i)));
}
var.MergeLoDTensor(ptrs, platform::CPUPlace());
var_array[i] = std::move(var);
}
ret.emplace_back(var_array);
}
ret.emplace_back();
ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
}
return ret;
} else {
......@@ -277,8 +298,8 @@ FetchResultType ParallelSSAGraphExecutor::Run(
boost::get<FetchUnmergedList>(fetch_data[scope_idx]);
PADDLE_ENFORCE_EQ(
fetch_list[fetch_idx].size(), 1,
platform::errors::Fatal(
"Each place must have only one fetched LoDTensor!"));
platform::errors::Fatal("Each place must have only one fetched "
"LoDTensor/LoDTensorArray!"));
ret.back().emplace_back(fetch_list[fetch_idx][0]);
}
}
......
......@@ -72,7 +72,7 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl(
std::unordered_set<VarHandleBase *> fetch_dependencies;
FetchResultType fetch_data;
if (return_merged) {
fetch_data = FeedFetchList(fetch_tensors.size());
fetch_data = FetchList(fetch_tensors.size());
} else {
fetch_data = FetchUnmergedList(fetch_tensors.size());
}
......
......@@ -256,7 +256,7 @@ static bool has_feed_operators(
// Return true if the block has fetch operators and holder of matching info.
static bool has_fetch_operators(
const BlockDesc& block,
const std::map<std::string, LoDTensor*>& fetch_targets,
const std::map<std::string, FetchType*>& fetch_targets,
const std::string& fetch_holder_name) {
size_t fetch_count = 0;
for (auto* op : block.AllOps()) {
......@@ -306,7 +306,7 @@ static bool has_fetch_operators(
void Executor::Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets,
std::map<std::string, LoDTensor*>* fetch_targets,
std::map<std::string, FetchType*>* fetch_targets,
bool create_local_scope, bool create_vars,
const std::string& feed_holder_name,
const std::string& fetch_holder_name) {
......@@ -504,7 +504,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
void Executor::RunPreparedContext(
ExecutorPrepareContext* ctx, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets,
std::map<std::string, LoDTensor*>* fetch_targets, bool create_local_scope,
std::map<std::string, FetchType*>* fetch_targets, bool create_local_scope,
bool create_vars, const std::string& feed_holder_name,
const std::string& fetch_holder_name) {
auto& global_block = ctx->prog_.Block(ctx->block_id_);
......
......@@ -87,7 +87,7 @@ class Executor {
// This API is very slow.
void Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets,
std::map<std::string, LoDTensor*>* fetch_targets,
std::map<std::string, FetchType*>* fetch_targets,
bool create_local_scope = true, bool create_vars = true,
const std::string& feed_holder_name = "feed",
const std::string& fetch_holder_name = "fetch");
......@@ -95,7 +95,7 @@ class Executor {
// This API is very slow.
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets,
std::map<std::string, LoDTensor*>* fetch_targets,
std::map<std::string, FetchType*>* fetch_targets,
bool create_local_scope = true,
bool create_vars = true,
const std::string& feed_holder_name = "feed",
......
......@@ -29,7 +29,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
// be created.
VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
auto& feed_inputs = *(g_feed_value->GetMutable<FeedList>());
if (index >= feed_inputs.size()) {
feed_inputs.resize(index + 1);
}
......@@ -39,27 +39,35 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
feed_inputs[index].set_lod(input.lod());
}
LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name,
size_t index) {
// Since we want to fetch LodTensor from a variable, the variable must
// Since we want to fetch FetchType from a variable, the variable must
// be created alreadly.
Variable* g_fetch_value = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(g_fetch_value, "%s is not found.", var_name);
PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
"Only %s can be invoked by GetFetchVariable",
typeid(FeedFetchList).name());
auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
PADDLE_ENFORCE_NOT_NULL(g_fetch_value,
platform::errors::NotFound(
"Variable %s is not found in scope.", var_name));
PADDLE_ENFORCE_EQ(g_fetch_value->IsType<FetchList>(), true,
platform::errors::InvalidArgument(
"Only %s can be invoked by GetFetchVariable",
typeid(FetchList).name()));
auto& fetch_outputs = *g_fetch_value->GetMutable<FetchList>();
auto& tensor = fetch_outputs[index];
VLOG(3) << "Fetch " << var_name << " with index " << index
<< " shape= " << tensor.dims();
PADDLE_ENFORCE_LT(index, fetch_outputs.size());
VLOG(3) << "Fetch " << var_name << " with index " << index;
PADDLE_ENFORCE_LT(index, fetch_outputs.size(),
platform::errors::InvalidArgument(
"index must less than fetch_outputs size."));
return tensor;
}
LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) {
Variable* var = scope.FindVar(var_name);
PADDLE_ENFORCE(var, "%s no in scope", var_name);
PADDLE_ENFORCE(var->IsType<LoDTensor>(), "Only support lod tensor now.");
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
var_name));
PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(), true,
platform::errors::InvalidArgument(
"Only support lod tensor in GetVariableTensor now."));
return *var->GetMutable<LoDTensor>();
}
......
......@@ -24,7 +24,7 @@ namespace framework {
void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index);
LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name,
size_t index);
LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name);
......
......@@ -15,14 +15,33 @@ limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle {
namespace framework {
using FeedFetchType = LoDTensor;
using FeedFetchList = std::vector<FeedFetchType>;
using FetchUnmergedList = std::vector<std::vector<FeedFetchType>>;
using FetchResultType = boost::variant<FeedFetchList, FetchUnmergedList>;
using FeedType = LoDTensor;
using FeedList = std::vector<FeedType>;
using FetchType = boost::variant<LoDTensor, LoDTensorArray>;
using FetchList = std::vector<FetchType>;
using FetchUnmergedList = std::vector<std::vector<FetchType>>;
using FetchResultType = boost::variant<FetchList, FetchUnmergedList>;
inline bool data_is_lod_tensor(const FetchType &data) {
if (data.type() == typeid(LoDTensor)) {
return true;
}
return false;
}
inline bool data_is_lod_tensor_array(const FetchType &data) {
if (data.type() == typeid(LoDTensorArray)) {
return true;
}
return false;
}
static const char kFeedOpType[] = "feed";
static const char kFetchOpType[] = "fetch";
......
......@@ -20,7 +20,6 @@ namespace paddle {
namespace framework {
using LoDTensorArray = std::vector<LoDTensor>;
using LoDTensor2DArray = std::vector<std::vector<LoDTensor>>;
} // namespace framework
} // namespace paddle
......@@ -36,6 +36,7 @@ inline proto::VarType::Type ToVarType(int type) {
case proto::VarType::SELECTED_ROWS:
case proto::VarType::LOD_RANK_TABLE:
case proto::VarType::LOD_TENSOR_ARRAY:
case proto::VarType::FETCH_LIST:
case proto::VarType::READER:
return static_cast<proto::VarType::Type>(type);
default:
......@@ -61,6 +62,9 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
case proto::VarType::READER:
visitor(var.Get<ReaderHolder>());
return;
case proto::VarType::FETCH_LIST:
visitor(var.Get<FetchList>());
return;
default:
PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type()));
}
......
......@@ -19,6 +19,7 @@
#include <tuple>
#include <typeindex>
#include <vector>
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/platform/place.h"
......@@ -139,7 +140,7 @@ struct VarTypeRegistryImpl {
using VarTypeRegistry = detail::VarTypeRegistryImpl<
Tensor, LoDTensor, SelectedRows, std::vector<Scope *>, LoDRankTable,
LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *,
operators::reader::LoDTensorBlockingQueueHolder,
operators::reader::LoDTensorBlockingQueueHolder, FetchList,
operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_NCCL)
......@@ -178,6 +179,7 @@ REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE);
REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY);
REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST);
REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER);
REG_PROTO_VAR_TYPE_TRAIT(FetchList, proto::VarType::FETCH_LIST);
REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32);
REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32);
......
......@@ -34,9 +34,9 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
} else if (var_type == proto::VarType::SELECTED_ROWS) {
var->GetMutable<SelectedRows>();
} else if (var_type == proto::VarType::FEED_MINIBATCH) {
var->GetMutable<FeedFetchList>();
var->GetMutable<FeedList>();
} else if (var_type == proto::VarType::FETCH_LIST) {
var->GetMutable<FeedFetchList>();
var->GetMutable<FetchList>();
} else if (var_type == proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope *>>();
} else if (var_type == proto::VarType::LOD_RANK_TABLE) {
......
......@@ -383,8 +383,9 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
for (size_t i = 0; i < fetches_.size(); ++i) {
int idx = boost::get<int>(fetches_[i]->GetAttr("col"));
PADDLE_ENFORCE((size_t)idx == i);
framework::LoDTensor &fetch =
framework::FetchType &fetch_var =
framework::GetFetchVariable(*scope, "fetch", idx);
auto &fetch = boost::get<framework::LoDTensor>(fetch_var);
auto type = fetch.type();
auto output = &(outputs->at(i));
output->name = fetches_[idx]->Input("X")[0];
......@@ -583,9 +584,9 @@ void AnalysisPredictor::PrepareFeedFetch() {
void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
PADDLE_ENFORCE_NOT_NULL(scope);
auto *var = scope->Var("feed");
var->GetMutable<framework::FeedFetchList>();
var->GetMutable<framework::FeedList>();
var = scope->Var("fetch");
var->GetMutable<framework::FeedFetchList>();
var->GetMutable<framework::FetchList>();
}
std::vector<std::string> AnalysisPredictor::GetInputNames() {
......
......@@ -286,8 +286,9 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
for (size_t i = 0; i < fetchs_.size(); ++i) {
int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
PADDLE_ENFORCE((size_t)idx == i);
framework::LoDTensor &fetch =
framework::FetchType &fetch_var =
framework::GetFetchVariable(*scope, "fetch", idx);
auto fetch = boost::get<framework::LoDTensor>(fetch_var);
auto type = fetch.type();
auto output = &(outputs->at(i));
output->name = fetchs_[idx]->Input("X")[0];
......
......@@ -102,14 +102,15 @@ void MainWord2Vec(bool use_gpu) {
cpu_feeds.push_back(&third_word);
cpu_feeds.push_back(&fourth_word);
framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
framework::FetchType output1;
std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
TestInference<platform::CPUPlace>(config.model_dir, cpu_feeds, cpu_fetchs1);
float* lod_data = output1.data<float>();
for (int i = 0; i < output1.numel(); ++i) {
auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
float* lod_data = output1_tensor.data<float>();
for (int i = 0; i < output1_tensor.numel(); ++i) {
EXPECT_LT(lod_data[i] - data[i], ACC_DIFF);
EXPECT_GT(lod_data[i] - data[i], -ACC_DIFF);
}
......@@ -137,8 +138,8 @@ void MainImageClassification(bool use_gpu) {
std::vector<framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
framework::LoDTensor output1;
std::vector<framework::LoDTensor*> cpu_fetchs1;
framework::FetchType output1;
std::vector<framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
TestInference<platform::CPUPlace, false, true>(
......@@ -153,7 +154,8 @@ void MainImageClassification(bool use_gpu) {
ASSERT_EQ(outputs.size(), 1UL);
size_t len = outputs[0].data.length();
float* data = static_cast<float*>(outputs[0].data.data());
float* lod_data = output1.data<float>();
float* lod_data =
boost::get<paddle::framework::LoDTensor>(output1).data<float>();
for (size_t j = 0; j < len / sizeof(float); ++j) {
EXPECT_NEAR(lod_data[j], data[j], ACC_DIFF);
}
......@@ -168,7 +170,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
constexpr int num_jobs = 3;
std::vector<std::vector<framework::LoDTensor>> jobs(num_jobs);
std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
std::vector<framework::LoDTensor> refs(num_jobs);
std::vector<framework::FetchType> refs(num_jobs);
for (size_t i = 0; i < jobs.size(); ++i) {
// each job has 4 words
jobs[i].resize(4);
......@@ -181,7 +183,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
// get reference result of each job
std::vector<paddle::framework::LoDTensor*> ref_feeds;
std::vector<paddle::framework::LoDTensor*> ref_fetches(1, &refs[i]);
std::vector<paddle::framework::FetchType*> ref_fetches(1, &refs[i]);
for (auto& word : jobs[i]) {
ref_feeds.push_back(&word);
}
......@@ -207,9 +209,10 @@ void MainThreadsWord2Vec(bool use_gpu) {
}
// check outputs correctness
float* ref_data = refs[tid].data<float>();
EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
for (int i = 0; i < refs[tid].numel(); ++i) {
auto ref_tensor = boost::get<paddle::framework::LoDTensor>(refs[tid]);
float* ref_data = ref_tensor.data<float>();
EXPECT_EQ(ref_tensor.numel(), static_cast<int64_t>(len / sizeof(float)));
for (int i = 0; i < ref_tensor.numel(); ++i) {
EXPECT_NEAR(ref_data[i], data[i], 2e-3);
}
});
......@@ -230,7 +233,7 @@ void MainThreadsImageClassification(bool use_gpu) {
auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
std::vector<framework::LoDTensor> jobs(num_jobs);
std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
std::vector<framework::LoDTensor> refs(num_jobs);
std::vector<framework::FetchType> refs(num_jobs);
for (size_t i = 0; i < jobs.size(); ++i) {
// prepare inputs
std::vector<std::vector<int64_t>> feed_target_shapes =
......@@ -242,7 +245,7 @@ void MainThreadsImageClassification(bool use_gpu) {
// get reference result of each job
std::vector<framework::LoDTensor*> ref_feeds(1, &jobs[i]);
std::vector<framework::LoDTensor*> ref_fetches(1, &refs[i]);
std::vector<framework::FetchType*> ref_fetches(1, &refs[i]);
TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
}
......@@ -259,9 +262,10 @@ void MainThreadsImageClassification(bool use_gpu) {
ASSERT_EQ(local_outputs.size(), 1UL);
const size_t len = local_outputs[0].data.length();
float* data = static_cast<float*>(local_outputs[0].data.data());
float* ref_data = refs[tid].data<float>();
EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float));
for (int i = 0; i < refs[tid].numel(); ++i) {
auto ref_tensor = boost::get<paddle::framework::LoDTensor>(refs[tid]);
float* ref_data = ref_tensor.data<float>();
EXPECT_EQ((size_t)ref_tensor.numel(), len / sizeof(float));
for (int i = 0; i < ref_tensor.numel(); ++i) {
EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF);
}
});
......
......@@ -40,10 +40,10 @@ TEST(inference, fit_a_line) {
cpu_feeds[i].push_back(input);
}
std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs1;
std::vector<std::vector<paddle::framework::FetchType*>> cpu_fetchs1;
cpu_fetchs1.resize(num_threads);
for (int i = 0; i < num_threads; ++i) {
auto* output = new paddle::framework::LoDTensor();
auto* output = new paddle::framework::FetchType();
cpu_fetchs1[i].push_back(output);
}
......@@ -58,10 +58,10 @@ TEST(inference, fit_a_line) {
}
#ifdef PADDLE_WITH_CUDA
std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs2;
std::vector<std::vector<paddle::framework::FetchType*>> cpu_fetchs2;
cpu_fetchs2.resize(num_threads);
for (int i = 0; i < num_threads; ++i) {
auto* output = new paddle::framework::LoDTensor();
auto* output = new paddle::framework::FetchType();
cpu_fetchs2[i].push_back(output);
}
......@@ -76,7 +76,9 @@ TEST(inference, fit_a_line) {
}
for (int i = 0; i < num_threads; ++i) {
CheckError<float>(*cpu_fetchs1[i][0], *cpu_fetchs2[i][0]);
CheckError<float>(
boost::get<paddle::framework::LoDTensor>(*cpu_fetchs1[i][0]),
boost::get<paddle::framework::LoDTensor>(*cpu_fetchs2[i][0]));
delete cpu_fetchs2[i][0];
}
#endif
......
......@@ -50,9 +50,9 @@ TEST(inference, image_classification) {
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
paddle::framework::LoDTensor output1;
paddle::framework::FetchType output1;
if (!FLAGS_skip_cpu) {
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
......@@ -60,12 +60,12 @@ TEST(inference, image_classification) {
LOG(INFO) << "Batch size is " << FLAGS_batch_size;
TestInference<paddle::platform::CPUPlace, false, true>(
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
LOG(INFO) << output1.dims();
LOG(INFO) << boost::get<paddle::framework::LoDTensor>(output1).dims();
}
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
paddle::framework::FetchType output2;
std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
......@@ -73,17 +73,18 @@ TEST(inference, image_classification) {
LOG(INFO) << "Batch size is " << FLAGS_batch_size;
TestInference<paddle::platform::CUDAPlace, false, true>(
dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
LOG(INFO) << output2.dims();
LOG(INFO) << boost::get<paddle::framework::LoDTensor>(output2).dims();
if (!FLAGS_skip_cpu) {
CheckError<float>(output1, output2);
CheckError<float>(boost::get<paddle::framework::LoDTensor>(output1),
boost::get<paddle::framework::LoDTensor>(output2));
}
// float16 inference requires cuda GPUs with >= 5.3 compute capability
if (!FLAGS_fp16_dirname.empty() &&
paddle::platform::GetCUDAComputeCapability(0) >= 53) {
paddle::framework::LoDTensor output3;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs3;
paddle::framework::FetchType output3;
std::vector<paddle::framework::FetchType*> cpu_fetchs3;
cpu_fetchs3.push_back(&output3);
LOG(INFO) << "--- GPU Runs in float16 mode: ---";
......@@ -92,7 +93,8 @@ TEST(inference, image_classification) {
TestInference<paddle::platform::CUDAPlace, false, true>(
FLAGS_fp16_dirname, cpu_feeds, cpu_fetchs3, FLAGS_repeat);
CheckError<float>(output2, output3);
CheckError<float>(boost::get<paddle::framework::LoDTensor>(output2),
boost::get<paddle::framework::LoDTensor>(output3));
}
#endif
}
......@@ -63,25 +63,27 @@ TEST(inference, label_semantic_roles) {
cpu_feeds.push_back(&ctx_p2);
cpu_feeds.push_back(&mark);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
paddle::framework::FetchType output1;
std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.lod();
LOG(INFO) << output1.dims();
auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
LOG(INFO) << output1_tensor.lod();
LOG(INFO) << output1_tensor.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
paddle::framework::FetchType output2;
std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.lod();
LOG(INFO) << output2.dims();
auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
LOG(INFO) << output2_tensor.lod();
LOG(INFO) << output2_tensor.dims();
CheckError<float>(output1, output2);
CheckError<float>(output1_tensor, output2_tensor);
#endif
}
......@@ -118,8 +118,8 @@ void ThreadRunInfer(
inference_program->GetFetchTargetNames();
PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
paddle::framework::LoDTensor outtensor;
std::map<std::string, paddle::framework::FetchType*> fetch_targets;
paddle::framework::FetchType outtensor;
fetch_targets[fetch_target_names[0]] = &outtensor;
std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
......@@ -150,7 +150,8 @@ void ThreadRunInfer(
std::string fetch_target_name = op->Input("X")[0];
int idx = boost::get<int>(op->GetAttr("col"));
*fetch_targets[fetch_target_name] =
paddle::framework::GetFetchVariable(*scope, "fetch", idx);
boost::get<paddle::framework::LoDTensor>(
paddle::framework::GetFetchVariable(*scope, "fetch", idx));
}
}
......@@ -215,8 +216,8 @@ TEST(inference, nlp) {
const std::vector<std::string>& fetch_target_names =
inference_program->GetFetchTargetNames();
PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
paddle::framework::LoDTensor outtensor;
std::map<std::string, paddle::framework::FetchType*> fetch_targets;
paddle::framework::FetchType outtensor;
fetch_targets[fetch_target_names[0]] = &outtensor;
// prepare feed
......
......@@ -41,28 +41,30 @@ TEST(inference, recognize_digits) {
cpu_feeds.push_back(&input);
for (auto is_combined : {false, true}) {
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
paddle::framework::FetchType output1;
std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
FLAGS_repeat, is_combined);
LOG(INFO) << output1.dims();
auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
LOG(INFO) << output1_tensor.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
paddle::framework::FetchType output2;
std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
FLAGS_repeat, is_combined);
LOG(INFO) << output2.dims();
auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
LOG(INFO) << output2_tensor.dims();
CheckError<float>(output1, output2);
CheckError<float>(output1_tensor, output2_tensor);
#endif
}
}
......@@ -65,23 +65,25 @@ TEST(inference, recommender_system) {
cpu_feeds.push_back(&category_id);
cpu_feeds.push_back(&movie_title);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
paddle::framework::FetchType output1;
std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();
auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
LOG(INFO) << output1_tensor.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
paddle::framework::FetchType output2;
std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();
auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
LOG(INFO) << output2_tensor.dims();
CheckError<float>(output1, output2);
CheckError<float>(output1_tensor, output2_tensor);
#endif
}
......@@ -41,25 +41,27 @@ TEST(inference, rnn_encoder_decoder) {
cpu_feeds.push_back(&word_data);
cpu_feeds.push_back(&trg_word);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
paddle::framework::FetchType output1;
std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.lod();
LOG(INFO) << output1.dims();
auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
LOG(INFO) << output1_tensor.lod();
LOG(INFO) << output1_tensor.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
paddle::framework::FetchType output2;
std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.lod();
LOG(INFO) << output2.dims();
auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
LOG(INFO) << output2_tensor.lod();
LOG(INFO) << output2_tensor.dims();
CheckError<float>(output1, output2);
CheckError<float>(output1_tensor, output2_tensor);
#endif
}
......@@ -39,25 +39,27 @@ TEST(inference, understand_sentiment) {
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&words);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
paddle::framework::FetchType output1;
std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.lod();
LOG(INFO) << output1.dims();
auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
LOG(INFO) << output1_tensor.lod();
LOG(INFO) << output1_tensor.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
paddle::framework::FetchType output2;
std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.lod();
LOG(INFO) << output2.dims();
auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
LOG(INFO) << output2_tensor.lod();
LOG(INFO) << output2_tensor.dims();
CheckError<float>(output1, output2);
CheckError<float>(output1_tensor, output2_tensor);
#endif
}
......@@ -44,25 +44,27 @@ TEST(inference, word2vec) {
cpu_feeds.push_back(&third_word);
cpu_feeds.push_back(&fourth_word);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
paddle::framework::FetchType output1;
std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.lod();
LOG(INFO) << output1.dims();
auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
LOG(INFO) << output1_tensor.lod();
LOG(INFO) << output1_tensor.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
paddle::framework::FetchType output2;
std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.lod();
LOG(INFO) << output2.dims();
auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
LOG(INFO) << output2_tensor.lod();
LOG(INFO) << output2_tensor.dims();
CheckError<float>(output1, output2);
CheckError<float>(output1_tensor, output2_tensor);
#endif
}
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <map>
#include <memory>
#include <random>
#include <string>
#include <vector>
......@@ -142,7 +143,7 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
template <typename Place, bool CreateVars = true, bool PrepareContext = false>
void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
const std::vector<paddle::framework::FetchType*>& cpu_fetchs,
const int repeat = 1, const bool is_combined = false) {
// 1. Define place, executor, scope
auto place = Place();
......@@ -194,7 +195,7 @@ void TestInference(const std::string& dirname,
}
// 5. Define Tensor to get the outputs: set up maps for fetch targets
std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
std::map<std::string, paddle::framework::FetchType*> fetch_targets;
for (size_t i = 0; i < fetch_target_names.size(); ++i) {
fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
}
......
......@@ -58,7 +58,7 @@ class FeedOp : public framework::OperatorBase {
VLOG(3) << "Feed variable " << feed_var_name << "'s " << col
<< " column to variable " << out_name;
auto &feed_list = feed_var->Get<framework::FeedFetchList>();
auto &feed_list = feed_var->Get<framework::FeedList>();
PADDLE_ENFORCE_LT(
static_cast<size_t>(col), feed_list.size(),
platform::errors::InvalidArgument(
......@@ -68,7 +68,7 @@ class FeedOp : public framework::OperatorBase {
col, feed_list.size()));
auto &feed_item = feed_list.at(static_cast<size_t>(col));
auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
auto *out_item = out_var->GetMutable<framework::FeedType>();
if (platform::is_same_place(feed_item.place(), place)) {
out_item->ShareDataWith(feed_item);
......
......@@ -21,6 +21,39 @@ limitations under the License. */
namespace paddle {
namespace operators {
// FIXME(yuyang18): Should we assume the fetch operator always generate
// CPU outputs?
static void DataCopy(const framework::LoDTensor &src_item,
const std::string &fetch_var_name,
framework::LoDTensor *dst_item) {
if (src_item.IsInitialized() && src_item.numel() > 0) {
#ifdef PADDLE_WITH_MKLDNN
// Conversion from MKL-DNN to Paddle
if (src_item.layout() == framework::DataLayout::kMKLDNN) {
framework::Tensor out;
// Convert to desired Paddle layout, apart from grads of filter
// as params are not a subject to paddle's data_format
framework::innerTransDataLayoutFromMKLDNN(
src_item.layout(),
fetch_var_name == framework::GradVarName("Filter")
? framework::DataLayout::kNCHW
: paddle::platform::get_cur_paddle_data_layout(),
src_item, &out, platform::CPUPlace());
TensorCopySync(out, platform::CPUPlace(), dst_item);
} else {
TensorCopySync(src_item, platform::CPUPlace(), dst_item);
}
#else
TensorCopySync(src_item, platform::CPUPlace(), dst_item);
#endif
} else {
// Not copy, if the src tensor is empty.
dst_item->clear();
dst_item->Resize({0});
}
dst_item->set_lod(src_item.lod());
}
class FetchOp : public framework::OperatorBase {
public:
FetchOp(const std::string &type, const framework::VariableNameMap &inputs,
......@@ -66,42 +99,26 @@ class FetchOp : public framework::OperatorBase {
VLOG(3) << "Fetch variable " << fetch_var_name << " to variable "
<< out_name << "'s " << col << " column.";
auto *fetch_list = out_var->GetMutable<framework::FeedFetchList>();
auto &src_item = fetch_var->Get<framework::FeedFetchType>();
auto *fetch_list = out_var->GetMutable<framework::FetchList>();
if (static_cast<size_t>(col) >= fetch_list->size()) {
fetch_list->resize(col + 1);
}
auto &dst_item = fetch_list->at(col);
// FIXME(yuyang18): Should we assume the fetch operator always generate
// CPU outputs?
if (src_item.IsInitialized() && src_item.numel() > 0) {
#ifdef PADDLE_WITH_MKLDNN
// Conversion from MKL-DNN to Paddle
if (src_item.layout() == framework::DataLayout::kMKLDNN) {
framework::Tensor out;
// Convert to desired Paddle layout, apart from grads of filter
// as params are not a subject to paddle's data_format
framework::innerTransDataLayoutFromMKLDNN(
src_item.layout(),
fetch_var_name == framework::GradVarName("Filter")
? framework::DataLayout::kNCHW
: paddle::platform::get_cur_paddle_data_layout(),
src_item, &out, platform::CPUPlace());
TensorCopySync(out, platform::CPUPlace(), &dst_item);
} else {
TensorCopySync(src_item, platform::CPUPlace(), &dst_item);
}
#else
TensorCopySync(src_item, platform::CPUPlace(), &dst_item);
#endif
if (fetch_var->IsType<framework::LoDTensor>()) {
auto &src_item = fetch_var->Get<framework::LoDTensor>();
auto *dst_item = &(boost::get<framework::LoDTensor>(fetch_list->at(col)));
DataCopy(src_item, fetch_var_name, dst_item);
} else {
// Not copy, if the src tensor is empty.
dst_item.clear();
dst_item.Resize({0});
auto &src_item = fetch_var->Get<framework::LoDTensorArray>();
framework::LoDTensorArray tmp(src_item.size());
fetch_list->at(col) = tmp;
auto &dst_item =
boost::get<framework::LoDTensorArray>(fetch_list->at(col));
for (size_t i = 0; i < src_item.size(); ++i) {
DataCopy(src_item[i], fetch_var_name, &dst_item[i]);
}
}
dst_item.set_lod(src_item.lod());
}
};
......
......@@ -97,7 +97,9 @@ DECLARE_bool(use_mkldnn);
// disable auto conversion to list in Python
PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensor2DArray);
PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList);
PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList);
PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
namespace paddle {
namespace pybind {
......@@ -966,6 +968,9 @@ All parameter, weight, gradient are variables in Paddle.
.def("get_lod_tensor_array",
[](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
py::return_value_policy::reference)
.def("get_fetch_list",
[](Variable &self) { return self.GetMutable<FetchList>(); },
py::return_value_policy::reference)
#if (defined(PADDLE_WITH_NCCL))
.def("get_communicator",
[](Variable &self) -> platform::Communicator * {
......@@ -1443,7 +1448,7 @@ All parameter, weight, gradient are variables in Paddle.
.def("run_prepared_ctx",
[](Executor &self, ExecutorPrepareContext *ctx, Scope *scope,
std::map<std::string, const LoDTensor *> *feed_targets,
std::map<std::string, LoDTensor *> *fetch_targets,
std::map<std::string, FetchType *> *fetch_targets,
bool create_local_scope = true, bool create_vars = true,
const std::string &feed_holder_name = "feed",
const std::string &fetch_holder_name = "fetch") {
......@@ -1503,7 +1508,16 @@ All parameter, weight, gradient are variables in Paddle.
#endif
m.def("set_feed_variable", framework::SetFeedVariable);
m.def("get_fetch_variable", framework::GetFetchVariable);
m.def("get_fetch_variable",
[](const Scope &scope, const std::string &var_name,
size_t index) -> py::object {
auto &var = framework::GetFetchVariable(scope, var_name, index);
if (data_is_lod_tensor(var)) {
return py::cast(boost::get<LoDTensor>(var));
} else {
return py::cast(boost::get<LoDTensorArray>(var));
}
});
m.def("get_variable_tensor", framework::GetVariableTensor);
m.def("_is_program_version_supported", IsProgramVersionSupported);
......@@ -1583,16 +1597,70 @@ All parameter, weight, gradient are variables in Paddle.
},
py::return_value_policy::take_ownership);
py::class_<LoDTensor2DArray>(m, "LoDTensor2DArray", R"DOC(
LoDTensor2DArray is 2-D array of LoDTensor.
py::class_<FetchList>(m, "FetchList", R"DOC( FetchList is a
vector of boost::variant<LoDTensor, LoDTensorArray>.
)DOC")
.def("_move_to_list",
[](FetchList &self) -> py::list {
py::list res(self.size());
for (size_t i = 0; i < self.size(); ++i) {
if (data_is_lod_tensor(self[i])) {
auto &data = boost::get<LoDTensor>(self[i]);
res[i] = py::cast(std::move(data));
} else {
auto &data = boost::get<LoDTensorArray>(self[i]);
py::list tmp(data.size());
for (size_t j = 0; j < data.size(); ++j) {
tmp[j] = py::cast(std::move(data[j]));
}
res[i] = std::move(tmp);
}
}
self.clear();
return res;
},
py::return_value_policy::take_ownership)
.def("append",
[](FetchList &self, const LoDTensor &t) {
self.emplace_back();
auto &lod_tensor = boost::get<LoDTensor>(self.back());
lod_tensor.ShareDataWith(t);
lod_tensor.set_lod(t.lod());
},
py::arg("var"))
.def("append",
[](FetchList &self, const LoDTensorArray &t) {
self.emplace_back();
auto &lod_tensor_array = boost::get<LoDTensorArray>(self.back());
for (size_t i = 0; i < t.size(); ++i) {
lod_tensor_array[i].ShareDataWith(t[i]);
lod_tensor_array[i].set_lod(t[i].lod());
}
},
py::arg("var"));
py::class_<FetchUnmergedList>(m, "FetchUnmergedList", R"DOC(
FetchUnmergedList is 2-D array of FetchType(boost::variant(LoDTensor, LoDTensorArray)).
)DOC")
.def("_move_to_list",
[](LoDTensor2DArray &self) -> py::list {
[](FetchUnmergedList &self) -> py::list {
py::list res(self.size());
for (size_t i = 0; i < self.size(); ++i) {
py::list tmp(self[i].size());
for (size_t j = 0; j < self[i].size(); ++j) {
tmp[j] = py::cast(std::move(self[i][j]));
if (data_is_lod_tensor(self[i][j])) {
auto &var = boost::get<LoDTensor>(self[i][j]);
tmp[j] = py::cast(std::move(var));
} else {
auto &var = boost::get<LoDTensorArray>(self[i][j]);
py::list tmp_array(var.size());
for (size_t k = 0; k < var.size(); ++k) {
tmp_array[k] = std::move(var[k]);
}
tmp[j] = std::move(tmp_array);
}
}
res[i] = std::move(tmp);
self[i].clear();
......@@ -2326,8 +2394,8 @@ All parameter, weight, gradient are variables in Paddle.
ret = self.Run(fetch_tensors, return_merged);
}
if (return_merged) {
return py::cast(std::move(
boost::get<paddle::framework::FeedFetchList>(ret)));
return py::cast(
std::move(boost::get<paddle::framework::FetchList>(ret)));
} else {
return py::cast(std::move(
boost::get<paddle::framework::FetchUnmergedList>(ret)));
......
......@@ -931,14 +931,14 @@ class Executor(object):
return_merged(bool): This parameter indicates whether fetched variables (the variables
specified in the fetch list) should be merged according to the execution device dimension.
If :code:`return_merged` is False, the type of the return value is a two-dimensional list
of :code:`Tensor` ( :code:`return_numpy` is False) or a two-dimensional list of
:code:`numpy.ndarray` ( :code:`return_numpy` is True). If :code:`return_merged` is True,
the type of the return value is an one-dimensional list of :code:`Tensor` ( :code:`return_numpy`
is False) or an one-dimensional list of :code:`numpy.ndarray` ( :code:`return_numpy` is True).
Please see Examples 2 for more details. If the lengths of fetched results are variant, please
set :code:`return_merged` as False, which denotes that the fetched results will not be merged.
The default is True, but it is just for the compatibility, and may use False as default value
in the future version.
of :code:`Tensor` / :code:`LoDTensorArray` ( :code:`return_numpy` is False) or a two-dimensional
list of :code:`numpy.ndarray` ( :code:`return_numpy` is True). If :code:`return_merged` is True,
the type of the return value is an one-dimensional list of :code:`Tensor` / :code:`LoDTensorArray`
( :code:`return_numpy` is False) or an one-dimensional list of :code:`numpy.ndarray`
( :code:`return_numpy` is True). Please see Examples 2 for more details. If the lengths of fetched
results are variant, please set :code:`return_merged` as False, which denotes that the fetched
results will not be merged. The default is True, but it is just for the compatibility, and may
use False as default value in the future version.
use_prune(bool): This parameter indicates whether the input :code:`Program` will be pruned.
If the parameter is True, the program will be pruned accroding to the given feed and fetch_list,
which means the operators and variables in program that generate :code:`feed` and are not
......@@ -980,13 +980,17 @@ class Executor(object):
loss = fluid.layers.mean(hidden)
adam = fluid.optimizer.Adam()
adam.minimize(loss)
i = fluid.layers.zeros(shape=[1], dtype='int64')
array = fluid.layers.array_write(x=loss, i=i)
# Run the startup program once and only once.
exe.run(fluid.default_startup_program())
x = numpy.random.random(size=(10, 1)).astype('float32')
outs = exe.run(feed={'X': x},
fetch_list=[loss.name])
loss_val, array_val = exe.run(feed={'X': x},
fetch_list=[loss.name, array.name])
print(array_val)
# [array([0.02153828], dtype=float32)]
Examples 2:
.. code-block:: python
......@@ -1226,7 +1230,7 @@ class Executor(object):
else:
self._default_executor.run_prepared_ctx(ctx, scope, False, False,
False)
arr = scope.find_var(fetch_var_name).get_lod_tensor_array()
arr = scope.find_var(fetch_var_name).get_fetch_list()
tensors = arr._move_to_list()
if return_numpy:
return as_numpy(tensors)
......
......@@ -58,8 +58,11 @@ def convolutional_neural_network(use_py_reader):
loss = fluid.layers.cross_entropy(input=prediction, label=label)
avg_loss = fluid.layers.mean(loss)
acc = fluid.layers.accuracy(input=prediction, label=label)
return img, label, prediction, avg_loss, acc, py_reader
i = fluid.layers.zeros(shape=[1], dtype='int64')
array = fluid.layers.array_write(x=prediction, i=i)
fluid.layers.increment(i)
fluid.layers.array_write(x=acc, i=i, array=array)
return array, img, label, prediction, avg_loss, acc, py_reader
def test():
......@@ -69,7 +72,7 @@ def test():
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network(
array, img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network(
use_py_reader=False)
feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
......@@ -102,7 +105,7 @@ def train(use_cuda, thread_num, cpu_num):
print("paddle is not compiled with cuda, exit!")
return
img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network(
array, img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network(
use_py_reader=True)
print("build convolutional neural network done.")
......@@ -150,7 +153,12 @@ def train(use_cuda, thread_num, cpu_num):
py_reader.start()
try:
while True:
loss_val = pe.run(fetch_list=[avg_loss.name])
array_v, acc_v, prediction_v, loss_val = pe.run(
fetch_list=[array, acc, prediction, avg_loss.name])
assert numpy.allclose(array_v[0], prediction_v) == True
assert numpy.allclose(array_v[1], acc_v) == True
loss_val = numpy.mean(loss_val)
if step % 10 == 0:
print("Pass %d, Batch %d, Cost %f, queue size %d" %
......
......@@ -19,25 +19,40 @@ import unittest
import numpy
import paddle.fluid.core as core
from paddle.fluid.executor import Executor
from paddle.fluid.layers import mul, data
from paddle.fluid.layers import mul, data, zeros, array_write, increment
class TestExecutor(unittest.TestCase):
def test_mul(self):
i = zeros(shape=[1], dtype='int64')
a = data(name='a', shape=[784], dtype='float32')
array = array_write(x=a, i=i)
i = increment(i)
b = data(
name='b',
shape=[784, 100],
dtype='float32',
append_batch_size=False)
array_write(x=b, i=i, array=array)
i = increment(i)
out = mul(x=a, y=b)
array_write(x=out, i=i, array=array)
a_np = numpy.random.random((100, 784)).astype('float32')
b_np = numpy.random.random((784, 100)).astype('float32')
exe = Executor()
outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out])
out = outs[0]
self.assertEqual((100, 100), out.shape)
self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
res, res_array = exe.run(feed={'a': a_np,
'b': b_np},
fetch_list=[out, array])
self.assertEqual((100, 100), res.shape)
self.assertTrue(numpy.allclose(res, numpy.dot(a_np, b_np)))
self.assertTrue(numpy.allclose(res_array[0], a_np))
self.assertTrue(numpy.allclose(res_array[1], b_np))
self.assertTrue(numpy.allclose(res_array[2], res))
if __name__ == '__main__':
......
......@@ -31,7 +31,9 @@ class TestFeedFetch(unittest.TestCase):
core.set_feed_variable(scope, input_tensor, "feed", 0)
output_tensor = core.get_fetch_variable(scope, "feed", 0)
output = scope.var("fetch").get_fetch_list()
output.append(input_tensor)
output_tensor = core.get_fetch_variable(scope, "fetch", 0)
output_lod = output_tensor.recursive_sequence_lengths()
self.assertEqual(2, output_lod[0][0])
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import unittest
import random
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from simple_nets import simple_fc_net_with_inputs, simple_fc_net
class TestFetchLoDTensorArray(unittest.TestCase):
def build_program(self, main_program, startup_program):
with fluid.unique_name.guard():
with fluid.program_guard(main_program, startup_program):
i = layers.zeros(shape=[1], dtype='int64')
img = fluid.data(name='image', shape=[-1, 784], dtype='float32')
label = fluid.data(name='label', shape=[-1, 1], dtype='int64')
loss = simple_fc_net_with_inputs(img, label, class_num=10)
loss = simple_fc_net()
opt = fluid.optimizer.SGD(learning_rate=0.001)
opt.minimize(loss)
array = layers.array_write(x=img, i=i)
i = layers.increment(i)
layers.array_write(x=label, i=i, array=array)
i = layers.increment(i)
layers.array_write(x=loss, i=i, array=array)
return loss, array
def check_network(self, use_cuda=True):
os.environ["CPU_NUM"] = str(2)
main_program = fluid.Program()
startup_program = fluid.Program()
loss, array = self.build_program(main_program, startup_program)
batch_size = 32
image = np.random.normal(size=(batch_size, 784)).astype('float32')
label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_program)
feed_dict = {'image': image, 'label': label}
build_strategy = fluid.BuildStrategy()
binary = fluid.CompiledProgram(main_program).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy)
device_num = fluid.core.get_cuda_device_count() if use_cuda else 2
for _ in range(3):
loss_v, array_v = exe.run(binary,
feed=feed_dict,
fetch_list=[loss, array],
return_merged=False)
self.assertEqual(np.array(loss_v).shape, (device_num, 1))
self.assertEqual(
np.array(array_v[0][0]).shape, (batch_size / device_num, 784))
self.assertEqual(
np.array(array_v[0][1]).shape, (batch_size / device_num, 1))
self.assertEqual(np.array(array_v[0][2]).shape, (1, ))
for _ in range(3):
loss_v, array_v = exe.run(binary,
feed=feed_dict,
fetch_list=[loss, array],
return_merged=True)
self.assertEqual(np.array(loss_v).shape, (device_num, ))
self.assertEqual(np.array(array_v[0]).shape, (batch_size, 784))
self.assertEqual(np.array(array_v[1]).shape, (batch_size, 1))
self.assertTrue(np.allclose(loss_v, array_v[2]))
def test_fetch_lod_tensor_array(self):
if fluid.core.is_compiled_with_cuda():
self.check_network(use_cuda=True)
self.check_network(use_cuda=False)
def test_fetch_unmerged_parallel_graph(self):
fluid.core.globals()['FLAGS_enable_parallel_graph'] = True
if fluid.core.is_compiled_with_cuda():
self.check_network(use_cuda=True)
self.check_network(use_cuda=False)
fluid.core.globals()['FLAGS_enable_parallel_graph'] = False
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册