未验证 提交 2b896c1f 编写于 作者: G guofei 提交者: GitHub

Support LoDTensorArray in fetch (#23645)

* Support LoDTEnsorArray in fetch op

test=develop

* Support LoDTensorArray in fetch

test=develop

* Support LoDTensorArray in fetch

test=develop

* Support LoDTensorArray in fetch

test=develop

* Support LoDTensorArray in fetch

test=develop

* Support LoDTensorArray in fetch

test=develop

* Support LoDTensorArray in fetch

test=develop

* Support LoDTensorArray in fetch

test=develop

* Support LoDTensorArray in fetch

test=develop

* Support LoDTensorArray in fetch

test=develop
上级 0b0adbf9
...@@ -197,13 +197,27 @@ FetchResultType AsyncSSAGraphExecutor::Run( ...@@ -197,13 +197,27 @@ FetchResultType AsyncSSAGraphExecutor::Run(
HandleException(); HandleException();
FeedFetchList ret; FetchList ret;
auto &val = boost::get<FeedFetchList>(fetch_data); auto &val = boost::get<FetchList>(fetch_data);
for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
std::vector<const LoDTensor *> lodtensor_ptrs; if (data_is_lod_tensor(val.at(fetch_idx))) {
lodtensor_ptrs.push_back(&val.at(fetch_idx)); std::vector<const LoDTensor *> lodtensor_ptrs;
ret.emplace_back(); lodtensor_ptrs.push_back(&(boost::get<LoDTensor>(val.at(fetch_idx))));
ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); LoDTensor var;
var.MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
ret.emplace_back(var);
} else {
auto array = boost::get<LoDTensorArray>(val.at(fetch_idx));
LoDTensorArray item_array;
item_array.reserve(array.size());
for (size_t i = 0; i < array.size(); ++i) {
std::vector<const LoDTensor *> lodtensor_ptrs;
lodtensor_ptrs.push_back(&array[i]);
item_array.emplace_back();
item_array.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
}
ret.emplace_back(item_array);
}
} }
return ret; return ret;
} }
......
...@@ -63,7 +63,7 @@ FetchResultType FastThreadedSSAGraphExecutor::Run( ...@@ -63,7 +63,7 @@ FetchResultType FastThreadedSSAGraphExecutor::Run(
FetchResultType fetches; FetchResultType fetches;
if (return_merged) { if (return_merged) {
fetches = FeedFetchList(fetch_tensors.size()); fetches = FetchList(fetch_tensors.size());
} else { } else {
fetches = FetchUnmergedList(fetch_tensors.size()); fetches = FetchUnmergedList(fetch_tensors.size());
} }
......
...@@ -39,51 +39,98 @@ void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { ...@@ -39,51 +39,98 @@ void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error"); PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
} }
void FetchOpHandle::WaitAndMergeCPUTensors() const { static void CheckDims(const framework::DDim &tensor_dims,
const framework::DDim &ele_dims, const size_t offset) {
PADDLE_ENFORCE_EQ(
tensor_dims.size(), ele_dims.size(),
platform::errors::Fatal("The dimension sizes of fetched Tensors or "
"the items of fetched LoDTensorArray are "
"different from each other on different "
"devices. And the error is caused by the %zu "
"(th) fetched variable. Please set the "
"parameter `return_merged = False` when you "
"call the `Executor.run()` method.",
offset));
for (int j = 1; j < tensor_dims.size(); j++) {
PADDLE_ENFORCE_EQ(
tensor_dims[j], ele_dims[j],
platform::errors::Fatal("The dimensions of fetched Tensors or "
"the items of fetched LoDTensorArray are "
"different from each other on different "
"devices. And the error is caused by the "
"%zu (th) fetched variable. Please set the "
"parameter `return_merged = False` when "
"you call the `Executor.run()` method.",
offset));
}
}
void FetchOpHandle::WaitAndMergeCPUFetchVars() const {
if (return_merged_) { if (return_merged_) {
const auto &tensor_dims = tensors_[0].dims(); if (data_is_lod_tensor(tensors_[0])) {
for (size_t i = 1; i < tensors_.size(); i++) { const auto &tensor_dims = boost::get<LoDTensor>(tensors_[0]).dims();
const auto &ele_dims = tensors_[i].dims(); for (size_t i = 1; i < tensors_.size(); i++) {
PADDLE_ENFORCE_EQ( const auto &ele_dims = boost::get<LoDTensor>(tensors_[i]).dims();
tensor_dims.size(), ele_dims.size(), CheckDims(tensor_dims, ele_dims, offset_);
platform::errors::Fatal("The dimension sizes of fetched Tensors are "
"different from each other on different "
"devices. And the error is caused by the %zu "
"(th) fetched variable. Please set the "
"parameter `return_merged = False` when you "
"call the `Executor.run()` method.",
offset_));
for (int j = 1; j < tensor_dims.size(); j++) {
PADDLE_ENFORCE_EQ(
tensor_dims[j], ele_dims[j],
platform::errors::Fatal("The dimensions of fetched Tensors are "
"different from each other on different "
"devices. And the error is caused by the "
"%zu (th) fetched variable. Please set the "
"parameter `return_merged = False` when "
"you call the `Executor.run()` method.",
offset_));
} }
std::vector<const LoDTensor *> tensors_ptr;
tensors_ptr.reserve(tensors_.size());
for (auto &t : tensors_) {
tensors_ptr.emplace_back(&boost::get<LoDTensor>(t));
}
auto &val = boost::get<FetchList>(*data_);
LoDTensor var;
var.MergeLoDTensor(tensors_ptr, platform::CPUPlace());
val.at(offset_) = std::move(var);
} else {
auto &array = boost::get<LoDTensorArray>(tensors_[0]);
LoDTensorArray tmp_array;
tmp_array.reserve(array.size());
for (size_t i = 0; i < array.size(); ++i) {
const auto &tensor_dims = array[i].dims();
std::vector<const LoDTensor *> tensors_ptr;
tensors_ptr.reserve(tensors_.size());
tensors_ptr.push_back(&array[i]);
for (size_t j = 1; j < tensors_.size(); ++j) {
auto &element = boost::get<LoDTensorArray>(tensors_[j]);
const auto &ele_dims = element[i].dims();
CheckDims(tensor_dims, ele_dims, offset_);
tensors_ptr.push_back(&element[i]);
}
tmp_array.emplace_back();
tmp_array.back().MergeLoDTensor(tensors_ptr, platform::CPUPlace());
}
auto &val = boost::get<FetchList>(*data_);
val.at(offset_) = std::move(tmp_array);
} }
std::vector<const LoDTensor *> tensors_ptr;
tensors_ptr.reserve(tensors_.size());
for (auto &t : tensors_) {
tensors_ptr.emplace_back(&t);
}
auto &val = boost::get<FeedFetchList>(*data_);
val.at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace());
} else { } else {
auto &val = boost::get<FetchUnmergedList>(*data_); auto &val = boost::get<FetchUnmergedList>(*data_);
val.at(offset_) = std::move(tensors_); val.at(offset_) = std::move(tensors_);
} }
} }
static void TransData(const framework::LoDTensor &src_item,
framework::LoDTensor *dst_item) {
if (src_item.IsInitialized() && src_item.numel() > 0) {
if (platform::is_gpu_place(src_item.place())) {
#ifdef PADDLE_WITH_CUDA
TensorCopy(src_item, platform::CPUPlace(), dst_item);
#endif
} else {
dst_item->ShareDataWith(src_item);
}
} else {
dst_item->clear();
dst_item->Resize({0});
}
dst_item->set_lod(src_item.lod());
}
void FetchOpHandle::RunImpl() { void FetchOpHandle::RunImpl() {
platform::RecordEvent record_event(Name()); platform::RecordEvent record_event(Name());
WaitInputVarGenerated(platform::CPUPlace()); WaitInputVarGenerated(platform::CPUPlace());
tensors_.resize(inputs_.size()); tensors_.resize(inputs_.size());
platform::CPUPlace cpu;
auto &scopes = *local_exec_scopes_; auto &scopes = *local_exec_scopes_;
for (size_t i = 0; i < inputs_.size(); ++i) { for (size_t i = 0; i < inputs_.size(); ++i) {
...@@ -93,23 +140,21 @@ void FetchOpHandle::RunImpl() { ...@@ -93,23 +140,21 @@ void FetchOpHandle::RunImpl() {
PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope", PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
var_handle->name()); var_handle->name());
auto &t = var->Get<framework::LoDTensor>(); if (var->IsType<LoDTensor>()) {
if (t.IsInitialized() && t.numel() > 0) { auto &t = var->Get<framework::LoDTensor>();
if (platform::is_gpu_place(t.place())) { auto &item = boost::get<LoDTensor>(tensors_[i]);
#ifdef PADDLE_WITH_CUDA TransData(t, &item);
TensorCopy(t, cpu, &tensors_[i]);
#endif
} else {
tensors_[i].ShareDataWith(t);
}
} else { } else {
tensors_[i].clear(); auto &t = var->Get<framework::LoDTensorArray>();
tensors_[i].Resize({0}); LoDTensorArray tmp(t.size());
tensors_[i] = tmp;
auto &item = boost::get<LoDTensorArray>(tensors_[i]);
for (size_t j = 0; j < t.size(); ++j) {
TransData(t[j], &item[j]);
}
} }
tensors_[i].set_lod(t.lod());
} }
this->WaitAndMergeCPUFetchVars();
this->WaitAndMergeCPUTensors();
} }
void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) { void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
......
...@@ -36,7 +36,7 @@ struct FetchOpHandle : public OpHandleBase { ...@@ -36,7 +36,7 @@ struct FetchOpHandle : public OpHandleBase {
void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) override; void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) override;
void WaitAndMergeCPUTensors() const; void WaitAndMergeCPUFetchVars() const;
std::string Name() const override; std::string Name() const override;
...@@ -54,7 +54,7 @@ struct FetchOpHandle : public OpHandleBase { ...@@ -54,7 +54,7 @@ struct FetchOpHandle : public OpHandleBase {
size_t offset_; size_t offset_;
std::vector<Scope *> *local_scopes_; std::vector<Scope *> *local_scopes_;
std::vector<Scope *> *local_exec_scopes_; std::vector<Scope *> *local_exec_scopes_;
std::vector<LoDTensor> tensors_; std::vector<FetchType> tensors_;
bool return_merged_; bool return_merged_;
}; };
......
...@@ -179,7 +179,7 @@ FetchResultType ParallelSSAGraphExecutor::Run( ...@@ -179,7 +179,7 @@ FetchResultType ParallelSSAGraphExecutor::Run(
} }
if (return_merged) { if (return_merged) {
return FeedFetchList(); return FetchList();
} else { } else {
return FetchUnmergedList(); return FetchUnmergedList();
} }
...@@ -245,22 +245,43 @@ FetchResultType ParallelSSAGraphExecutor::Run( ...@@ -245,22 +245,43 @@ FetchResultType ParallelSSAGraphExecutor::Run(
} }
if (return_merged) { if (return_merged) {
FeedFetchList ret; FetchList ret;
ret.reserve(fetch_tensors.size()); ret.reserve(fetch_tensors.size());
for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
std::vector<const LoDTensor *> lodtensor_ptrs; std::vector<const LoDTensor *> lodtensor_ptrs;
lodtensor_ptrs.reserve(place_num); lodtensor_ptrs.reserve(place_num);
std::vector<const LoDTensorArray *> lodtensorarray_ptrs;
lodtensorarray_ptrs.reserve(place_num);
for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) { for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
if (!is_valid[scope_idx]) { if (!is_valid[scope_idx]) {
continue; continue;
} }
const auto &fetch_list = const auto &fetch_list = boost::get<FetchList>(fetch_data[scope_idx]);
boost::get<FeedFetchList>(fetch_data[scope_idx]); if (data_is_lod_tensor(fetch_list[fetch_idx])) {
lodtensor_ptrs.push_back(&fetch_list[fetch_idx]); lodtensor_ptrs.push_back(
&(boost::get<LoDTensor>(fetch_list[fetch_idx])));
} else {
lodtensorarray_ptrs.push_back(
&(boost::get<LoDTensorArray>(fetch_list[fetch_idx])));
}
}
if (lodtensor_ptrs.size() != 0) {
LoDTensor var;
var.MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
ret.emplace_back(var);
} else {
LoDTensorArray var_array(lodtensorarray_ptrs[0]->size());
for (size_t i = 0; i < lodtensorarray_ptrs[0]->size(); ++i) {
LoDTensor var;
std::vector<const LoDTensor *> ptrs;
for (size_t j = 0; j < lodtensorarray_ptrs.size(); ++j) {
ptrs.push_back(&(lodtensorarray_ptrs[j]->at(i)));
}
var.MergeLoDTensor(ptrs, platform::CPUPlace());
var_array[i] = std::move(var);
}
ret.emplace_back(var_array);
} }
ret.emplace_back();
ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
} }
return ret; return ret;
} else { } else {
...@@ -277,8 +298,8 @@ FetchResultType ParallelSSAGraphExecutor::Run( ...@@ -277,8 +298,8 @@ FetchResultType ParallelSSAGraphExecutor::Run(
boost::get<FetchUnmergedList>(fetch_data[scope_idx]); boost::get<FetchUnmergedList>(fetch_data[scope_idx]);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
fetch_list[fetch_idx].size(), 1, fetch_list[fetch_idx].size(), 1,
platform::errors::Fatal( platform::errors::Fatal("Each place must have only one fetched "
"Each place must have only one fetched LoDTensor!")); "LoDTensor/LoDTensorArray!"));
ret.back().emplace_back(fetch_list[fetch_idx][0]); ret.back().emplace_back(fetch_list[fetch_idx][0]);
} }
} }
......
...@@ -72,7 +72,7 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl( ...@@ -72,7 +72,7 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl(
std::unordered_set<VarHandleBase *> fetch_dependencies; std::unordered_set<VarHandleBase *> fetch_dependencies;
FetchResultType fetch_data; FetchResultType fetch_data;
if (return_merged) { if (return_merged) {
fetch_data = FeedFetchList(fetch_tensors.size()); fetch_data = FetchList(fetch_tensors.size());
} else { } else {
fetch_data = FetchUnmergedList(fetch_tensors.size()); fetch_data = FetchUnmergedList(fetch_tensors.size());
} }
......
...@@ -256,7 +256,7 @@ static bool has_feed_operators( ...@@ -256,7 +256,7 @@ static bool has_feed_operators(
// Return true if the block has fetch operators and holder of matching info. // Return true if the block has fetch operators and holder of matching info.
static bool has_fetch_operators( static bool has_fetch_operators(
const BlockDesc& block, const BlockDesc& block,
const std::map<std::string, LoDTensor*>& fetch_targets, const std::map<std::string, FetchType*>& fetch_targets,
const std::string& fetch_holder_name) { const std::string& fetch_holder_name) {
size_t fetch_count = 0; size_t fetch_count = 0;
for (auto* op : block.AllOps()) { for (auto* op : block.AllOps()) {
...@@ -306,7 +306,7 @@ static bool has_fetch_operators( ...@@ -306,7 +306,7 @@ static bool has_fetch_operators(
void Executor::Run(const ProgramDesc& program, Scope* scope, void Executor::Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets, std::map<std::string, const LoDTensor*>* feed_targets,
std::map<std::string, LoDTensor*>* fetch_targets, std::map<std::string, FetchType*>* fetch_targets,
bool create_local_scope, bool create_vars, bool create_local_scope, bool create_vars,
const std::string& feed_holder_name, const std::string& feed_holder_name,
const std::string& fetch_holder_name) { const std::string& fetch_holder_name) {
...@@ -504,7 +504,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -504,7 +504,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
void Executor::RunPreparedContext( void Executor::RunPreparedContext(
ExecutorPrepareContext* ctx, Scope* scope, ExecutorPrepareContext* ctx, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets, std::map<std::string, const LoDTensor*>* feed_targets,
std::map<std::string, LoDTensor*>* fetch_targets, bool create_local_scope, std::map<std::string, FetchType*>* fetch_targets, bool create_local_scope,
bool create_vars, const std::string& feed_holder_name, bool create_vars, const std::string& feed_holder_name,
const std::string& fetch_holder_name) { const std::string& fetch_holder_name) {
auto& global_block = ctx->prog_.Block(ctx->block_id_); auto& global_block = ctx->prog_.Block(ctx->block_id_);
......
...@@ -87,7 +87,7 @@ class Executor { ...@@ -87,7 +87,7 @@ class Executor {
// This API is very slow. // This API is very slow.
void Run(const ProgramDesc& program, Scope* scope, void Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets, std::map<std::string, const LoDTensor*>* feed_targets,
std::map<std::string, LoDTensor*>* fetch_targets, std::map<std::string, FetchType*>* fetch_targets,
bool create_local_scope = true, bool create_vars = true, bool create_local_scope = true, bool create_vars = true,
const std::string& feed_holder_name = "feed", const std::string& feed_holder_name = "feed",
const std::string& fetch_holder_name = "fetch"); const std::string& fetch_holder_name = "fetch");
...@@ -95,7 +95,7 @@ class Executor { ...@@ -95,7 +95,7 @@ class Executor {
// This API is very slow. // This API is very slow.
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets, std::map<std::string, const LoDTensor*>* feed_targets,
std::map<std::string, LoDTensor*>* fetch_targets, std::map<std::string, FetchType*>* fetch_targets,
bool create_local_scope = true, bool create_local_scope = true,
bool create_vars = true, bool create_vars = true,
const std::string& feed_holder_name = "feed", const std::string& feed_holder_name = "feed",
......
...@@ -29,7 +29,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, ...@@ -29,7 +29,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
// be created. // be created.
VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index; VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = scope->Var(var_name); Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>()); auto& feed_inputs = *(g_feed_value->GetMutable<FeedList>());
if (index >= feed_inputs.size()) { if (index >= feed_inputs.size()) {
feed_inputs.resize(index + 1); feed_inputs.resize(index + 1);
} }
...@@ -39,27 +39,35 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, ...@@ -39,27 +39,35 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
feed_inputs[index].set_lod(input.lod()); feed_inputs[index].set_lod(input.lod());
} }
LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name,
size_t index) { size_t index) {
// Since we want to fetch LodTensor from a variable, the variable must // Since we want to fetch FetchType from a variable, the variable must
// be created alreadly. // be created alreadly.
Variable* g_fetch_value = scope.FindVar(var_name); Variable* g_fetch_value = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(g_fetch_value, "%s is not found.", var_name); PADDLE_ENFORCE_NOT_NULL(g_fetch_value,
PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(), platform::errors::NotFound(
"Only %s can be invoked by GetFetchVariable", "Variable %s is not found in scope.", var_name));
typeid(FeedFetchList).name()); PADDLE_ENFORCE_EQ(g_fetch_value->IsType<FetchList>(), true,
auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>(); platform::errors::InvalidArgument(
"Only %s can be invoked by GetFetchVariable",
typeid(FetchList).name()));
auto& fetch_outputs = *g_fetch_value->GetMutable<FetchList>();
auto& tensor = fetch_outputs[index]; auto& tensor = fetch_outputs[index];
VLOG(3) << "Fetch " << var_name << " with index " << index VLOG(3) << "Fetch " << var_name << " with index " << index;
<< " shape= " << tensor.dims(); PADDLE_ENFORCE_LT(index, fetch_outputs.size(),
PADDLE_ENFORCE_LT(index, fetch_outputs.size()); platform::errors::InvalidArgument(
"index must less than fetch_outputs size."));
return tensor; return tensor;
} }
LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) { LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) {
Variable* var = scope.FindVar(var_name); Variable* var = scope.FindVar(var_name);
PADDLE_ENFORCE(var, "%s no in scope", var_name); PADDLE_ENFORCE_NOT_NULL(
PADDLE_ENFORCE(var->IsType<LoDTensor>(), "Only support lod tensor now."); var, platform::errors::NotFound("Variable %s is not found in scope.",
var_name));
PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(), true,
platform::errors::InvalidArgument(
"Only support lod tensor in GetVariableTensor now."));
return *var->GetMutable<LoDTensor>(); return *var->GetMutable<LoDTensor>();
} }
......
...@@ -24,7 +24,7 @@ namespace framework { ...@@ -24,7 +24,7 @@ namespace framework {
void SetFeedVariable(Scope* scope, const LoDTensor& input, void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index); const std::string& var_name, size_t index);
LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name,
size_t index); size_t index);
LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name); LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name);
......
...@@ -15,14 +15,33 @@ limitations under the License. */ ...@@ -15,14 +15,33 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/platform/variant.h" #include "paddle/fluid/platform/variant.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
using FeedFetchType = LoDTensor; using FeedType = LoDTensor;
using FeedFetchList = std::vector<FeedFetchType>; using FeedList = std::vector<FeedType>;
using FetchUnmergedList = std::vector<std::vector<FeedFetchType>>;
using FetchResultType = boost::variant<FeedFetchList, FetchUnmergedList>; using FetchType = boost::variant<LoDTensor, LoDTensorArray>;
using FetchList = std::vector<FetchType>;
using FetchUnmergedList = std::vector<std::vector<FetchType>>;
using FetchResultType = boost::variant<FetchList, FetchUnmergedList>;
inline bool data_is_lod_tensor(const FetchType &data) {
if (data.type() == typeid(LoDTensor)) {
return true;
}
return false;
}
inline bool data_is_lod_tensor_array(const FetchType &data) {
if (data.type() == typeid(LoDTensorArray)) {
return true;
}
return false;
}
static const char kFeedOpType[] = "feed"; static const char kFeedOpType[] = "feed";
static const char kFetchOpType[] = "fetch"; static const char kFetchOpType[] = "fetch";
......
...@@ -20,7 +20,6 @@ namespace paddle { ...@@ -20,7 +20,6 @@ namespace paddle {
namespace framework { namespace framework {
using LoDTensorArray = std::vector<LoDTensor>; using LoDTensorArray = std::vector<LoDTensor>;
using LoDTensor2DArray = std::vector<std::vector<LoDTensor>>;
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -36,6 +36,7 @@ inline proto::VarType::Type ToVarType(int type) { ...@@ -36,6 +36,7 @@ inline proto::VarType::Type ToVarType(int type) {
case proto::VarType::SELECTED_ROWS: case proto::VarType::SELECTED_ROWS:
case proto::VarType::LOD_RANK_TABLE: case proto::VarType::LOD_RANK_TABLE:
case proto::VarType::LOD_TENSOR_ARRAY: case proto::VarType::LOD_TENSOR_ARRAY:
case proto::VarType::FETCH_LIST:
case proto::VarType::READER: case proto::VarType::READER:
return static_cast<proto::VarType::Type>(type); return static_cast<proto::VarType::Type>(type);
default: default:
...@@ -61,6 +62,9 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) { ...@@ -61,6 +62,9 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
case proto::VarType::READER: case proto::VarType::READER:
visitor(var.Get<ReaderHolder>()); visitor(var.Get<ReaderHolder>());
return; return;
case proto::VarType::FETCH_LIST:
visitor(var.Get<FetchList>());
return;
default: default:
PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type())); PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type()));
} }
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <tuple> #include <tuple>
#include <typeindex> #include <typeindex>
#include <vector> #include <vector>
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -139,7 +140,7 @@ struct VarTypeRegistryImpl { ...@@ -139,7 +140,7 @@ struct VarTypeRegistryImpl {
using VarTypeRegistry = detail::VarTypeRegistryImpl< using VarTypeRegistry = detail::VarTypeRegistryImpl<
Tensor, LoDTensor, SelectedRows, std::vector<Scope *>, LoDRankTable, Tensor, LoDTensor, SelectedRows, std::vector<Scope *>, LoDRankTable,
LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *,
operators::reader::LoDTensorBlockingQueueHolder, operators::reader::LoDTensorBlockingQueueHolder, FetchList,
operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder, operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
...@@ -178,6 +179,7 @@ REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE); ...@@ -178,6 +179,7 @@ REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE);
REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY); REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY);
REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST); REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST);
REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER); REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER);
REG_PROTO_VAR_TYPE_TRAIT(FetchList, proto::VarType::FETCH_LIST);
REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32); REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32);
REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32); REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32);
......
...@@ -34,9 +34,9 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) { ...@@ -34,9 +34,9 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
} else if (var_type == proto::VarType::SELECTED_ROWS) { } else if (var_type == proto::VarType::SELECTED_ROWS) {
var->GetMutable<SelectedRows>(); var->GetMutable<SelectedRows>();
} else if (var_type == proto::VarType::FEED_MINIBATCH) { } else if (var_type == proto::VarType::FEED_MINIBATCH) {
var->GetMutable<FeedFetchList>(); var->GetMutable<FeedList>();
} else if (var_type == proto::VarType::FETCH_LIST) { } else if (var_type == proto::VarType::FETCH_LIST) {
var->GetMutable<FeedFetchList>(); var->GetMutable<FetchList>();
} else if (var_type == proto::VarType::STEP_SCOPES) { } else if (var_type == proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope *>>(); var->GetMutable<std::vector<framework::Scope *>>();
} else if (var_type == proto::VarType::LOD_RANK_TABLE) { } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
......
...@@ -383,8 +383,9 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs, ...@@ -383,8 +383,9 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
for (size_t i = 0; i < fetches_.size(); ++i) { for (size_t i = 0; i < fetches_.size(); ++i) {
int idx = boost::get<int>(fetches_[i]->GetAttr("col")); int idx = boost::get<int>(fetches_[i]->GetAttr("col"));
PADDLE_ENFORCE((size_t)idx == i); PADDLE_ENFORCE((size_t)idx == i);
framework::LoDTensor &fetch = framework::FetchType &fetch_var =
framework::GetFetchVariable(*scope, "fetch", idx); framework::GetFetchVariable(*scope, "fetch", idx);
auto &fetch = boost::get<framework::LoDTensor>(fetch_var);
auto type = fetch.type(); auto type = fetch.type();
auto output = &(outputs->at(i)); auto output = &(outputs->at(i));
output->name = fetches_[idx]->Input("X")[0]; output->name = fetches_[idx]->Input("X")[0];
...@@ -583,9 +584,9 @@ void AnalysisPredictor::PrepareFeedFetch() { ...@@ -583,9 +584,9 @@ void AnalysisPredictor::PrepareFeedFetch() {
void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) { void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
PADDLE_ENFORCE_NOT_NULL(scope); PADDLE_ENFORCE_NOT_NULL(scope);
auto *var = scope->Var("feed"); auto *var = scope->Var("feed");
var->GetMutable<framework::FeedFetchList>(); var->GetMutable<framework::FeedList>();
var = scope->Var("fetch"); var = scope->Var("fetch");
var->GetMutable<framework::FeedFetchList>(); var->GetMutable<framework::FetchList>();
} }
std::vector<std::string> AnalysisPredictor::GetInputNames() { std::vector<std::string> AnalysisPredictor::GetInputNames() {
......
...@@ -286,8 +286,9 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs, ...@@ -286,8 +286,9 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
for (size_t i = 0; i < fetchs_.size(); ++i) { for (size_t i = 0; i < fetchs_.size(); ++i) {
int idx = boost::get<int>(fetchs_[i]->GetAttr("col")); int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
PADDLE_ENFORCE((size_t)idx == i); PADDLE_ENFORCE((size_t)idx == i);
framework::LoDTensor &fetch = framework::FetchType &fetch_var =
framework::GetFetchVariable(*scope, "fetch", idx); framework::GetFetchVariable(*scope, "fetch", idx);
auto fetch = boost::get<framework::LoDTensor>(fetch_var);
auto type = fetch.type(); auto type = fetch.type();
auto output = &(outputs->at(i)); auto output = &(outputs->at(i));
output->name = fetchs_[idx]->Input("X")[0]; output->name = fetchs_[idx]->Input("X")[0];
......
...@@ -102,14 +102,15 @@ void MainWord2Vec(bool use_gpu) { ...@@ -102,14 +102,15 @@ void MainWord2Vec(bool use_gpu) {
cpu_feeds.push_back(&third_word); cpu_feeds.push_back(&third_word);
cpu_feeds.push_back(&fourth_word); cpu_feeds.push_back(&fourth_word);
framework::LoDTensor output1; framework::FetchType output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1; std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1); cpu_fetchs1.push_back(&output1);
TestInference<platform::CPUPlace>(config.model_dir, cpu_feeds, cpu_fetchs1); TestInference<platform::CPUPlace>(config.model_dir, cpu_feeds, cpu_fetchs1);
float* lod_data = output1.data<float>(); auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
for (int i = 0; i < output1.numel(); ++i) { float* lod_data = output1_tensor.data<float>();
for (int i = 0; i < output1_tensor.numel(); ++i) {
EXPECT_LT(lod_data[i] - data[i], ACC_DIFF); EXPECT_LT(lod_data[i] - data[i], ACC_DIFF);
EXPECT_GT(lod_data[i] - data[i], -ACC_DIFF); EXPECT_GT(lod_data[i] - data[i], -ACC_DIFF);
} }
...@@ -137,8 +138,8 @@ void MainImageClassification(bool use_gpu) { ...@@ -137,8 +138,8 @@ void MainImageClassification(bool use_gpu) {
std::vector<framework::LoDTensor*> cpu_feeds; std::vector<framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input); cpu_feeds.push_back(&input);
framework::LoDTensor output1; framework::FetchType output1;
std::vector<framework::LoDTensor*> cpu_fetchs1; std::vector<framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1); cpu_fetchs1.push_back(&output1);
TestInference<platform::CPUPlace, false, true>( TestInference<platform::CPUPlace, false, true>(
...@@ -153,7 +154,8 @@ void MainImageClassification(bool use_gpu) { ...@@ -153,7 +154,8 @@ void MainImageClassification(bool use_gpu) {
ASSERT_EQ(outputs.size(), 1UL); ASSERT_EQ(outputs.size(), 1UL);
size_t len = outputs[0].data.length(); size_t len = outputs[0].data.length();
float* data = static_cast<float*>(outputs[0].data.data()); float* data = static_cast<float*>(outputs[0].data.data());
float* lod_data = output1.data<float>(); float* lod_data =
boost::get<paddle::framework::LoDTensor>(output1).data<float>();
for (size_t j = 0; j < len / sizeof(float); ++j) { for (size_t j = 0; j < len / sizeof(float); ++j) {
EXPECT_NEAR(lod_data[j], data[j], ACC_DIFF); EXPECT_NEAR(lod_data[j], data[j], ACC_DIFF);
} }
...@@ -168,7 +170,7 @@ void MainThreadsWord2Vec(bool use_gpu) { ...@@ -168,7 +170,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
constexpr int num_jobs = 3; constexpr int num_jobs = 3;
std::vector<std::vector<framework::LoDTensor>> jobs(num_jobs); std::vector<std::vector<framework::LoDTensor>> jobs(num_jobs);
std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs); std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
std::vector<framework::LoDTensor> refs(num_jobs); std::vector<framework::FetchType> refs(num_jobs);
for (size_t i = 0; i < jobs.size(); ++i) { for (size_t i = 0; i < jobs.size(); ++i) {
// each job has 4 words // each job has 4 words
jobs[i].resize(4); jobs[i].resize(4);
...@@ -181,7 +183,7 @@ void MainThreadsWord2Vec(bool use_gpu) { ...@@ -181,7 +183,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
// get reference result of each job // get reference result of each job
std::vector<paddle::framework::LoDTensor*> ref_feeds; std::vector<paddle::framework::LoDTensor*> ref_feeds;
std::vector<paddle::framework::LoDTensor*> ref_fetches(1, &refs[i]); std::vector<paddle::framework::FetchType*> ref_fetches(1, &refs[i]);
for (auto& word : jobs[i]) { for (auto& word : jobs[i]) {
ref_feeds.push_back(&word); ref_feeds.push_back(&word);
} }
...@@ -207,9 +209,10 @@ void MainThreadsWord2Vec(bool use_gpu) { ...@@ -207,9 +209,10 @@ void MainThreadsWord2Vec(bool use_gpu) {
} }
// check outputs correctness // check outputs correctness
float* ref_data = refs[tid].data<float>(); auto ref_tensor = boost::get<paddle::framework::LoDTensor>(refs[tid]);
EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float))); float* ref_data = ref_tensor.data<float>();
for (int i = 0; i < refs[tid].numel(); ++i) { EXPECT_EQ(ref_tensor.numel(), static_cast<int64_t>(len / sizeof(float)));
for (int i = 0; i < ref_tensor.numel(); ++i) {
EXPECT_NEAR(ref_data[i], data[i], 2e-3); EXPECT_NEAR(ref_data[i], data[i], 2e-3);
} }
}); });
...@@ -230,7 +233,7 @@ void MainThreadsImageClassification(bool use_gpu) { ...@@ -230,7 +233,7 @@ void MainThreadsImageClassification(bool use_gpu) {
auto main_predictor = CreatePaddlePredictor<NativeConfig>(config); auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
std::vector<framework::LoDTensor> jobs(num_jobs); std::vector<framework::LoDTensor> jobs(num_jobs);
std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs); std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
std::vector<framework::LoDTensor> refs(num_jobs); std::vector<framework::FetchType> refs(num_jobs);
for (size_t i = 0; i < jobs.size(); ++i) { for (size_t i = 0; i < jobs.size(); ++i) {
// prepare inputs // prepare inputs
std::vector<std::vector<int64_t>> feed_target_shapes = std::vector<std::vector<int64_t>> feed_target_shapes =
...@@ -242,7 +245,7 @@ void MainThreadsImageClassification(bool use_gpu) { ...@@ -242,7 +245,7 @@ void MainThreadsImageClassification(bool use_gpu) {
// get reference result of each job // get reference result of each job
std::vector<framework::LoDTensor*> ref_feeds(1, &jobs[i]); std::vector<framework::LoDTensor*> ref_feeds(1, &jobs[i]);
std::vector<framework::LoDTensor*> ref_fetches(1, &refs[i]); std::vector<framework::FetchType*> ref_fetches(1, &refs[i]);
TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches); TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
} }
...@@ -259,9 +262,10 @@ void MainThreadsImageClassification(bool use_gpu) { ...@@ -259,9 +262,10 @@ void MainThreadsImageClassification(bool use_gpu) {
ASSERT_EQ(local_outputs.size(), 1UL); ASSERT_EQ(local_outputs.size(), 1UL);
const size_t len = local_outputs[0].data.length(); const size_t len = local_outputs[0].data.length();
float* data = static_cast<float*>(local_outputs[0].data.data()); float* data = static_cast<float*>(local_outputs[0].data.data());
float* ref_data = refs[tid].data<float>(); auto ref_tensor = boost::get<paddle::framework::LoDTensor>(refs[tid]);
EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float)); float* ref_data = ref_tensor.data<float>();
for (int i = 0; i < refs[tid].numel(); ++i) { EXPECT_EQ((size_t)ref_tensor.numel(), len / sizeof(float));
for (int i = 0; i < ref_tensor.numel(); ++i) {
EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF); EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF);
} }
}); });
......
...@@ -40,10 +40,10 @@ TEST(inference, fit_a_line) { ...@@ -40,10 +40,10 @@ TEST(inference, fit_a_line) {
cpu_feeds[i].push_back(input); cpu_feeds[i].push_back(input);
} }
std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs1; std::vector<std::vector<paddle::framework::FetchType*>> cpu_fetchs1;
cpu_fetchs1.resize(num_threads); cpu_fetchs1.resize(num_threads);
for (int i = 0; i < num_threads; ++i) { for (int i = 0; i < num_threads; ++i) {
auto* output = new paddle::framework::LoDTensor(); auto* output = new paddle::framework::FetchType();
cpu_fetchs1[i].push_back(output); cpu_fetchs1[i].push_back(output);
} }
...@@ -58,10 +58,10 @@ TEST(inference, fit_a_line) { ...@@ -58,10 +58,10 @@ TEST(inference, fit_a_line) {
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs2; std::vector<std::vector<paddle::framework::FetchType*>> cpu_fetchs2;
cpu_fetchs2.resize(num_threads); cpu_fetchs2.resize(num_threads);
for (int i = 0; i < num_threads; ++i) { for (int i = 0; i < num_threads; ++i) {
auto* output = new paddle::framework::LoDTensor(); auto* output = new paddle::framework::FetchType();
cpu_fetchs2[i].push_back(output); cpu_fetchs2[i].push_back(output);
} }
...@@ -76,7 +76,9 @@ TEST(inference, fit_a_line) { ...@@ -76,7 +76,9 @@ TEST(inference, fit_a_line) {
} }
for (int i = 0; i < num_threads; ++i) { for (int i = 0; i < num_threads; ++i) {
CheckError<float>(*cpu_fetchs1[i][0], *cpu_fetchs2[i][0]); CheckError<float>(
boost::get<paddle::framework::LoDTensor>(*cpu_fetchs1[i][0]),
boost::get<paddle::framework::LoDTensor>(*cpu_fetchs2[i][0]));
delete cpu_fetchs2[i][0]; delete cpu_fetchs2[i][0];
} }
#endif #endif
......
...@@ -50,9 +50,9 @@ TEST(inference, image_classification) { ...@@ -50,9 +50,9 @@ TEST(inference, image_classification) {
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input); cpu_feeds.push_back(&input);
paddle::framework::LoDTensor output1; paddle::framework::FetchType output1;
if (!FLAGS_skip_cpu) { if (!FLAGS_skip_cpu) {
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1; std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1); cpu_fetchs1.push_back(&output1);
// Run inference on CPU // Run inference on CPU
...@@ -60,12 +60,12 @@ TEST(inference, image_classification) { ...@@ -60,12 +60,12 @@ TEST(inference, image_classification) {
LOG(INFO) << "Batch size is " << FLAGS_batch_size; LOG(INFO) << "Batch size is " << FLAGS_batch_size;
TestInference<paddle::platform::CPUPlace, false, true>( TestInference<paddle::platform::CPUPlace, false, true>(
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined); dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
LOG(INFO) << output1.dims(); LOG(INFO) << boost::get<paddle::framework::LoDTensor>(output1).dims();
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2; paddle::framework::FetchType output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2; std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2); cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU // Run inference on CUDA GPU
...@@ -73,17 +73,18 @@ TEST(inference, image_classification) { ...@@ -73,17 +73,18 @@ TEST(inference, image_classification) {
LOG(INFO) << "Batch size is " << FLAGS_batch_size; LOG(INFO) << "Batch size is " << FLAGS_batch_size;
TestInference<paddle::platform::CUDAPlace, false, true>( TestInference<paddle::platform::CUDAPlace, false, true>(
dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined); dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
LOG(INFO) << output2.dims(); LOG(INFO) << boost::get<paddle::framework::LoDTensor>(output2).dims();
if (!FLAGS_skip_cpu) { if (!FLAGS_skip_cpu) {
CheckError<float>(output1, output2); CheckError<float>(boost::get<paddle::framework::LoDTensor>(output1),
boost::get<paddle::framework::LoDTensor>(output2));
} }
// float16 inference requires cuda GPUs with >= 5.3 compute capability // float16 inference requires cuda GPUs with >= 5.3 compute capability
if (!FLAGS_fp16_dirname.empty() && if (!FLAGS_fp16_dirname.empty() &&
paddle::platform::GetCUDAComputeCapability(0) >= 53) { paddle::platform::GetCUDAComputeCapability(0) >= 53) {
paddle::framework::LoDTensor output3; paddle::framework::FetchType output3;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs3; std::vector<paddle::framework::FetchType*> cpu_fetchs3;
cpu_fetchs3.push_back(&output3); cpu_fetchs3.push_back(&output3);
LOG(INFO) << "--- GPU Runs in float16 mode: ---"; LOG(INFO) << "--- GPU Runs in float16 mode: ---";
...@@ -92,7 +93,8 @@ TEST(inference, image_classification) { ...@@ -92,7 +93,8 @@ TEST(inference, image_classification) {
TestInference<paddle::platform::CUDAPlace, false, true>( TestInference<paddle::platform::CUDAPlace, false, true>(
FLAGS_fp16_dirname, cpu_feeds, cpu_fetchs3, FLAGS_repeat); FLAGS_fp16_dirname, cpu_feeds, cpu_fetchs3, FLAGS_repeat);
CheckError<float>(output2, output3); CheckError<float>(boost::get<paddle::framework::LoDTensor>(output2),
boost::get<paddle::framework::LoDTensor>(output3));
} }
#endif #endif
} }
...@@ -63,25 +63,27 @@ TEST(inference, label_semantic_roles) { ...@@ -63,25 +63,27 @@ TEST(inference, label_semantic_roles) {
cpu_feeds.push_back(&ctx_p2); cpu_feeds.push_back(&ctx_p2);
cpu_feeds.push_back(&mark); cpu_feeds.push_back(&mark);
paddle::framework::LoDTensor output1; paddle::framework::FetchType output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1; std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1); cpu_fetchs1.push_back(&output1);
// Run inference on CPU // Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1); TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.lod(); auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
LOG(INFO) << output1.dims(); LOG(INFO) << output1_tensor.lod();
LOG(INFO) << output1_tensor.dims();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2; paddle::framework::FetchType output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2; std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2); cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU // Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2); TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.lod(); auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
LOG(INFO) << output2.dims(); LOG(INFO) << output2_tensor.lod();
LOG(INFO) << output2_tensor.dims();
CheckError<float>(output1, output2); CheckError<float>(output1_tensor, output2_tensor);
#endif #endif
} }
...@@ -118,8 +118,8 @@ void ThreadRunInfer( ...@@ -118,8 +118,8 @@ void ThreadRunInfer(
inference_program->GetFetchTargetNames(); inference_program->GetFetchTargetNames();
PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL); PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
std::map<std::string, paddle::framework::LoDTensor*> fetch_targets; std::map<std::string, paddle::framework::FetchType*> fetch_targets;
paddle::framework::LoDTensor outtensor; paddle::framework::FetchType outtensor;
fetch_targets[fetch_target_names[0]] = &outtensor; fetch_targets[fetch_target_names[0]] = &outtensor;
std::map<std::string, const paddle::framework::LoDTensor*> feed_targets; std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
...@@ -150,7 +150,8 @@ void ThreadRunInfer( ...@@ -150,7 +150,8 @@ void ThreadRunInfer(
std::string fetch_target_name = op->Input("X")[0]; std::string fetch_target_name = op->Input("X")[0];
int idx = boost::get<int>(op->GetAttr("col")); int idx = boost::get<int>(op->GetAttr("col"));
*fetch_targets[fetch_target_name] = *fetch_targets[fetch_target_name] =
paddle::framework::GetFetchVariable(*scope, "fetch", idx); boost::get<paddle::framework::LoDTensor>(
paddle::framework::GetFetchVariable(*scope, "fetch", idx));
} }
} }
...@@ -215,8 +216,8 @@ TEST(inference, nlp) { ...@@ -215,8 +216,8 @@ TEST(inference, nlp) {
const std::vector<std::string>& fetch_target_names = const std::vector<std::string>& fetch_target_names =
inference_program->GetFetchTargetNames(); inference_program->GetFetchTargetNames();
PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL); PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
std::map<std::string, paddle::framework::LoDTensor*> fetch_targets; std::map<std::string, paddle::framework::FetchType*> fetch_targets;
paddle::framework::LoDTensor outtensor; paddle::framework::FetchType outtensor;
fetch_targets[fetch_target_names[0]] = &outtensor; fetch_targets[fetch_target_names[0]] = &outtensor;
// prepare feed // prepare feed
......
...@@ -41,28 +41,30 @@ TEST(inference, recognize_digits) { ...@@ -41,28 +41,30 @@ TEST(inference, recognize_digits) {
cpu_feeds.push_back(&input); cpu_feeds.push_back(&input);
for (auto is_combined : {false, true}) { for (auto is_combined : {false, true}) {
paddle::framework::LoDTensor output1; paddle::framework::FetchType output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1; std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1); cpu_fetchs1.push_back(&output1);
// Run inference on CPU // Run inference on CPU
LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---"; LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1, TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
FLAGS_repeat, is_combined); FLAGS_repeat, is_combined);
LOG(INFO) << output1.dims(); auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
LOG(INFO) << output1_tensor.dims();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2; paddle::framework::FetchType output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2; std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2); cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU // Run inference on CUDA GPU
LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---"; LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2, TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
FLAGS_repeat, is_combined); FLAGS_repeat, is_combined);
LOG(INFO) << output2.dims(); auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
LOG(INFO) << output2_tensor.dims();
CheckError<float>(output1, output2); CheckError<float>(output1_tensor, output2_tensor);
#endif #endif
} }
} }
...@@ -65,23 +65,25 @@ TEST(inference, recommender_system) { ...@@ -65,23 +65,25 @@ TEST(inference, recommender_system) {
cpu_feeds.push_back(&category_id); cpu_feeds.push_back(&category_id);
cpu_feeds.push_back(&movie_title); cpu_feeds.push_back(&movie_title);
paddle::framework::LoDTensor output1; paddle::framework::FetchType output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1; std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1); cpu_fetchs1.push_back(&output1);
// Run inference on CPU // Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1); TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims(); auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
LOG(INFO) << output1_tensor.dims();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2; paddle::framework::FetchType output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2; std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2); cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU // Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2); TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims(); auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
LOG(INFO) << output2_tensor.dims();
CheckError<float>(output1, output2); CheckError<float>(output1_tensor, output2_tensor);
#endif #endif
} }
...@@ -41,25 +41,27 @@ TEST(inference, rnn_encoder_decoder) { ...@@ -41,25 +41,27 @@ TEST(inference, rnn_encoder_decoder) {
cpu_feeds.push_back(&word_data); cpu_feeds.push_back(&word_data);
cpu_feeds.push_back(&trg_word); cpu_feeds.push_back(&trg_word);
paddle::framework::LoDTensor output1; paddle::framework::FetchType output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1; std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1); cpu_fetchs1.push_back(&output1);
// Run inference on CPU // Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1); TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.lod(); auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
LOG(INFO) << output1.dims(); LOG(INFO) << output1_tensor.lod();
LOG(INFO) << output1_tensor.dims();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2; paddle::framework::FetchType output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2; std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2); cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU // Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2); TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.lod(); auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
LOG(INFO) << output2.dims(); LOG(INFO) << output2_tensor.lod();
LOG(INFO) << output2_tensor.dims();
CheckError<float>(output1, output2); CheckError<float>(output1_tensor, output2_tensor);
#endif #endif
} }
...@@ -39,25 +39,27 @@ TEST(inference, understand_sentiment) { ...@@ -39,25 +39,27 @@ TEST(inference, understand_sentiment) {
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&words); cpu_feeds.push_back(&words);
paddle::framework::LoDTensor output1; paddle::framework::FetchType output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1; std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1); cpu_fetchs1.push_back(&output1);
// Run inference on CPU // Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1); TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.lod(); auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
LOG(INFO) << output1.dims(); LOG(INFO) << output1_tensor.lod();
LOG(INFO) << output1_tensor.dims();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2; paddle::framework::FetchType output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2; std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2); cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU // Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2); TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.lod(); auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
LOG(INFO) << output2.dims(); LOG(INFO) << output2_tensor.lod();
LOG(INFO) << output2_tensor.dims();
CheckError<float>(output1, output2); CheckError<float>(output1_tensor, output2_tensor);
#endif #endif
} }
...@@ -44,25 +44,27 @@ TEST(inference, word2vec) { ...@@ -44,25 +44,27 @@ TEST(inference, word2vec) {
cpu_feeds.push_back(&third_word); cpu_feeds.push_back(&third_word);
cpu_feeds.push_back(&fourth_word); cpu_feeds.push_back(&fourth_word);
paddle::framework::LoDTensor output1; paddle::framework::FetchType output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1; std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1); cpu_fetchs1.push_back(&output1);
// Run inference on CPU // Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1); TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.lod(); auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
LOG(INFO) << output1.dims(); LOG(INFO) << output1_tensor.lod();
LOG(INFO) << output1_tensor.dims();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2; paddle::framework::FetchType output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2; std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2); cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU // Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2); TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.lod(); auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
LOG(INFO) << output2.dims(); LOG(INFO) << output2_tensor.lod();
LOG(INFO) << output2_tensor.dims();
CheckError<float>(output1, output2); CheckError<float>(output1_tensor, output2_tensor);
#endif #endif
} }
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <map> #include <map>
#include <memory>
#include <random> #include <random>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -142,7 +143,7 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes( ...@@ -142,7 +143,7 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
template <typename Place, bool CreateVars = true, bool PrepareContext = false> template <typename Place, bool CreateVars = true, bool PrepareContext = false>
void TestInference(const std::string& dirname, void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds, const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs, const std::vector<paddle::framework::FetchType*>& cpu_fetchs,
const int repeat = 1, const bool is_combined = false) { const int repeat = 1, const bool is_combined = false) {
// 1. Define place, executor, scope // 1. Define place, executor, scope
auto place = Place(); auto place = Place();
...@@ -194,7 +195,7 @@ void TestInference(const std::string& dirname, ...@@ -194,7 +195,7 @@ void TestInference(const std::string& dirname,
} }
// 5. Define Tensor to get the outputs: set up maps for fetch targets // 5. Define Tensor to get the outputs: set up maps for fetch targets
std::map<std::string, paddle::framework::LoDTensor*> fetch_targets; std::map<std::string, paddle::framework::FetchType*> fetch_targets;
for (size_t i = 0; i < fetch_target_names.size(); ++i) { for (size_t i = 0; i < fetch_target_names.size(); ++i) {
fetch_targets[fetch_target_names[i]] = cpu_fetchs[i]; fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
} }
......
...@@ -58,7 +58,7 @@ class FeedOp : public framework::OperatorBase { ...@@ -58,7 +58,7 @@ class FeedOp : public framework::OperatorBase {
VLOG(3) << "Feed variable " << feed_var_name << "'s " << col VLOG(3) << "Feed variable " << feed_var_name << "'s " << col
<< " column to variable " << out_name; << " column to variable " << out_name;
auto &feed_list = feed_var->Get<framework::FeedFetchList>(); auto &feed_list = feed_var->Get<framework::FeedList>();
PADDLE_ENFORCE_LT( PADDLE_ENFORCE_LT(
static_cast<size_t>(col), feed_list.size(), static_cast<size_t>(col), feed_list.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -68,7 +68,7 @@ class FeedOp : public framework::OperatorBase { ...@@ -68,7 +68,7 @@ class FeedOp : public framework::OperatorBase {
col, feed_list.size())); col, feed_list.size()));
auto &feed_item = feed_list.at(static_cast<size_t>(col)); auto &feed_item = feed_list.at(static_cast<size_t>(col));
auto *out_item = out_var->GetMutable<framework::FeedFetchType>(); auto *out_item = out_var->GetMutable<framework::FeedType>();
if (platform::is_same_place(feed_item.place(), place)) { if (platform::is_same_place(feed_item.place(), place)) {
out_item->ShareDataWith(feed_item); out_item->ShareDataWith(feed_item);
......
...@@ -21,6 +21,39 @@ limitations under the License. */ ...@@ -21,6 +21,39 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
// FIXME(yuyang18): Should we assume the fetch operator always generate
// CPU outputs?
static void DataCopy(const framework::LoDTensor &src_item,
const std::string &fetch_var_name,
framework::LoDTensor *dst_item) {
if (src_item.IsInitialized() && src_item.numel() > 0) {
#ifdef PADDLE_WITH_MKLDNN
// Conversion from MKL-DNN to Paddle
if (src_item.layout() == framework::DataLayout::kMKLDNN) {
framework::Tensor out;
// Convert to desired Paddle layout, apart from grads of filter
// as params are not a subject to paddle's data_format
framework::innerTransDataLayoutFromMKLDNN(
src_item.layout(),
fetch_var_name == framework::GradVarName("Filter")
? framework::DataLayout::kNCHW
: paddle::platform::get_cur_paddle_data_layout(),
src_item, &out, platform::CPUPlace());
TensorCopySync(out, platform::CPUPlace(), dst_item);
} else {
TensorCopySync(src_item, platform::CPUPlace(), dst_item);
}
#else
TensorCopySync(src_item, platform::CPUPlace(), dst_item);
#endif
} else {
// Not copy, if the src tensor is empty.
dst_item->clear();
dst_item->Resize({0});
}
dst_item->set_lod(src_item.lod());
}
class FetchOp : public framework::OperatorBase { class FetchOp : public framework::OperatorBase {
public: public:
FetchOp(const std::string &type, const framework::VariableNameMap &inputs, FetchOp(const std::string &type, const framework::VariableNameMap &inputs,
...@@ -66,42 +99,26 @@ class FetchOp : public framework::OperatorBase { ...@@ -66,42 +99,26 @@ class FetchOp : public framework::OperatorBase {
VLOG(3) << "Fetch variable " << fetch_var_name << " to variable " VLOG(3) << "Fetch variable " << fetch_var_name << " to variable "
<< out_name << "'s " << col << " column."; << out_name << "'s " << col << " column.";
auto *fetch_list = out_var->GetMutable<framework::FeedFetchList>(); auto *fetch_list = out_var->GetMutable<framework::FetchList>();
auto &src_item = fetch_var->Get<framework::FeedFetchType>();
if (static_cast<size_t>(col) >= fetch_list->size()) { if (static_cast<size_t>(col) >= fetch_list->size()) {
fetch_list->resize(col + 1); fetch_list->resize(col + 1);
} }
auto &dst_item = fetch_list->at(col);
// FIXME(yuyang18): Should we assume the fetch operator always generate if (fetch_var->IsType<framework::LoDTensor>()) {
// CPU outputs? auto &src_item = fetch_var->Get<framework::LoDTensor>();
if (src_item.IsInitialized() && src_item.numel() > 0) { auto *dst_item = &(boost::get<framework::LoDTensor>(fetch_list->at(col)));
#ifdef PADDLE_WITH_MKLDNN DataCopy(src_item, fetch_var_name, dst_item);
// Conversion from MKL-DNN to Paddle
if (src_item.layout() == framework::DataLayout::kMKLDNN) {
framework::Tensor out;
// Convert to desired Paddle layout, apart from grads of filter
// as params are not a subject to paddle's data_format
framework::innerTransDataLayoutFromMKLDNN(
src_item.layout(),
fetch_var_name == framework::GradVarName("Filter")
? framework::DataLayout::kNCHW
: paddle::platform::get_cur_paddle_data_layout(),
src_item, &out, platform::CPUPlace());
TensorCopySync(out, platform::CPUPlace(), &dst_item);
} else {
TensorCopySync(src_item, platform::CPUPlace(), &dst_item);
}
#else
TensorCopySync(src_item, platform::CPUPlace(), &dst_item);
#endif
} else { } else {
// Not copy, if the src tensor is empty. auto &src_item = fetch_var->Get<framework::LoDTensorArray>();
dst_item.clear(); framework::LoDTensorArray tmp(src_item.size());
dst_item.Resize({0}); fetch_list->at(col) = tmp;
auto &dst_item =
boost::get<framework::LoDTensorArray>(fetch_list->at(col));
for (size_t i = 0; i < src_item.size(); ++i) {
DataCopy(src_item[i], fetch_var_name, &dst_item[i]);
}
} }
dst_item.set_lod(src_item.lod());
} }
}; };
......
...@@ -97,7 +97,9 @@ DECLARE_bool(use_mkldnn); ...@@ -97,7 +97,9 @@ DECLARE_bool(use_mkldnn);
// disable auto conversion to list in Python // disable auto conversion to list in Python
PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensor2DArray); PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList);
PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList);
PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
...@@ -966,6 +968,9 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -966,6 +968,9 @@ All parameter, weight, gradient are variables in Paddle.
.def("get_lod_tensor_array", .def("get_lod_tensor_array",
[](Variable &self) { return self.GetMutable<LoDTensorArray>(); }, [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
py::return_value_policy::reference) py::return_value_policy::reference)
.def("get_fetch_list",
[](Variable &self) { return self.GetMutable<FetchList>(); },
py::return_value_policy::reference)
#if (defined(PADDLE_WITH_NCCL)) #if (defined(PADDLE_WITH_NCCL))
.def("get_communicator", .def("get_communicator",
[](Variable &self) -> platform::Communicator * { [](Variable &self) -> platform::Communicator * {
...@@ -1443,7 +1448,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1443,7 +1448,7 @@ All parameter, weight, gradient are variables in Paddle.
.def("run_prepared_ctx", .def("run_prepared_ctx",
[](Executor &self, ExecutorPrepareContext *ctx, Scope *scope, [](Executor &self, ExecutorPrepareContext *ctx, Scope *scope,
std::map<std::string, const LoDTensor *> *feed_targets, std::map<std::string, const LoDTensor *> *feed_targets,
std::map<std::string, LoDTensor *> *fetch_targets, std::map<std::string, FetchType *> *fetch_targets,
bool create_local_scope = true, bool create_vars = true, bool create_local_scope = true, bool create_vars = true,
const std::string &feed_holder_name = "feed", const std::string &feed_holder_name = "feed",
const std::string &fetch_holder_name = "fetch") { const std::string &fetch_holder_name = "fetch") {
...@@ -1503,7 +1508,16 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1503,7 +1508,16 @@ All parameter, weight, gradient are variables in Paddle.
#endif #endif
m.def("set_feed_variable", framework::SetFeedVariable); m.def("set_feed_variable", framework::SetFeedVariable);
m.def("get_fetch_variable", framework::GetFetchVariable); m.def("get_fetch_variable",
[](const Scope &scope, const std::string &var_name,
size_t index) -> py::object {
auto &var = framework::GetFetchVariable(scope, var_name, index);
if (data_is_lod_tensor(var)) {
return py::cast(boost::get<LoDTensor>(var));
} else {
return py::cast(boost::get<LoDTensorArray>(var));
}
});
m.def("get_variable_tensor", framework::GetVariableTensor); m.def("get_variable_tensor", framework::GetVariableTensor);
m.def("_is_program_version_supported", IsProgramVersionSupported); m.def("_is_program_version_supported", IsProgramVersionSupported);
...@@ -1583,16 +1597,70 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1583,16 +1597,70 @@ All parameter, weight, gradient are variables in Paddle.
}, },
py::return_value_policy::take_ownership); py::return_value_policy::take_ownership);
py::class_<LoDTensor2DArray>(m, "LoDTensor2DArray", R"DOC( py::class_<FetchList>(m, "FetchList", R"DOC( FetchList is a
LoDTensor2DArray is 2-D array of LoDTensor. vector of boost::variant<LoDTensor, LoDTensorArray>.
)DOC")
.def("_move_to_list",
[](FetchList &self) -> py::list {
py::list res(self.size());
for (size_t i = 0; i < self.size(); ++i) {
if (data_is_lod_tensor(self[i])) {
auto &data = boost::get<LoDTensor>(self[i]);
res[i] = py::cast(std::move(data));
} else {
auto &data = boost::get<LoDTensorArray>(self[i]);
py::list tmp(data.size());
for (size_t j = 0; j < data.size(); ++j) {
tmp[j] = py::cast(std::move(data[j]));
}
res[i] = std::move(tmp);
}
}
self.clear();
return res;
},
py::return_value_policy::take_ownership)
.def("append",
[](FetchList &self, const LoDTensor &t) {
self.emplace_back();
auto &lod_tensor = boost::get<LoDTensor>(self.back());
lod_tensor.ShareDataWith(t);
lod_tensor.set_lod(t.lod());
},
py::arg("var"))
.def("append",
[](FetchList &self, const LoDTensorArray &t) {
self.emplace_back();
auto &lod_tensor_array = boost::get<LoDTensorArray>(self.back());
for (size_t i = 0; i < t.size(); ++i) {
lod_tensor_array[i].ShareDataWith(t[i]);
lod_tensor_array[i].set_lod(t[i].lod());
}
},
py::arg("var"));
py::class_<FetchUnmergedList>(m, "FetchUnmergedList", R"DOC(
FetchUnmergedList is 2-D array of FetchType(boost::variant(LoDTensor, LoDTensorArray)).
)DOC") )DOC")
.def("_move_to_list", .def("_move_to_list",
[](LoDTensor2DArray &self) -> py::list { [](FetchUnmergedList &self) -> py::list {
py::list res(self.size()); py::list res(self.size());
for (size_t i = 0; i < self.size(); ++i) { for (size_t i = 0; i < self.size(); ++i) {
py::list tmp(self[i].size()); py::list tmp(self[i].size());
for (size_t j = 0; j < self[i].size(); ++j) { for (size_t j = 0; j < self[i].size(); ++j) {
tmp[j] = py::cast(std::move(self[i][j])); if (data_is_lod_tensor(self[i][j])) {
auto &var = boost::get<LoDTensor>(self[i][j]);
tmp[j] = py::cast(std::move(var));
} else {
auto &var = boost::get<LoDTensorArray>(self[i][j]);
py::list tmp_array(var.size());
for (size_t k = 0; k < var.size(); ++k) {
tmp_array[k] = std::move(var[k]);
}
tmp[j] = std::move(tmp_array);
}
} }
res[i] = std::move(tmp); res[i] = std::move(tmp);
self[i].clear(); self[i].clear();
...@@ -2326,8 +2394,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2326,8 +2394,8 @@ All parameter, weight, gradient are variables in Paddle.
ret = self.Run(fetch_tensors, return_merged); ret = self.Run(fetch_tensors, return_merged);
} }
if (return_merged) { if (return_merged) {
return py::cast(std::move( return py::cast(
boost::get<paddle::framework::FeedFetchList>(ret))); std::move(boost::get<paddle::framework::FetchList>(ret)));
} else { } else {
return py::cast(std::move( return py::cast(std::move(
boost::get<paddle::framework::FetchUnmergedList>(ret))); boost::get<paddle::framework::FetchUnmergedList>(ret)));
......
...@@ -931,14 +931,14 @@ class Executor(object): ...@@ -931,14 +931,14 @@ class Executor(object):
return_merged(bool): This parameter indicates whether fetched variables (the variables return_merged(bool): This parameter indicates whether fetched variables (the variables
specified in the fetch list) should be merged according to the execution device dimension. specified in the fetch list) should be merged according to the execution device dimension.
If :code:`return_merged` is False, the type of the return value is a two-dimensional list If :code:`return_merged` is False, the type of the return value is a two-dimensional list
of :code:`Tensor` ( :code:`return_numpy` is False) or a two-dimensional list of of :code:`Tensor` / :code:`LoDTensorArray` ( :code:`return_numpy` is False) or a two-dimensional
:code:`numpy.ndarray` ( :code:`return_numpy` is True). If :code:`return_merged` is True, list of :code:`numpy.ndarray` ( :code:`return_numpy` is True). If :code:`return_merged` is True,
the type of the return value is an one-dimensional list of :code:`Tensor` ( :code:`return_numpy` the type of the return value is an one-dimensional list of :code:`Tensor` / :code:`LoDTensorArray`
is False) or an one-dimensional list of :code:`numpy.ndarray` ( :code:`return_numpy` is True). ( :code:`return_numpy` is False) or an one-dimensional list of :code:`numpy.ndarray`
Please see Examples 2 for more details. If the lengths of fetched results are variant, please ( :code:`return_numpy` is True). Please see Examples 2 for more details. If the lengths of fetched
set :code:`return_merged` as False, which denotes that the fetched results will not be merged. results are variant, please set :code:`return_merged` as False, which denotes that the fetched
The default is True, but it is just for the compatibility, and may use False as default value results will not be merged. The default is True, but it is just for the compatibility, and may
in the future version. use False as default value in the future version.
use_prune(bool): This parameter indicates whether the input :code:`Program` will be pruned. use_prune(bool): This parameter indicates whether the input :code:`Program` will be pruned.
If the parameter is True, the program will be pruned accroding to the given feed and fetch_list, If the parameter is True, the program will be pruned accroding to the given feed and fetch_list,
which means the operators and variables in program that generate :code:`feed` and are not which means the operators and variables in program that generate :code:`feed` and are not
...@@ -980,13 +980,17 @@ class Executor(object): ...@@ -980,13 +980,17 @@ class Executor(object):
loss = fluid.layers.mean(hidden) loss = fluid.layers.mean(hidden)
adam = fluid.optimizer.Adam() adam = fluid.optimizer.Adam()
adam.minimize(loss) adam.minimize(loss)
i = fluid.layers.zeros(shape=[1], dtype='int64')
array = fluid.layers.array_write(x=loss, i=i)
# Run the startup program once and only once. # Run the startup program once and only once.
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
x = numpy.random.random(size=(10, 1)).astype('float32') x = numpy.random.random(size=(10, 1)).astype('float32')
outs = exe.run(feed={'X': x}, loss_val, array_val = exe.run(feed={'X': x},
fetch_list=[loss.name]) fetch_list=[loss.name, array.name])
print(array_val)
# [array([0.02153828], dtype=float32)]
Examples 2: Examples 2:
.. code-block:: python .. code-block:: python
...@@ -1226,7 +1230,7 @@ class Executor(object): ...@@ -1226,7 +1230,7 @@ class Executor(object):
else: else:
self._default_executor.run_prepared_ctx(ctx, scope, False, False, self._default_executor.run_prepared_ctx(ctx, scope, False, False,
False) False)
arr = scope.find_var(fetch_var_name).get_lod_tensor_array() arr = scope.find_var(fetch_var_name).get_fetch_list()
tensors = arr._move_to_list() tensors = arr._move_to_list()
if return_numpy: if return_numpy:
return as_numpy(tensors) return as_numpy(tensors)
......
...@@ -58,8 +58,11 @@ def convolutional_neural_network(use_py_reader): ...@@ -58,8 +58,11 @@ def convolutional_neural_network(use_py_reader):
loss = fluid.layers.cross_entropy(input=prediction, label=label) loss = fluid.layers.cross_entropy(input=prediction, label=label)
avg_loss = fluid.layers.mean(loss) avg_loss = fluid.layers.mean(loss)
acc = fluid.layers.accuracy(input=prediction, label=label) acc = fluid.layers.accuracy(input=prediction, label=label)
i = fluid.layers.zeros(shape=[1], dtype='int64')
return img, label, prediction, avg_loss, acc, py_reader array = fluid.layers.array_write(x=prediction, i=i)
fluid.layers.increment(i)
fluid.layers.array_write(x=acc, i=i, array=array)
return array, img, label, prediction, avg_loss, acc, py_reader
def test(): def test():
...@@ -69,7 +72,7 @@ def test(): ...@@ -69,7 +72,7 @@ def test():
test_reader = paddle.batch( test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network( array, img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network(
use_py_reader=False) use_py_reader=False)
feeder = fluid.DataFeeder(feed_list=[img, label], place=place) feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
...@@ -102,7 +105,7 @@ def train(use_cuda, thread_num, cpu_num): ...@@ -102,7 +105,7 @@ def train(use_cuda, thread_num, cpu_num):
print("paddle is not compiled with cuda, exit!") print("paddle is not compiled with cuda, exit!")
return return
img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network( array, img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network(
use_py_reader=True) use_py_reader=True)
print("build convolutional neural network done.") print("build convolutional neural network done.")
...@@ -150,7 +153,12 @@ def train(use_cuda, thread_num, cpu_num): ...@@ -150,7 +153,12 @@ def train(use_cuda, thread_num, cpu_num):
py_reader.start() py_reader.start()
try: try:
while True: while True:
loss_val = pe.run(fetch_list=[avg_loss.name]) array_v, acc_v, prediction_v, loss_val = pe.run(
fetch_list=[array, acc, prediction, avg_loss.name])
assert numpy.allclose(array_v[0], prediction_v) == True
assert numpy.allclose(array_v[1], acc_v) == True
loss_val = numpy.mean(loss_val) loss_val = numpy.mean(loss_val)
if step % 10 == 0: if step % 10 == 0:
print("Pass %d, Batch %d, Cost %f, queue size %d" % print("Pass %d, Batch %d, Cost %f, queue size %d" %
......
...@@ -19,25 +19,40 @@ import unittest ...@@ -19,25 +19,40 @@ import unittest
import numpy import numpy
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.executor import Executor from paddle.fluid.executor import Executor
from paddle.fluid.layers import mul, data from paddle.fluid.layers import mul, data, zeros, array_write, increment
class TestExecutor(unittest.TestCase): class TestExecutor(unittest.TestCase):
def test_mul(self): def test_mul(self):
i = zeros(shape=[1], dtype='int64')
a = data(name='a', shape=[784], dtype='float32') a = data(name='a', shape=[784], dtype='float32')
array = array_write(x=a, i=i)
i = increment(i)
b = data( b = data(
name='b', name='b',
shape=[784, 100], shape=[784, 100],
dtype='float32', dtype='float32',
append_batch_size=False) append_batch_size=False)
array_write(x=b, i=i, array=array)
i = increment(i)
out = mul(x=a, y=b) out = mul(x=a, y=b)
array_write(x=out, i=i, array=array)
a_np = numpy.random.random((100, 784)).astype('float32') a_np = numpy.random.random((100, 784)).astype('float32')
b_np = numpy.random.random((784, 100)).astype('float32') b_np = numpy.random.random((784, 100)).astype('float32')
exe = Executor() exe = Executor()
outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out]) res, res_array = exe.run(feed={'a': a_np,
out = outs[0] 'b': b_np},
self.assertEqual((100, 100), out.shape) fetch_list=[out, array])
self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
self.assertEqual((100, 100), res.shape)
self.assertTrue(numpy.allclose(res, numpy.dot(a_np, b_np)))
self.assertTrue(numpy.allclose(res_array[0], a_np))
self.assertTrue(numpy.allclose(res_array[1], b_np))
self.assertTrue(numpy.allclose(res_array[2], res))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -31,7 +31,9 @@ class TestFeedFetch(unittest.TestCase): ...@@ -31,7 +31,9 @@ class TestFeedFetch(unittest.TestCase):
core.set_feed_variable(scope, input_tensor, "feed", 0) core.set_feed_variable(scope, input_tensor, "feed", 0)
output_tensor = core.get_fetch_variable(scope, "feed", 0) output = scope.var("fetch").get_fetch_list()
output.append(input_tensor)
output_tensor = core.get_fetch_variable(scope, "fetch", 0)
output_lod = output_tensor.recursive_sequence_lengths() output_lod = output_tensor.recursive_sequence_lengths()
self.assertEqual(2, output_lod[0][0]) self.assertEqual(2, output_lod[0][0])
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import unittest
import random
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from simple_nets import simple_fc_net_with_inputs, simple_fc_net
class TestFetchLoDTensorArray(unittest.TestCase):
def build_program(self, main_program, startup_program):
with fluid.unique_name.guard():
with fluid.program_guard(main_program, startup_program):
i = layers.zeros(shape=[1], dtype='int64')
img = fluid.data(name='image', shape=[-1, 784], dtype='float32')
label = fluid.data(name='label', shape=[-1, 1], dtype='int64')
loss = simple_fc_net_with_inputs(img, label, class_num=10)
loss = simple_fc_net()
opt = fluid.optimizer.SGD(learning_rate=0.001)
opt.minimize(loss)
array = layers.array_write(x=img, i=i)
i = layers.increment(i)
layers.array_write(x=label, i=i, array=array)
i = layers.increment(i)
layers.array_write(x=loss, i=i, array=array)
return loss, array
def check_network(self, use_cuda=True):
os.environ["CPU_NUM"] = str(2)
main_program = fluid.Program()
startup_program = fluid.Program()
loss, array = self.build_program(main_program, startup_program)
batch_size = 32
image = np.random.normal(size=(batch_size, 784)).astype('float32')
label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_program)
feed_dict = {'image': image, 'label': label}
build_strategy = fluid.BuildStrategy()
binary = fluid.CompiledProgram(main_program).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy)
device_num = fluid.core.get_cuda_device_count() if use_cuda else 2
for _ in range(3):
loss_v, array_v = exe.run(binary,
feed=feed_dict,
fetch_list=[loss, array],
return_merged=False)
self.assertEqual(np.array(loss_v).shape, (device_num, 1))
self.assertEqual(
np.array(array_v[0][0]).shape, (batch_size / device_num, 784))
self.assertEqual(
np.array(array_v[0][1]).shape, (batch_size / device_num, 1))
self.assertEqual(np.array(array_v[0][2]).shape, (1, ))
for _ in range(3):
loss_v, array_v = exe.run(binary,
feed=feed_dict,
fetch_list=[loss, array],
return_merged=True)
self.assertEqual(np.array(loss_v).shape, (device_num, ))
self.assertEqual(np.array(array_v[0]).shape, (batch_size, 784))
self.assertEqual(np.array(array_v[1]).shape, (batch_size, 1))
self.assertTrue(np.allclose(loss_v, array_v[2]))
def test_fetch_lod_tensor_array(self):
if fluid.core.is_compiled_with_cuda():
self.check_network(use_cuda=True)
self.check_network(use_cuda=False)
def test_fetch_unmerged_parallel_graph(self):
fluid.core.globals()['FLAGS_enable_parallel_graph'] = True
if fluid.core.is_compiled_with_cuda():
self.check_network(use_cuda=True)
self.check_network(use_cuda=False)
fluid.core.globals()['FLAGS_enable_parallel_graph'] = False
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册