未验证 提交 b5af9575 编写于 作者: W wanghuancoder 提交者: GitHub

fix some bug in new executor (#36822)

* fix some bug in new executor, test=develop

* fix error message, test=develop
上级 be55bac3
......@@ -79,12 +79,13 @@ void InterpreterCore::AddFetch(const std::vector<std::string>& fetch_names) {
}
paddle::framework::FetchList InterpreterCore::Run(
const std::vector<framework::Tensor>& feed_tensors) {
const std::vector<framework::LoDTensor>& feed_tensors) {
auto FeedInput = [&] {
for (size_t i = 0; i < feed_names_.size(); ++i) {
auto* feed_var = global_scope_->Var(feed_names_[i]);
auto feed_tensor = feed_var->GetMutable<framework::LoDTensor>();
feed_tensor->ShareDataWith(feed_tensors[i]);
feed_tensor->set_lod(feed_tensors[i].lod());
}
};
......@@ -495,7 +496,7 @@ void InterpreterCore::CheckGC(const Instruction& instr) {
}
void InterpreterCore::DryRunPrepare(
const std::vector<framework::Tensor>& feed_tensors) {
const std::vector<framework::LoDTensor>& feed_tensors) {
auto FeedInput = [&] {
for (size_t i = 0; i < feed_names_.size(); ++i) {
auto* feed_var = global_scope_->FindVar(feed_names_[i]);
......@@ -504,6 +505,7 @@ void InterpreterCore::DryRunPrepare(
auto feed_tensor = feed_var->GetMutable<framework::LoDTensor>();
feed_tensor->ShareDataWith(feed_tensors[i]);
feed_tensor->set_lod(feed_tensors[i].lod());
}
};
......@@ -525,7 +527,7 @@ void InterpreterCore::DryRunPrepare(
}
const CostInfo& InterpreterCore::DryRun(
const std::vector<framework::Tensor>& feed_tensors) {
const std::vector<framework::LoDTensor>& feed_tensors) {
DryRunPrepare(feed_tensors);
// DryRun may be called many times.
dry_run_profiler_.Reset();
......
......@@ -46,9 +46,9 @@ class InterpreterCore {
const std::vector<std::string>& fetch_names);
paddle::framework::FetchList Run(
const std::vector<framework::Tensor>& feed_tensors);
const std::vector<framework::LoDTensor>& feed_tensors);
const CostInfo& DryRun(const std::vector<framework::Tensor>& feed_tensors);
const CostInfo& DryRun(const std::vector<framework::LoDTensor>& feed_tensors);
private:
void Convert();
......@@ -65,7 +65,7 @@ class InterpreterCore {
void ExecuteInstructionList(const std::vector<Instruction>& vec_instr);
void DryRunPrepare(const std::vector<framework::Tensor>& feed_tensors);
void DryRunPrepare(const std::vector<framework::LoDTensor>& feed_tensors);
void CheckGC(const Instruction& instr);
......
......@@ -287,7 +287,7 @@ void build_op_func_list(const platform::Place& place,
for (size_t i = 0; i < var_name_item.second.size(); ++i) {
auto var = var_name_item.second[i];
auto& var_name = inputs_names[var_name_item.first].at(i);
auto tensor_in = static_cast<const Tensor*>(&(var->Get<LoDTensor>()));
auto tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
if (!tensor_in->IsInitialized()) {
continue;
}
......@@ -296,7 +296,9 @@ void build_op_func_list(const platform::Place& place,
->GetKernelTypeForVar(var_name_item.first, *tensor_in,
expected_kernel_key);
if (platform::is_same_place(kernel_type_for_var.place_,
expected_kernel_key.place_)) {
expected_kernel_key.place_) ||
(is_cuda_pinned_place(kernel_type_for_var.place_) &&
is_cpu_place(expected_kernel_key.place_))) {
// record no need data transformer input var_id
VLOG(3) << op->Type() << " found no data_transform var: " << var_name
<< " with id: " << var_name;
......
......@@ -47,7 +47,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
paddle::framework::FetchList StandaloneExecutor::Run(
const std::vector<std::string>& feed_names,
const std::vector<framework::Tensor>& feed_tensors,
const std::vector<framework::LoDTensor>& feed_tensors,
const std::vector<std::string>& fetch_names) {
auto core = GetInterpreterCore(feed_names, fetch_names);
......@@ -56,7 +56,7 @@ paddle::framework::FetchList StandaloneExecutor::Run(
const CostInfo& StandaloneExecutor::DryRun(
const std::vector<std::string>& feed_names,
const std::vector<framework::Tensor>& feed_tensors) {
const std::vector<framework::LoDTensor>& feed_tensors) {
auto core = GetInterpreterCore(feed_names, {});
auto& cost_info = core->DryRun(feed_tensors);
......
......@@ -28,7 +28,7 @@ class ExecutorBase {
virtual ~ExecutorBase() {}
virtual paddle::framework::FetchList Run(
const std::vector<std::string>& feed_names,
const std::vector<framework::Tensor>& feed_tensors,
const std::vector<framework::LoDTensor>& feed_tensors,
const std::vector<std::string>& fetch_names) = 0;
};
......@@ -42,11 +42,11 @@ class StandaloneExecutor : public ExecutorBase {
virtual paddle::framework::FetchList Run(
const std::vector<std::string>& feed_names,
const std::vector<framework::Tensor>& feed_tensors,
const std::vector<framework::LoDTensor>& feed_tensors,
const std::vector<std::string>& fetch_names);
const CostInfo& DryRun(const std::vector<std::string>& feed_names,
const std::vector<framework::Tensor>& feed_tensors);
const std::vector<framework::LoDTensor>& feed_tensors);
private:
void BuildVariableOuterScope(const framework::ProgramDesc& pdesc,
......
......@@ -128,9 +128,12 @@ class FetchV2Kernel {
if (fetch_var->IsType<framework::LoDTensor>()) {
auto &src_item = fetch_var->Get<framework::LoDTensor>();
auto *dst_item = &(BOOST_GET(framework::LoDTensor, fetch_list->at(col)));
PADDLE_ENFORCE_EQ(platform::is_cpu_place(src_item.place()), true,
platform::errors::InvalidArgument(
"Tensor's place of input(X) must be CPUPlace."));
bool check_place = platform::is_cpu_place(src_item.place()) ||
platform::is_cuda_pinned_place(src_item.place());
PADDLE_ENFORCE_EQ(
check_place, true,
platform::errors::InvalidArgument("Tensor's place of input(X) must "
"be CPUPlace or CUDAPinnedPlace."));
if (deepcopy) {
DeepCopy(src_item, fetch_var_name, dst_item);
} else {
......@@ -188,8 +191,11 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL_FUNCTOR(fetch_v2, float, ops::FetchV2Kernel, double,
ops::FetchV2Kernel, int, ops::FetchV2Kernel,
int64_t, ops::FetchV2Kernel, bool,
ops::FetchV2Kernel, plat::float16,
ops::FetchV2Kernel);
REGISTER_OP_CPU_KERNEL_FUNCTOR(
fetch_v2, float, ops::FetchV2Kernel, double, ops::FetchV2Kernel, int8_t,
ops::FetchV2Kernel, uint8_t, ops::FetchV2Kernel, int, ops::FetchV2Kernel,
int64_t, ops::FetchV2Kernel, bool, ops::FetchV2Kernel,
paddle::platform::bfloat16, ops::FetchV2Kernel,
paddle::platform::complex<float>, ops::FetchV2Kernel,
paddle::platform::complex<double>, ops::FetchV2Kernel, plat::float16,
ops::FetchV2Kernel, int16_t, ops::FetchV2Kernel);
......@@ -125,24 +125,33 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL_FUNCTOR(memcpy_d2h, float, ops::MemcpyD2HKernel, double,
ops::MemcpyD2HKernel, int, ops::MemcpyD2HKernel,
int64_t, ops::MemcpyD2HKernel, bool,
ops::MemcpyD2HKernel, plat::float16,
ops::MemcpyD2HKernel);
REGISTER_OP_CPU_KERNEL_FUNCTOR(
memcpy_d2h, float, ops::MemcpyD2HKernel, double, ops::MemcpyD2HKernel,
int8_t, ops::MemcpyD2HKernel, uint8_t, ops::MemcpyD2HKernel, int,
ops::MemcpyD2HKernel, int64_t, ops::MemcpyD2HKernel, bool,
ops::MemcpyD2HKernel, paddle::platform::bfloat16, ops::MemcpyD2HKernel,
paddle::platform::complex<float>, ops::MemcpyD2HKernel,
paddle::platform::complex<double>, ops::MemcpyD2HKernel, plat::float16,
ops::MemcpyD2HKernel, int16_t, ops::MemcpyD2HKernel);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy_d2h, float, ops::MemcpyD2HKernel, double,
ops::MemcpyD2HKernel, int, ops::MemcpyD2HKernel,
int64_t, ops::MemcpyD2HKernel, bool,
ops::MemcpyD2HKernel, plat::float16,
ops::MemcpyD2HKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(
memcpy_d2h, float, ops::MemcpyD2HKernel, double, ops::MemcpyD2HKernel,
int8_t, ops::MemcpyD2HKernel, uint8_t, ops::MemcpyD2HKernel, int,
ops::MemcpyD2HKernel, int64_t, ops::MemcpyD2HKernel, bool,
ops::MemcpyD2HKernel, paddle::platform::bfloat16, ops::MemcpyD2HKernel,
paddle::platform::complex<float>, ops::MemcpyD2HKernel,
paddle::platform::complex<double>, ops::MemcpyD2HKernel, plat::float16,
ops::MemcpyD2HKernel, int16_t, ops::MemcpyD2HKernel);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy_d2h, float, ops::MemcpyD2HKernel, double,
ops::MemcpyD2HKernel, int, ops::MemcpyD2HKernel,
int64_t, ops::MemcpyD2HKernel, bool,
ops::MemcpyD2HKernel, plat::float16,
ops::MemcpyD2HKernel);
REGISTER_OP_NPU_KERNEL_FUNCTOR(
memcpy_d2h, float, ops::MemcpyD2HKernel, double, ops::MemcpyD2HKernel,
int8_t, ops::MemcpyD2HKernel, uint8_t, ops::MemcpyD2HKernel, int,
ops::MemcpyD2HKernel, int64_t, ops::MemcpyD2HKernel, bool,
ops::MemcpyD2HKernel, paddle::platform::bfloat16, ops::MemcpyD2HKernel,
paddle::platform::complex<float>, ops::MemcpyD2HKernel,
paddle::platform::complex<double>, ops::MemcpyD2HKernel, plat::float16,
ops::MemcpyD2HKernel, int16_t, ops::MemcpyD2HKernel);
#endif
......@@ -125,24 +125,33 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL_FUNCTOR(memcpy_h2d, float, ops::MemcpyH2DKernel, double,
ops::MemcpyH2DKernel, int, ops::MemcpyH2DKernel,
int64_t, ops::MemcpyH2DKernel, bool,
ops::MemcpyH2DKernel, plat::float16,
ops::MemcpyH2DKernel);
REGISTER_OP_CPU_KERNEL_FUNCTOR(
memcpy_h2d, float, ops::MemcpyH2DKernel, double, ops::MemcpyH2DKernel,
int8_t, ops::MemcpyH2DKernel, uint8_t, ops::MemcpyH2DKernel, int,
ops::MemcpyH2DKernel, int64_t, ops::MemcpyH2DKernel, bool,
ops::MemcpyH2DKernel, paddle::platform::bfloat16, ops::MemcpyH2DKernel,
paddle::platform::complex<float>, ops::MemcpyH2DKernel,
paddle::platform::complex<double>, ops::MemcpyH2DKernel, plat::float16,
ops::MemcpyH2DKernel, int16_t, ops::MemcpyH2DKernel);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy_h2d, float, ops::MemcpyH2DKernel, double,
ops::MemcpyH2DKernel, int, ops::MemcpyH2DKernel,
int64_t, ops::MemcpyH2DKernel, bool,
ops::MemcpyH2DKernel, plat::float16,
ops::MemcpyH2DKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(
memcpy_h2d, float, ops::MemcpyH2DKernel, double, ops::MemcpyH2DKernel,
int8_t, ops::MemcpyH2DKernel, uint8_t, ops::MemcpyH2DKernel, int,
ops::MemcpyH2DKernel, int64_t, ops::MemcpyH2DKernel, bool,
ops::MemcpyH2DKernel, paddle::platform::bfloat16, ops::MemcpyH2DKernel,
paddle::platform::complex<float>, ops::MemcpyH2DKernel,
paddle::platform::complex<double>, ops::MemcpyH2DKernel, plat::float16,
ops::MemcpyH2DKernel, int16_t, ops::MemcpyH2DKernel);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy_h2d, float, ops::MemcpyH2DKernel, double,
ops::MemcpyH2DKernel, int, ops::MemcpyH2DKernel,
int64_t, ops::MemcpyH2DKernel, bool,
ops::MemcpyH2DKernel, plat::float16,
ops::MemcpyH2DKernel);
REGISTER_OP_NPU_KERNEL_FUNCTOR(
memcpy_h2d, float, ops::MemcpyH2DKernel, double, ops::MemcpyH2DKernel,
int8_t, ops::MemcpyH2DKernel, uint8_t, ops::MemcpyH2DKernel, int,
ops::MemcpyH2DKernel, int64_t, ops::MemcpyH2DKernel, bool,
ops::MemcpyH2DKernel, paddle::platform::bfloat16, ops::MemcpyH2DKernel,
paddle::platform::complex<float>, ops::MemcpyH2DKernel,
paddle::platform::complex<double>, ops::MemcpyH2DKernel, plat::float16,
ops::MemcpyH2DKernel, int16_t, ops::MemcpyH2DKernel);
#endif
......@@ -2046,7 +2046,7 @@ All parameter, weight, gradient are variables in Paddle.
[](StandaloneExecutor &self,
const std::unordered_map<std::string, py::array> &input_dict,
std::vector<std::string> fetch_names) {
std::vector<framework::Tensor> feed_tensors;
std::vector<framework::LoDTensor> feed_tensors;
std::vector<std::string> feed_names;
for (auto &item : input_dict) {
......@@ -2066,10 +2066,10 @@ All parameter, weight, gradient are variables in Paddle.
})
.def("run",
[](StandaloneExecutor &self,
const std::unordered_map<std::string, framework::Tensor>
const std::unordered_map<std::string, framework::LoDTensor>
&input_dict,
std::vector<std::string> fetch_names) {
std::vector<framework::Tensor> feed_tensors;
std::vector<framework::LoDTensor> feed_tensors;
std::vector<std::string> feed_names;
for (auto &item : input_dict) {
......@@ -2087,7 +2087,7 @@ All parameter, weight, gradient are variables in Paddle.
.def("dry_run",
[](StandaloneExecutor &self,
const std::unordered_map<std::string, py::array> &input_dict) {
std::vector<framework::Tensor> feed_tensors;
std::vector<framework::LoDTensor> feed_tensors;
std::vector<std::string> feed_names;
for (auto &item : input_dict) {
......
......@@ -485,10 +485,11 @@ handler = FetchHandlerExample(var_dict=var_dict)
class _StandaloneExecutor(object):
def __init__(self, place, main_program):
def __init__(self, place, main_program, scope):
self._place = core.Place()
self._place.set_place(place)
self._main_program = main_program
self._scope = scope
self._new_exe = self._create_new_executor()
def run(self, feed, fetch_list, return_numpy=True):
......@@ -522,9 +523,8 @@ class _StandaloneExecutor(object):
def _create_new_executor(self):
# NOTE: It's a trick to set empty start_up program.
startup_program = Program()
outer_scope = global_scope()
new_exe = core.StandaloneExecutor(self._place, startup_program.desc,
self._main_program.desc, outer_scope)
self._main_program.desc, self._scope)
return new_exe
......@@ -585,11 +585,11 @@ class _ExecutorCache(object):
self._place = place
self._cached_executors = {}
def run(self, program, feed, fetch_list, return_numpy=True):
new_exe = self._get_exe_from_cache(program)
def run(self, program, scope, feed, fetch_list, return_numpy=True):
new_exe = self._get_exe_from_cache(program, scope)
return new_exe.run(feed, fetch_list, return_numpy)
def _get_exe_from_cache(self, program):
def _get_exe_from_cache(self, program, scope):
"""
Return cached _StandaloneExecutor instance. If not found, create associated
_StandaloneExecutor instance with given program and cache it.
......@@ -598,7 +598,7 @@ class _ExecutorCache(object):
program, Program), "Required type(Program), but received {}".format(
type(program).__name__)
if program not in self._cached_executors:
new_exe = _StandaloneExecutor(self._place, program)
new_exe = _StandaloneExecutor(self._place, program, scope)
self._cached_executors[program] = new_exe
return self._cached_executors[program]
......@@ -1297,7 +1297,7 @@ class Executor(object):
# NOTE: This is an experimental feature. If `export FLAGS_USE_STANDALONE_EXECUTOR=1 `,
# use StandaloneExecutor to run the program.
if self._enable_interpreter_core and not program._is_start_up_program_:
return self._executor_cache.run(program, feed, fetch_list,
return self._executor_cache.run(program, scope, feed, fetch_list,
return_numpy)
# use_prune can be overrided by putting optimize_ops in fetch_list
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册