operator.cc 49.2 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Q
Qiao Longfei 已提交
2 3 4 5 6 7 8 9 10 11 12 13

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
D
dzhwinter 已提交
14

15 16
#include "paddle/fluid/framework/operator.h"

17 18
#include <gflags/gflags.h>
#include <glog/logging.h>
19

20
#include <algorithm>
P
peizhilin 已提交
21 22
#include <sstream>
#include <string>
S
sneaxiy 已提交
23
#include <unordered_set>
P
peizhilin 已提交
24
#include <vector>
25

Y
Yi Wang 已提交
26
#include "paddle/fluid/framework/data_transform.h"
W
WangXi 已提交
27
#include "paddle/fluid/framework/details/nan_inf_utils.h"
Y
Yi Wang 已提交
28
#include "paddle/fluid/framework/executor.h"
29
#include "paddle/fluid/framework/lod_tensor.h"
30
#include "paddle/fluid/framework/op_call_stack.h"
31
#include "paddle/fluid/framework/op_proto_maker.h"
Y
Yi Wang 已提交
32
#include "paddle/fluid/framework/shape_inference.h"
33
#include "paddle/fluid/framework/transfer_scope_cache.h"
34
#include "paddle/fluid/framework/unused_var_check.h"
Y
Yi Wang 已提交
35
#include "paddle/fluid/framework/var_type.h"
36
#include "paddle/fluid/platform/profiler.h"
37 38 39
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_info.h"
#endif
Q
Qiao Longfei 已提交
40

41 42 43 44
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif

D
dzhwinter 已提交
45
DECLARE_bool(benchmark);
46
DECLARE_bool(check_nan_inf);
47
DECLARE_bool(enable_unused_var_check);
Q
Qiao Longfei 已提交
48
DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op");
P
pkpk 已提交
49 50 51
DEFINE_bool(fast_check_nan_inf, false,
            "Fast checking NAN/INF after each operation. It will be a little"
            "bit slow, much faster than check_nan_inf");
D
dzhwinter 已提交
52

Q
Qiao Longfei 已提交
53 54 55
namespace paddle {
namespace framework {

56 57 58 59 60 61
std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
    std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
    std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
    std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN),
    std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
};
D
dzhwinter 已提交
62

63 64
static DDim GetDimsDebug(const Scope& scope, const std::string& name,
                         bool get_actual_dim = false) {
65
  Variable* var = scope.FindVar(name);
Q
qiaolongfei 已提交
66 67
  if (var == nullptr) {
    return DDim({-1});
Q
Qiao Longfei 已提交
68 69
  }

M
minqiyang 已提交
70 71 72 73 74 75 76 77 78
  if (var->IsType<LoDTensor>()) {
    const LoDTensor& tensor = var->Get<LoDTensor>();
    return tensor.dims();
  } else if (var->IsType<SelectedRows>()) {
    if (get_actual_dim) {
      return var->Get<SelectedRows>().value().dims();
    } else {
      return var->Get<SelectedRows>().GetCompleteDims();
    }
79 80 81 82 83
  } else {
    return DDim({-1});
  }
}

Q
Qiao Longfei 已提交
84 85 86 87 88 89
static bool VarInited(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  if (var == nullptr) return false;
  return var->IsInitialized();
}

D
dzhwinter 已提交
90 91 92 93 94
static std::string GetDtype(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  if (var == nullptr) {
    return "";
  }
95

M
minqiyang 已提交
96 97 98
  if (var->IsType<LoDTensor>()) {
    const LoDTensor& tensor = var->Get<LoDTensor>();
    if (UNLIKELY(!tensor.IsInitialized())) {
99 100
      return "";
    }
Y
Yu Yang 已提交
101
    return DataTypeToString(tensor.type());
M
minqiyang 已提交
102
  } else if (var->IsType<SelectedRows>()) {
Q
Qiao Longfei 已提交
103 104 105 106
    auto tensor = var->Get<SelectedRows>().value();
    if (UNLIKELY(!tensor.IsInitialized())) {
      return "uninited";
    } else {
Y
Yu Yang 已提交
107
      return DataTypeToString(tensor.type());
Q
Qiao Longfei 已提交
108
    }
D
dzhwinter 已提交
109 110 111 112 113
  } else {
    return "";
  }
}

114 115 116 117 118 119
static int GetRowSize(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  if (var == nullptr) {
    return -1;
  }

M
minqiyang 已提交
120 121
  if (var->IsType<SelectedRows>()) {
    return var->Get<SelectedRows>().rows().size();
122 123 124 125 126
  }

  return -1;
}

127
static LoD GetLoDDebug(const Scope& scope, const std::string& name) {
Q
Qiao Longfei 已提交
128 129 130 131 132 133 134
  Variable* var = scope.FindVar(name);
  auto default_lod = LoD({{}});

  if (var == nullptr) {
    return default_lod;
  }

M
minqiyang 已提交
135 136 137
  if (var->IsType<LoDTensor>()) {
    const LoDTensor& tensor = var->Get<LoDTensor>();
    return tensor.lod();
Q
Qiao Longfei 已提交
138 139 140 141 142
  } else {
    return default_lod;
  }
}

X
Xin Pan 已提交
143 144 145 146 147
RuntimeContext::RuntimeContext(const VariableNameMap& innames,
                               const VariableNameMap& outnames,
                               const Scope& scope) {
  for (auto& var_name_item : innames) {
    std::vector<Variable*>& input_vars = inputs[var_name_item.first];
X
Xin Pan 已提交
148
    input_vars.reserve(var_name_item.second.size());
X
Xin Pan 已提交
149 150 151 152 153 154
    for (auto& var_name : var_name_item.second) {
      input_vars.push_back(scope.FindVar(var_name));
    }
  }
  for (auto& var_name_item : outnames) {
    std::vector<Variable*>& output_vars = outputs[var_name_item.first];
X
Xin Pan 已提交
155
    output_vars.reserve(var_name_item.second.size());
X
Xin Pan 已提交
156 157 158 159 160 161
    for (auto& var_name : var_name_item.second) {
      output_vars.push_back(scope.FindVar(var_name));
    }
  }
}

162
void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
P
peizhilin 已提交
163 164 165
  try {
    VLOG(4) << place << " " << DebugStringEx(&scope);
    if (platform::is_gpu_place(place)) {
166
#ifndef PADDLE_WITH_CUDA
P
peizhilin 已提交
167
      PADDLE_THROW("Cannot run operator on place %s", place);
168
#else
169
      auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
P
peizhilin 已提交
170
      platform::SetDeviceId(dev_id);
171 172 173 174 175 176 177 178
#endif
    } else if (platform::is_xpu_place(place)) {
#ifndef PADDLE_WITH_XPU
      PADDLE_THROW(platform::errors::Unimplemented(
          "Cannot run operator on place %s", place));
#else
      auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
      platform::SetXPUDeviceId(dev_id);
179
#endif
P
peizhilin 已提交
180
    }
P
peizhilin 已提交
181

182
    {
183 184 185 186 187 188
      // TODO(wangchaochaohu) : refine code to use only one RecordEvent)
      // in order to record different op type cost time
      // and different op name cost time,we set two event.
      platform::RecordEvent op_type_record_event(Type());
      auto op_name = platform::OpName(outputs_, Type());
      platform::RecordEvent op_name_record_event(
189
          op_name, platform::EventRole::kUniqueOp);
P
peizhilin 已提交
190 191
      RunImpl(scope, place);
    }
192

Z
Zhang Ting 已提交
193
    VLOG(3) << GetExecutionPlace(place) << " " << DebugStringEx(&scope);
194
  } catch (platform::EnforceNotMet& exception) {
195
    framework::InsertCallStackInfo(Type(), Attrs(), &exception);
196
    throw std::move(exception);
197 198 199 200 201 202
  } catch (platform::EOFException&) {
    std::rethrow_exception(std::current_exception());
  } catch (std::exception& ex) {
    LOG(WARNING) << Type() << " raises an exception "
                 << platform::demangle(typeid(ex).name()) << ", " << ex.what();
    std::rethrow_exception(std::current_exception());
P
peizhilin 已提交
203
  } catch (...) {
204
    LOG(WARNING) << Type() << " raises an unknown exception";
P
peizhilin 已提交
205
    std::rethrow_exception(std::current_exception());
206
  }
207 208
}

209
bool OperatorBase::HasInputs(const std::string& name) const {
M
minqiyang 已提交
210
  return inputs_.find(name) != inputs_.end();
211 212
}

213
std::string OperatorBase::Input(const std::string& name) const {
Y
Yu Yang 已提交
214
  auto& ins = Inputs(name);
215 216 217 218 219
  PADDLE_ENFORCE_LE(
      ins.size(), 1UL,
      platform::errors::AlreadyExists(
          "Operator %s's input %s should contain only one variable.", type_,
          name));
Y
Yu Yang 已提交
220
  return ins.empty() ? kEmptyVarName : ins[0];
Y
Yan Chunwei 已提交
221 222
}

Y
Yu Yang 已提交
223 224
const std::vector<std::string>& OperatorBase::Inputs(
    const std::string& name) const {
Y
Yu Yang 已提交
225
  auto it = inputs_.find(name);
226 227
  PADDLE_ENFORCE(it != inputs_.end(), "Operator %s does not have the input %s.",
                 type_, name);
Y
Yu Yang 已提交
228
  return it->second;
Y
Yan Chunwei 已提交
229 230
}

231
bool OperatorBase::HasOutputs(const std::string& name) const {
232
  if (outputs_.find(name) != outputs_.end()) {
233 234 235 236 237 238
    return true;
  } else {
    return false;
  }
}

239
std::string OperatorBase::Output(const std::string& name) const {
Y
Yu Yang 已提交
240
  auto& outs = Outputs(name);
Y
Yu Yang 已提交
241
  PADDLE_ENFORCE_LE(outs.size(), 1UL,
242 243
                    "Operator %s's output %s should contain only one variable.",
                    type_, name);
Y
Yu Yang 已提交
244
  return outs.empty() ? kEmptyVarName : outs[0];
Y
Yan Chunwei 已提交
245 246
}

Y
Yu Yang 已提交
247 248
const std::vector<std::string>& OperatorBase::Outputs(
    const std::string& name) const {
Y
Yu Yang 已提交
249
  auto it = outputs_.find(name);
250 251
  PADDLE_ENFORCE(it != outputs_.end(),
                 "Operator %s does not have an output called %s.", type_, name);
Y
Yu Yang 已提交
252
  return it->second;
Y
Yan Chunwei 已提交
253 254
}

255
std::string OperatorBase::DebugStringEx(const Scope* scope) const {
Q
Qiao Longfei 已提交
256
  std::stringstream ss;
Y
Yu Yang 已提交
257
  ss << "Op(" << type_ << "), inputs:{";
258

259
  const std::unordered_set<std::string>* no_need_buffer_vars = nullptr;
260 261
  if (info_ && info_->NoNeedBufferVarsInferer()) {
    no_need_buffer_vars =
262 263
        &(Info().NoNeedBufferVarsInferer()(Inputs(), Outputs(), Attrs()));
    if (no_need_buffer_vars->empty()) no_need_buffer_vars = nullptr;
264 265
  }

Y
Yu Yang 已提交
266 267
  for (auto it = inputs_.begin(); it != inputs_.end();) {
    auto& input = *it;
268 269
    bool is_no_need_buffer_var =
        (no_need_buffer_vars && no_need_buffer_vars->count(input.first) > 0);
Y
Yu Yang 已提交
270 271
    ss << input.first << "[";
    for (size_t i = 0; i < input.second.size(); ++i) {
Q
Qiao Longfei 已提交
272 273
      auto var_name = input.second[i];
      ss << var_name;
274
      if (scope) {
Q
Qiao Longfei 已提交
275 276 277 278 279 280 281
        if (!VarInited(*scope, var_name)) {
          ss << "[uninited]";
        } else {
          int row_size = GetRowSize(*scope, var_name);
          if (row_size >= 0) {
            ss << "[row_size=" << row_size << "]";
          }
282 283 284
          std::string dtype = is_no_need_buffer_var
                                  ? "unknown_dtype"
                                  : GetDtype(*scope, var_name);
Q
Qiao Longfei 已提交
285
          ss << ":" << dtype;
286 287
          ss << "[" << GetDimsDebug(*scope, var_name, true) << "]";
          ss << "(" << GetLoDDebug(*scope, var_name) << ")";
288
        }
289
      }
Y
Yu Yang 已提交
290 291 292
      if (i != input.second.size() - 1) {
        ss << ", ";
      }
293
    }
Y
Yu Yang 已提交
294
    ss << "]";
Y
Yu Yang 已提交
295 296
    ++it;
    if (it != inputs_.end()) {
297 298
      ss << ", ";
    }
Q
Qiao Longfei 已提交
299
  }
Y
Yu Yang 已提交
300
  ss << "}, outputs:{";
Y
Yu Yang 已提交
301 302
  for (auto it = outputs_.begin(); it != outputs_.end();) {
    auto& output = *it;
Y
Yu Yang 已提交
303 304
    ss << output.first << "[";
    for (size_t i = 0; i < output.second.size(); ++i) {
Q
Qiao Longfei 已提交
305 306
      auto var_name = output.second[i];
      ss << var_name;
307
      if (scope) {
Q
Qiao Longfei 已提交
308 309 310 311 312 313 314
        if (!VarInited(*scope, var_name)) {
          ss << "[uninited]";
        } else {
          int row_size = GetRowSize(*scope, output.second[i]);
          if (row_size >= 0) {
            ss << "[row_size=" << row_size << "]";
          }
C
chengduo 已提交
315 316
          std::string dtype = GetDtype(*scope, output.second[i]);
          ss << ":" << dtype;
317 318
          ss << "[" << GetDimsDebug(*scope, var_name, true) << "]";
          ss << "(" << GetLoDDebug(*scope, var_name) << ")";
319
        }
320
      }
Y
Yu Yang 已提交
321 322 323
      if (i != output.second.size() - 1) {
        ss << ", ";
      }
324
    }
Y
Yu Yang 已提交
325
    ss << "]";
Y
Yu Yang 已提交
326 327
    ++it;
    if (it != outputs_.end()) {
328 329
      ss << ", ";
    }
Q
Qiao Longfei 已提交
330
  }
Y
Yu Yang 已提交
331
  ss << "}.";
Q
Qiao Longfei 已提交
332 333 334
  return ss.str();
}

Y
Yu Yang 已提交
335
OperatorBase::OperatorBase(const std::string& type,
Y
Yu Yang 已提交
336 337
                           const VariableNameMap& inputs,
                           const VariableNameMap& outputs,
Y
Yu Yang 已提交
338
                           const AttributeMap& attrs)
S
sneaxiy 已提交
339 340 341 342 343 344
    : type_(type),
      inputs_(inputs),
      outputs_(outputs),
      attrs_(attrs),
      // NOTE(zjl): why op_info may be nullptr?
      info_(OpInfoMap::Instance().GetNullable(type)) {
H
hong 已提交
345 346 347 348 349 350 351 352
  // In dygraph mode, all the OperatorBase will be constructed by function:
  // framework::OpRegistry::CreateOp(type, {}, {}, {}, false).
  // Inputs, outputs and attrs will be set to empty map
  // to improve the execution efficiency of dygraph.
  if (inputs_.size() > 0 || outputs_.size() > 0) {
    GenerateTemporaryNames();
    CheckAllInputOutputSet();
  }
Y
Yu Yang 已提交
353
}
354

Q
qijun 已提交
355 356
std::vector<std::string> OperatorBase::InputVars() const {
  std::vector<std::string> ret_val;
Y
Yu Yang 已提交
357
  for (auto& o : inputs_) {
Q
qijun 已提交
358 359 360 361 362 363
    ret_val.reserve(ret_val.size() + o.second.size());
    ret_val.insert(ret_val.end(), o.second.begin(), o.second.end());
  }
  return ret_val;
}

Y
Yu Yang 已提交
364 365 366 367 368 369 370 371 372 373
std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
  std::vector<std::string> ret_val;
  if (has_intermediate) {
    // push all outputs into ret_val
    for (auto& o : outputs_) {
      ret_val.reserve(ret_val.size() + o.second.size());
      ret_val.insert(ret_val.end(), o.second.begin(), o.second.end());
    }
    return ret_val;
  }
S
sneaxiy 已提交
374
  auto& info = Info();
Y
Yu Yang 已提交
375 376

  // get all OpProto::Var for outputs
Y
Yu Yang 已提交
377
  for (auto& o : info.Proto().outputs()) {
Y
Yu Yang 已提交
378 379 380 381 382 383 384 385 386
    // ignore all intermediate output
    if (o.intermediate()) continue;
    auto out = outputs_.find(o.name());
    if (out != outputs_.end()) {
      ret_val.reserve(ret_val.size() + out->second.size());
      ret_val.insert(ret_val.end(), out->second.begin(), out->second.end());
    }
  }
  return ret_val;
D
dongzhihong 已提交
387 388
}

389
void OperatorBase::CheckAllInputOutputSet() const {
S
sneaxiy 已提交
390
  if (info_ == nullptr || info_->proto_ == nullptr) return;
391

S
sneaxiy 已提交
392
  for (auto& in : info_->Proto().inputs()) {
393 394 395 396
    if (!in.dispensable()) {
      PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
                     "Operator %s's input, %s, is not set", Type(), in.name());
    }
397 398
  }

S
sneaxiy 已提交
399
  for (auto& out : info_->Proto().outputs()) {
400 401 402 403 404
    if (!out.dispensable()) {
      PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
                     "Operator %s's output, %s, is not set", Type(),
                     out.name());
    }
405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420
  }
}

void OperatorBase::GenerateTemporaryNames() {
  static std::atomic<size_t> gUniqId(0UL);
  for (auto& output : outputs_) {
    for (auto& output_name : output.second) {
      if (output_name == kTempVarName) {
        output_name += type_;
        output_name += "@";
        output_name += std::to_string(gUniqId.fetch_add(1));
      }
    }
  }
}

B
baojun-nervana 已提交
421
static bool VarIsTensor(const Variable& var) {
C
chengduo 已提交
422
  return var.IsType<LoDTensor>() || var.IsType<SelectedRows>();
423 424
}

C
chengduo 已提交
425
const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) {
C
chengduo 已提交
426 427 428 429
  if (var.IsType<LoDTensor>()) {
    return static_cast<const Tensor*>(&(var.Get<LoDTensor>()));
  } else if (var.IsType<SelectedRows>()) {
    return &(var.Get<SelectedRows>().value());
Q
QI JUN 已提交
430
  } else {
Y
Yang Yang 已提交
431
    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
S
sneaxiy 已提交
432
                 ToTypeName(var.Type()));
Q
QI JUN 已提交
433 434 435
  }
}

C
chengduo 已提交
436
Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) {
Q
QI JUN 已提交
437
  if (var->IsType<LoDTensor>()) {
438
    return var->GetMutable<LoDTensor>();
Q
QI JUN 已提交
439
  } else if (var->IsType<SelectedRows>()) {
440
    return var->GetMutable<SelectedRows>()->mutable_value();
Q
QI JUN 已提交
441
  } else {
Y
Yang Yang 已提交
442
    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
S
sneaxiy 已提交
443
                 ToTypeName(var->Type()));
Q
QI JUN 已提交
444 445 446
  }
}

447
bool ExecutionContext::HasInput(const std::string& name) const {
448
  auto* var = InputVar(name);
449 450 451 452
  return var != nullptr;
}

bool ExecutionContext::HasOutput(const std::string& name) const {
453
  auto* var = OutputVar(name);
454 455 456
  return var != nullptr;
}

X
Xin Pan 已提交
457
const Variable* ExecutionContext::InputVar(const std::string& name) const {
458 459
  LogVarUsageIfUnusedVarCheckEnabled(name);

X
Xin Pan 已提交
460 461 462
  auto it = ctx_.inputs.find(name);
  if (it == ctx_.inputs.end()) return nullptr;

463 464 465 466 467
  PADDLE_ENFORCE_LE(
      it->second.size(), 1UL,
      platform::errors::AlreadyExists(
          "Operator %s's input %s should contain only one variable.",
          op_.Type(), name));
X
Xin Pan 已提交
468 469 470
  return it->second.empty() ? nullptr : it->second[0];
}

X
clean  
Xin Pan 已提交
471
Variable* ExecutionContext::OutputVar(const std::string& name) const {
X
Xin Pan 已提交
472 473 474 475 476 477 478 479 480
  auto it = ctx_.outputs.find(name);
  if (it == ctx_.outputs.end()) return nullptr;

  PADDLE_ENFORCE_LE(it->second.size(), 1UL,
                    "Operator %s's output %s should contain only one variable.",
                    op_.Type(), name);
  return it->second.empty() ? nullptr : it->second[0];
}

481
template <>
482
const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
C
chengduo 已提交
483
  return Input<LoDTensor>(name);
484 485 486
}

template <>
487
const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
488
    const std::string& name) const {
489 490
  LogVarUsageIfUnusedVarCheckEnabled(name);

H
hong 已提交
491 492
  auto vars = MultiInputVar(name);
  if (vars.size() == 0) {
X
Xin Pan 已提交
493 494 495 496 497
    return {};
  }
  std::vector<const Tensor*> res;
  res.reserve(vars.size());
  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
H
hong 已提交
498
                 [&](const Variable* var) -> const Tensor* {
X
Xin Pan 已提交
499 500 501 502
                   if (var == nullptr) return nullptr;
                   PADDLE_ENFORCE(
                       var->IsType<LoDTensor>(),
                       "should be LoDTensor, but the received type is %s",
S
sneaxiy 已提交
503
                       ToTypeName(var->Type()));
X
Xin Pan 已提交
504 505 506 507 508
                   return &(var->Get<LoDTensor>());
                 });
  return res;
}

509
template <>
510
Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
C
chengduo 已提交
511
  return Output<LoDTensor>(name);
512 513 514
}

template <>
515
std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
516
    const std::string& name) const {
H
hong 已提交
517 518 519
  auto vars = MultiOutputVar(name);

  if (vars.size() == 0) {
520 521
    return {};
  }
522
  std::vector<Tensor*> res;
523 524 525 526 527
  res.reserve(vars.size());
  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
                 [&](Variable* var) -> Tensor* {
                   return var == nullptr ? nullptr
                                         : var->GetMutable<LoDTensor>();
528
                 });
529 530 531
  return res;
}

Y
Yu Yang 已提交
532 533 534 535 536 537 538 539 540 541 542 543 544 545 546
bool OpSupportGPU(const std::string& op_type) {
  auto& all_kernels = OperatorWithKernel::AllOpKernels();
  auto it = all_kernels.find(op_type);
  if (it == all_kernels.end()) {
    // All control operator must support GPU
    return true;
  }
  for (auto& kern_pair : it->second) {
    if (platform::is_gpu_place(kern_pair.first.place_)) {
      return true;
    }
  }
  return false;
}

547 548
class RuntimeInferShapeContext : public InferShapeContext {
 public:
549
  RuntimeInferShapeContext(const OperatorBase& op, const RuntimeContext& ctx)
G
Gabor Buella 已提交
550
      : op_(op), ctx_(ctx) {}
551 552

  bool HasInput(const std::string& name) const override {
553
    // has only one input
X
Xin Pan 已提交
554
    const auto& ins = ctx_.inputs;
555 556
    auto it = ins.find(name);
    if (it == ins.end()) {
557 558
      return false;
    }
559
    const auto& in = it->second;
X
Xin Pan 已提交
560
    if (in.size() == 0) return false;
T
tensor-tang 已提交
561
    PADDLE_ENFORCE_EQ(in.size(), 1UL,
F
fengjiayi 已提交
562
                      "Input %s should not have more than one inputs", name);
X
Xin Pan 已提交
563
    return in[0] != nullptr;
564 565 566
  }

  bool HasOutput(const std::string& name) const override {
567
    // has only one output
X
Xin Pan 已提交
568
    const auto& outs = ctx_.outputs;
569 570
    auto it = outs.find(name);
    if (it == outs.end()) {
571 572
      return false;
    }
573
    const auto& out = it->second;
X
Xin Pan 已提交
574
    if (out.size() == 0) {
575 576
      return false;
    }
T
tensor-tang 已提交
577 578
    PADDLE_ENFORCE_EQ(out.size(), 1UL,
                      "Output %s should not have more than one outputs", name);
X
Xin Pan 已提交
579
    return out[0] != nullptr;
580 581 582
  }

  bool HasInputs(const std::string& name) const override {
X
Xin Pan 已提交
583 584
    const auto& ins = ctx_.inputs;
    auto it = ins.find(name);
X
fix  
Xin Pan 已提交
585
    if (it == ins.end() || it->second.empty()) {
586 587
      return false;
    }
X
Xin Pan 已提交
588 589
    for (auto& input : it->second) {
      if (input == nullptr) {
590 591 592 593 594 595 596
        return false;
      }
    }
    return true;
  }

  bool HasOutputs(const std::string& name) const override {
X
Xin Pan 已提交
597 598
    const auto& outs = ctx_.outputs;
    auto it = outs.find(name);
X
fix  
Xin Pan 已提交
599
    if (it == outs.end() || it->second.empty()) {
600 601
      return false;
    }
X
Xin Pan 已提交
602 603
    for (auto& output : it->second) {
      if (output == nullptr) {
604 605 606 607 608 609 610 611
        return false;
      }
    }
    return true;
  }

  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }

H
hong 已提交
612
  std::vector<std::string> Inputs(const std::string& name) const override {
613 614 615
    return op_.Inputs(name);
  }

H
hong 已提交
616
  std::vector<std::string> Outputs(const std::string& name) const override {
617 618 619
    return op_.Outputs(name);
  }

620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
  std::string GetInputNameByIdx(size_t idx) const override {
    auto& op_proto =
        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
    PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
                      platform::errors::OutOfRange(
                          "The index should be less than the size of inputs of "
                          "operator %s, but got index is %d and size is %d",
                          op_.Type(), idx, op_proto->inputs().size()));
    return op_proto->inputs()[idx].name();
  }

  std::string GetOutputNameByIdx(size_t idx) const override {
    auto& op_proto =
        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
    PADDLE_ENFORCE_LT(
        idx, op_proto->outputs().size(),
        platform::errors::OutOfRange(
            "The index should be less than the size of outputs of "
            "operator %s, but got index is %d and size is %d",
            op_.Type(), idx, op_proto->outputs().size()));
    return op_proto->outputs()[idx].name();
  }

643 644
  void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
                size_t j = 0) override {
X
Xin Pan 已提交
645 646 647 648 649 650 651 652 653
    auto in_it = ctx_.inputs.find(in);
    auto out_it = ctx_.outputs.find(out);
    PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
                   "Inputs %s should have %llu argument", in, i);
    PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
                   "Outputs %s should have %llu argument", out, j);

    Variable* in_var = in_it->second[i];
    Variable* out_var = out_it->second[j];
654 655

    PADDLE_ENFORCE(in_var->Type() == out_var->Type(),
X
fix  
Xin Pan 已提交
656
                   "The type of %s and %s is not the same.", in, out);
657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674

    if (in_var->IsType<framework::SelectedRows>()) {
      auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
      auto out_sele_rows = out_var->GetMutable<framework::SelectedRows>();
      out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
      out_sele_rows->set_rows(in_sele_rows.rows());
      out_sele_rows->set_height(in_sele_rows.height());
    } else if (in_var->IsType<framework::LoDTensor>()) {
      auto& in_lod_tensor = in_var->Get<framework::LoDTensor>();
      auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>();
      out_lod_tensor->Resize(in_lod_tensor.dims());
    } else {
      PADDLE_THROW(
          "Currently, the input type of ShareDim only can be LoDTensor "
          "or SelectedRows.");
    }
  }

H
hong 已提交
675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692
  void ShareAllLoD(const std::string& in,
                   const std::string& out) const override {
    auto in_it = ctx_.inputs.find(in);
    auto out_it = ctx_.outputs.find(out);
    PADDLE_ENFORCE_NE(in_it, ctx_.inputs.end(),
                      platform::errors::NotFound(
                          "Input [%s] found error in Op [%s]", in, op_.Type()));
    PADDLE_ENFORCE_NE(
        out_it, ctx_.outputs.end(),
        platform::errors::NotFound("Output [%s] found error in Op [%s]", out,
                                   op_.Type()));

    auto& in_var_list = in_it->second;
    auto& out_var_list = out_it->second;

    PADDLE_ENFORCE_EQ(
        in_var_list.size(), out_var_list.size(),
        platform::errors::PreconditionNotMet(
T
tianshuo78520a 已提交
693
            "Op [%s]: Input var size should be equal with output var size",
H
hong 已提交
694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719
            op_.Type()));

    auto& out_var_names = op_.Outputs(out);

    for (size_t i = 0; i < in_var_list.size(); ++i) {
      if (out_var_names[i] == framework::kEmptyVarName) {
        continue;
      }

      Variable* in_var = in_var_list[i];
      if (!in_var->IsType<LoDTensor>()) return;
      Variable* out_var = out_var_list[i];
      PADDLE_ENFORCE_EQ(out_var->IsType<LoDTensor>(), true,
                        platform::errors::PreconditionNotMet(
                            "The %d-th output of Output(%s) must be LoDTensor.",
                            i, out_var_names[i]));
      auto& in_tensor = in_var->Get<LoDTensor>();
      auto* out_tensor = out_var->GetMutable<LoDTensor>();
      out_tensor->set_lod(in_tensor.lod());
#ifdef PADDLE_WITH_MKLDNN
      if (in_tensor.layout() != DataLayout::kMKLDNN)
#endif
        out_tensor->set_layout(in_tensor.layout());
    }
  }

Q
Qiao Longfei 已提交
720 721
  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
                size_t j = 0) const override {
X
Xin Pan 已提交
722 723 724 725 726 727 728 729
    auto in_it = ctx_.inputs.find(in);
    auto out_it = ctx_.outputs.find(out);
    PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
                   "Inputs %s should have %llu argument", in, i);
    PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
                   "Outputs %s should have %llu argument", out, j);

    Variable* in_var = in_it->second.at(i);
Q
Qiao Longfei 已提交
730
    if (!in_var->IsType<LoDTensor>()) return;
X
Xin Pan 已提交
731
    Variable* out_var = out_it->second.at(j);
Q
Qiao Longfei 已提交
732 733
    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
734
    auto& in_tensor = in_var->Get<LoDTensor>();
Q
Qiao Longfei 已提交
735 736
    auto* out_tensor = out_var->GetMutable<LoDTensor>();
    out_tensor->set_lod(in_tensor.lod());
D
dzhwinter 已提交
737

M
mozga-intel 已提交
738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756
// TODO(dzhwinter) : reuse ShareLoD in most operators.
// Need to call ShareLayout explicitly in sequence related ops.
// Shall we have a better method to shared info between in/out Tensor?
#ifdef PADDLE_WITH_MKLDNN
    // Fix me: ugly workaround below
    // Correct solution:
    //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
    //    layout of output tensor should be set "manually" in Compute()
    //    of each OPKernel. The reason layout should NOT be shared between
    //    input and output "automatically" (now by InferShape()->ShareLoD())
    //    is that layout transform may occur after InferShape().
    // Workaround:
    //    Skip set_layout() when input layout is kMKLDNN
    //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
    //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
    //    in Compute()
    if (in_tensor.layout() != DataLayout::kMKLDNN)
#endif
      out_tensor->set_layout(in_tensor.layout());
D
dzhwinter 已提交
757 758
  }

759
  int32_t GetLoDLevel(const std::string& in, size_t i = 0) const override {
760
    PADDLE_THROW(
761
        "GetLoDLevel is only used in compile time. The calculation of "
762 763 764 765
        "output's actual lod is different among operators so that should be "
        "set in the runtime kernel.");
  }

766 767
  void SetLoDLevel(const std::string& out, int32_t lod_level,
                   size_t j = 0) const override {
768
    PADDLE_THROW(
769
        "SetLoDLevel is only used in compile time. The calculation of "
770 771
        "output's actual lod is different among operators so that should be "
        "set in the runtime kernel.");
C
chengduo 已提交
772 773
  }

774 775
  bool IsRuntime() const override { return true; }

776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794
  // TODO(paddle-dev): Can this be template?
  std::vector<InferShapeVarPtr> GetInputVarPtrs(
      const std::string& name) override {
    const std::vector<Variable*>& vars = InputVars(name);
    std::vector<InferShapeVarPtr> res;
    res.reserve(vars.size());
    res.insert(res.begin(), vars.begin(), vars.end());
    return res;
  }

  std::vector<InferShapeVarPtr> GetOutputVarPtrs(
      const std::string& name) override {
    const std::vector<Variable*>& vars = OutputVars(name);
    std::vector<InferShapeVarPtr> res;
    res.reserve(vars.size());
    res.insert(res.begin(), vars.begin(), vars.end());
    return res;
  }

X
Xin Pan 已提交
795 796 797 798 799 800 801 802 803 804 805 806 807
  DDim GetInputDim(const std::string& name) const override {
    const std::vector<Variable*>& vars = InputVars(name);
    PADDLE_ENFORCE_EQ(vars.size(), 1UL,
                      "Input(%s) should hold one element, but now it holds %d",
                      name, vars.size());
    return this->GetDim(vars[0]);
  }

  std::vector<DDim> GetInputsDim(const std::string& name) const override {
    const std::vector<Variable*>& vars = InputVars(name);
    return GetDims(vars);
  }

X
Xin Pan 已提交
808 809 810 811 812 813 814 815 816 817
  std::vector<proto::VarType::Type> GetInputsVarType(
      const std::string& name) const override {
    return GetVarTypes(InputVars(name));
  }

  std::vector<proto::VarType::Type> GetOutputsVarType(
      const std::string& name) const override {
    return GetVarTypes(OutputVars(name));
  }

X
Xin Pan 已提交
818 819 820 821 822 823 824 825 826 827 828 829 830 831
  void SetOutputDim(const std::string& name, const DDim& dim) override {
    auto& vars = OutputVars(name);
    PADDLE_ENFORCE_EQ(vars.size(), 1UL,
                      "Output(%s) should hold one element, but now it holds %d",
                      name, vars.size());
    SetDim(vars[0], dim);
  }

  void SetOutputsDim(const std::string& name,
                     const std::vector<DDim>& dims) override {
    auto& vars = OutputVars(name);
    SetDims(vars, dims);
  }

832
 protected:
X
Xin Pan 已提交
833
  DDim GetDim(Variable* var) const {
F
fengjiayi 已提交
834
    PADDLE_ENFORCE_NOT_NULL(var);
835 836 837 838 839
    if (var->IsType<LoDTensor>()) {
      return var->Get<LoDTensor>().dims();
    } else if (var->IsType<SelectedRows>()) {
      return var->Get<SelectedRows>().GetCompleteDims();
    } else {
F
fengjiayi 已提交
840
      PADDLE_THROW(
X
Xin Pan 已提交
841
          "Only LoDTensor/SelectedRows support 'GetDim', but Variables "
F
fengjiayi 已提交
842
          "type_id is %s.",
S
sneaxiy 已提交
843
          ToTypeName(var->Type()));
F
fengjiayi 已提交
844 845 846
    }
  }

X
Xin Pan 已提交
847 848 849 850 851 852 853 854
  std::vector<DDim> GetDims(const std::vector<Variable*>& vars) const {
    std::vector<DDim> ret;
    ret.reserve(vars.size());
    std::transform(vars.begin(), vars.end(), std::back_inserter(ret),
                   [this](Variable* var) { return this->GetDim(var); });
    return ret;
  }

F
fengjiayi 已提交
855
  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
Y
Yu Yang 已提交
856
    PADDLE_THROW("Only compile time support this method");
857 858
  }

X
Xin Pan 已提交
859
  void SetDim(Variable* var, const DDim& dim) {
860 861 862 863 864
    if (var->IsType<LoDTensor>()) {
      var->GetMutable<LoDTensor>()->Resize(dim);
    } else if (var->IsType<SelectedRows>()) {
      var->GetMutable<SelectedRows>()->set_height(dim[0]);
    } else {
X
Xin Pan 已提交
865
      PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
S
sneaxiy 已提交
866
                   ToTypeName(var->Type()));
X
Xin Pan 已提交
867 868 869 870 871 872 873 874 875 876 877 878
    }
  }

  void SetDims(const std::vector<Variable*>& vars,
               const std::vector<DDim>& dims) {
    size_t length = vars.size();
    PADDLE_ENFORCE_EQ(length, dims.size());
    for (size_t i = 0; i < length; ++i) {
      if (vars[i] == nullptr) {
        continue;
      }
      SetDim(vars[i], dims[i]);
879 880 881
    }
  }

F
fengjiayi 已提交
882 883
  void SetRepeatedDims(const std::string& name,
                       const std::vector<DDim>& dims) override {
Y
Yu Yang 已提交
884
    PADDLE_THROW("Only compile time support this method");
F
fengjiayi 已提交
885 886
  }

X
Xin Pan 已提交
887 888 889 890 891 892 893 894 895 896 897
  std::vector<proto::VarType::Type> GetVarTypes(
      const std::vector<Variable*>& vars) const {
    std::vector<proto::VarType::Type> retv;
    retv.resize(vars.size());
    std::transform(vars.begin(), vars.end(), retv.begin(),
                   std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType),
                             this, std::placeholders::_1));
    return retv;
  }

  proto::VarType::Type GetVarType(Variable* var) const {
898 899 900
    return ToVarType(var->Type());
  }

901 902 903 904 905 906 907 908 909 910 911 912 913 914
 private:
  const std::vector<Variable*>& InputVars(const std::string& name) const {
    auto it = ctx_.inputs.find(name);
    PADDLE_ENFORCE(it != ctx_.inputs.end(),
                   "Operator %s does not have the input %s.", op_.Type(), name);
    return it->second;
  }

  const std::vector<Variable*>& OutputVars(const std::string& name) const {
    auto it = ctx_.outputs.find(name);
    PADDLE_ENFORCE(it != ctx_.outputs.end(),
                   "Operator %s does not have the outputs %s.", op_.Type(),
                   name);
    return it->second;
F
fengjiayi 已提交
915 916
  }

917
  const OperatorBase& op_;
X
Xin Pan 已提交
918
  const RuntimeContext& ctx_;
919 920
};

921 922
static void CheckTensorNANOrInf(const std::string& op_type,
                                const std::string& name,
C
chengduoZH 已提交
923 924 925 926
                                const framework::Tensor& tensor) {
  if (tensor.memory_size() == 0) {
    return;
  }
Y
Yu Yang 已提交
927 928
  if (tensor.type() != proto::VarType::FP32 &&
      tensor.type() != proto::VarType::FP64) {
C
chengduoZH 已提交
929 930 931
    return;
  }
  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
932
                 "Operator %s output Tensor %s contains Inf", op_type, name);
C
chengduoZH 已提交
933
  PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
934
                 "Operator %s output Tensor %s contains NAN", op_type, name);
C
chengduoZH 已提交
935 936
}

B
baojun-nervana 已提交
937
void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
X
Xin Pan 已提交
938 939
                                           const platform::Place& place,
                                           const RuntimeContext& ctx) const {
940
  RuntimeInferShapeContext infer_shape_ctx(*this, ctx);
B
baojun-nervana 已提交
941 942 943
  this->InferShape(&infer_shape_ctx);
}

L
luotao1 已提交
944 945
void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place) const {
L
luotao1 已提交
946 947
  // To reduce the elapsed time of HasAttr, we use bool variable to record the
  // result of HasAttr.
948 949 950
  if (!enable_cache_runtime_context_ && HasAttr(kEnableCacheRuntimeContext))
    enable_cache_runtime_context_ = true;
  if (!all_kernels_must_compute_runtime_shape_ &&
L
luotao1 已提交
951
      HasAttr(kAllKernelsMustComputeRuntimeShape))
952
    all_kernels_must_compute_runtime_shape_ = true;
953
  const Scope* cur_scope = &scope;
954
  if (!enable_cache_runtime_context_) {
L
luotao1 已提交
955 956
    RuntimeContext ctx(Inputs(), Outputs(), scope);
    RunImpl(scope, place, &ctx);
957
    pre_scope_ = cur_scope;
L
luotao1 已提交
958
  } else {
959 960 961 962 963 964
    if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
      std::lock_guard<std::mutex> lock(cache_update_mutex_);
      if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
        runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
        pre_scope_ = cur_scope;
      }
L
luotao1 已提交
965 966 967 968 969 970 971 972
    }
    RunImpl(scope, place, runtime_ctx_.get());
  }
}

void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place,
                                 RuntimeContext* runtime_ctx) const {
Y
Yu Yang 已提交
973
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
974
  auto* dev_ctx = pool.Get(place);
975

976
  if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
977
    ChooseKernel(*runtime_ctx, scope, place);
978 979
  }

Y
yuyang18 已提交
980 981
  // do data transformScope &transfer_scope;
  std::vector<std::string> transfered_inplace_vars;
982 983
  Scope* transfer_scope = nullptr;
  {
984
    platform::RecordEvent record_event("prepare_data",
985
                                       platform::EventRole::kInnerOp);
986 987 988 989
    if (need_prepare_data_) {
      transfer_scope = PrepareData(scope, *kernel_type_,
                                   &transfered_inplace_vars, runtime_ctx);
    }
990
  }
Y
yuyang18 已提交
991 992 993 994
  // exec scope is the scope that kernel actually executed on.
  const Scope& exec_scope =
      (transfer_scope == nullptr ? scope : *transfer_scope);

995 996
  if (!(kernel_type_->place_ == dev_ctx->GetPlace())) {
    dev_ctx = pool.Get(kernel_type_->place_);
997
  }
Q
QI JUN 已提交
998

999
  if (!all_kernels_must_compute_runtime_shape_) {
1000
    platform::RecordEvent record_event("infer_shape",
1001
                                       platform::EventRole::kInnerOp);
1002
    RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
1003 1004
    this->InferShape(&infer_shape_ctx);
  }
1005 1006 1007 1008 1009

  if (FLAGS_enable_unused_var_check) {
    GetThreadLocalUsedVarNameSet()->clear();
  }

X
clean  
Xin Pan 已提交
1010 1011
  // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
  // not Scope. Imperative mode only pass inputs and get outputs.
1012
  {
1013
    platform::RecordEvent record_event("compute",
1014
                                       platform::EventRole::kInnerOp);
1015 1016
    (*kernel_func_)(
        ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx));
1017
  }
D
dzhwinter 已提交
1018

Y
yuyang18 已提交
1019
  if (!transfered_inplace_vars.empty()) {
T
tianshuo78520a 已提交
1020
    // there is inplace variable has been transferred.
Y
yuyang18 已提交
1021
    TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
1022
  }
1023 1024 1025 1026 1027 1028 1029 1030
  if (FLAGS_enable_unused_var_check) {
    // skip op that uses mkldnn because it has different memory reuse strategy.
    // use attr here because some GradMakers (like ActivationGradOpMaker) add
    // input when use_mkldnn=true;
    if (!(HasAttr("use_mkldnn") && Attr<bool>("use_mkldnn"))) {
      CheckUnusedVar(*this, scope);
    }
  }
1031

D
dzhwinter 已提交
1032
  /*For profiling/benchmark only*/
D
dzhwinter 已提交
1033
  if (FLAGS_benchmark) {
Y
yuyang18 已提交
1034
    dev_ctx->Wait();
D
dzhwinter 已提交
1035
  }
C
chengduoZH 已提交
1036

P
pkpk 已提交
1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055
  if (FLAGS_fast_check_nan_inf) {
    for (auto& vname : OutputVars(true)) {
      // only check inserted vars,
      // please see executor.py for details of fast_check_nan_inf
      if (vname.rfind("debug_var") == 0) {
        VLOG(3) << "debugging nan/inf in var " << vname;

        auto* var = exec_scope.FindVar(vname);
        if (var == nullptr) continue;
        if (var->IsType<framework::LoDTensor>()) {
          CheckTensorNANOrInf(type_, vname, var->Get<framework::LoDTensor>());
        } else if (var->IsType<framework::SelectedRows>()) {
          CheckTensorNANOrInf(type_, vname,
                              var->Get<framework::SelectedRows>().value());
        }
      }
    }
  }

C
chengduoZH 已提交
1056
  if (FLAGS_check_nan_inf) {
W
WangXi 已提交
1057
    framework::details::CheckOpHasNanOrInf(*this, exec_scope, place);
C
chengduoZH 已提交
1058
  }
1059 1060 1061 1062 1063 1064 1065

  // To solve issue #15032, have a discussion with @Luotao for cpu inference,
  // do not cache transfer scope, hence in this case delete transfer scope
  // after run to avoid memory leak
  if (transfer_scope && !run_by_executor_ && !enable_cache_transfer_scope_) {
    scope.DeleteScope(transfer_scope);
  }
Q
Qiao Longfei 已提交
1066
}
X
Xin Pan 已提交
1067

L
Liu Yiqun 已提交
1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084
void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
                                      const Scope& scope,
                                      const platform::Place& place) const {
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto* dev_ctx = pool.Get(place);

  // check if op[type] has kernel registered.
  auto& all_op_kernels = AllOpKernels();
  auto kernels_iter = all_op_kernels.find(type_);
  if (kernels_iter == all_op_kernels.end()) {
    PADDLE_THROW(
        "There are no kernels which are registered in the %s operator.", type_);
  }

  OpKernelMap& kernels = kernels_iter->second;

  auto expected_kernel_key = this->GetExpectedKernelType(
1085
      ExecutionContext(*this, scope, *dev_ctx, ctx));
1086 1087 1088
  if (HasAttr("op_device")) {
    if (Attr<std::string>("op_device") == "cpu") {
      expected_kernel_key.place_ = platform::CPUPlace();
1089 1090 1091 1092 1093 1094 1095 1096 1097 1098
    } else if (Attr<std::string>("op_device").find("gpu") !=
               std::string::npos) {
      auto device = Attr<std::string>("op_device");
      size_t pos = device.find(':');
      if (pos != std::string::npos) {
        device = device.substr(0, pos);
        LOG_FIRST_N(WARNING, 1)
            << "Device index is only supported under pipeline parallelism, "
            << "so it will be ignored.";
      }
1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110
      // when the Op that only has CPUKernel is assigned to GPU, the CPUKernel
      // will be executed and a warning will be given at the same time.
      if (SupportGPU()) {
        expected_kernel_key.place_ = dev_ctx->GetPlace();
      } else {
        expected_kernel_key.place_ = platform::CPUPlace();
        LOG_FIRST_N(WARNING, 1)
            << "Op(" << type_
            << ") has no CUDA implementation. It will be assigned to CPUPlace.";
      }
    }
  }
L
Liu Yiqun 已提交
1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122
  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;

  auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_MKLDNN
  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
  if (kernel_iter == kernels.end() &&
      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
    expected_kernel_key.library_type_ = LibraryType::kPlain;
    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
    kernel_iter = kernels.find(expected_kernel_key);
  }
1123 1124 1125 1126 1127 1128 1129 1130 1131 1132
#endif
#ifdef PADDLE_WITH_XPU
  if (kernel_iter == kernels.end() &&
      is_xpu_place(expected_kernel_key.place_)) {
    VLOG(3) << "missing XPU kernel: " << type_
            << ", expected_kernel_key:" << expected_kernel_key
            << ", fallbacking to CPU one!";
    expected_kernel_key.place_ = platform::CPUPlace();
    kernel_iter = kernels.find(expected_kernel_key);
  }
L
Liu Yiqun 已提交
1133 1134 1135 1136 1137 1138
#endif
  if (kernel_iter == kernels.end()) {
    PADDLE_THROW("op %s does not have kernel for %s", type_,
                 KernelTypeToString(expected_kernel_key));
  }

1139 1140 1141 1142 1143
  std::lock_guard<std::mutex> lock(cache_update_mutex_);
  if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
    kernel_type_.reset(new OpKernelType(expected_kernel_key));
    kernel_func_.reset(new OpKernelFunc(kernel_iter->second));
  }
L
Liu Yiqun 已提交
1144 1145
}

Y
yuyang18 已提交
1146 1147 1148 1149
void OperatorWithKernel::TransferInplaceVarsBack(
    const Scope& scope, const std::vector<std::string>& inplace_vars,
    const Scope& transfer_scope) const {
  for (auto& var_name : inplace_vars) {
M
minqiyang 已提交
1150
    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
C
chengduo 已提交
1151 1152 1153
    auto* origin_var = scope.FindVar(var_name);
    PADDLE_ENFORCE_NOT_NULL(origin_var, "The var[%s] should not be nullptr.",
                            var_name);
C
chengduo 已提交
1154
    auto* original_tensor =
C
chengduo 已提交
1155
        GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var);
C
chengduo 已提交
1156
    auto* var = transfer_scope.FindVar(var_name);
C
chengduo 已提交
1157 1158
    PADDLE_ENFORCE_NOT_NULL(var, "The var[%s] should not be nullptr.",
                            var_name);
C
chengduo 已提交
1159
    auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
1160
    auto original_dims = original_tensor->dims();
Y
yuyang18 已提交
1161
    original_tensor->ShareDataWith(*transformed_tensor);
1162
    original_tensor->Resize(original_dims);
Y
yuyang18 已提交
1163 1164 1165
  }
}

X
Xin Pan 已提交
1166
Scope* OperatorWithKernel::PrepareData(
Y
yuyang18 已提交
1167
    const Scope& scope, const OpKernelType& expected_kernel_key,
X
Xin Pan 已提交
1168 1169
    std::vector<std::string>* transfered_inplace_vars,
    RuntimeContext* ctx) const {
Y
yuyang18 已提交
1170
  Scope* new_scope = nullptr;
S
sneaxiy 已提交
1171

1172
  const std::unordered_set<std::string>* no_buffer_ins = nullptr;
S
sneaxiy 已提交
1173 1174 1175 1176
  if (info_) {
    auto& no_buffer_inferer = info_->NoNeedBufferVarsInferer();
    // Some op may not register NoNeedBufferVarsInferer
    if (no_buffer_inferer) {
1177 1178
      no_buffer_ins = &(no_buffer_inferer(Inputs(), Outputs(), Attrs()));
      if (no_buffer_ins->empty()) no_buffer_ins = nullptr;
S
sneaxiy 已提交
1179 1180 1181
    }
  }

Y
yuyang18 已提交
1182
  for (auto& var_name_item : Inputs()) {
1183 1184
    bool should_skip_input =
        no_buffer_ins && no_buffer_ins->count(var_name_item.first) > 0;
S
sneaxiy 已提交
1185

X
Xin Pan 已提交
1186 1187 1188 1189
    std::vector<Variable*>& input_vars = ctx->inputs[var_name_item.first];

    for (size_t i = 0; i < var_name_item.second.size(); ++i) {
      auto& var_name = var_name_item.second[i];
X
Xin Pan 已提交
1190
      auto* var = input_vars[i];
X
Xin Pan 已提交
1191

Y
yuyang18 已提交
1192
      // Only tensor can be tranfer to another device.
C
chengduo 已提交
1193
      if (var == nullptr || !VarIsTensor(*var)) {
Y
yuyang18 已提交
1194 1195 1196
        continue;
      }

C
chengduo 已提交
1197
      auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212

      // When no_buffer_ins then checking of Tensor::holder_ is
      // not a thread safe. And for infershape scenario checks
      // to be omitted are not really needed
      if (should_skip_input == true) {
#ifdef PADDLE_WITH_MKLDNN
        // Var without buffer may be needed
        // for some situation like InferShape().
        // In this situation We cannot skip Var analysis, as
        // MKL-DNN shape of Var may differ from kNHWC Var
        // In such situation corressponding resized Var
        // has to be created and registered
        if ((tensor_in->layout() == DataLayout::kMKLDNN) &&
            (var->IsType<LoDTensor>() == true) &&
            (expected_kernel_key.data_layout_ != DataLayout::kMKLDNN) &&
1213 1214
            (paddle::platform::MKLDNNDeviceContext::tls()
                 .get_cur_paddle_data_layout() == DataLayout::kNHWC)) {
1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235
          // Mixed execution : MKL-DNN and GPU is not supported!
          if (!new_scope) {
            new_scope = &scope.NewScope();
          }
          auto* trans_var = new_scope->Var(var_name);
          input_vars[i] = trans_var;
          auto out = trans_var->GetMutable<LoDTensor>();
          out->Resize(tensor_in->dims());
          platform::MatchShapeToLayout(out, tensor_in->layout(),
                                       DataLayout::kNHWC);
          VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , "
                     "but kNHWC layout"
                  << var_name_item.first << " in Operator " << type_;
        } else {
          VLOG(7) << "Skip scanning input " << var_name_item.first
                  << " in Operator " << type_;
        }
#endif
        continue;
      }

Y
yuyang18 已提交
1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252
      if (!tensor_in->IsInitialized()) {
        continue;
      }

      auto kernel_type_for_var = GetKernelTypeForVar(
          var_name_item.first, *tensor_in, expected_kernel_key);

      if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
        continue;
      }

      auto out_var_names = OutputVars(true);
      if (std::find(out_var_names.begin(), out_var_names.end(), var_name) !=
          out_var_names.end()) {
        transfered_inplace_vars->emplace_back(var_name);
      }

M
minqiyang 已提交
1253 1254
      VLOG(3) << "Transform Variable " << var_name << " from "
              << kernel_type_for_var << " to " << expected_kernel_key;
Y
yuyang18 已提交
1255

1256 1257 1258
      // In the inference scenerio, the scopes will be reused across the
      // batches, so the `new_scope` here will result in GPU memroy explosion
      // over the  running of operators.
1259
      // We use a thread_local cache to fix that issue, the key in the cache is
1260 1261 1262 1263 1264
      // the combination of the `scope` argument, from_kernel_type,
      // target_kernel_type.
      // Have a discussion with @Superjomn or the inference developers if some
      // changes on this logic for this macro might not tested on the other
      // scenerios.
1265 1266
      // If this op is not called by an Executor or ParallelExecutor, it should
      // called by a NaiveExecutor, the NaiveExecutor will cache the scopes and
1267
      // variables, that behavior a lot different.
1268 1269 1270 1271 1272 1273 1274 1275 1276
      //
      // To solve issue #15032, have a discussion with @Luotao for cpu
      // inference, for all cpu kernels cases without GPU participation, here
      // not do transfer scope caching, and cpu inference performance is not
      // impacted by test.
      enable_cache_transfer_scope_ = false;
      if (!run_by_executor_ &&
          (platform::is_gpu_place(kernel_type_for_var.place_) ||
           platform::is_gpu_place(expected_kernel_key.place_))) {
1277 1278
        new_scope = TryCreateTransferScope(kernel_type_for_var,
                                           expected_kernel_key, &scope);
1279
        enable_cache_transfer_scope_ = true;
1280
      }
1281
      if (!new_scope) {
Y
yuyang18 已提交
1282 1283
        new_scope = &scope.NewScope();
      }
1284 1285 1286 1287
      // For inference, if a gpu model has an op which could only run on CPU,
      // each result of different input will be the same with the first one.
      // The reason is that if a gpu tensor is the input of a cpu kernel,
      // we will create a new cpu tensor in new scope.
1288
      // However, if enable_cache_runtime_context_, we get the cpu tensor each
1289 1290
      // time, not the gpu tensor. Thus, we set pre_scope_ = nullptr
      // to trigger `new RuntimeContext()` in RunImpl().
1291
      if (enable_cache_runtime_context_) {
1292 1293
        pre_scope_ = nullptr;
      }
Y
yuyang18 已提交
1294
      auto* trans_var = new_scope->Var(var_name);
X
fix  
Xin Pan 已提交
1295
      input_vars[i] = trans_var;
Y
yuyang18 已提交
1296
      Tensor out;
Y
yuyang18 已提交
1297
      TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
Y
yuyang18 已提交
1298 1299 1300
      SetTensorToVariable(*var, out, trans_var);
    }
  }
1301 1302 1303 1304 1305 1306 1307 1308 1309
  // If pre_scope = &scope, it means that scope is cached and the op is not in
  // while block. If new_scope = nullptr, it means that for each input of this
  // Op, there is no need to do PrepareData. So PrepareData could be skipped at
  // the rest iterations to save the elapsed time.
  // We do not support skipping PrepareData in while block, because the Op's
  // input may be changed by subsequent Ops, which may cause an error.
  if (pre_scope_ == &scope && new_scope == nullptr) {
    need_prepare_data_ = false;
  }
Y
yuyang18 已提交
1310 1311 1312

  return new_scope;
}
Q
Qiao Longfei 已提交
1313

1314 1315 1316
void OperatorWithKernel::ParseInputDataType(
    const ExecutionContext& ctx, const std::string& name,
    proto::VarType::Type* data_type) const {
1317
  proto::VarType::Type default_data_type =
1318
      static_cast<proto::VarType::Type>(-1);
H
hong 已提交
1319
  const std::vector<Variable*> vars = ctx.MultiInputVar(name);
1320 1321 1322 1323 1324 1325 1326 1327 1328 1329
  for (size_t i = 0; i < vars.size(); ++i) {
    const Variable* var = vars[i];
    if (var != nullptr) {
      const Tensor* t = nullptr;
      if (var->IsType<Tensor>()) {
        t = &var->Get<Tensor>();
      } else if (var->IsType<LoDTensor>()) {
        t = &var->Get<LoDTensor>();
      } else if (var->IsType<SelectedRows>()) {
        t = &(var->Get<SelectedRows>().value());
1330 1331 1332 1333 1334 1335 1336
      } else if (var->IsType<LoDTensorArray>()) {
        auto t_arr = var->Get<LoDTensorArray>();
        for (size_t j = 0; j < t_arr.size(); j++) {
          if (t_arr[j].IsInitialized()) {
            t = &(t_arr[j]);
          }
        }
1337 1338
      }
      if (t != nullptr) {
1339 1340 1341 1342 1343
        PADDLE_ENFORCE_EQ(
            t->IsInitialized(), true,
            platform::errors::InvalidArgument(
                "The Tensor in the %s Op's Input Variable %s(%s) is "
                "not initialized.",
H
hong 已提交
1344
                Type(), name, ctx.InputNames(name).at(i)));
1345
        proto::VarType::Type tmp = t->type();
1346
        PADDLE_ENFORCE(
1347
            tmp == *data_type || *data_type == default_data_type,
1348 1349 1350 1351 1352 1353
            platform::errors::InvalidArgument(
                "The DataType of %s Op's duplicable Variable %s must be "
                "consistent. The current variable type is (%s), but the "
                "previous variable type is (%s).",
                Type(), name, DataTypeToString(tmp),
                DataTypeToString(*data_type)));
1354 1355 1356 1357 1358 1359
        *data_type = tmp;
      }
    }
  }
}

1360
proto::VarType::Type OperatorWithKernel::IndicateDataType(
Y
Yu Yang 已提交
1361
    const ExecutionContext& ctx) const {
1362 1363 1364
  proto::VarType::Type dafault_data_type =
      static_cast<proto::VarType::Type>(-1);
  proto::VarType::Type data_type = dafault_data_type;
H
hong 已提交
1365 1366
  for (auto& input : ctx.InNameList()) {
    ParseInputDataType(ctx, input, &data_type);
Y
Yu Yang 已提交
1367
  }
1368 1369 1370 1371
  PADDLE_ENFORCE_NE(
      data_type, dafault_data_type,
      platform::errors::NotFound(
          "DataType should be indicated by input Variable at %s.", Type()));
1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383
  return data_type;
}

proto::VarType::Type OperatorWithKernel::IndicateVarDataType(
    const ExecutionContext& ctx, const std::string& name) const {
  proto::VarType::Type dafault_data_type =
      static_cast<proto::VarType::Type>(-1);
  proto::VarType::Type data_type = dafault_data_type;
  ParseInputDataType(ctx, name, &data_type);
  PADDLE_ENFORCE_NE(
      data_type, dafault_data_type,
      "The Input Variable(%s) of %s Op used to determine kernel data type "
L
liym27 已提交
1384
      "is empty or not LoDTensor or SelectedRows or LoDTensorArray.",
1385
      name, Type());
1386
  return data_type;
Y
Yu Yang 已提交
1387
}
1388

1389 1390 1391 1392 1393 1394 1395 1396
OpKernelType OperatorWithKernel::GetExpectedKernelType(
    const ExecutionContext& ctx) const {
  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
}

OpKernelType OperatorWithKernel::GetKernelTypeForVar(
    const std::string& var_name, const Tensor& tensor,
    const OpKernelType& expected_kernel_type) const {
M
mozga-intel 已提交
1397 1398
  return OpKernelType(expected_kernel_type.data_type_, tensor.place(),
                      tensor.layout());
1399 1400
}

Q
Qiao Longfei 已提交
1401
}  // namespace framework
L
liaogang 已提交
1402
}  // namespace paddle