operator.cc 43.9 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Q
Qiao Longfei 已提交
2 3 4 5 6 7 8 9 10 11 12 13

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
D
dzhwinter 已提交
14

15 16
#include <gflags/gflags.h>
#include <glog/logging.h>
17

18
#include <algorithm>
P
peizhilin 已提交
19 20
#include <sstream>
#include <string>
S
sneaxiy 已提交
21
#include <unordered_set>
P
peizhilin 已提交
22
#include <vector>
Y
Yi Wang 已提交
23
#include "paddle/fluid/framework/data_transform.h"
W
WangXi 已提交
24
#include "paddle/fluid/framework/details/nan_inf_utils.h"
Y
Yi Wang 已提交
25
#include "paddle/fluid/framework/executor.h"
26
#include "paddle/fluid/framework/lod_tensor.h"
27
#include "paddle/fluid/framework/op_call_stack.h"
28
#include "paddle/fluid/framework/op_proto_maker.h"
29
#include "paddle/fluid/framework/operator.h"
Y
Yi Wang 已提交
30
#include "paddle/fluid/framework/shape_inference.h"
31
#include "paddle/fluid/framework/transfer_scope_cache.h"
32
#include "paddle/fluid/framework/unused_var_check.h"
Y
Yi Wang 已提交
33
#include "paddle/fluid/framework/var_type.h"
34
#include "paddle/fluid/platform/profiler.h"
Q
Qiao Longfei 已提交
35

D
dzhwinter 已提交
36
DECLARE_bool(benchmark);
37
DECLARE_bool(check_nan_inf);
38
DECLARE_bool(enable_unused_var_check);
Q
Qiao Longfei 已提交
39
DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op");
P
pkpk 已提交
40 41 42
DEFINE_bool(fast_check_nan_inf, false,
            "Fast checking NAN/INF after each operation. It will be a little"
            "bit slow, much faster than check_nan_inf");
D
dzhwinter 已提交
43

Q
Qiao Longfei 已提交
44 45 46
namespace paddle {
namespace framework {

47 48 49 50 51 52
std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
    std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
    std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
    std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN),
    std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
};
D
dzhwinter 已提交
53

54 55
static DDim GetDimsDebug(const Scope& scope, const std::string& name,
                         bool get_actual_dim = false) {
56
  Variable* var = scope.FindVar(name);
Q
qiaolongfei 已提交
57 58
  if (var == nullptr) {
    return DDim({-1});
Q
Qiao Longfei 已提交
59 60
  }

M
minqiyang 已提交
61 62 63 64 65 66 67 68 69
  if (var->IsType<LoDTensor>()) {
    const LoDTensor& tensor = var->Get<LoDTensor>();
    return tensor.dims();
  } else if (var->IsType<SelectedRows>()) {
    if (get_actual_dim) {
      return var->Get<SelectedRows>().value().dims();
    } else {
      return var->Get<SelectedRows>().GetCompleteDims();
    }
70 71 72 73 74
  } else {
    return DDim({-1});
  }
}

Q
Qiao Longfei 已提交
75 76 77 78 79 80
static bool VarInited(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  if (var == nullptr) return false;
  return var->IsInitialized();
}

D
dzhwinter 已提交
81 82 83 84 85
static std::string GetDtype(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  if (var == nullptr) {
    return "";
  }
86

M
minqiyang 已提交
87 88 89
  if (var->IsType<LoDTensor>()) {
    const LoDTensor& tensor = var->Get<LoDTensor>();
    if (UNLIKELY(!tensor.IsInitialized())) {
90 91
      return "";
    }
Y
Yu Yang 已提交
92
    return DataTypeToString(tensor.type());
M
minqiyang 已提交
93
  } else if (var->IsType<SelectedRows>()) {
Q
Qiao Longfei 已提交
94 95 96 97
    auto tensor = var->Get<SelectedRows>().value();
    if (UNLIKELY(!tensor.IsInitialized())) {
      return "uninited";
    } else {
Y
Yu Yang 已提交
98
      return DataTypeToString(tensor.type());
Q
Qiao Longfei 已提交
99
    }
D
dzhwinter 已提交
100 101 102 103 104
  } else {
    return "";
  }
}

105 106 107 108 109 110
static int GetRowSize(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  if (var == nullptr) {
    return -1;
  }

M
minqiyang 已提交
111 112
  if (var->IsType<SelectedRows>()) {
    return var->Get<SelectedRows>().rows().size();
113 114 115 116 117
  }

  return -1;
}

118
static LoD GetLoDDebug(const Scope& scope, const std::string& name) {
Q
Qiao Longfei 已提交
119 120 121 122 123 124 125
  Variable* var = scope.FindVar(name);
  auto default_lod = LoD({{}});

  if (var == nullptr) {
    return default_lod;
  }

M
minqiyang 已提交
126 127 128
  if (var->IsType<LoDTensor>()) {
    const LoDTensor& tensor = var->Get<LoDTensor>();
    return tensor.lod();
Q
Qiao Longfei 已提交
129 130 131 132 133
  } else {
    return default_lod;
  }
}

X
Xin Pan 已提交
134 135 136 137 138
RuntimeContext::RuntimeContext(const VariableNameMap& innames,
                               const VariableNameMap& outnames,
                               const Scope& scope) {
  for (auto& var_name_item : innames) {
    std::vector<Variable*>& input_vars = inputs[var_name_item.first];
X
Xin Pan 已提交
139
    input_vars.reserve(var_name_item.second.size());
X
Xin Pan 已提交
140 141 142 143 144 145
    for (auto& var_name : var_name_item.second) {
      input_vars.push_back(scope.FindVar(var_name));
    }
  }
  for (auto& var_name_item : outnames) {
    std::vector<Variable*>& output_vars = outputs[var_name_item.first];
X
Xin Pan 已提交
146
    output_vars.reserve(var_name_item.second.size());
X
Xin Pan 已提交
147 148 149 150 151 152
    for (auto& var_name : var_name_item.second) {
      output_vars.push_back(scope.FindVar(var_name));
    }
  }
}

153
void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
P
peizhilin 已提交
154 155 156
  try {
    VLOG(4) << place << " " << DebugStringEx(&scope);
    if (platform::is_gpu_place(place)) {
157
#ifndef PADDLE_WITH_CUDA
P
peizhilin 已提交
158
      PADDLE_THROW("Cannot run operator on place %s", place);
159
#else
P
peizhilin 已提交
160 161
      auto dev_id = boost::get<platform::CUDAPlace>(place).device;
      platform::SetDeviceId(dev_id);
162
#endif
P
peizhilin 已提交
163
    }
P
peizhilin 已提交
164

P
peizhilin 已提交
165 166 167 168 169
    // The profile has a process-wide mutex, results in serious performance
    // issue
    // in concurrency scenerio. Here use an `if` to fix this issue.
    // Please not remove the `if`, ask @Superjomn if there are any concern.
    if (platform::IsProfileEnabled()) {
170
      platform::RecordEvent record_event(Type());
P
peizhilin 已提交
171 172 173 174 175
      RunImpl(scope, place);
    } else {
      RunImpl(scope, place);
    }
    VLOG(3) << place << " " << DebugStringEx(&scope);
176
  } catch (platform::EnforceNotMet& exception) {
177
    framework::InsertCallStackInfo(Type(), Attrs(), &exception);
178
    throw std::move(exception);
179 180 181 182 183 184
  } catch (platform::EOFException&) {
    std::rethrow_exception(std::current_exception());
  } catch (std::exception& ex) {
    LOG(WARNING) << Type() << " raises an exception "
                 << platform::demangle(typeid(ex).name()) << ", " << ex.what();
    std::rethrow_exception(std::current_exception());
P
peizhilin 已提交
185
  } catch (...) {
186
    LOG(WARNING) << Type() << " raises an unknown exception";
P
peizhilin 已提交
187
    std::rethrow_exception(std::current_exception());
188
  }
189 190
}

191
bool OperatorBase::HasInputs(const std::string& name) const {
M
minqiyang 已提交
192
  return inputs_.find(name) != inputs_.end();
193 194
}

195
std::string OperatorBase::Input(const std::string& name) const {
Y
Yu Yang 已提交
196
  auto& ins = Inputs(name);
197 198 199 200 201
  PADDLE_ENFORCE_LE(
      ins.size(), 1UL,
      platform::errors::AlreadyExists(
          "Operator %s's input %s should contain only one variable.", type_,
          name));
Y
Yu Yang 已提交
202
  return ins.empty() ? kEmptyVarName : ins[0];
Y
Yan Chunwei 已提交
203 204
}

Y
Yu Yang 已提交
205 206
const std::vector<std::string>& OperatorBase::Inputs(
    const std::string& name) const {
Y
Yu Yang 已提交
207
  auto it = inputs_.find(name);
208 209
  PADDLE_ENFORCE(it != inputs_.end(), "Operator %s does not have the input %s.",
                 type_, name);
Y
Yu Yang 已提交
210
  return it->second;
Y
Yan Chunwei 已提交
211 212
}

213
bool OperatorBase::HasOutputs(const std::string& name) const {
214
  if (outputs_.find(name) != outputs_.end()) {
215 216 217 218 219 220
    return true;
  } else {
    return false;
  }
}

221
std::string OperatorBase::Output(const std::string& name) const {
Y
Yu Yang 已提交
222
  auto& outs = Outputs(name);
Y
Yu Yang 已提交
223
  PADDLE_ENFORCE_LE(outs.size(), 1UL,
224 225
                    "Operator %s's output %s should contain only one variable.",
                    type_, name);
Y
Yu Yang 已提交
226
  return outs.empty() ? kEmptyVarName : outs[0];
Y
Yan Chunwei 已提交
227 228
}

Y
Yu Yang 已提交
229 230
const std::vector<std::string>& OperatorBase::Outputs(
    const std::string& name) const {
Y
Yu Yang 已提交
231
  auto it = outputs_.find(name);
232 233
  PADDLE_ENFORCE(it != outputs_.end(),
                 "Operator %s does not have an output called %s.", type_, name);
Y
Yu Yang 已提交
234
  return it->second;
Y
Yan Chunwei 已提交
235 236
}

237
std::string OperatorBase::DebugStringEx(const Scope* scope) const {
Q
Qiao Longfei 已提交
238
  std::stringstream ss;
Y
Yu Yang 已提交
239
  ss << "Op(" << type_ << "), inputs:{";
240

241
  const std::unordered_set<std::string>* no_need_buffer_vars = nullptr;
242 243
  if (info_ && info_->NoNeedBufferVarsInferer()) {
    no_need_buffer_vars =
244 245
        &(Info().NoNeedBufferVarsInferer()(Inputs(), Outputs(), Attrs()));
    if (no_need_buffer_vars->empty()) no_need_buffer_vars = nullptr;
246 247
  }

Y
Yu Yang 已提交
248 249
  for (auto it = inputs_.begin(); it != inputs_.end();) {
    auto& input = *it;
250 251
    bool is_no_need_buffer_var =
        (no_need_buffer_vars && no_need_buffer_vars->count(input.first) > 0);
Y
Yu Yang 已提交
252 253
    ss << input.first << "[";
    for (size_t i = 0; i < input.second.size(); ++i) {
Q
Qiao Longfei 已提交
254 255
      auto var_name = input.second[i];
      ss << var_name;
256
      if (scope) {
Q
Qiao Longfei 已提交
257 258 259 260 261 262 263
        if (!VarInited(*scope, var_name)) {
          ss << "[uninited]";
        } else {
          int row_size = GetRowSize(*scope, var_name);
          if (row_size >= 0) {
            ss << "[row_size=" << row_size << "]";
          }
264 265 266
          std::string dtype = is_no_need_buffer_var
                                  ? "unknown_dtype"
                                  : GetDtype(*scope, var_name);
Q
Qiao Longfei 已提交
267
          ss << ":" << dtype;
268 269
          ss << "[" << GetDimsDebug(*scope, var_name, true) << "]";
          ss << "(" << GetLoDDebug(*scope, var_name) << ")";
270
        }
271
      }
Y
Yu Yang 已提交
272 273 274
      if (i != input.second.size() - 1) {
        ss << ", ";
      }
275
    }
Y
Yu Yang 已提交
276
    ss << "]";
Y
Yu Yang 已提交
277 278
    ++it;
    if (it != inputs_.end()) {
279 280
      ss << ", ";
    }
Q
Qiao Longfei 已提交
281
  }
Y
Yu Yang 已提交
282
  ss << "}, outputs:{";
Y
Yu Yang 已提交
283 284
  for (auto it = outputs_.begin(); it != outputs_.end();) {
    auto& output = *it;
Y
Yu Yang 已提交
285 286
    ss << output.first << "[";
    for (size_t i = 0; i < output.second.size(); ++i) {
Q
Qiao Longfei 已提交
287 288
      auto var_name = output.second[i];
      ss << var_name;
289
      if (scope) {
Q
Qiao Longfei 已提交
290 291 292 293 294 295 296
        if (!VarInited(*scope, var_name)) {
          ss << "[uninited]";
        } else {
          int row_size = GetRowSize(*scope, output.second[i]);
          if (row_size >= 0) {
            ss << "[row_size=" << row_size << "]";
          }
C
chengduo 已提交
297 298
          std::string dtype = GetDtype(*scope, output.second[i]);
          ss << ":" << dtype;
299 300
          ss << "[" << GetDimsDebug(*scope, var_name, true) << "]";
          ss << "(" << GetLoDDebug(*scope, var_name) << ")";
301
        }
302
      }
Y
Yu Yang 已提交
303 304 305
      if (i != output.second.size() - 1) {
        ss << ", ";
      }
306
    }
Y
Yu Yang 已提交
307
    ss << "]";
Y
Yu Yang 已提交
308 309
    ++it;
    if (it != outputs_.end()) {
310 311
      ss << ", ";
    }
Q
Qiao Longfei 已提交
312
  }
Y
Yu Yang 已提交
313
  ss << "}.";
Q
Qiao Longfei 已提交
314 315 316
  return ss.str();
}

Y
Yu Yang 已提交
317
OperatorBase::OperatorBase(const std::string& type,
Y
Yu Yang 已提交
318 319
                           const VariableNameMap& inputs,
                           const VariableNameMap& outputs,
Y
Yu Yang 已提交
320
                           const AttributeMap& attrs)
S
sneaxiy 已提交
321 322 323 324 325 326
    : type_(type),
      inputs_(inputs),
      outputs_(outputs),
      attrs_(attrs),
      // NOTE(zjl): why op_info may be nullptr?
      info_(OpInfoMap::Instance().GetNullable(type)) {
H
hong 已提交
327 328 329 330 331 332 333 334
  // In dygraph mode, all the OperatorBase will be constructed by function:
  // framework::OpRegistry::CreateOp(type, {}, {}, {}, false).
  // Inputs, outputs and attrs will be set to empty map
  // to improve the execution efficiency of dygraph.
  if (inputs_.size() > 0 || outputs_.size() > 0) {
    GenerateTemporaryNames();
    CheckAllInputOutputSet();
  }
Y
Yu Yang 已提交
335
}
336

Q
qijun 已提交
337 338
std::vector<std::string> OperatorBase::InputVars() const {
  std::vector<std::string> ret_val;
Y
Yu Yang 已提交
339
  for (auto& o : inputs_) {
Q
qijun 已提交
340 341 342 343 344 345
    ret_val.reserve(ret_val.size() + o.second.size());
    ret_val.insert(ret_val.end(), o.second.begin(), o.second.end());
  }
  return ret_val;
}

Y
Yu Yang 已提交
346 347 348 349 350 351 352 353 354 355
std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
  std::vector<std::string> ret_val;
  if (has_intermediate) {
    // push all outputs into ret_val
    for (auto& o : outputs_) {
      ret_val.reserve(ret_val.size() + o.second.size());
      ret_val.insert(ret_val.end(), o.second.begin(), o.second.end());
    }
    return ret_val;
  }
S
sneaxiy 已提交
356
  auto& info = Info();
Y
Yu Yang 已提交
357 358

  // get all OpProto::Var for outputs
Y
Yu Yang 已提交
359
  for (auto& o : info.Proto().outputs()) {
Y
Yu Yang 已提交
360 361 362 363 364 365 366 367 368
    // ignore all intermediate output
    if (o.intermediate()) continue;
    auto out = outputs_.find(o.name());
    if (out != outputs_.end()) {
      ret_val.reserve(ret_val.size() + out->second.size());
      ret_val.insert(ret_val.end(), out->second.begin(), out->second.end());
    }
  }
  return ret_val;
D
dongzhihong 已提交
369 370
}

371
void OperatorBase::CheckAllInputOutputSet() const {
S
sneaxiy 已提交
372
  if (info_ == nullptr || info_->proto_ == nullptr) return;
373

S
sneaxiy 已提交
374
  for (auto& in : info_->Proto().inputs()) {
375 376 377 378
    if (!in.dispensable()) {
      PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
                     "Operator %s's input, %s, is not set", Type(), in.name());
    }
379 380
  }

S
sneaxiy 已提交
381
  for (auto& out : info_->Proto().outputs()) {
382 383 384 385 386
    if (!out.dispensable()) {
      PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
                     "Operator %s's output, %s, is not set", Type(),
                     out.name());
    }
387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402
  }
}

void OperatorBase::GenerateTemporaryNames() {
  static std::atomic<size_t> gUniqId(0UL);
  for (auto& output : outputs_) {
    for (auto& output_name : output.second) {
      if (output_name == kTempVarName) {
        output_name += type_;
        output_name += "@";
        output_name += std::to_string(gUniqId.fetch_add(1));
      }
    }
  }
}

B
baojun-nervana 已提交
403
static bool VarIsTensor(const Variable& var) {
C
chengduo 已提交
404
  return var.IsType<LoDTensor>() || var.IsType<SelectedRows>();
405 406
}

C
chengduo 已提交
407
const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) {
C
chengduo 已提交
408 409 410 411
  if (var.IsType<LoDTensor>()) {
    return static_cast<const Tensor*>(&(var.Get<LoDTensor>()));
  } else if (var.IsType<SelectedRows>()) {
    return &(var.Get<SelectedRows>().value());
Q
QI JUN 已提交
412
  } else {
Y
Yang Yang 已提交
413
    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
S
sneaxiy 已提交
414
                 ToTypeName(var.Type()));
Q
QI JUN 已提交
415 416 417
  }
}

C
chengduo 已提交
418
Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) {
Q
QI JUN 已提交
419
  if (var->IsType<LoDTensor>()) {
420
    return var->GetMutable<LoDTensor>();
Q
QI JUN 已提交
421
  } else if (var->IsType<SelectedRows>()) {
422
    return var->GetMutable<SelectedRows>()->mutable_value();
Q
QI JUN 已提交
423
  } else {
Y
Yang Yang 已提交
424
    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
S
sneaxiy 已提交
425
                 ToTypeName(var->Type()));
Q
QI JUN 已提交
426 427 428
  }
}

429
bool ExecutionContext::HasInput(const std::string& name) const {
430
  auto* var = InputVar(name);
431 432 433 434
  return var != nullptr;
}

bool ExecutionContext::HasOutput(const std::string& name) const {
435
  auto* var = OutputVar(name);
436 437 438
  return var != nullptr;
}

X
Xin Pan 已提交
439
const Variable* ExecutionContext::InputVar(const std::string& name) const {
440 441
  LogVarUsageIfUnusedVarCheckEnabled(name);

X
Xin Pan 已提交
442 443 444
  auto it = ctx_.inputs.find(name);
  if (it == ctx_.inputs.end()) return nullptr;

445 446 447 448 449
  PADDLE_ENFORCE_LE(
      it->second.size(), 1UL,
      platform::errors::AlreadyExists(
          "Operator %s's input %s should contain only one variable.",
          op_.Type(), name));
X
Xin Pan 已提交
450 451 452
  return it->second.empty() ? nullptr : it->second[0];
}

X
clean  
Xin Pan 已提交
453
Variable* ExecutionContext::OutputVar(const std::string& name) const {
X
Xin Pan 已提交
454 455 456 457 458 459 460 461 462
  auto it = ctx_.outputs.find(name);
  if (it == ctx_.outputs.end()) return nullptr;

  PADDLE_ENFORCE_LE(it->second.size(), 1UL,
                    "Operator %s's output %s should contain only one variable.",
                    op_.Type(), name);
  return it->second.empty() ? nullptr : it->second[0];
}

463
template <>
464
const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
C
chengduo 已提交
465
  return Input<LoDTensor>(name);
466 467 468
}

template <>
469
const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
470
    const std::string& name) const {
471 472
  LogVarUsageIfUnusedVarCheckEnabled(name);

H
hong 已提交
473 474
  auto vars = MultiInputVar(name);
  if (vars.size() == 0) {
X
Xin Pan 已提交
475 476 477 478 479
    return {};
  }
  std::vector<const Tensor*> res;
  res.reserve(vars.size());
  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
H
hong 已提交
480
                 [&](const Variable* var) -> const Tensor* {
X
Xin Pan 已提交
481 482 483 484
                   if (var == nullptr) return nullptr;
                   PADDLE_ENFORCE(
                       var->IsType<LoDTensor>(),
                       "should be LoDTensor, but the received type is %s",
S
sneaxiy 已提交
485
                       ToTypeName(var->Type()));
X
Xin Pan 已提交
486 487 488 489 490
                   return &(var->Get<LoDTensor>());
                 });
  return res;
}

491
template <>
492
Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
C
chengduo 已提交
493
  return Output<LoDTensor>(name);
494 495 496
}

template <>
497
std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
498
    const std::string& name) const {
H
hong 已提交
499 500 501
  auto vars = MultiOutputVar(name);

  if (vars.size() == 0) {
502 503
    return {};
  }
504
  std::vector<Tensor*> res;
505 506 507 508 509
  res.reserve(vars.size());
  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
                 [&](Variable* var) -> Tensor* {
                   return var == nullptr ? nullptr
                                         : var->GetMutable<LoDTensor>();
510
                 });
511 512 513
  return res;
}

Y
Yu Yang 已提交
514 515 516 517 518 519 520 521 522 523 524 525 526 527 528
bool OpSupportGPU(const std::string& op_type) {
  auto& all_kernels = OperatorWithKernel::AllOpKernels();
  auto it = all_kernels.find(op_type);
  if (it == all_kernels.end()) {
    // All control operator must support GPU
    return true;
  }
  for (auto& kern_pair : it->second) {
    if (platform::is_gpu_place(kern_pair.first.place_)) {
      return true;
    }
  }
  return false;
}

529 530
class RuntimeInferShapeContext : public InferShapeContext {
 public:
X
Xin Pan 已提交
531 532
  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope,
                           const RuntimeContext& ctx)
G
Gabor Buella 已提交
533
      : op_(op), ctx_(ctx) {}
534 535

  bool HasInput(const std::string& name) const override {
536
    // has only one input
X
Xin Pan 已提交
537
    const auto& ins = ctx_.inputs;
538 539
    auto it = ins.find(name);
    if (it == ins.end()) {
540 541
      return false;
    }
542
    const auto& in = it->second;
X
Xin Pan 已提交
543
    if (in.size() == 0) return false;
T
tensor-tang 已提交
544
    PADDLE_ENFORCE_EQ(in.size(), 1UL,
F
fengjiayi 已提交
545
                      "Input %s should not have more than one inputs", name);
X
Xin Pan 已提交
546
    return in[0] != nullptr;
547 548 549
  }

  bool HasOutput(const std::string& name) const override {
550
    // has only one output
X
Xin Pan 已提交
551
    const auto& outs = ctx_.outputs;
552 553
    auto it = outs.find(name);
    if (it == outs.end()) {
554 555
      return false;
    }
556
    const auto& out = it->second;
X
Xin Pan 已提交
557
    if (out.size() == 0) {
558 559
      return false;
    }
T
tensor-tang 已提交
560 561
    PADDLE_ENFORCE_EQ(out.size(), 1UL,
                      "Output %s should not have more than one outputs", name);
X
Xin Pan 已提交
562
    return out[0] != nullptr;
563 564 565
  }

  bool HasInputs(const std::string& name) const override {
X
Xin Pan 已提交
566 567
    const auto& ins = ctx_.inputs;
    auto it = ins.find(name);
X
fix  
Xin Pan 已提交
568
    if (it == ins.end() || it->second.empty()) {
569 570
      return false;
    }
X
Xin Pan 已提交
571 572
    for (auto& input : it->second) {
      if (input == nullptr) {
573 574 575 576 577 578 579
        return false;
      }
    }
    return true;
  }

  bool HasOutputs(const std::string& name) const override {
X
Xin Pan 已提交
580 581
    const auto& outs = ctx_.outputs;
    auto it = outs.find(name);
X
fix  
Xin Pan 已提交
582
    if (it == outs.end() || it->second.empty()) {
583 584
      return false;
    }
X
Xin Pan 已提交
585 586
    for (auto& output : it->second) {
      if (output == nullptr) {
587 588 589 590 591 592 593 594
        return false;
      }
    }
    return true;
  }

  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }

H
hong 已提交
595
  std::vector<std::string> Inputs(const std::string& name) const override {
596 597 598
    return op_.Inputs(name);
  }

H
hong 已提交
599
  std::vector<std::string> Outputs(const std::string& name) const override {
600 601 602
    return op_.Outputs(name);
  }

603 604
  void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
                size_t j = 0) override {
X
Xin Pan 已提交
605 606 607 608 609 610 611 612 613
    auto in_it = ctx_.inputs.find(in);
    auto out_it = ctx_.outputs.find(out);
    PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
                   "Inputs %s should have %llu argument", in, i);
    PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
                   "Outputs %s should have %llu argument", out, j);

    Variable* in_var = in_it->second[i];
    Variable* out_var = out_it->second[j];
614 615

    PADDLE_ENFORCE(in_var->Type() == out_var->Type(),
X
fix  
Xin Pan 已提交
616
                   "The type of %s and %s is not the same.", in, out);
617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634

    if (in_var->IsType<framework::SelectedRows>()) {
      auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
      auto out_sele_rows = out_var->GetMutable<framework::SelectedRows>();
      out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
      out_sele_rows->set_rows(in_sele_rows.rows());
      out_sele_rows->set_height(in_sele_rows.height());
    } else if (in_var->IsType<framework::LoDTensor>()) {
      auto& in_lod_tensor = in_var->Get<framework::LoDTensor>();
      auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>();
      out_lod_tensor->Resize(in_lod_tensor.dims());
    } else {
      PADDLE_THROW(
          "Currently, the input type of ShareDim only can be LoDTensor "
          "or SelectedRows.");
    }
  }

H
hong 已提交
635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679
  void ShareAllLoD(const std::string& in,
                   const std::string& out) const override {
    auto in_it = ctx_.inputs.find(in);
    auto out_it = ctx_.outputs.find(out);
    PADDLE_ENFORCE_NE(in_it, ctx_.inputs.end(),
                      platform::errors::NotFound(
                          "Input [%s] found error in Op [%s]", in, op_.Type()));
    PADDLE_ENFORCE_NE(
        out_it, ctx_.outputs.end(),
        platform::errors::NotFound("Output [%s] found error in Op [%s]", out,
                                   op_.Type()));

    auto& in_var_list = in_it->second;
    auto& out_var_list = out_it->second;

    PADDLE_ENFORCE_EQ(
        in_var_list.size(), out_var_list.size(),
        platform::errors::PreconditionNotMet(
            "Op [%s]: Input var size should be equal with ouput var size",
            op_.Type()));

    auto& out_var_names = op_.Outputs(out);

    for (size_t i = 0; i < in_var_list.size(); ++i) {
      if (out_var_names[i] == framework::kEmptyVarName) {
        continue;
      }

      Variable* in_var = in_var_list[i];
      if (!in_var->IsType<LoDTensor>()) return;
      Variable* out_var = out_var_list[i];
      PADDLE_ENFORCE_EQ(out_var->IsType<LoDTensor>(), true,
                        platform::errors::PreconditionNotMet(
                            "The %d-th output of Output(%s) must be LoDTensor.",
                            i, out_var_names[i]));
      auto& in_tensor = in_var->Get<LoDTensor>();
      auto* out_tensor = out_var->GetMutable<LoDTensor>();
      out_tensor->set_lod(in_tensor.lod());
#ifdef PADDLE_WITH_MKLDNN
      if (in_tensor.layout() != DataLayout::kMKLDNN)
#endif
        out_tensor->set_layout(in_tensor.layout());
    }
  }

Q
Qiao Longfei 已提交
680 681
  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
                size_t j = 0) const override {
X
Xin Pan 已提交
682 683 684 685 686 687 688 689
    auto in_it = ctx_.inputs.find(in);
    auto out_it = ctx_.outputs.find(out);
    PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
                   "Inputs %s should have %llu argument", in, i);
    PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
                   "Outputs %s should have %llu argument", out, j);

    Variable* in_var = in_it->second.at(i);
Q
Qiao Longfei 已提交
690
    if (!in_var->IsType<LoDTensor>()) return;
X
Xin Pan 已提交
691
    Variable* out_var = out_it->second.at(j);
Q
Qiao Longfei 已提交
692 693
    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
694
    auto& in_tensor = in_var->Get<LoDTensor>();
Q
Qiao Longfei 已提交
695 696
    auto* out_tensor = out_var->GetMutable<LoDTensor>();
    out_tensor->set_lod(in_tensor.lod());
D
dzhwinter 已提交
697

M
mozga-intel 已提交
698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716
// TODO(dzhwinter) : reuse ShareLoD in most operators.
// Need to call ShareLayout explicitly in sequence related ops.
// Shall we have a better method to shared info between in/out Tensor?
#ifdef PADDLE_WITH_MKLDNN
    // Fix me: ugly workaround below
    // Correct solution:
    //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
    //    layout of output tensor should be set "manually" in Compute()
    //    of each OPKernel. The reason layout should NOT be shared between
    //    input and output "automatically" (now by InferShape()->ShareLoD())
    //    is that layout transform may occur after InferShape().
    // Workaround:
    //    Skip set_layout() when input layout is kMKLDNN
    //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
    //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
    //    in Compute()
    if (in_tensor.layout() != DataLayout::kMKLDNN)
#endif
      out_tensor->set_layout(in_tensor.layout());
D
dzhwinter 已提交
717 718
  }

719
  int32_t GetLoDLevel(const std::string& in, size_t i = 0) const override {
720
    PADDLE_THROW(
721
        "GetLoDLevel is only used in compile time. The calculation of "
722 723 724 725
        "output's actual lod is different among operators so that should be "
        "set in the runtime kernel.");
  }

726 727
  void SetLoDLevel(const std::string& out, int32_t lod_level,
                   size_t j = 0) const override {
728
    PADDLE_THROW(
729
        "SetLoDLevel is only used in compile time. The calculation of "
730 731
        "output's actual lod is different among operators so that should be "
        "set in the runtime kernel.");
C
chengduo 已提交
732 733
  }

734 735
  bool IsRuntime() const override { return true; }

736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754
  // TODO(paddle-dev): Can this be template?
  std::vector<InferShapeVarPtr> GetInputVarPtrs(
      const std::string& name) override {
    const std::vector<Variable*>& vars = InputVars(name);
    std::vector<InferShapeVarPtr> res;
    res.reserve(vars.size());
    res.insert(res.begin(), vars.begin(), vars.end());
    return res;
  }

  std::vector<InferShapeVarPtr> GetOutputVarPtrs(
      const std::string& name) override {
    const std::vector<Variable*>& vars = OutputVars(name);
    std::vector<InferShapeVarPtr> res;
    res.reserve(vars.size());
    res.insert(res.begin(), vars.begin(), vars.end());
    return res;
  }

X
Xin Pan 已提交
755 756 757 758 759 760 761 762 763 764 765 766 767
  DDim GetInputDim(const std::string& name) const override {
    const std::vector<Variable*>& vars = InputVars(name);
    PADDLE_ENFORCE_EQ(vars.size(), 1UL,
                      "Input(%s) should hold one element, but now it holds %d",
                      name, vars.size());
    return this->GetDim(vars[0]);
  }

  std::vector<DDim> GetInputsDim(const std::string& name) const override {
    const std::vector<Variable*>& vars = InputVars(name);
    return GetDims(vars);
  }

X
Xin Pan 已提交
768 769 770 771 772 773 774 775 776 777
  std::vector<proto::VarType::Type> GetInputsVarType(
      const std::string& name) const override {
    return GetVarTypes(InputVars(name));
  }

  std::vector<proto::VarType::Type> GetOutputsVarType(
      const std::string& name) const override {
    return GetVarTypes(OutputVars(name));
  }

X
Xin Pan 已提交
778 779 780 781 782 783 784 785 786 787 788 789 790 791
  void SetOutputDim(const std::string& name, const DDim& dim) override {
    auto& vars = OutputVars(name);
    PADDLE_ENFORCE_EQ(vars.size(), 1UL,
                      "Output(%s) should hold one element, but now it holds %d",
                      name, vars.size());
    SetDim(vars[0], dim);
  }

  void SetOutputsDim(const std::string& name,
                     const std::vector<DDim>& dims) override {
    auto& vars = OutputVars(name);
    SetDims(vars, dims);
  }

792
 protected:
X
Xin Pan 已提交
793
  DDim GetDim(Variable* var) const {
F
fengjiayi 已提交
794
    PADDLE_ENFORCE_NOT_NULL(var);
795 796 797 798 799
    if (var->IsType<LoDTensor>()) {
      return var->Get<LoDTensor>().dims();
    } else if (var->IsType<SelectedRows>()) {
      return var->Get<SelectedRows>().GetCompleteDims();
    } else {
F
fengjiayi 已提交
800
      PADDLE_THROW(
X
Xin Pan 已提交
801
          "Only LoDTensor/SelectedRows support 'GetDim', but Variables "
F
fengjiayi 已提交
802
          "type_id is %s.",
S
sneaxiy 已提交
803
          ToTypeName(var->Type()));
F
fengjiayi 已提交
804 805 806
    }
  }

X
Xin Pan 已提交
807 808 809 810 811 812 813 814
  std::vector<DDim> GetDims(const std::vector<Variable*>& vars) const {
    std::vector<DDim> ret;
    ret.reserve(vars.size());
    std::transform(vars.begin(), vars.end(), std::back_inserter(ret),
                   [this](Variable* var) { return this->GetDim(var); });
    return ret;
  }

F
fengjiayi 已提交
815
  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
Y
Yu Yang 已提交
816
    PADDLE_THROW("Only compile time support this method");
817 818
  }

X
Xin Pan 已提交
819
  void SetDim(Variable* var, const DDim& dim) {
820 821 822 823 824
    if (var->IsType<LoDTensor>()) {
      var->GetMutable<LoDTensor>()->Resize(dim);
    } else if (var->IsType<SelectedRows>()) {
      var->GetMutable<SelectedRows>()->set_height(dim[0]);
    } else {
X
Xin Pan 已提交
825
      PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
S
sneaxiy 已提交
826
                   ToTypeName(var->Type()));
X
Xin Pan 已提交
827 828 829 830 831 832 833 834 835 836 837 838
    }
  }

  void SetDims(const std::vector<Variable*>& vars,
               const std::vector<DDim>& dims) {
    size_t length = vars.size();
    PADDLE_ENFORCE_EQ(length, dims.size());
    for (size_t i = 0; i < length; ++i) {
      if (vars[i] == nullptr) {
        continue;
      }
      SetDim(vars[i], dims[i]);
839 840 841
    }
  }

F
fengjiayi 已提交
842 843
  void SetRepeatedDims(const std::string& name,
                       const std::vector<DDim>& dims) override {
Y
Yu Yang 已提交
844
    PADDLE_THROW("Only compile time support this method");
F
fengjiayi 已提交
845 846
  }

X
Xin Pan 已提交
847 848 849 850 851 852 853 854 855 856 857
  std::vector<proto::VarType::Type> GetVarTypes(
      const std::vector<Variable*>& vars) const {
    std::vector<proto::VarType::Type> retv;
    retv.resize(vars.size());
    std::transform(vars.begin(), vars.end(), retv.begin(),
                   std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType),
                             this, std::placeholders::_1));
    return retv;
  }

  proto::VarType::Type GetVarType(Variable* var) const {
858 859 860
    return ToVarType(var->Type());
  }

861 862 863 864 865 866 867 868 869 870 871 872 873 874
 private:
  const std::vector<Variable*>& InputVars(const std::string& name) const {
    auto it = ctx_.inputs.find(name);
    PADDLE_ENFORCE(it != ctx_.inputs.end(),
                   "Operator %s does not have the input %s.", op_.Type(), name);
    return it->second;
  }

  const std::vector<Variable*>& OutputVars(const std::string& name) const {
    auto it = ctx_.outputs.find(name);
    PADDLE_ENFORCE(it != ctx_.outputs.end(),
                   "Operator %s does not have the outputs %s.", op_.Type(),
                   name);
    return it->second;
F
fengjiayi 已提交
875 876
  }

877
  const OperatorBase& op_;
X
Xin Pan 已提交
878
  const RuntimeContext& ctx_;
879 880
};

881 882
static void CheckTensorNANOrInf(const std::string& op_type,
                                const std::string& name,
C
chengduoZH 已提交
883 884 885 886
                                const framework::Tensor& tensor) {
  if (tensor.memory_size() == 0) {
    return;
  }
Y
Yu Yang 已提交
887 888
  if (tensor.type() != proto::VarType::FP32 &&
      tensor.type() != proto::VarType::FP64) {
C
chengduoZH 已提交
889 890 891
    return;
  }
  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
892
                 "Operator %s output Tensor %s contains Inf", op_type, name);
C
chengduoZH 已提交
893
  PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
894
                 "Operator %s output Tensor %s contains NAN", op_type, name);
C
chengduoZH 已提交
895 896
}

B
baojun-nervana 已提交
897
void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
X
Xin Pan 已提交
898 899 900
                                           const platform::Place& place,
                                           const RuntimeContext& ctx) const {
  RuntimeInferShapeContext infer_shape_ctx(*this, scope, ctx);
B
baojun-nervana 已提交
901 902 903
  this->InferShape(&infer_shape_ctx);
}

X
polish  
Xin Pan 已提交
904 905 906 907 908 909 910 911 912 913
std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
    const OpKernelType& key) const {
  auto config_iter = kernel_configs_map_.find(key);
  std::vector<KernelConfig>* kernel_configs = nullptr;
  if (config_iter != kernel_configs_map_.end()) {
    kernel_configs = &(config_iter->second);
  }
  return kernel_configs;
}

L
luotao1 已提交
914 915
void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place) const {
L
luotao1 已提交
916 917
  // To reduce the elapsed time of HasAttr, we use bool variable to record the
  // result of HasAttr.
918 919 920
  if (!enable_cache_runtime_context_ && HasAttr(kEnableCacheRuntimeContext))
    enable_cache_runtime_context_ = true;
  if (!all_kernels_must_compute_runtime_shape_ &&
L
luotao1 已提交
921
      HasAttr(kAllKernelsMustComputeRuntimeShape))
922 923
    all_kernels_must_compute_runtime_shape_ = true;
  if (!enable_cache_runtime_context_) {
L
luotao1 已提交
924 925 926 927
    RuntimeContext ctx(Inputs(), Outputs(), scope);
    RunImpl(scope, place, &ctx);
  } else {
    const Scope* cur_scope = &scope;
928 929 930 931 932 933
    if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
      std::lock_guard<std::mutex> lock(cache_update_mutex_);
      if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
        runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
        pre_scope_ = cur_scope;
      }
L
luotao1 已提交
934 935 936 937 938 939 940 941
    }
    RunImpl(scope, place, runtime_ctx_.get());
  }
}

void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place,
                                 RuntimeContext* runtime_ctx) const {
Y
Yu Yang 已提交
942
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
943
  auto* dev_ctx = pool.Get(place);
944

945
  if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
946
    ChooseKernel(*runtime_ctx, scope, place);
947 948
  }

L
Liu Yiqun 已提交
949
  std::vector<KernelConfig>* kernel_configs = GetKernelConfig(*kernel_type_);
950

Y
yuyang18 已提交
951 952
  // do data transformScope &transfer_scope;
  std::vector<std::string> transfered_inplace_vars;
T
Tao Luo 已提交
953
  auto* transfer_scope =
954
      PrepareData(scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx);
955

Y
yuyang18 已提交
956 957 958 959
  // exec scope is the scope that kernel actually executed on.
  const Scope& exec_scope =
      (transfer_scope == nullptr ? scope : *transfer_scope);

960 961
  if (!(kernel_type_->place_ == dev_ctx->GetPlace())) {
    dev_ctx = pool.Get(kernel_type_->place_);
962
  }
Q
QI JUN 已提交
963

964
  if (!all_kernels_must_compute_runtime_shape_) {
L
luotao1 已提交
965
    RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx);
966 967
    this->InferShape(&infer_shape_ctx);
  }
968 969 970 971 972

  if (FLAGS_enable_unused_var_check) {
    GetThreadLocalUsedVarNameSet()->clear();
  }

X
clean  
Xin Pan 已提交
973 974
  // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
  // not Scope. Imperative mode only pass inputs and get outputs.
975 976
  (*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
                                   kernel_configs));
D
dzhwinter 已提交
977

Y
yuyang18 已提交
978 979 980
  if (!transfered_inplace_vars.empty()) {
    // there is inplace variable has been transfered.
    TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
981
  }
982 983 984 985 986 987 988 989
  if (FLAGS_enable_unused_var_check) {
    // skip op that uses mkldnn because it has different memory reuse strategy.
    // use attr here because some GradMakers (like ActivationGradOpMaker) add
    // input when use_mkldnn=true;
    if (!(HasAttr("use_mkldnn") && Attr<bool>("use_mkldnn"))) {
      CheckUnusedVar(*this, scope);
    }
  }
990

D
dzhwinter 已提交
991
  /*For profiling/benchmark only*/
D
dzhwinter 已提交
992
  if (FLAGS_benchmark) {
Y
yuyang18 已提交
993
    dev_ctx->Wait();
D
dzhwinter 已提交
994
  }
C
chengduoZH 已提交
995

P
pkpk 已提交
996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014
  if (FLAGS_fast_check_nan_inf) {
    for (auto& vname : OutputVars(true)) {
      // only check inserted vars,
      // please see executor.py for details of fast_check_nan_inf
      if (vname.rfind("debug_var") == 0) {
        VLOG(3) << "debugging nan/inf in var " << vname;

        auto* var = exec_scope.FindVar(vname);
        if (var == nullptr) continue;
        if (var->IsType<framework::LoDTensor>()) {
          CheckTensorNANOrInf(type_, vname, var->Get<framework::LoDTensor>());
        } else if (var->IsType<framework::SelectedRows>()) {
          CheckTensorNANOrInf(type_, vname,
                              var->Get<framework::SelectedRows>().value());
        }
      }
    }
  }

C
chengduoZH 已提交
1015
  if (FLAGS_check_nan_inf) {
W
WangXi 已提交
1016
    framework::details::CheckOpHasNanOrInf(*this, exec_scope, place);
C
chengduoZH 已提交
1017
  }
1018 1019 1020 1021 1022 1023 1024

  // To solve issue #15032, have a discussion with @Luotao for cpu inference,
  // do not cache transfer scope, hence in this case delete transfer scope
  // after run to avoid memory leak
  if (transfer_scope && !run_by_executor_ && !enable_cache_transfer_scope_) {
    scope.DeleteScope(transfer_scope);
  }
Q
Qiao Longfei 已提交
1025
}
X
Xin Pan 已提交
1026

L
Liu Yiqun 已提交
1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062
void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
                                      const Scope& scope,
                                      const platform::Place& place) const {
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto* dev_ctx = pool.Get(place);

  // check if op[type] has kernel registered.
  auto& all_op_kernels = AllOpKernels();
  auto kernels_iter = all_op_kernels.find(type_);
  if (kernels_iter == all_op_kernels.end()) {
    PADDLE_THROW(
        "There are no kernels which are registered in the %s operator.", type_);
  }

  OpKernelMap& kernels = kernels_iter->second;

  auto expected_kernel_key = this->GetExpectedKernelType(
      ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;

  auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_MKLDNN
  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
  if (kernel_iter == kernels.end() &&
      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
    expected_kernel_key.library_type_ = LibraryType::kPlain;
    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
    kernel_iter = kernels.find(expected_kernel_key);
  }
#endif
  if (kernel_iter == kernels.end()) {
    PADDLE_THROW("op %s does not have kernel for %s", type_,
                 KernelTypeToString(expected_kernel_key));
  }

1063 1064 1065 1066 1067
  std::lock_guard<std::mutex> lock(cache_update_mutex_);
  if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
    kernel_type_.reset(new OpKernelType(expected_kernel_key));
    kernel_func_.reset(new OpKernelFunc(kernel_iter->second));
  }
L
Liu Yiqun 已提交
1068 1069
}

Y
yuyang18 已提交
1070 1071 1072 1073
void OperatorWithKernel::TransferInplaceVarsBack(
    const Scope& scope, const std::vector<std::string>& inplace_vars,
    const Scope& transfer_scope) const {
  for (auto& var_name : inplace_vars) {
M
minqiyang 已提交
1074
    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
C
chengduo 已提交
1075 1076 1077
    auto* origin_var = scope.FindVar(var_name);
    PADDLE_ENFORCE_NOT_NULL(origin_var, "The var[%s] should not be nullptr.",
                            var_name);
C
chengduo 已提交
1078
    auto* original_tensor =
C
chengduo 已提交
1079
        GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var);
C
chengduo 已提交
1080
    auto* var = transfer_scope.FindVar(var_name);
C
chengduo 已提交
1081 1082
    PADDLE_ENFORCE_NOT_NULL(var, "The var[%s] should not be nullptr.",
                            var_name);
C
chengduo 已提交
1083
    auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
Y
yuyang18 已提交
1084 1085 1086 1087
    original_tensor->ShareDataWith(*transformed_tensor);
  }
}

X
Xin Pan 已提交
1088
Scope* OperatorWithKernel::PrepareData(
Y
yuyang18 已提交
1089
    const Scope& scope, const OpKernelType& expected_kernel_key,
X
Xin Pan 已提交
1090 1091
    std::vector<std::string>* transfered_inplace_vars,
    RuntimeContext* ctx) const {
Y
yuyang18 已提交
1092
  Scope* new_scope = nullptr;
S
sneaxiy 已提交
1093

1094
  const std::unordered_set<std::string>* no_buffer_ins = nullptr;
S
sneaxiy 已提交
1095 1096 1097 1098
  if (info_) {
    auto& no_buffer_inferer = info_->NoNeedBufferVarsInferer();
    // Some op may not register NoNeedBufferVarsInferer
    if (no_buffer_inferer) {
1099 1100
      no_buffer_ins = &(no_buffer_inferer(Inputs(), Outputs(), Attrs()));
      if (no_buffer_ins->empty()) no_buffer_ins = nullptr;
S
sneaxiy 已提交
1101 1102 1103
    }
  }

Y
yuyang18 已提交
1104
  for (auto& var_name_item : Inputs()) {
1105
    if (no_buffer_ins && no_buffer_ins->count(var_name_item.first) > 0) {
G
gongweibao 已提交
1106
      VLOG(7) << "Skip scanning input " << var_name_item.first
S
sneaxiy 已提交
1107
              << " in Operator " << type_;
S
sneaxiy 已提交
1108 1109 1110
      continue;
    }

X
Xin Pan 已提交
1111 1112 1113 1114
    std::vector<Variable*>& input_vars = ctx->inputs[var_name_item.first];

    for (size_t i = 0; i < var_name_item.second.size(); ++i) {
      auto& var_name = var_name_item.second[i];
X
Xin Pan 已提交
1115
      auto* var = input_vars[i];
X
Xin Pan 已提交
1116

Y
yuyang18 已提交
1117
      // Only tensor can be tranfer to another device.
C
chengduo 已提交
1118
      if (var == nullptr || !VarIsTensor(*var)) {
Y
yuyang18 已提交
1119 1120 1121
        continue;
      }

C
chengduo 已提交
1122
      auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
Y
yuyang18 已提交
1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139
      if (!tensor_in->IsInitialized()) {
        continue;
      }

      auto kernel_type_for_var = GetKernelTypeForVar(
          var_name_item.first, *tensor_in, expected_kernel_key);

      if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
        continue;
      }

      auto out_var_names = OutputVars(true);
      if (std::find(out_var_names.begin(), out_var_names.end(), var_name) !=
          out_var_names.end()) {
        transfered_inplace_vars->emplace_back(var_name);
      }

M
minqiyang 已提交
1140 1141
      VLOG(3) << "Transform Variable " << var_name << " from "
              << kernel_type_for_var << " to " << expected_kernel_key;
Y
yuyang18 已提交
1142

1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154
      // In the inference scenerio, the scopes will be reused across the
      // batches, so the `new_scope` here will result in GPU memroy explosion
      // over the  running of operators.
      // We use a thread_local cache to fix that issue, the key in the cache is
      // the combination of the `scope` argument, from_kernel_type,
      // target_kernel_type.
      // Have a discussion with @Superjomn or the inference developers if some
      // changes on this logic for this macro might not tested on the other
      // scenerios.
      // If this op is not called by an Executor or ParallelExecutor, it should
      // called by a NaiveExecutor, the NaiveExecutor will cache the scopes and
      // variables, that behavior a lot different.
1155 1156 1157 1158 1159 1160 1161 1162 1163
      //
      // To solve issue #15032, have a discussion with @Luotao for cpu
      // inference, for all cpu kernels cases without GPU participation, here
      // not do transfer scope caching, and cpu inference performance is not
      // impacted by test.
      enable_cache_transfer_scope_ = false;
      if (!run_by_executor_ &&
          (platform::is_gpu_place(kernel_type_for_var.place_) ||
           platform::is_gpu_place(expected_kernel_key.place_))) {
1164 1165
        new_scope = TryCreateTransferScope(kernel_type_for_var,
                                           expected_kernel_key, &scope);
1166
        enable_cache_transfer_scope_ = true;
1167
      }
1168
      if (!new_scope) {
Y
yuyang18 已提交
1169 1170
        new_scope = &scope.NewScope();
      }
1171 1172 1173 1174
      // For inference, if a gpu model has an op which could only run on CPU,
      // each result of different input will be the same with the first one.
      // The reason is that if a gpu tensor is the input of a cpu kernel,
      // we will create a new cpu tensor in new scope.
1175
      // However, if enable_cache_runtime_context_, we get the cpu tensor each
1176 1177 1178
      // time, not the gpu tensor.
      // Thus, we set pre_scope_ = nullptr to trigger `new RuntimeContext()` in
      // RunImpl().
1179
      if (enable_cache_runtime_context_) {
1180 1181
        pre_scope_ = nullptr;
      }
Y
yuyang18 已提交
1182 1183

      auto* trans_var = new_scope->Var(var_name);
X
fix  
Xin Pan 已提交
1184
      input_vars[i] = trans_var;
1185

Y
yuyang18 已提交
1186
      Tensor out;
Y
yuyang18 已提交
1187
      TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
Y
yuyang18 已提交
1188 1189 1190 1191 1192 1193
      SetTensorToVariable(*var, out, trans_var);
    }
  }

  return new_scope;
}
Q
Qiao Longfei 已提交
1194

1195 1196 1197 1198 1199
void OperatorWithKernel::ParseInputDataType(
    const ExecutionContext& ctx, const std::string& name,
    proto::VarType::Type* data_type) const {
  proto::VarType::Type dafault_data_type =
      static_cast<proto::VarType::Type>(-1);
H
hong 已提交
1200
  const std::vector<Variable*> vars = ctx.MultiInputVar(name);
1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212
  for (size_t i = 0; i < vars.size(); ++i) {
    const Variable* var = vars[i];
    if (var != nullptr) {
      const Tensor* t = nullptr;
      if (var->IsType<Tensor>()) {
        t = &var->Get<Tensor>();
      } else if (var->IsType<LoDTensor>()) {
        t = &var->Get<LoDTensor>();
      } else if (var->IsType<SelectedRows>()) {
        t = &(var->Get<SelectedRows>().value());
      }
      if (t != nullptr) {
1213 1214 1215 1216 1217
        PADDLE_ENFORCE_EQ(
            t->IsInitialized(), true,
            platform::errors::InvalidArgument(
                "The Tensor in the %s Op's Input Variable %s(%s) is "
                "not initialized.",
H
hong 已提交
1218
                Type(), name, ctx.InputNames(name).at(i)));
1219
        proto::VarType::Type tmp = t->type();
1220 1221 1222 1223 1224 1225 1226 1227
        PADDLE_ENFORCE(
            tmp == *data_type || *data_type == dafault_data_type,
            platform::errors::InvalidArgument(
                "The DataType of %s Op's duplicable Variable %s must be "
                "consistent. The current variable type is (%s), but the "
                "previous variable type is (%s).",
                Type(), name, DataTypeToString(tmp),
                DataTypeToString(*data_type)));
1228 1229 1230 1231 1232 1233
        *data_type = tmp;
      }
    }
  }
}

1234
proto::VarType::Type OperatorWithKernel::IndicateDataType(
Y
Yu Yang 已提交
1235
    const ExecutionContext& ctx) const {
1236 1237 1238
  proto::VarType::Type dafault_data_type =
      static_cast<proto::VarType::Type>(-1);
  proto::VarType::Type data_type = dafault_data_type;
H
hong 已提交
1239 1240
  for (auto& input : ctx.InNameList()) {
    ParseInputDataType(ctx, input, &data_type);
Y
Yu Yang 已提交
1241
  }
1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
  PADDLE_ENFORCE_NE(data_type, dafault_data_type,
                    "DataType should be indicated by input Variable.");
  return data_type;
}

proto::VarType::Type OperatorWithKernel::IndicateVarDataType(
    const ExecutionContext& ctx, const std::string& name) const {
  proto::VarType::Type dafault_data_type =
      static_cast<proto::VarType::Type>(-1);
  proto::VarType::Type data_type = dafault_data_type;
  ParseInputDataType(ctx, name, &data_type);
  PADDLE_ENFORCE_NE(
      data_type, dafault_data_type,
      "The Input Variable(%s) of %s Op used to determine kernel data type "
      "is empty or not LoDTensor or SelectedRows.",
      name, Type());
1258
  return data_type;
Y
Yu Yang 已提交
1259
}
1260

1261 1262 1263 1264 1265 1266 1267 1268
OpKernelType OperatorWithKernel::GetExpectedKernelType(
    const ExecutionContext& ctx) const {
  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
}

OpKernelType OperatorWithKernel::GetKernelTypeForVar(
    const std::string& var_name, const Tensor& tensor,
    const OpKernelType& expected_kernel_type) const {
M
mozga-intel 已提交
1269 1270
  return OpKernelType(expected_kernel_type.data_type_, tensor.place(),
                      tensor.layout());
1271 1272
}

Q
Qiao Longfei 已提交
1273
}  // namespace framework
L
liaogang 已提交
1274
}  // namespace paddle