tracer.cc 19.8 KB
Newer Older
J
Jiabin Yang 已提交
1
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2 3 4 5 6 7 8 9 10 11 12 13 14
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/tracer.h"
15

16
#include <map>
H
hong 已提交
17
#include <set>
M
minqiyang 已提交
18
#include <unordered_set>
19
#include <utility>
20

21
#include "paddle/fluid/framework/op_registry.h"
22
#include "paddle/fluid/imperative/amp_auto_cast.h"
23
#include "paddle/fluid/imperative/execution_context.h"
24
#include "paddle/fluid/imperative/layout_autotune.h"
25
#include "paddle/fluid/imperative/op_base.h"
26
#include "paddle/fluid/operators/ops_extra_info.h"
27
#include "paddle/fluid/platform/denormal.h"
28
#include "paddle/fluid/platform/device/device_wrapper.h"
C
chengduo 已提交
29
#include "paddle/fluid/platform/profiler.h"
30
#include "paddle/fluid/platform/profiler/event_tracing.h"
31
#include "paddle/fluid/string/string_helper.h"
32
#include "paddle/phi/common/place.h"
33
#include "paddle/phi/core/flags.h"
34

35 36 37
PHI_DECLARE_bool(use_mkldnn);
PHI_DECLARE_string(tracer_mkldnn_ops_on);
PHI_DECLARE_string(tracer_mkldnn_ops_off);
38

39
namespace paddle {
M
minqiyang 已提交
40
namespace imperative {
41
thread_local std::string Tracer::python_stack_ = "";
M
minqiyang 已提交
42

43 44
thread_local bool Tracer::enable_program_desc_tracing_ = false;

Z
Zeng Jinle 已提交
45 46
thread_local bool Tracer::has_grad_ = true;

47 48
thread_local bool Tracer::use_layout_autotune_ = false;

49 50
thread_local AmpLevel Tracer::amp_level_ = AmpLevel::O0;

51
thread_local phi::DataType Tracer::amp_dtype_ = phi::DataType::FLOAT32;
52

53 54 55 56 57 58 59 60 61
static std::shared_ptr<Tracer> g_current_tracer(nullptr);

const std::shared_ptr<Tracer>& GetCurrentTracer() { return g_current_tracer; }

void SetCurrentTracer(const std::shared_ptr<Tracer>& tracer) {
  g_current_tracer = tracer;
  VLOG(6) << "Set current tracer: " << g_current_tracer;
}

62
void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) {
63 64 65 66 67 68 69 70 71 72 73 74
  for (const auto& pair : outs) {
    for (const auto& var : pair.second) {
      // NOTE(zhiqiu): this happends when None output are passed from python
      // side. For example, fake_quantize_dequantize_moving_average_abs_max may
      // pass None OutAccum in eval mode.
      // It can be refined by generate several different pybind interface for
      // one operator with different function signature.
      if (var == nullptr) {
        VLOG(4) << pair.first << " is NULL";
        continue;
      }
      VLOG(6) << "Set output: " << var->Name() << "'s OverridedStopGradient as "
75
              << generate_grad;
76
      var->InnerSetOverridedStopGradient(generate_grad);
77 78 79 80
    }
  }
}

81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
void IncreaseVarbaseReferenceCountUntilCopyComplete(
    const std::shared_ptr<imperative::VarBase>& var,
    const platform::Place& place) {
  // Note(zhiqiu): Follow the logic of TensorCopy to determine the place that we
  // need to add callback, see tensor_utils.cc:245
  auto place_ = platform::is_gpu_place(place) ? place : var->Place();

  auto tracer = imperative::GetCurrentTracer();
  auto gc = tracer->MutableGarbageCollectorIfNotExists(place_);

  // Note(zhiqiu): This is an empty callback, the only way is to "reference"
  // var, so it will not be destructed until the kernels launched at current
  // stream of given place is finished.
  auto callback = [var, place_]() {
    VLOG(4) << "Run callback of var:" << var->Name() << " at place " << place_;
  };

  gc->DirectClearCallback(callback);
}

paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
    const platform::Place& place) {
  // if not exists, create a new GarbageCollector at given place
  if (gcs_.count(place) == 0) {
    std::unique_ptr<framework::GarbageCollector> gc;
    if (platform::is_gpu_place(place)) {
Z
zhulei 已提交
107
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
108
      gc.reset(new framework::DefaultStreamGarbageCollector(place, 0));
109 110 111 112 113 114 115 116

      VLOG(10) << "Created GarbageCollector at " << place;
#else
      PADDLE_THROW(platform::errors::PermissionDenied(
          "Paddle can't use CUDA device since it's not compiled with CUDA,"
          "Please recompile or reinstall Paddle with GPU support."));
#endif
    } else if (platform::is_cuda_pinned_place(place)) {
Z
zhulei 已提交
117
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
118
      gc.reset(new framework::CUDAPinnedGarbageCollector(place, 0));
119 120 121 122 123 124 125 126 127 128

      VLOG(10) << "Created GarbageCollector at " << place;
#else
      PADDLE_THROW(platform::errors::PermissionDenied(
          "Paddle can't use CUDAPinned device since it's not compiled with "
          "CUDA,"
          "Please recompile or reinstall Paddle with GPU support."));
#endif
    } else if (platform::is_xpu_place(place)) {
#if defined(PADDLE_WITH_XPU)
129
      gc.reset(new framework::XPUGarbageCollector(place, 0));
130 131 132 133 134 135 136
      VLOG(10) << "Created GarbageCollector at " << place;
#else
      PADDLE_THROW(platform::errors::PermissionDenied(
          "Paddle can't use XPU device since it's not compiled with XPU,"
          "Please recompile or reinstall Paddle with XPU support."));
#endif
    } else if (platform::is_cpu_place(place)) {
137
      gc.reset(new framework::CPUGarbageCollector(place, 0));
138
      VLOG(10) << "Created GarbageCollector at " << place;
139 140 141 142 143 144 145 146
    } else if (platform::is_ipu_place(place)) {
#if defined(PADDLE_WITH_IPU)
      gc.reset(new framework::IPUGarbageCollector(place, 0));
      VLOG(10) << "Created GarbageCollector at " << place;
#else
      PADDLE_THROW(platform::errors::PermissionDenied(
          "Paddle can't use IPU device since it's not compiled with IPU,"
          "Please recompile or reinstall Paddle with IPU support."));
147 148 149
#endif
    } else if (platform::is_custom_place(place)) {
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
150 151 152 153 154 155 156 157
      if (framework::IsFastEagerDeletionModeEnabled()) {
        gc.reset(
            new framework::CustomDeviceUnsafeFastGarbageCollector(place, 0));
        VLOG(10) << "Created UnsafeFastGarbageCollector at " << place;
      } else {
        gc.reset(new framework::CustomDefaultStreamGarbageCollector(place, 0));
        VLOG(10) << "Created GarbageCollector at " << place;
      }
158 159 160 161 162 163
#else
      PADDLE_THROW(platform::errors::PermissionDenied(
          "Paddle can't use CustomDevice since it's not compiled with "
          "CustomDevice,"
          "Please recompile or reinstall Paddle with CustomDevice "
          "support."));
164
#endif
165 166 167 168 169 170 171 172 173 174
    } else {
      PADDLE_THROW(platform::errors::PreconditionNotMet(
          "Unsupported place for garbage collection"));
    }
    gcs_.emplace(place, std::move(gc));
  }

  return gcs_.at(place).get();
}

J
Jiabin Yang 已提交
175
template <typename VarType>
176 177
void Tracer::TraceOp(const std::string& type,
                     const NameVarMap<VarType>& ins,
J
Jiabin Yang 已提交
178 179
                     const NameVarMap<VarType>& outs,
                     framework::AttributeMap attrs,
180 181
                     const platform::Place& place,
                     bool trace_backward,
J
Jiabin Yang 已提交
182 183
                     const std::map<std::string, std::string>& inplace_map,
                     paddle::framework::AttributeMap* passed_default_attrs_,
184
                     bool use_default_attr_map) {
185 186 187 188 189 190 191 192
  TraceOpImpl<VarType>(type,
                       ins,
                       outs,
                       attrs,
                       place,
                       trace_backward,
                       inplace_map,
                       passed_default_attrs_,
W
wanghuancoder 已提交
193 194 195 196 197 198 199 200
                       use_default_attr_map);
}

template <typename VarType>
void Tracer::TraceOpImpl(const std::string& type,
                         const NameVarMap<VarType>& ins,
                         const NameVarMap<VarType>& outs,
                         framework::AttributeMap& attrs,
201 202
                         const platform::Place& place,
                         bool trace_backward,
W
wanghuancoder 已提交
203 204 205
                         const std::map<std::string, std::string>& inplace_map,
                         paddle::framework::AttributeMap* passed_default_attrs_,
                         bool use_default_attr_map) {
206
  platform::RecordEvent op_type_record_event(
207
      type, platform::TracerEventType::Operator, 1);
208
  platform::ScopedFlushDenormal flush;
L
Leo Chen 已提交
209
  VLOG(4) << "Trace Op: " << type;
210
  if (FLAGS_use_mkldnn) {
211 212 213 214 215 216 217 218 219 220 221
    // if both lists are empty all ops are enabled (default for
    // FLAGS_use_mkldnn=1)
    // if ops_on list is not empty only ops from that list are enabled
    if (!FLAGS_tracer_mkldnn_ops_on.empty()) {
      auto is_on = FLAGS_tracer_mkldnn_ops_on.find(type) != std::string::npos;
      attrs["use_mkldnn"] = is_on;
    } else {
      // if ops_on list is empty all ops are enabled except types from off_list
      auto is_off = FLAGS_tracer_mkldnn_ops_off.find(type) != std::string::npos;
      attrs["use_mkldnn"] = !is_off;
    }
222
  }
223 224 225 226
  auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
  const auto& op_info = op->Info();
  auto* attr_checker = op_info.Checker();
  if (attr_checker) {
227
    attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true);
228
  }
229 230 231 232 233
  const auto& extra_attr_checkers =
      operators::ExtraInfoUtils::Instance().GetExtraAttrsChecker(type);
  for (const auto& checker : extra_attr_checkers) {
    checker(&attrs, true);
  }
234

235 236 237 238 239
  static paddle::framework::AttributeMap empty_attrs_map = {};
  const paddle::framework::AttributeMap& default_attrs =
      attr_checker == nullptr ? empty_attrs_map
                              : attr_checker->GetDefaultAttrMap();

Z
zyfncg 已提交
240
  std::unique_ptr<NameVarMap<VarType>> ins_amp = nullptr;
L
Leo Chen 已提交
241
  if (amp_level_ == AmpLevel::O1) {
242
    if (amp_dtype_ == phi::DataType::FLOAT16) {
243
      VLOG(5) << "Float16 Auto Mixed Precision O1 run operator: " << type;
Z
zyfncg 已提交
244
      ins_amp = std::make_unique<NameVarMap<VarType>>(
245
          AutoCastInputs<VarType>(type, ins));
246
    } else if (amp_dtype_ == phi::DataType::BFLOAT16) {
247
      VLOG(5) << "BFloat16 Auto Mixed Precision O1 run operator: " << type;
Z
zyfncg 已提交
248 249
      ins_amp = std::make_unique<NameVarMap<VarType>>(
          AutoCastBF16Inputs<VarType>(type, ins));
250
    }
L
Leo Chen 已提交
251
  } else if (amp_level_ == AmpLevel::O2) {
252
    if (amp_dtype_ == phi::DataType::FLOAT16) {
253
      VLOG(5) << "Float16 Auto Mixed Precision O2 run operator: " << type;
254 255
      ins_amp = std::make_unique<NameVarMap<VarType>>(
          CastPureFp16Inputs<VarType>(type, ins));
256
    } else if (amp_dtype_ == phi::DataType::BFLOAT16) {
257
      VLOG(5) << "BFloat16 Auto Mixed Precision O2 run operator: " << type;
Z
zyfncg 已提交
258 259
      ins_amp = std::make_unique<NameVarMap<VarType>>(
          CastPureBf16Inputs<VarType>(type, ins));
260
    }
261
  }
262 263 264 265 266

  if (platform::is_gpu_place(place)) {
    const auto& new_tmp = ins_amp == nullptr ? ins : *ins_amp;
    const auto& tracer = imperative::GetCurrentTracer();
    ins_amp = std::make_unique<NameVarMap<VarType>>(
267 268
        imperative::AutoTuneLayout<VarType>(
            type, new_tmp, outs, &attrs, tracer));
269 270
  }

Z
zyfncg 已提交
271
  const auto& new_ins = ins_amp == nullptr ? ins : *ins_amp;
272

273
  try {
274 275
    if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
276
      platform::SetDeviceId(place.device);
277 278 279 280 281 282
#else
      PADDLE_THROW(platform::errors::PreconditionNotMet(
          "PaddlePaddle should compile with GPU if use CUDAPlace."));
#endif
    } else if (platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU
283
      platform::SetXPUDeviceId(place.device);
284 285 286
#else
      PADDLE_THROW(platform::errors::PreconditionNotMet(
          "PaddlePaddle should compile with XPU if use XPUPlace."));
H
houj04 已提交
287
#endif
288 289
    } else if (platform::is_custom_place(place)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
290
      phi::DeviceManager::SetDevice(place);
291 292 293 294
#else
      PADDLE_THROW(platform::errors::PreconditionNotMet(
          "PaddlePaddle should compile with CustomDevice if use "
          "CustomPlace."));
295 296
#endif
    }
297
    if (!use_default_attr_map) {
J
Jiabin Yang 已提交
298 299 300 301 302 303 304 305 306 307 308 309 310
      PADDLE_ENFORCE_NOT_NULL(passed_default_attrs_,
                              paddle::platform::errors::PermissionDenied(
                                  "Detected default_attrs = nullptr."));
      VLOG(6) << "Use passed in default attrs";
      OpBase::Run(*op, new_ins, outs, attrs, (*passed_default_attrs_), place);
    } else {
      VLOG(6) << "Use Checker's default attrs";
      if (passed_default_attrs_) {
        // TODO(jiabin): Update this without copy
        *passed_default_attrs_ = default_attrs;
      }
      OpBase::Run(*op, new_ins, outs, attrs, default_attrs, place);
    }
311 312 313 314
  } catch (platform::EnforceNotMet& exception) {
    framework::AppendErrorOpHint(type, &exception);
    throw std::move(exception);
  } catch (std::exception& ex) {
315 316 317 318 319 320
    PADDLE_THROW(
        platform::errors::Fatal("Operator %s raises an %s exception.\n"
                                "The exception content is\n:%s.",
                                type,
                                platform::demangle(typeid(ex).name()),
                                ex.what()));
321 322 323 324 325 326 327
  } catch (...) {
    // NOTE: this branch represents a very serious bug with
    // low probability of occurrence, and we can't get its
    // exception content here.
    PADDLE_THROW(platform::errors::Fatal(
        "Operator %s raises an unknown exception.", type));
  }
J
Jiabin Yang 已提交
328

329 330
  if (enable_program_desc_tracing_) {
    VLOG(5) << "Trace op " << type << " into ProgramDesc";
331
    program_desc_tracer_->InsertOp(type, new_ins, outs, attrs);
332 333
  }

334 335
  {
    platform::RecordEvent node_creation_record_event(
336
        "grad_node_creation", platform::TracerEventType::OperatorInner, 1);
337 338 339

    if (ComputeRequiredGrad(new_ins, outs, trace_backward)) {
      PADDLE_ENFORCE_EQ(
340 341
          passed_default_attrs_,
          nullptr,
342 343 344 345
          paddle::platform::errors::PermissionDenied(
              "We expect passed_default_attrs_ is nullptr while "
              "use_default_attr_map is true, however we got not null "
              "passed_default_attrs_. Please check your usage of trace_op. "));
346 347
      CreateGradOpNode(
          *op, new_ins, outs, attrs, default_attrs, place, inplace_map);
348 349 350 351
    } else {
      VLOG(3) << "No Grad to track for Op: " << type;
    }
    VLOG(6) << "Finish Trace Op: " << type;
352
  }
M
minqiyang 已提交
353 354
}

J
Jiabin Yang 已提交
355
template void Tracer::TraceOp<VarBase>(
356 357 358 359 360 361
    const std::string& type,
    const NameVarMap<VarBase>& ins,
    const NameVarMap<VarBase>& outs,
    framework::AttributeMap attrs,
    const platform::Place& place,
    bool trace_backward,
J
Jiabin Yang 已提交
362
    const std::map<std::string, std::string>& inplace_map,
363 364
    paddle::framework::AttributeMap* default_attrs,
    bool use_default_attr_map);
J
Jiabin Yang 已提交
365

366
template void Tracer::TraceOp<egr::EagerVariable>(
367 368 369 370 371 372
    const std::string& type,
    const NameVarMap<egr::EagerVariable>& ins,
    const NameVarMap<egr::EagerVariable>& outs,
    framework::AttributeMap attrs,
    const platform::Place& place,
    bool trace_backward,
J
Jiabin Yang 已提交
373
    const std::map<std::string, std::string>& inplace_map_,
374 375
    paddle::framework::AttributeMap* default_attrs,
    bool use_default_attr_map);
J
Jiabin Yang 已提交
376

377 378 379 380
void Tracer::TraceOp(const std::string& type,
                     const NameVarBaseMap& ins,
                     const NameVarBaseMap& outs,
                     framework::AttributeMap attrs,
381
                     const std::map<std::string, std::string>& inplace_map) {
382 383 384 385 386 387 388
  TraceOp<VarBase>(type,
                   ins,
                   outs,
                   std::move(attrs),
                   expected_place_,
                   has_grad_,
                   inplace_map);
J
Jiabin Yang 已提交
389 390
}

391 392
void Tracer::TraceOp(const std::string& type,
                     const NameTensorMap& ins,
J
Jiabin Yang 已提交
393
                     const NameTensorMap& outs,
W
wanghuancoder 已提交
394
                     paddle::framework::AttributeMap& attrs,
J
Jiabin Yang 已提交
395 396
                     const paddle::platform::Place& place,
                     paddle::framework::AttributeMap* default_attrs,
397
                     bool use_default_attr_map,
J
Jiabin Yang 已提交
398
                     const std::map<std::string, std::string>& inplace_map) {
399 400
  VLOG(6) << "Running On Eager TraceOp with use_default_attr_map: "
          << use_default_attr_map;
401 402 403 404 405 406 407 408
  TraceOpImpl<egr::EagerVariable>(type,
                                  ins,
                                  outs,
                                  attrs,
                                  place,
                                  false,
                                  inplace_map,
                                  default_attrs,
W
wanghuancoder 已提交
409 410 411
                                  use_default_attr_map);
}

412 413
void Tracer::TraceOp(const std::string& type,
                     const NameTensorMap& ins,
W
wanghuancoder 已提交
414 415 416
                     const NameTensorMap& outs,
                     paddle::framework::AttributeMap attrs) {
  VLOG(6) << "Running On Eager TraceOp(4 agrs): ";
417 418
  TraceOpImpl<egr::EagerVariable>(
      type, ins, outs, attrs, expected_place_, false, {}, nullptr, true);
J
Jiabin Yang 已提交
419 420
}

421 422
void Tracer::TraceOp(const std::string& type,
                     const NameTensorMap& ins,
J
Jiabin Yang 已提交
423
                     const NameTensorMap& outs,
W
wanghuancoder 已提交
424
                     paddle::framework::AttributeMap& attrs,
J
Jiabin Yang 已提交
425 426
                     const std::map<std::string, std::string>& inplace_map) {
  VLOG(6) << "Running On Eager TraceOp(less): ";
427 428 429 430 431 432 433 434 435
  TraceOpImpl<egr::EagerVariable>(type,
                                  ins,
                                  outs,
                                  attrs,
                                  expected_place_,
                                  false,
                                  inplace_map,
                                  nullptr,
                                  true);
436 437
}

W
WangXi 已提交
438 439 440 441
void Tracer::SetExpectedPlace(platform::Place place) {
  expected_place_ = place;
}

J
Jiabin Yang 已提交
442
bool Tracer::ComputeRequiredGrad(const NameVarBaseMap& ins,
443
                                 const NameVarBaseMap& outs,
J
Jiabin Yang 已提交
444
                                 bool trace_backward) {
445 446 447 448 449 450 451 452 453 454 455 456 457
  if (!trace_backward) return false;

  for (const auto& name_pair : ins) {
    for (const auto& var_base : name_pair.second) {
      if (!var_base->OverridedStopGradient()) {
        VLOG(6) << "Find out input: " << var_base->Name()
                << "'s GeneratedGrad is True";
        PassStopGradient(outs, var_base->OverridedStopGradient());
        return true;
      }
    }
  }
  return false;
M
minqiyang 已提交
458 459
}

J
Jiabin Yang 已提交
460 461 462 463 464 465
bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins,
                                 const NameTensorMap& outs,
                                 bool trace_backward) {
  return false;
}

466
phi::KernelSignature Tracer::GetExpectedKernelSignature(
467 468 469 470
    const std::string& type,
    const NameTensorMap& ins,
    const NameTensorMap& outs,
    framework::AttributeMap attrs) const {
471 472 473 474 475 476 477 478 479 480 481 482 483 484
  auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
  framework::RuntimeContext ctx({}, {});
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto* dev_ctx = pool.Get(phi::CPUPlace());
  const auto& op_info = op->Info();
  auto* attr_checker = op_info.Checker();
  if (attr_checker) {
    attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true);
  }
  static paddle::framework::AttributeMap empty_attrs_map = {};
  const paddle::framework::AttributeMap& default_attrs =
      attr_checker == nullptr ? empty_attrs_map
                              : attr_checker->GetDefaultAttrMap();
  auto dygraph_exe_ctx =
485
      imperative::DygraphExecutionContext<egr::EagerVariable>(
486 487 488 489 490 491 492
          *op,
          framework::Scope(),
          *dev_ctx,
          ctx,
          ins,
          outs,
          attrs,
493 494 495
          default_attrs);
  auto* opbase_with_kernel =
      dynamic_cast<framework::OperatorWithKernel*>(op.get());
496 497
  PADDLE_ENFORCE_NE(opbase_with_kernel,
                    nullptr,
498 499 500 501
                    platform::errors::InvalidArgument(
                        "This op type:`%s` is not a OperatorWithKernel, only "
                        "OperatorWithKernel can get KernelSignature",
                        type));
502 503 504 505 506 507
  if (phi::KernelFactory::Instance().HasStructuredKernel(type)) {
    return phi::KernelSignature(op->Type().c_str());
  } else {
    return phi::KernelSignature(std::move(
        opbase_with_kernel->GetExpectedPhiKernelArgs(dygraph_exe_ctx)));
  }
508 509
}

M
minqiyang 已提交
510
}  // namespace imperative
511
}  // namespace paddle