subgraph_compute.cc 17.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/kernels/npu/subgraph_compute.h"
#include <sys/time.h>
#include <time.h>
18
#include <algorithm>
19
#include <functional>
20
#include <utility>
21
#include "hiai_ir_build.h"  // NOLINT
22 23 24 25
#include "lite/backends/npu/device.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/paddle_use_bridges.h"
26
#include "lite/kernels/npu/bridges/utility.h"
27
#include "lite/utils/io.h"
28
#include "lite/utils/md5.h"
29 30 31 32 33 34

namespace paddle {
namespace lite {
namespace kernels {
namespace npu {

35 36 37 38 39 40 41 42 43 44 45 46 47 48
// Generate the model name by using md5 hashes based on:
// 1. the sorted variable input names
// 2. the shapes of the origin input tensors
// 3. the sorted variable output names
std::string DeviceProgram::GenerateModelName(
    const std::vector<std::string>& input_names,
    const std::vector<std::string>& output_names,
    const std::vector<std::vector<int64_t>>& origin_idims) {
  std::ostringstream os;
  CHECK_EQ(input_names.size(), origin_idims.size());
  for (int i = 0; i < input_names.size(); i++) {
    os << input_names[i];
    for (auto dim : origin_idims[i]) {
      os << dim;
49 50
    }
  }
51 52 53 54 55
  for (auto output_name : output_names) {
    os << output_name;
  }
  return MD5(os.str());
}
56

57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
// Deserialize the generated model, the precisions and dimensions of the origin
// output tensors of the subgraph op into files
bool DeviceProgram::LoadFromCacheFile(
    const std::vector<std::string>& input_names,
    const std::vector<std::string>& output_names,
    const std::vector<std::vector<int64_t>>& origin_idims,
    const std::string& model_cache_dir) {
  // Generate the model name if not initialized
  if (model_name_.empty()) {
    model_name_ = GenerateModelName(input_names, output_names, origin_idims);
  }
  // Load from the cached model file, return a HiAI model manager client for
  // inference
  auto model_path = model_cache_dir + "/" + model_name_ + ".om";
  VLOG(3) << "[NPU] Load model from " << model_path;
  std::vector<char> model_buffer;
  if (!ReadFile(model_path, &model_buffer)) {
    LOG(WARNING) << "[NPU] read from " << model_path << " failed!";
    return false;
  }
  bool model_comp = false;
  model_client_ =
      lite::npu::Device::Global().Load(model_name_, &model_buffer, &model_comp);
  if (!model_client_) {
    LOG(WARNING) << "[NPU] Load model failed!";
    return false;
  }
  // Rewrite with the compatible model data if the cached
  // model file is incompatible with the current device
  if (!model_comp) {
    VLOG(3) << "[NPU] Export the compatible model to " << model_path;
    if (!WriteFile(model_path, model_buffer)) {
      LOG(WARNING) << "[NPU] Open " << model_path << " for writting failed!";
    }
  }
  // Deserialize the precisions and shapes of the origin output tensors from the
  // cached configuration file
  auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
  VLOG(3) << "[NPU] Load configuration from " << config_path;
  std::vector<char> config_buffer;
  if (!ReadFile(config_path, &config_buffer)) {
    LOG(WARNING) << "[NPU] read from " << config_path << " failed!";
    return false;
  }
  std::string config_str(config_buffer.begin(), config_buffer.end());
  // Parse the precision and shapes of the output tensors
  auto output_options = Split<std::string>(config_str, ";");
  CHECK_EQ(output_options.size(), output_names.size());
  origin_otypes_.resize(output_names.size());
  origin_odims_.resize(output_names.size());
  for (int i = 0; i < output_names.size(); i++) {
    auto items = Split<std::string>(output_options[i], ":");
    CHECK_EQ(items.size(), 2);  // precision and shapes
    origin_otypes_[i] = static_cast<PrecisionType>(std::stoi(items[0]));
    origin_odims_[i] = Split<int64_t>(items[1], ",");
  }
  return true;
114 115
}

116 117 118 119 120 121 122 123 124 125 126 127 128
bool DeviceProgram::BuildGraphAndCacheToFile(
    const std::vector<Instruction>& origin_program,
    const std::vector<std::string>& input_names,
    const std::vector<std::string>& output_names,
    const std::vector<std::vector<int64_t>>& origin_idims,
    const std::vector<Tensor*>& origin_otensors,
    const std::string& model_cache_dir) {
  // Generate the model name if not initialized
  if (model_name_.empty()) {
    model_name_ = GenerateModelName(input_names, output_names, origin_idims);
  }
  // Convert all of ops and their input vars and weights to HiAI IR nodes,
  // then added them into the HiAI IR graph
129
  int status = 0;
130
  CHECK(!origin_program.empty()) << "no instructions";
131 132
  subgraph::npu::Graph graph;
  const auto& bridges = subgraph::Registry::Instance();
133
  for (auto& inst : origin_program) {
134
    auto op = const_cast<OpLite*>(inst.op());
135 136 137 138
    CHECK(op);
    op->CheckShape();
    op->InferShape();
    std::string op_type = op->op_info()->Type();
139
    if (!bridges.Exists(op_type, TARGET(kNPU))) {
140
      return false;
141
    }
142
    auto kernel = inst.kernel();
143 144
    status |= bridges.Select(op_type, TARGET(kNPU))(
        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
145
    if (subgraph::CHECK_FAILED(status)) {
146
      return false;
147 148
    }
  }
149
  // Collect the input and output nodes of the HiAI IR graph
150
  std::vector<ge::Operator> device_inodes;
151 152 153 154
  for (size_t i = 0; i < input_names.size(); i++) {
    CHECK(graph.Has(input_names[i]) && graph.Get(input_names[i])->is_data());
    device_inodes.push_back(*graph.Get(input_names[i])->data());
  }
155
  std::vector<ge::Operator> device_onodes;
156 157 158
  for (size_t i = 0; i < output_names.size(); i++) {
    CHECK(graph.Has(output_names[i]));
    device_onodes.push_back(*graph.Get(output_names[i])->data());
159
  }
160 161 162 163 164 165
  // Build the HiAI IR graph to the HiAI om model
  std::vector<char> model_buffer;
  if (!lite::npu::Device::Global().Build(
          device_inodes, device_onodes, &model_buffer)) {
    LOG(WARNING) << "[NPU] Build model failed!";
    return false;
166
  }
167 168 169 170 171 172 173 174
  // Load the HiAI om model and create a HiAI model manager client(from HiAI
  // Service) to run inference.
  bool model_comp = true;
  model_client_ =
      lite::npu::Device::Global().Load(model_name_, &model_buffer, &model_comp);
  if (!model_client_) {
    LOG(WARNING) << "[NPU] Load model failed!";
    return false;
175
  }
176 177 178 179 180 181 182
  // Update the precison and dimensions of the origin output tensors
  CHECK_EQ(origin_otensors.size(), output_names.size());
  origin_otypes_.resize(output_names.size());
  origin_odims_.resize(output_names.size());
  for (size_t i = 0; i < output_names.size(); i++) {
    origin_otypes_[i] = graph.Get(output_names[i])->precision();
    origin_odims_[i] = origin_otensors[i]->dims().Vectorize();
183
  }
184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
  if (!model_cache_dir.empty()) {
    // Save the generated model to file, used for the model caching or the
    // offline model generation
    auto model_path = model_cache_dir + "/" + model_name_ + ".om";
    VLOG(3) << "[NPU] Save model to " << model_path;
    if (!WriteFile(model_path, model_buffer)) {
      LOG(WARNING) << "[NPU] Open " << model_path << " for writting failed!";
    }
    // Serialize the precisions and shapes of the origin output tensors into the
    // configuration file
    std::ostringstream os;
    for (int i = 0; i < output_names.size(); i++) {
      os << static_cast<int32_t>(origin_otypes_[i]) << ":";
      for (auto dim : origin_odims_[i]) {
        os << dim << ",";
      }
      os << ";";
    }
    auto str = os.str();
    std::vector<char> config_buffer(str.begin(), str.end());
    auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
    VLOG(3) << "[NPU] Save configuration to " << config_path;
    if (!WriteFile(config_path, config_buffer)) {
      LOG(WARNING) << "[NPU] Open " << config_path << " for writting failed!";
    }
Z
zhupengyang 已提交
209
  }
210 211
  return true;
}
212

213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
bool DeviceProgram::ShareBufferWithOriginTensors(
    const std::vector<std::string>& input_names,
    const std::vector<std::string>& output_names,
    std::vector<Tensor*>* origin_itensors,
    std::vector<Tensor*>* origin_otensors,
    std::vector<std::shared_ptr<hiai::AiTensor>>* device_itensors,
    std::vector<std::shared_ptr<hiai::AiTensor>>* device_otensors) {
  CHECK(!model_name_.empty() && model_client_);
  // Query the dimensions of the device input and output tensors if not
  // initialized
  if (device_idims_.empty() || device_odims_.empty()) {
    if (model_client_->GetModelIOTensorDim(
            model_name_, device_idims_, device_odims_) != hiai::AI_SUCCESS) {
      LOG(WARNING)
          << "[NPU] Get the dimensions of input and output tensors failed!";
      return false;
    }
230
  }
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
  // Check the dimensions of the device tensors and the origin tensors
  CHECK_EQ(device_itensors->size(), input_names.size());
  CHECK_EQ(device_otensors->size(), output_names.size());
  CHECK_EQ(origin_otypes_.size(), output_names.size());
  CHECK_EQ(origin_odims_.size(), output_names.size());
  CHECK_EQ(device_idims_.size(), input_names.size());
  CHECK_EQ(device_odims_.size(), output_names.size());
  for (int i = 0; i < input_names.size(); i++) {
    VLOG(3) << "[NPU] Inputs[" << i << "] name: " << input_names[i]
            << " origin dims:" << (*origin_itensors)[i]->dims().repr()
            << " device dims: {" << device_idims_[i].GetNumber() << ","
            << device_idims_[i].GetChannel() << ","
            << device_idims_[i].GetHeight() << ","
            << device_idims_[i].GetWidth() << "}";
    CHECK_EQ((*origin_itensors)[i]->dims().production(),
             device_idims_[i].GetNumber() * device_idims_[i].GetChannel() *
                 device_idims_[i].GetHeight() * device_idims_[i].GetWidth());
    VLOG(3) << "[NPU] Init the input tensors for the device program and share "
               "their buffers with the origin input tensors";
    // reinit device tensor will free shared buffer, so copy data to a tmp
    // tensor
    Tensor tmp;
    tmp.CopyDataFrom(*(*origin_itensors)[i]);
    (*device_itensors)[i]->Init(&(device_idims_[i]));
255

256 257
    std::memcpy(
        (*device_itensors)[i]->GetBuffer(), tmp.raw_data(), tmp.memory_size());
258

259 260 261 262 263 264 265
    // Share data buf between device_itensor and origin_itensor
    std::shared_ptr<Buffer> buffer =
        std::make_shared<Buffer>((*device_itensors)[i]->GetBuffer(),
                                 lite_api::TargetType::kHost,
                                 (*device_itensors)[i]->GetSize());
    (*origin_itensors)[i]->ResetBuffer(buffer,
                                       (*device_itensors)[i]->GetSize());
266
  }
267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
  for (int i = 0; i < output_names.size(); i++) {
    (*origin_otensors)[i]->set_precision(origin_otypes_[i]);
    (*origin_otensors)[i]->Resize(origin_odims_[i]);
    VLOG(3) << "[NPU] Outputs[" << i << "] name: " << output_names[i]
            << " origin dims:" << (*origin_otensors)[i]->dims().repr()
            << " device dims: {" << device_odims_[i].GetNumber() << ","
            << device_odims_[i].GetChannel() << ","
            << device_odims_[i].GetHeight() << ","
            << device_odims_[i].GetWidth() << "}";
    CHECK_EQ((*origin_otensors)[i]->dims().production(),
             device_odims_[i].GetNumber() * device_odims_[i].GetChannel() *
                 device_odims_[i].GetHeight() * device_odims_[i].GetWidth());
    (*device_otensors)[i]->Init(&(device_odims_[i]));
    VLOG(3) << "[NPU] Init the output tensors for the device program and share "
               "their buffers with the origin output tensors";
    // Share data buf between device_itensor and origin_itensor
    std::shared_ptr<Buffer> buffer =
        std::make_shared<Buffer>((*device_otensors)[i]->GetBuffer(),
                                 lite_api::TargetType::kHost,
                                 (*device_otensors)[i]->GetSize());
    (*origin_otensors)[i]->ResetBuffer(buffer,
                                       (*device_otensors)[i]->GetSize());
289
  }
290
  return true;
291 292
}

293 294 295 296
bool DeviceProgram::ZeroCopyRun(
    std::vector<std::shared_ptr<hiai::AiTensor>>* device_itensors,
    std::vector<std::shared_ptr<hiai::AiTensor>>* device_otensors) {
  CHECK(!model_name_.empty() && model_client_);
297 298
  // Run the HiAI model by name
  std::string key = "model_name";  // Note: key seems must be model_name
299 300
  hiai::AiContext model_context;
  model_context.AddPara(key, model_name_);
301 302 303 304 305 306 307
  auto GetCurrentUS = []() -> double {
    struct timeval time;
    gettimeofday(&time, NULL);
    return 1e+6 * time.tv_sec + time.tv_usec;
  };
  int istamp;
  auto start_time = GetCurrentUS();
308 309
  CHECK_EQ(model_client_->Process(
               model_context, *device_itensors, *device_otensors, 1000, istamp),
310
           hiai::AI_SUCCESS);
311
  VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
312
  return true;
313 314
}

315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
  // Obtain the origin input tensors, and create the origin output
  // tensors(Don't try to access them before launch the device program or the
  // origin program)
  PrepareWorkspaceForOriginProgram();
  // Create the device input and output tensors, but don't initialize them
  // with the dimensions
  device_itensors_.resize(input_names_.size());
  for (int i = 0; i < input_names_.size(); i++) {
    device_itensors_[i].reset(new hiai::AiTensor);
    CHECK(device_itensors_[i]);
  }
  device_otensors_.resize(output_names_.size());
  for (int i = 0; i < output_names_.size(); i++) {
    device_otensors_[i].reset(new hiai::AiTensor);
    CHECK(device_otensors_[i]);
331
  }
332
  return true;
333 334
}

335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
bool SubgraphEngine::BuildDeviceProgram() {
  // Check if the cache device program exists
  if (!device_programs_.count(origin_idims_)) {
    auto device_program = std::make_shared<DeviceProgram>();
    // Obtain the model cache dir from the NPU Context of the subgraph op
    auto model_cache_dir = ctx_->As<NPUContext>().SubgraphModelCacheDir();
    VLOG(3) << "[NPU] Getting subgraph model_cache_dir is: " << model_cache_dir;
    // Check and load if the cached model and configuration file exists
    if (model_cache_dir.empty() ||
        !device_program->LoadFromCacheFile(
            input_names_, output_names_, origin_idims_, model_cache_dir)) {
      // Build the model online, including converting the paddle ops to the HiAI
      // IR nodes, building the HiAI IR graph to the om model, then load it as a
      // new HiAI model manager client for inference.
      if (origin_program_.empty()) {
        BuildOriginProgram();
      }
      CHECK(!origin_program_.empty()) << "no instructions";
      if (!device_program->BuildGraphAndCacheToFile(origin_program_,
                                                    input_names_,
                                                    output_names_,
                                                    origin_idims_,
                                                    origin_otensors_,
                                                    model_cache_dir)) {
        return false;
      }
361
    }
362 363
    if (device_program->model_client_ == nullptr) {
      return false;
364
    }
365
    device_programs_[origin_idims_] = device_program;
366
  }
367 368 369 370 371 372 373 374
  auto device_program = device_programs_[origin_idims_];
  CHECK(device_program && device_program->model_client_);
  return device_program->ShareBufferWithOriginTensors(input_names_,
                                                      output_names_,
                                                      &origin_itensors_,
                                                      &origin_otensors_,
                                                      &device_itensors_,
                                                      &device_otensors_);
375 376
}

377 378 379 380 381 382
bool SubgraphEngine::LaunchDeviceProgram() {
  // Roll back to launch the origin program if the device program can't be
  // found or the model client isn't initialized.
  if (device_programs_.count(origin_idims_) == 0 ||
      device_programs_[origin_idims_]->model_client_ == nullptr) {
    return LaunchOriginProgram();
383
  }
384 385 386
  auto device_program = device_programs_[origin_idims_];
  if (!device_program->model_client_) {
    return LaunchOriginProgram();
387
  }
388
  return device_program->ZeroCopyRun(&device_itensors_, &device_otensors_);
389 390
}

391 392
void SubgraphCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
393 394
  engine_.reset(new SubgraphEngine(ctx_.get(),
                                   param.sub_block_idx,
395 396 397
                                   param.sub_block_desc,
                                   param.input_data_names,
                                   param.output_data_names,
398
                                   param.scope));
399 400 401 402 403
  CHECK(engine_);
}

void SubgraphCompute::Run() {
  CHECK(engine_);
404
  engine_->Run();
405 406 407 408 409 410 411 412 413
}

}  // namespace npu
}  // namespace kernels
}  // namespace lite
}  // namespace paddle

REGISTER_LITE_KERNEL(subgraph,
                     kNPU,
414
                     kAny,
415 416 417
                     kNCHW,
                     paddle::lite::kernels::npu::SubgraphCompute,
                     def)
418 419 420 421
    .BindInput("Inputs",
               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
    .BindOutput("Outputs",
                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
422
    .Finalize();