subgraph_compute.cc 16.1 KB
Newer Older
H
hong19860320 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/kernels/apu/subgraph_compute.h"
#include <dlfcn.h>
#include <sys/time.h>
#include <time.h>
#include <utility>
#include "lite/backends/apu/device.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/apu/bridges/graph.h"
#include "lite/kernels/apu/bridges/paddle_use_bridges.h"
#include "lite/kernels/apu/bridges/utility.h"
B
barry-ai 已提交
25 26
#include "lite/utils/io.h"
#include "lite/utils/md5.h"
H
hong19860320 已提交
27 28 29 30 31 32

namespace paddle {
namespace lite {
namespace kernels {
namespace apu {

B
barry-ai 已提交
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
// Generate the model name by using md5 hashes based on:
// 1. the sorted variable input names
// 2. the shapes of the origin input tensors
// 3. the sorted variable output names
std::string DeviceProgram::GenerateModelName(
    const std::vector<std::string>& input_names,
    const std::vector<std::string>& output_names,
    const std::vector<std::vector<int64_t>>& origin_idims) {
  std::ostringstream os;
  CHECK_EQ(input_names.size(), origin_idims.size());
  for (int i = 0; i < input_names.size(); i++) {
    os << input_names[i];
    for (auto dim : origin_idims[i]) {
      os << dim;
    }
  }
  for (auto output_name : output_names) {
    os << output_name;
  }
  return MD5(os.str());
}

// Deserialize the generated model
bool DeviceProgram::LoadFromCacheFile(
    const std::vector<std::string>& input_names,
    const std::vector<std::string>& output_names,
    const std::vector<std::vector<int64_t>>& origin_idims,
    const std::string& model_cache_dir) {
  int status;

  // Generate the model name if not initialized
  if (model_name_.empty()) {
    model_name_ = GenerateModelName(input_names, output_names, origin_idims);
66
  }
B
barry-ai 已提交
67 68 69
  // Load from the cached model file
  auto model_path = model_cache_dir + "/" + model_name_ + ".dla";
  VLOG(3) << "[APU] Load model from " << model_path;
70

B
barry-ai 已提交
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
  std::vector<char> compilationBuffer;
  if (!ReadFile(model_path, &compilationBuffer)) {
    LOG(WARNING) << "[NPU] Open " << model_path << " for reading failed!";
    return false;
  }
  model_ = nullptr;
  compilation_ = nullptr;
  status = NeuronModel_restoreFromCompiledNetwork(
      &model_, &compilation_, &compilationBuffer[0], compilationBuffer.size());
  if (status != NEURON_NO_ERROR) {
    LOG(WARNING) << "[APU] Load model failed!" << compilationBuffer.size();
    return false;
  }

  VLOG(3) << "[APU] Complete Load model!";

  // Deserialize the preicisions and shapes of the origin output tensors from
88
  // the cached configuration file
B
barry-ai 已提交
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
  auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
  VLOG(3) << "[APU] Load configuration from " << config_path;
  std::vector<char> config_buffer;
  if (!ReadFile(config_path, &config_buffer)) {
    LOG(WARNING) << "[APU] read from " << config_path << " failed!";
    return false;
  }

  std::string str(config_buffer.begin(), config_buffer.end());
  // Parse the precision and shapes of the output tensors
  auto output_options = Split<std::string>(str, ";");
  CHECK_EQ(output_options.size(), output_names.size());
  origin_otypes_.resize(output_names.size());
  origin_odims_.resize(output_names.size());
  for (int i = 0; i < output_names.size(); i++) {
    auto items = Split<std::string>(output_options[i], ":");
    CHECK_EQ(items.size(), 2);  // precision and shapes
    origin_otypes_[i] = static_cast<PrecisionType>(std::stoi(items[0]));
    origin_odims_[i] = Split<int64_t>(items[1], ",");
  }
  return true;
}

bool DeviceProgram::BuildGraphAndCacheToFile(
    RuntimeProgram* origin_program,
    const std::vector<std::string>& input_names,
    const std::vector<std::string>& output_names,
    const std::vector<std::vector<int64_t>>& origin_idims,
    const std::vector<Tensor*>& origin_itensors,
    const std::vector<Tensor*>& origin_otensors,
    const std::string& model_cache_dir) {
B
barry-ai 已提交
120 121 122 123 124 125 126 127
  auto GetCurrentUS = []() -> double {
    struct timeval time;
    gettimeofday(&time, NULL);
    return 1e+6 * time.tv_sec + time.tv_usec;
  };

  auto start_time = GetCurrentUS();

H
hong19860320 已提交
128
  unsigned int version;
129
  Neuron_getVersion(&version);
H
hong19860320 已提交
130 131 132 133
  VLOG(3) << "Neuron Adapter version: " << version;

  int status = 0;
  subgraph::apu::Graph graph;
134
  int neuron_errCode = NeuronModel_create(&model_);
H
hong19860320 已提交
135
  if (NEURON_NO_ERROR != neuron_errCode) {
136
    LOG(WARNING) << "[APU] Failed to create the neuron model!";
137
    return false;
H
hong19860320 已提交
138 139
  }
  graph.set_model(model_);
B
barry-ai 已提交
140 141
  graph.set_input_names(input_names);
  graph.set_output_names(output_names);
H
hong19860320 已提交
142 143 144

  // Convert all of ops and their input vars and weights and added into the APU
  // NIR graph
145
  const auto& bridges = subgraph::SubgraphBridgeRegistry::Instance();
B
barry-ai 已提交
146 147
  const auto& insts = origin_program->instructions(kRootBlockIdx);

148
  for (auto& inst : insts) {
H
hong19860320 已提交
149 150 151 152 153 154
    auto op = const_cast<OpLite*>(inst.op());
    CHECK(op);
    op->CheckShape();
    op->InferShape();
    std::string op_type = op->op_info()->Type();
    if (!bridges.Exists(op_type, TARGET(kAPU))) {
155
      return false;
H
hong19860320 已提交
156 157 158 159 160 161 162 163
    }

    auto kernel = inst.kernel();
    status |=
        bridges.Select(op_type, TARGET(kAPU))(reinterpret_cast<void*>(&graph),
                                              const_cast<OpLite*>(op),
                                              const_cast<KernelBase*>(kernel));
    if (subgraph::CHECK_FAILED(status)) {
164
      return false;
H
hong19860320 已提交
165 166 167
    }
  }

168 169
  // Get the index of input tensors
  std::vector<uint32_t> input_indices;
B
barry-ai 已提交
170 171 172 173
  for (int i = 0; i < input_names.size(); i++) {
    CHECK(graph.Has(input_names[i])) << "[APU] Failed to find input node "
                                     << input_names[i];
    auto index = graph.Get(input_names[i])->index();
174
    input_indices.push_back(index);
B
barry-ai 已提交
175 176
    VLOG(3) << "[APU] Input[" << i << "] name " << input_names[i] << " dims "
            << origin_itensors[i]->dims() << " index " << index;
H
hong19860320 已提交
177 178
  }

179 180
  // Get the index of output tensors
  std::vector<uint32_t> output_indices;
B
barry-ai 已提交
181 182 183 184 185
  for (int i = 0; i < output_names.size(); i++) {
    CHECK(graph.Has(output_names[i])) << "[APU] Failed to find output node "
                                      << output_names[i];
    origin_otensors[i]->mutable_data<int8_t>();
    auto index = graph.Get(output_names[i])->index();
186
    output_indices.push_back(index);
B
barry-ai 已提交
187 188
    VLOG(3) << "[APU] Output[" << i << "] name " << output_names[i] << " dims "
            << origin_otensors[i]->dims() << " index " << index;
H
hong19860320 已提交
189 190
  }

191 192 193 194 195 196
  // Indentify the input and output tensors of the neuron model
  NeuronModel_identifyInputsAndOutputs(model_,
                                       input_indices.size(),
                                       &input_indices[0],
                                       output_indices.size(),
                                       &output_indices[0]);
197
  neuron_errCode = NeuronModel_finish(model_);
H
hong19860320 已提交
198
  if (NEURON_NO_ERROR != neuron_errCode) {
199
    LOG(WARNING) << "[APU] Fail to create NIR model:" << neuron_errCode;
200
    return false;
H
hong19860320 已提交
201 202
  }

B
barry-ai 已提交
203 204 205 206
  VLOG(1) << "[APU] APU NIR model created, Create cost "
          << GetCurrentUS() - start_time << " us";

  start_time = GetCurrentUS();
207
  compilation_ = lite::apu::Device::Global().Build(model_);
H
hong19860320 已提交
208 209
  if (compilation_ == nullptr) {
    LOG(WARNING) << "[APU] Build APU DLA model failed!";
210
    return false;
H
hong19860320 已提交
211
  }
B
barry-ai 已提交
212
  VLOG(1) << "[APU] APU DLA model created, Build cost "
H
hong19860320 已提交
213
          << GetCurrentUS() - start_time << " us";
B
barry-ai 已提交
214

215
  start_time = GetCurrentUS();
B
barry-ai 已提交
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
  CHECK_EQ(origin_otensors.size(), output_names.size());
  origin_otypes_.resize(output_names.size());
  origin_odims_.resize(output_names.size());
  for (size_t i = 0; i < output_names.size(); i++) {
    origin_otypes_[i] = origin_otensors[i]->precision();
    origin_odims_[i] = origin_otensors[i]->dims().Vectorize();
  }
  if (!model_cache_dir.empty()) {
    // Save the generated model to file
    auto model_path = model_cache_dir + "/" + model_name_ + ".dla";
    VLOG(3) << "[APU] Save model to " << model_path;

    size_t compilationSize;
    status = NeuronCompilation_getCompiledNetworkSize(compilation_,
                                                      &compilationSize);
231
    std::vector<char> model_buffer;
B
barry-ai 已提交
232 233
    if (status == NEURON_NO_ERROR) {
      // Serialization DLA
234

B
barry-ai 已提交
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
      model_buffer.resize(compilationSize);
      status = NeuronCompilation_storeCompiledNetwork(
          compilation_, &model_buffer[0], compilationSize);
      if (status != NEURON_NO_ERROR) {
        LOG(WARNING) << "[APU] Serialization DLA failed!";
      }

      VLOG(3) << "[APU] Export the model to " << model_path;
      if (!WriteFile(model_path, model_buffer)) {
        LOG(WARNING) << "[APU] Open " << model_path << " for writting failed!";
      }
    }

    // Serialize the precisions and shapes of the origin output tensors into the
    // configuration file
    std::ostringstream os;
    for (int i = 0; i < output_names.size(); i++) {
      os << static_cast<int32_t>(origin_otypes_[i]) << ":";
      for (auto dim : origin_odims_[i]) {
        os << dim << ",";
      }
      os << ";";
    }
    auto str = os.str();
    std::vector<char> config_buffer(str.begin(), str.end());
    auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
    VLOG(3) << "[APU] Save configuration to " << config_path;
    if (!WriteFile(config_path, config_buffer)) {
      LOG(WARNING) << "[APU] Open " << config_path << " for writting failed!";
    }
265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281

    // Workaround: after calling storeCompiledNetwork, model will be modificated
    // that will cause a low performace, so we need restore it. after we fix
    // this bug, below code will be deleted
    NeuronCompilation_free(compilation_);
    NeuronModel_free(model_);
    model_ = nullptr;
    compilation_ = nullptr;
    status = NeuronModel_restoreFromCompiledNetwork(
        &model_, &compilation_, &model_buffer[0], compilationSize);
    if (status != NEURON_NO_ERROR) {
      LOG(WARNING) << "[APU] Load model failed!" << compilationSize;
      return false;
    }
    VLOG(3) << "[APU] Complete Load model!";
    VLOG(1) << "[APU] APU DLA model cached, cache cost "
            << GetCurrentUS() - start_time << " us";
B
barry-ai 已提交
282 283
  }

284
  return true;
H
hong19860320 已提交
285 286
}

B
barry-ai 已提交
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
bool SubgraphEngine::BuildDeviceProgram() {
  // Check if the cache device program exists
  if (!device_programs_.count(origin_idims_)) {
    auto device_program = std::make_shared<DeviceProgram>();
    // Obtain the model cache dir from the NPU Context of the subgraph op
    auto model_cache_dir =
        ctx_->As<APUContext>().SubgraphModelCacheDir(exec_scope_);
    VLOG(3) << "[APU] Getting subgraph_model_cache_dir: " << model_cache_dir;
    // Check and load if the cached model and configuration file exists
    if (model_cache_dir.empty() ||
        !device_program->LoadFromCacheFile(
            input_names_, output_names_, origin_idims_, model_cache_dir)) {
      // Build the model online, including converting the paddle ops to the NIR
      // nodes, building the MTK NIR graph, and compile MTK NIR graph to dla
      if (!origin_program_) {
        BuildOriginProgram();
      }
      CHECK(origin_program_) << "[APU] The origin program is not initialized!";
      CHECK_GT(origin_program_->instructions().size(), 0)
          << "[APU] No instructions found in the origin program!";
      if (!device_program->BuildGraphAndCacheToFile(origin_program_.get(),
                                                    input_names_,
                                                    output_names_,
                                                    origin_idims_,
                                                    origin_itensors_,
                                                    origin_otensors_,
                                                    model_cache_dir)) {
        return false;
      }
    }
    if (device_program->model_ == nullptr) {
      LOG(WARNING) << "dla create fail!";
      return false;
    }
    device_programs_[origin_idims_] = device_program;
  }

  // Get the index of output tensors
  auto device_program = device_programs_[origin_idims_];
  CHECK(device_program && device_program->model_);
  for (int i = 0; i < output_names_.size(); i++) {
    origin_otensors_[i]->Resize(device_program->origin_odims_[i]);
    origin_otensors_[i]->mutable_data<int8_t>();
    VLOG(3) << "[APU] Output[" << i << "] name " << output_names_[i] << " dims "
            << origin_otensors_[i]->dims() << " memory_size "
            << origin_otensors_[i]->memory_size();
  }
}

336
bool SubgraphEngine::LaunchDeviceProgram() {
H
hong19860320 已提交
337 338 339 340 341 342
  auto GetCurrentUS = []() -> double {
    struct timeval time;
    gettimeofday(&time, NULL);
    return 1e+6 * time.tv_sec + time.tv_usec;
  };

B
barry-ai 已提交
343 344 345 346 347 348 349
  if (device_programs_.count(origin_idims_) == 0 ||
      device_programs_[origin_idims_]->model_ == nullptr) {
    return LaunchOriginProgram();
  }

  auto device_program = device_programs_[origin_idims_];

H
hong19860320 已提交
350
  auto start_time = GetCurrentUS();
351
  NeuronExecution* run = NULL;
B
barry-ai 已提交
352 353
  int neuron_errCode =
      NeuronExecution_create(device_program->compilation_, &run);
H
hong19860320 已提交
354 355
  if (NEURON_NO_ERROR != neuron_errCode) {
    LOG(WARNING) << "[APU] Build APU runtime failed!";
356
    return false;
H
hong19860320 已提交
357 358 359 360
  }

  // Set input buffer
  for (size_t i = 0; i < origin_itensors_.size(); i++) {
361 362
    auto origin_data = origin_itensors_[i]->mutable_data<int8_t>();
    auto converted_data = reinterpret_cast<uint8_t*>(origin_data);
H
hong19860320 已提交
363
    for (int j = 0; j < origin_itensors_[i]->data_size(); j++) {
364 365
      converted_data[j] =
          static_cast<uint8_t>(static_cast<int16_t>(origin_data[j]) + 128);
H
hong19860320 已提交
366
    }
367
    NeuronExecution_setInput(
368
        run, i, NULL, converted_data, origin_itensors_[i]->memory_size());
H
hong19860320 已提交
369 370 371 372
  }

  // Set output buffer
  for (size_t i = 0; i < origin_otensors_.size(); i++) {
373 374
    NeuronExecution_setOutput(
        run,
H
hong19860320 已提交
375 376 377 378 379 380
        i,
        NULL,
        reinterpret_cast<void*>(origin_otensors_[i]->raw_data()),
        origin_otensors_[i]->memory_size());
  }

381
  neuron_errCode = NeuronExecution_compute(run);
H
hong19860320 已提交
382 383
  if (NEURON_NO_ERROR != neuron_errCode) {
    LOG(WARNING) << "Fail to run execution!" << neuron_errCode;
384
    return false;
H
hong19860320 已提交
385 386 387
  }

  for (size_t i = 0; i < origin_otensors_.size(); i++) {
388 389
    auto converted_data = origin_otensors_[i]->mutable_data<int8_t>();
    auto origin_data = reinterpret_cast<uint8_t*>(converted_data);
H
hong19860320 已提交
390
    for (int j = 0; j < origin_otensors_[i]->data_size(); j++) {
391 392
      converted_data[j] =
          static_cast<int8_t>(static_cast<int16_t>(origin_data[j]) - 128);
H
hong19860320 已提交
393 394
    }
  }
395
  NeuronExecution_free(run);
B
barry-ai 已提交
396
  VLOG(1) << "[APU] Process cost " << GetCurrentUS() - start_time << " us";
397
  return true;
H
hong19860320 已提交
398 399
}

400
SubgraphEngine::~SubgraphEngine() {
B
barry-ai 已提交
401 402 403 404 405 406 407
  for (auto& device_program : device_programs_) {
    if (device_program.second->compilation_) {
      NeuronCompilation_free(device_program.second->compilation_);
    }
    if (device_program.second->model_) {
      NeuronModel_free(device_program.second->model_);
    }
408 409 410
  }
}

H
hong19860320 已提交
411 412 413
void SubgraphCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
  engine_.reset(new SubgraphEngine(ctx_.get(),
414 415 416
                                   param.block_idx,
                                   param.program_desc,
                                   param.exec_scope,
H
hong19860320 已提交
417
                                   param.input_data_names,
418
                                   param.output_data_names));
H
hong19860320 已提交
419 420 421 422 423
  CHECK(engine_);
}

void SubgraphCompute::Run() {
  CHECK(engine_);
424
  engine_->Run();
H
hong19860320 已提交
425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446
}

}  // namespace apu
}  // namespace kernels
}  // namespace lite
}  // namespace paddle

REGISTER_LITE_KERNEL(subgraph,
                     kAPU,
                     kInt8,
                     kNCHW,
                     paddle::lite::kernels::apu::SubgraphCompute,
                     def)
    .BindInput("Inputs",
               {LiteType::GetTensorTy(TARGET(kHost),
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNCHW))})
    .BindOutput("Outputs",
                {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kInt8),
                                       DATALAYOUT(kNCHW))})
    .Finalize();