subgraph_compute.cc 10.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/kernels/npu/subgraph_compute.h"
#include <sys/time.h>
#include <time.h>
#include <utility>
19
#include "hiai_ir_build.h"  // NOLINT
20 21 22 23
#include "lite/backends/npu/device.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/paddle_use_bridges.h"
24
#include "lite/kernels/npu/bridges/utility.h"
25 26 27 28 29 30 31 32

namespace paddle {
namespace lite {
namespace kernels {
namespace npu {

int SubgraphEngine::BuildDeviceProgram() {
  int status = 0;
33 34
  // Convert all of ops and their input vars and weights and added into the NPU
  // HiAI IR graph
35 36 37 38 39 40 41 42
  subgraph::npu::Graph graph;
  const auto& bridges = subgraph::Registry::Instance();
  for (auto& inst : origin_program_) {
    auto op = inst.op();
    CHECK(op);
    op->CheckShape();
    op->InferShape();
    std::string op_type = op->op_info()->Type();
43
    if (!bridges.Exists(op_type, TARGET(kNPU))) {
44 45
      return subgraph::FAILED;
    }
46
    auto kernel = inst.kernel();
47 48
    status |=
        bridges.Select(op_type, TARGET(kNPU))(reinterpret_cast<void*>(&graph),
49 50
                                              const_cast<OpLite*>(op),
                                              const_cast<KernelBase*>(kernel));
51 52 53 54
    if (subgraph::CHECK_FAILED(status)) {
      return subgraph::FAILED;
    }
  }
55 56 57 58 59 60
  // Collect the valid input and output nodes in the HiAI IR graph and update
  // the input and output names
  device_inames_.clear();
  device_onames_.clear();
  std::vector<ge::Operator> device_inodes;
  std::vector<ge::Operator> device_onodes;
61
  for (auto& input_name : input_names_) {
62 63 64
    if (graph.Has(input_name)) {
      if (graph.Get(input_name)->is_data()) {
        device_inodes.push_back(*graph.Get(input_name)->data());
65 66 67
        device_inames_.push_back(input_name);
      } else {
        LOG(WARNING) << "[NPU] Input node " << input_name
68
                     << " is ignored because it is not a data node.";
69 70 71
      }
    } else {
      LOG(WARNING) << "[NPU] Input node " << input_name
72
                   << " is ignored because it does not exist.";
73
    }
74 75
  }
  for (auto& output_name : output_names_) {
76 77
    if (graph.Has(output_name)) {
      device_onodes.push_back(*graph.Get(output_name)->data());
78 79 80
      device_onames_.push_back(output_name);
    } else {
      LOG(WARNING) << "[NPU] Output node " << output_name
81
                   << " is ignored because it does not exist.";
82
    }
83
  }
84 85 86 87 88 89 90
  CHECK(!device_inames_.empty())
      << "[NPU] No input nodes found for building NPU model";
  CHECK(!device_onames_.empty())
      << "[NPU] No output nodes found for building NPU model";
  // Build the HiAI IR graph to HiAI om model as the device program
  device_program_ = lite::npu::Device::Global().Build(
      model_name_, device_inodes, device_onodes);
91 92 93 94 95
  if (device_program_ == nullptr) {
    LOG(WARNING) << "[NPU] Build model failed!";
    return subgraph::FAILED;
  }

96
  // Query and check the dimensions of valid input and output tensors
97 98 99 100 101 102 103
  std::vector<hiai::TensorDimension> device_idims, device_odims;
  if (device_program_->GetModelIOTensorDim(
          model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) {
    LOG(WARNING)
        << "[NPU] Get the dimensions of input and output tensors failed!";
    return subgraph::FAILED;
  }
104 105 106 107 108 109 110 111 112
  CHECK_EQ(device_idims.size(), device_inames_.size());
  CHECK_EQ(device_odims.size(), device_onames_.size());
  origin_idims_.resize(device_inames_.size());
  origin_itensors_.resize(device_inames_.size());
  device_itensors_.resize(device_inames_.size());
  origin_odims_.resize(device_onames_.size());
  origin_otensors_.resize(device_onames_.size());
  device_otensors_.resize(device_onames_.size());
  for (int i = 0; i < device_inames_.size(); i++) {
113 114 115
    auto node = graph.Get(device_inames_[i]);
    auto precision = node->precision();
    auto layout = node->layout();
116
    origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
117 118
    CHECK(origin_itensors_[i]);
    origin_idims_[i] = origin_itensors_[i]->dims();
119 120
    VLOG(3) << "[NPU] Inputs[" << i << "] name: " << device_inames_[i]
            << " precision: " << PrecisionToStr(precision)
121 122 123
            << " layout: " << DataLayoutToStr(layout) << " dims: {"
            << device_idims[i].GetNumber() << ","
            << device_idims[i].GetChannel() << ","
124 125
            << device_idims[i].GetHeight() << "," << device_idims[i].GetWidth()
            << "}";
126
    // Prepare the device input tensors
127 128 129 130 131 132 133 134 135 136 137 138 139
    if (!subgraph::npu::CheckShape(origin_idims_[i], device_idims[i])) {
      LOG(WARNING) << "origin and device input's dims are mismatched.";
      for (int j = 0; j < origin_idims_[i].size(); j++) {
        LOG(WARNING) << "origin_idims_[" << i << "][" << j
                     << "]: " << origin_idims_[i][j];
      }
      LOG(WARNING) << "device_idims[" << i << "]: {"
                   << device_idims[i].GetNumber() << ", "
                   << device_idims[i].GetChannel() << ", "
                   << device_idims[i].GetHeight() << ", "
                   << device_idims[i].GetWidth() << "}";
      return subgraph::FAILED;
    }
140 141 142
    device_itensors_[i].reset(new hiai::AiTensor);
    device_itensors_[i]->Init(&(device_idims[i]));
  }
143
  for (int i = 0; i < device_onames_.size(); i++) {
144 145 146
    auto node = graph.Get(device_onames_[i]);
    auto precision = node->precision();
    auto layout = node->layout();
147
    origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
148 149
    CHECK(origin_otensors_[i]);
    origin_odims_[i] = origin_otensors_[i]->dims();
150 151
    VLOG(3) << "[NPU] Outputs[" << i << "] name: " << device_onames_[i]
            << " precision: " << PrecisionToStr(precision)
152
            << " layout: " << DataLayoutToStr(layout) << " dims: {"
153 154 155 156
            << device_odims[i].GetNumber() << ","
            << device_odims[i].GetChannel() << ","
            << device_odims[i].GetHeight() << "," << device_odims[i].GetWidth()
            << "}";
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
    // Prepare the device output tensors
    switch (precision) {
      case PRECISION(kFloat):
        origin_otensors_[i]->mutable_data<float>();
        break;
      case PRECISION(kInt8):
        origin_otensors_[i]->mutable_data<int8_t>();
        break;
      case PRECISION(kInt16):
        origin_otensors_[i]->mutable_data<int16_t>();
        break;
      case PRECISION(kInt32):
        origin_otensors_[i]->mutable_data<int32_t>();
        break;
      case PRECISION(kInt64):
        origin_otensors_[i]->mutable_data<int64_t>();
        break;
      default:
        LOG(FATAL) << "[NPU] " << device_onames_[i]
                   << " can't mutable data with precision type "
                   << PrecisionToStr(precision);
        break;
    }
180
    /*
181 182 183 184 185 186 187 188 189 190 191 192 193
    if (!subgraph::npu::CheckShape(origin_odims_[i], device_odims[i])) {
      LOG(WARNING) << "origin and device output's dims are mismatched.";
      for (int j = 0; j < origin_odims_[i].size(); j++) {
        LOG(WARNING) << "origin_odims_[" << i << "][" << j
                     << "]: " << origin_odims_[i][j];
      }
      LOG(WARNING) << "device_odims[" << i << "]: {"
                   << device_odims[i].GetNumber() << ", "
                   << device_odims[i].GetChannel() << ", "
                   << device_odims[i].GetHeight() << ", "
                   << device_odims[i].GetWidth() << "}";
      return subgraph::FAILED;
    }
194
    */
195 196 197 198 199 200 201 202
    device_otensors_[i].reset(new hiai::AiTensor);
    device_otensors_[i]->Init(&(device_odims[i]));
  }
  return status;
}

int SubgraphEngine::LaunchDeviceProgram() {
  // Copy the data of origin input tensors to the buffer of input HiAI tensors
203 204 205 206
  for (size_t i = 0; i < device_itensors_.size(); i++) {
    std::memcpy(device_itensors_[i]->GetBuffer(),
                origin_itensors_[i]->raw_data(),
                origin_itensors_[i]->memory_size());
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
  }
  // Run the HiAI model by name
  std::string key = "model_name";  // Note: key seems must be model_name
  model_context_.AddPara(key, model_name_);
  auto GetCurrentUS = []() -> double {
    struct timeval time;
    gettimeofday(&time, NULL);
    return 1e+6 * time.tv_sec + time.tv_usec;
  };
  int istamp;
  auto start_time = GetCurrentUS();
  CHECK_EQ(
      device_program_->Process(
          model_context_, device_itensors_, device_otensors_, 1000, istamp),
      hiai::AI_SUCCESS);
  VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
  // Copy the data of output HiAI tensor to the buffer of origin output tensors
224 225 226 227
  for (size_t i = 0; i < device_otensors_.size(); i++) {
    std::memcpy(const_cast<void*>(origin_otensors_[i]->raw_data()),
                device_otensors_[i]->GetBuffer(),
                device_otensors_[i]->GetSize());
228 229 230 231 232 233
  }
  return 0;
}

void SubgraphCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
234 235
  engine_.reset(new SubgraphEngine(ctx_.get(),
                                   param.sub_block_idx,
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
                                   param.sub_block_desc,
                                   param.input_data_names,
                                   param.output_data_names,
                                   param.scope));
  CHECK(engine_);
  engine_->Build();
}

void SubgraphCompute::Run() {
  CHECK(engine_);
  engine_->Launch();
}

}  // namespace npu
}  // namespace kernels
}  // namespace lite
}  // namespace paddle

REGISTER_LITE_KERNEL(subgraph,
                     kNPU,
                     kFloat,
                     kNCHW,
                     paddle::lite::kernels::npu::SubgraphCompute,
                     def)
    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
    .Finalize();