subgraph_compute.cc 12.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/kernels/npu/subgraph_compute.h"
#include <sys/time.h>
#include <time.h>
18
#include <algorithm>
19
#include <utility>
20
#include "hiai_ir_build.h"  // NOLINT
21 22 23 24
#include "lite/backends/npu/device.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/paddle_use_bridges.h"
25
#include "lite/kernels/npu/bridges/utility.h"
26
#include "lite/utils/io.h"
27 28 29 30 31 32

namespace paddle {
namespace lite {
namespace kernels {
namespace npu {

33 34 35
std::string SubgraphEngine::GenerateModelCacheName() const {
  auto inames = device_inames_;
  auto onames = device_onames_;
36
  std::stable_sort(inames.begin(), inames.end());
37

Z
zhupengyang 已提交
38
  std::string model_cache_name = "subgraph_" + std::to_string(block_idx_);
39
  for (auto iname : inames) {
Z
zhupengyang 已提交
40
    model_cache_name += "_";
41
    auto itensor = scope_->FindTensor(iname);
Z
zhupengyang 已提交
42
    int tmp = 0;
43
    for (auto i : itensor->dims().Vectorize()) {
Z
zhupengyang 已提交
44
      tmp += i * i;
45
    }
Z
zhupengyang 已提交
46
    model_cache_name += std::to_string(tmp % 1999);
47 48 49 50 51 52
  }
  model_cache_name += "_.om";

  return model_cache_name;
}

53 54
int SubgraphEngine::BuildDeviceProgram() {
  int status = 0;
55 56
  // Convert all of ops and their input vars and weights and added into the NPU
  // HiAI IR graph
57 58 59
  subgraph::npu::Graph graph;
  const auto& bridges = subgraph::Registry::Instance();
  for (auto& inst : origin_program_) {
60
    auto op = const_cast<OpLite*>(inst.op());
61 62 63 64
    CHECK(op);
    op->CheckShape();
    op->InferShape();
    std::string op_type = op->op_info()->Type();
65
    if (!bridges.Exists(op_type, TARGET(kNPU))) {
66 67
      return subgraph::FAILED;
    }
68
    auto kernel = inst.kernel();
69 70
    status |= bridges.Select(op_type, TARGET(kNPU))(
        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
71 72 73 74
    if (subgraph::CHECK_FAILED(status)) {
      return subgraph::FAILED;
    }
  }
75 76 77 78 79 80
  // Collect the valid input and output nodes in the HiAI IR graph and update
  // the input and output names
  device_inames_.clear();
  device_onames_.clear();
  std::vector<ge::Operator> device_inodes;
  std::vector<ge::Operator> device_onodes;
81
  for (auto& input_name : input_names_) {
82 83 84
    if (graph.Has(input_name)) {
      if (graph.Get(input_name)->is_data()) {
        device_inodes.push_back(*graph.Get(input_name)->data());
85 86 87
        device_inames_.push_back(input_name);
      } else {
        LOG(WARNING) << "[NPU] Input node " << input_name
88
                     << " is ignored because it is not a data node.";
89 90 91
      }
    } else {
      LOG(WARNING) << "[NPU] Input node " << input_name
92
                   << " is ignored because it does not exist.";
93
    }
94 95
  }
  for (auto& output_name : output_names_) {
96 97
    if (graph.Has(output_name)) {
      device_onodes.push_back(*graph.Get(output_name)->data());
98 99 100
      device_onames_.push_back(output_name);
    } else {
      LOG(WARNING) << "[NPU] Output node " << output_name
101
                   << " is ignored because it does not exist.";
102
    }
103
  }
104 105 106 107
  CHECK(!device_inames_.empty())
      << "[NPU] No input nodes found for building NPU model";
  CHECK(!device_onames_.empty())
      << "[NPU] No output nodes found for building NPU model";
108

109
  // Build the HiAI IR graph to HiAI om model as the device program
110 111 112
  if (device_program_map_.count(inputs_shape_) > 0) {
    return status;
  }
113 114 115
  std::string model_cache_full_dir =
      model_cache_dir_.empty() ? "" : model_cache_dir_ + "/" +
                                          GenerateModelCacheName();
116
  auto device_client = lite::npu::Device::Global().Build(
117
      model_name_, device_inodes, device_onodes, model_cache_full_dir);
118
  if (device_client == nullptr) {
119 120 121
    LOG(WARNING) << "[NPU] Build model failed!";
    return subgraph::FAILED;
  }
122
  auto device_program = std::make_shared<device_program_t>(device_client);
Z
zhupengyang 已提交
123 124 125
  if (!inputs_shape_.empty()) {
    device_program_map_[inputs_shape_] = device_program;
  }
126

127
  // Query and check the dimensions of valid input and output tensors
128
  std::vector<hiai::TensorDimension> device_idims, device_odims;
129
  if (device_program->client->GetModelIOTensorDim(
130 131 132 133 134
          model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) {
    LOG(WARNING)
        << "[NPU] Get the dimensions of input and output tensors failed!";
    return subgraph::FAILED;
  }
135 136 137
  device_program->device_idims = device_idims;
  device_program->device_odims = device_odims;

138 139 140 141 142 143 144 145
  CHECK_EQ(device_idims.size(), device_inames_.size());
  CHECK_EQ(device_odims.size(), device_onames_.size());
  origin_idims_.resize(device_inames_.size());
  origin_itensors_.resize(device_inames_.size());
  device_itensors_.resize(device_inames_.size());
  origin_odims_.resize(device_onames_.size());
  origin_otensors_.resize(device_onames_.size());
  device_otensors_.resize(device_onames_.size());
146

147
  for (int i = 0; i < device_inames_.size(); i++) {
148 149 150
    auto node = graph.Get(device_inames_[i]);
    auto precision = node->precision();
    auto layout = node->layout();
151
    origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
152 153
    CHECK(origin_itensors_[i]);
    origin_idims_[i] = origin_itensors_[i]->dims();
154 155
    VLOG(3) << "[NPU] Inputs[" << i << "] name: " << device_inames_[i]
            << " precision: " << PrecisionToStr(precision)
156 157 158
            << " layout: " << DataLayoutToStr(layout) << " dims: {"
            << device_idims[i].GetNumber() << ","
            << device_idims[i].GetChannel() << ","
159 160
            << device_idims[i].GetHeight() << "," << device_idims[i].GetWidth()
            << "}";
161
    // Prepare the device input tensors
162 163 164
    CHECK_EQ(origin_idims_[i].production(),
             device_idims[i].GetNumber() * device_idims[i].GetChannel() *
                 device_idims[i].GetHeight() * device_idims[i].GetWidth());
165 166 167
    device_itensors_[i].reset(new hiai::AiTensor);
    device_itensors_[i]->Init(&(device_idims[i]));
  }
168 169
  device_program->origin_idims = origin_idims_;

170
  for (int i = 0; i < device_onames_.size(); i++) {
171 172 173
    auto node = graph.Get(device_onames_[i]);
    auto precision = node->precision();
    auto layout = node->layout();
174
    origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
175 176
    CHECK(origin_otensors_[i]);
    origin_odims_[i] = origin_otensors_[i]->dims();
177 178
    VLOG(3) << "[NPU] Outputs[" << i << "] name: " << device_onames_[i]
            << " precision: " << PrecisionToStr(precision)
179
            << " layout: " << DataLayoutToStr(layout) << " dims: {"
180 181 182 183
            << device_odims[i].GetNumber() << ","
            << device_odims[i].GetChannel() << ","
            << device_odims[i].GetHeight() << "," << device_odims[i].GetWidth()
            << "}";
184 185 186 187 188
    // Prepare the device output tensors
    switch (precision) {
      case PRECISION(kFloat):
        origin_otensors_[i]->mutable_data<float>();
        break;
189 190 191
      case PRECISION(kBool):
        origin_otensors_[i]->mutable_data<bool>();
        break;
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
      case PRECISION(kInt8):
        origin_otensors_[i]->mutable_data<int8_t>();
        break;
      case PRECISION(kInt16):
        origin_otensors_[i]->mutable_data<int16_t>();
        break;
      case PRECISION(kInt32):
        origin_otensors_[i]->mutable_data<int32_t>();
        break;
      case PRECISION(kInt64):
        origin_otensors_[i]->mutable_data<int64_t>();
        break;
      default:
        LOG(FATAL) << "[NPU] " << device_onames_[i]
                   << " can't mutable data with precision type "
                   << PrecisionToStr(precision);
        break;
    }
210 211
    device_program->origin_odims = origin_odims_;

212 213 214
    CHECK_EQ(origin_odims_[i].production(),
             device_odims[i].GetNumber() * device_odims[i].GetChannel() *
                 device_odims[i].GetHeight() * device_odims[i].GetWidth());
215 216 217 218 219 220 221 222
    device_otensors_[i].reset(new hiai::AiTensor);
    device_otensors_[i]->Init(&(device_odims[i]));
  }
  return status;
}

int SubgraphEngine::LaunchDeviceProgram() {
  // Copy the data of origin input tensors to the buffer of input HiAI tensors
223 224 225
  // init device_itensors_, device_otensors_, origin_otensors_
  auto device_program = device_program_map_[inputs_shape_];

226 227
  // Run the HiAI model by name
  std::string key = "model_name";  // Note: key seems must be model_name
228 229
  hiai::AiContext model_context;
  model_context.AddPara(key, model_name_);
230 231 232 233 234 235 236
  auto GetCurrentUS = []() -> double {
    struct timeval time;
    gettimeofday(&time, NULL);
    return 1e+6 * time.tv_sec + time.tv_usec;
  };
  int istamp;
  auto start_time = GetCurrentUS();
237 238 239
  CHECK_EQ(device_program->client->Process(
               model_context, device_itensors_, device_otensors_, 1000, istamp),
           hiai::AI_SUCCESS);
240
  VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
241

242 243 244
  return 0;
}

245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
int SubgraphEngine::Build() {
  if (device_program_map_.count(inputs_shape_) > 0) {
    return subgraph::SUCCESS;
  }
  // In order to attach all of the ops of the block desc, we need to build the
  // original program firstly.
  BuildOriginProgram();
  // Run InferShape() of all of ops, and convert Paddle ops to NPU/XPU IR graph
  build_device_program_status_ = BuildDeviceProgram();
  return build_device_program_status_;
}

void SubgraphEngine::InitDeviceTensor() {
  auto device_program = device_program_map_[inputs_shape_];
  for (size_t i = 0; i < device_itensors_.size(); i++) {
260 261 262 263 264 265 266 267 268 269 270 271 272 273
    if (device_itensors_[i]->GetBuffer() != origin_itensors_[i]->raw_data()) {
      VLOG(3) << "init device_itensors and share input tensor buf between "
                 "device and host";
      device_itensors_[i]->Init(&(device_program->device_idims[i]));
      std::memcpy(device_itensors_[i]->GetBuffer(),
                  origin_itensors_[i]->raw_data(),
                  origin_itensors_[i]->memory_size());
      // share data buf between device_itensor and origin_itensor
      std::shared_ptr<Buffer> buffer =
          std::make_shared<Buffer>(device_itensors_[i]->GetBuffer(),
                                   lite_api::TargetType::kHost,
                                   device_itensors_[i]->GetSize());
      origin_itensors_[i]->ResetBuffer(buffer, device_itensors_[i]->GetSize());
    }
274 275
  }
  for (size_t i = 0; i < device_otensors_.size(); i++) {
276 277 278 279 280 281 282 283 284 285 286 287
    if (device_otensors_[i]->GetBuffer() != origin_otensors_[i]->raw_data()) {
      VLOG(3) << "init device_otensors and share output tensor buf between "
                 "device and host";
      device_otensors_[i]->Init(&(device_program->device_odims[i]));
      // share data buf between device_itensor and origin_itensor
      origin_otensors_[i]->Resize(device_program->origin_odims[i]);
      std::shared_ptr<Buffer> buffer =
          std::make_shared<Buffer>(device_otensors_[i]->GetBuffer(),
                                   lite_api::TargetType::kHost,
                                   device_otensors_[i]->GetSize());
      origin_otensors_[i]->ResetBuffer(buffer, device_otensors_[i]->GetSize());
    }
288 289 290
  }
}

291 292 293 294 295
bool SubgraphEngine::InputShapeChanged() {
  std::vector<std::vector<int64_t>> new_shape;
  for (auto origin_itensor : origin_itensors_) {
    new_shape.push_back(origin_itensor->dims().Vectorize());
  }
296
  if (inputs_shape_ == new_shape) {
297 298
    return false;
  }
299
  inputs_shape_ = new_shape;
300 301 302
  return true;
}

303 304
void SubgraphCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
305 306
  engine_.reset(new SubgraphEngine(ctx_.get(),
                                   param.sub_block_idx,
307 308 309
                                   param.sub_block_desc,
                                   param.input_data_names,
                                   param.output_data_names,
310 311
                                   param.scope,
                                   NPUContext::SubgraphModelCacheDir()));
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327
  CHECK(engine_);
  engine_->Build();
}

void SubgraphCompute::Run() {
  CHECK(engine_);
  engine_->Launch();
}

}  // namespace npu
}  // namespace kernels
}  // namespace lite
}  // namespace paddle

REGISTER_LITE_KERNEL(subgraph,
                     kNPU,
328
                     kAny,
329 330 331
                     kNCHW,
                     paddle::lite::kernels::npu::SubgraphCompute,
                     def)
332 333 334 335
    .BindInput("Inputs",
               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
    .BindOutput("Outputs",
                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
336
    .Finalize();