subgraph_compute.cc 9.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/kernels/xpu/subgraph_compute.h"
#include <sys/time.h>
#include <time.h>
#include <utility>
#include "lite/backends/xpu/device.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/xpu/bridges/graph.h"
#include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
23
#include "lite/kernels/xpu/bridges/utility.h"
24 25 26 27 28 29

namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {

30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
  // Obtain the origin input tensors, and create the origin output
  // tensors(Don't try to access them before launch the device program or the
  // origin program)
  PrepareWorkspaceForOriginProgram();
  // Create the device input and output tensors, but don't initialize them
  // with the dimensions
  device_itensors_.resize(input_names_.size());
  for (int i = 0; i < input_names_.size(); i++) {
    device_itensors_[i].reset(new hiai::AiTensor);
    CHECK(device_itensors_[i]);
  }
  device_otensors_.resize(output_names_.size());
  for (int i = 0; i < output_names_.size(); i++) {
    device_otensors_[i].reset(new hiai::AiTensor);
    CHECK(device_otensors_[i]);
  }
  return true;
}

bool SubgraphEngine::BuildDeviceProgram() {
51
  int status = 0;
52 53
  // Convert all of ops and their input vars and weights and added into the XPU
  // IR graph
54 55
  subgraph::xpu::Graph graph;
  const auto& bridges = subgraph::Registry::Instance();
56
  if (!origin_program_) {
57 58
    BuildOriginProgram();
  }
59 60
  const auto& insts = origin_program_->instructions(kRootBlockIdx);
  for (auto& inst : insts) {
Z
zhupengyang 已提交
61
    auto op = const_cast<OpLite*>(inst.op());
62 63 64 65
    CHECK(op);
    op->CheckShape();
    op->InferShape();
    std::string op_type = op->op_info()->Type();
66
    if (!bridges.Exists(op_type, TARGET(kXPU))) {
67
      return false;
68
    }
69
    auto kernel = inst.kernel();
Z
zhupengyang 已提交
70 71
    status |= bridges.Select(op_type, TARGET(kXPU))(
        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
72
    if (subgraph::CHECK_FAILED(status)) {
73
      return false;
74 75
    }
  }
76
  // Obtain the output nodes of the XPU IR graph and build the graph to the XPU
77
  // runtime
78 79 80 81 82
  device_inames_.clear();
  device_onames_.clear();
  std::vector<xtcl::xExpr*> device_inodes;
  std::vector<xtcl::xExpr*> device_onodes;
  for (auto& input_name : input_names_) {
83 84 85
    if (graph.Has(input_name)) {
      if (graph.Get(input_name)->is_data()) {
        device_inodes.push_back(graph.Get(input_name)->data().get());
86 87 88
        device_inames_.push_back(input_name);
      } else {
        LOG(WARNING) << "[XPU] Input node " << input_name
89
                     << " is ignored because it is not a data node.";
90 91 92
      }
    } else {
      LOG(WARNING) << "[XPU] Input node " << input_name
93
                   << " is ignored because it does not exist.";
94 95
    }
  }
96
  for (auto& output_name : output_names_) {
97 98
    if (graph.Has(output_name)) {
      device_onodes.push_back(graph.Get(output_name)->data().get());
99 100 101
      device_onames_.push_back(output_name);
    } else {
      LOG(WARNING) << "[XPU] Output node " << output_name
102
                   << " is ignored because it does not exist.";
103
    }
104
  }
105 106 107 108
  CHECK(!device_inames_.empty())
      << "[XPU] No input nodes found for building XPU model";
  CHECK(!device_onames_.empty())
      << "[XPU] No output nodes found for building XPU model";
109
  device_program_ = lite::xpu::Device::Global().Build(
110
      &graph.builder_, &graph.params_, &device_onodes);
111 112
  if (device_program_ == nullptr) {
    LOG(WARNING) << "[XPU] Build model failed!";
113
    return false;
114 115 116
  }

  // Query and check the dimensions of input and output tensors
117 118 119 120 121 122 123
  origin_idims_.resize(device_inames_.size());
  origin_itensors_.resize(device_inames_.size());
  device_itensors_.resize(device_inames_.size());
  origin_odims_.resize(device_onames_.size());
  origin_otensors_.resize(device_onames_.size());
  device_otensors_.resize(device_onames_.size());
  for (int i = 0; i < device_inames_.size(); i++) {
124 125 126
    auto node = graph.Get(device_inames_[i]);
    auto precision = node->precision();
    auto layout = node->layout();
127
    origin_itensors_[i] = exec_scope_->FindMutableTensor(device_inames_[i]);
128 129
    CHECK(origin_itensors_[i]);
    origin_idims_[i] = origin_itensors_[i]->dims();
130 131
    VLOG(3) << "[XPU] Inputs[" << i << "] name: " << device_inames_[i]
            << " precision: " << PrecisionToStr(precision)
132 133 134 135 136 137 138 139 140 141 142 143 144 145
            << " layout: " << DataLayoutToStr(layout)
            << " dims: " << origin_idims_[i];
    // Prepare the device input tensors which share data with the origin input
    // tensors
    device_itensors_[i].data = nullptr;
    device_itensors_[i].ctx.device_type =
        subgraph::xpu::CvtDLDeviceType(TARGET(kHost));
    device_itensors_[i].ctx.device_id = 0;
    device_itensors_[i].ndim = origin_idims_[i].size();
    device_itensors_[i].dtype = subgraph::xpu::CvtDLDataType(precision);
    device_itensors_[i].shape = const_cast<int64_t*>(
        static_cast<const int64_t*>(origin_idims_[i].data().data()));
    device_itensors_[i].strides = nullptr;
    device_itensors_[i].byte_offset = 0;
146
  }
147
  for (int i = 0; i < device_onames_.size(); i++) {
148 149 150
    auto node = graph.Get(device_onames_[i]);
    auto precision = node->precision();
    auto layout = node->layout();
151
    origin_otensors_[i] = exec_scope_->FindMutableTensor(device_onames_[i]);
152 153
    CHECK(origin_otensors_[i]);
    origin_odims_[i] = origin_otensors_[i]->dims();
154 155
    VLOG(3) << "[XPU] Outputs[" << i << "] name: " << device_onames_[i]
            << " precision: " << PrecisionToStr(precision)
156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
            << " layout: " << DataLayoutToStr(layout)
            << " dims: " << origin_odims_[i];
    // Prepare the device output tensors which share data with the origin output
    // tensors
    switch (precision) {
      case PRECISION(kFloat):
        origin_otensors_[i]->mutable_data<float>();
        break;
      case PRECISION(kInt8):
        origin_otensors_[i]->mutable_data<int8_t>();
        break;
      case PRECISION(kInt16):
        origin_otensors_[i]->mutable_data<int16_t>();
        break;
      case PRECISION(kInt32):
        origin_otensors_[i]->mutable_data<int32_t>();
        break;
      case PRECISION(kInt64):
        origin_otensors_[i]->mutable_data<int64_t>();
        break;
      default:
        LOG(FATAL) << "[XPU] " << device_onames_[i]
                   << " can't mutable data with precision type "
                   << PrecisionToStr(precision);
        break;
    }
    device_otensors_[i].data = nullptr;
    device_otensors_[i].ctx.device_type =
        subgraph::xpu::CvtDLDeviceType(TARGET(kHost));
    device_otensors_[i].ctx.device_id = 0;
    device_otensors_[i].ndim = origin_odims_[i].size();
    device_otensors_[i].dtype = subgraph::xpu::CvtDLDataType(precision);
    device_otensors_[i].shape = const_cast<int64_t*>(
        static_cast<const int64_t*>(origin_odims_[i].data().data()));
    device_otensors_[i].strides = nullptr;
    device_otensors_[i].byte_offset = 0;
192
  }
193
  return true;
194 195
}

196
bool SubgraphEngine::LaunchDeviceProgram() {
197 198 199 200
  for (size_t i = 0; i < device_itensors_.size(); i++) {
    // Update the data pointer of DLTensor to track the origin input tensors
    device_itensors_[i].data =
        const_cast<void*>(origin_itensors_[i]->raw_data());
201
    device_program_->SetInput(device_inames_[i], &device_itensors_[i]);
202 203 204 205 206 207 208 209 210 211
  }
  // Run the XPU model
  auto GetCurrentUS = []() -> double {
    struct timeval time;
    gettimeofday(&time, NULL);
    return 1e+6 * time.tv_sec + time.tv_usec;
  };
  auto start_time = GetCurrentUS();
  device_program_->Run();
  VLOG(3) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us";
212 213 214 215 216
  for (size_t i = 0; i < device_otensors_.size(); i++) {
    // Update the data pointer of DLTensor to track the origin output tensors
    device_otensors_[i].data =
        const_cast<void*>(origin_otensors_[i]->raw_data());
    device_program_->CopyOutputTo(i, &device_otensors_[i]);
217
  }
218
  return true;
219 220 221 222
}

void SubgraphCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
223
  engine_.reset(new SubgraphEngine(ctx_.get(),
224 225 226
                                   param.block_idx,
                                   param.program_desc,
                                   param.exec_scope,
227
                                   param.input_data_names,
228
                                   param.output_data_names));
229 230 231 232 233
  CHECK(engine_);
}

void SubgraphCompute::Run() {
  CHECK(engine_);
234
  engine_->Run();
235 236 237 238 239 240 241 242 243
}

}  // namespace xpu
}  // namespace kernels
}  // namespace lite
}  // namespace paddle

REGISTER_LITE_KERNEL(subgraph,
                     kXPU,
244
                     kAny,
245 246 247
                     kNCHW,
                     paddle::lite::kernels::xpu::SubgraphCompute,
                     def)
248 249 250 251
    .BindInput("Inputs",
               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
    .BindOutput("Outputs",
                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
252
    .Finalize();