subgraph_compute.cc 7.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/kernels/xpu/subgraph_compute.h"
#include <sys/time.h>
#include <time.h>
#include <utility>
#include "lite/backends/xpu/device.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/xpu/bridges/graph.h"
#include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
23
#include "lite/kernels/xpu/bridges/utility.h"
24 25 26 27 28 29

namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {

30
bool SubgraphEngine::BuildDeviceProgram() {
31
  int status = 0;
32 33 34 35
  if (!origin_program_) {
    BuildOriginProgram();
  }

36 37
  // Convert all of ops and their input vars and weights and added into the XPU
  // IR graph
38
  subgraph::xpu::Graph graph;
39
  const auto& bridges = subgraph::SubgraphBridgeRegistry::Instance();
40 41
  const auto& insts = origin_program_->instructions(kRootBlockIdx);
  for (auto& inst : insts) {
Z
zhupengyang 已提交
42
    auto op = const_cast<OpLite*>(inst.op());
43 44 45 46
    CHECK(op);
    op->CheckShape();
    op->InferShape();
    std::string op_type = op->op_info()->Type();
47
    if (!bridges.Exists(op_type, TARGET(kXPU))) {
48
      return false;
49
    }
50
    auto kernel = inst.kernel();
Z
zhupengyang 已提交
51 52
    status |= bridges.Select(op_type, TARGET(kXPU))(
        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
53
    if (subgraph::CHECK_FAILED(status)) {
54
      return false;
55 56
    }
  }
57

58
  // Collect the input and output nodes of the XPU IR graph
59 60
  std::vector<xtcl::xExpr*> device_inodes;
  std::vector<xtcl::xExpr*> device_onodes;
61 62 63 64
  for (size_t i = 0; i < input_names_.size(); i++) {
    CHECK(graph.Has(input_names_[i]));
    CHECK(graph.Get(input_names_[i])->is_data());
    device_inodes.push_back(graph.Get(input_names_[i])->data().get());
65
  }
66
  for (size_t i = 0; i < output_names_.size(); i++) {
67 68
    if (graph.Has(output_names_[i])) {
      device_onodes.push_back(graph.Get(output_names_[i])->data().get());
69 70 71 72 73 74
    } else {
      // update output_names_ and origin_otensors because some outputs may be
      // useless
      output_names_.erase(output_names_.begin() + i);
      origin_otensors_.erase(origin_otensors_.begin() + i);
      i--;
75
    }
76
  }
77
  CHECK_GT(output_names_.size(), 0);
78 79
  CHECK_EQ(output_names_.size(), origin_otensors_.size());

80
  // Build the XPU IR graph to the XPU runtime for inference
81
  device_program_ = lite::xpu::Device::Global().Build(
82
      &graph.builder_, &graph.params_, &device_onodes);
83 84
  if (device_program_ == nullptr) {
    LOG(WARNING) << "[XPU] Build model failed!";
85
    return false;
86
  }
87 88 89 90 91 92
  origin_otypes_.resize(output_names_.size());
  origin_odims_.resize(output_names_.size());
  for (size_t i = 0; i < output_names_.size(); i++) {
    origin_otypes_[i] = graph.Get(output_names_[i])->precision();
    origin_odims_[i] = origin_otensors_[i]->dims().Vectorize();
  }
93 94

  // Query and check the dimensions of input and output tensors
95 96
  device_itensors_.resize(input_names_.size());
  device_otensors_.resize(output_names_.size());
97 98 99
  for (size_t i = 0; i < input_names_.size(); i++) {
    VLOG(3) << "[XPU] Inputs[" << i << "] name: " << input_names_[i]
            << " dims: " << DDim(origin_idims_[i]).repr();
100 101 102 103 104 105 106
    // Prepare the device input tensors which share data with the origin input
    // tensors
    device_itensors_[i].data = nullptr;
    device_itensors_[i].ctx.device_type =
        subgraph::xpu::CvtDLDeviceType(TARGET(kHost));
    device_itensors_[i].ctx.device_id = 0;
    device_itensors_[i].ndim = origin_idims_[i].size();
107 108
    device_itensors_[i].dtype =
        subgraph::xpu::CvtDLDataType(origin_itensors_[i]->precision());
109
    device_itensors_[i].shape = const_cast<int64_t*>(
110
        static_cast<const int64_t*>(origin_idims_[i].data()));
111 112
    device_itensors_[i].strides = nullptr;
    device_itensors_[i].byte_offset = 0;
113
  }
114 115 116
  for (size_t i = 0; i < output_names_.size(); i++) {
    VLOG(3) << "[XPU] Outputs[" << i << "] name: " << output_names_[i]
            << " dims: " << DDim(origin_odims_[i]).repr();
117 118
    // Prepare the device output tensors which share data with the origin output
    // tensors
119 120
    origin_otensors_[i]->Resize(origin_odims_[i]);
    auto& precision = origin_otypes_[i];
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
    switch (precision) {
      case PRECISION(kFloat):
        origin_otensors_[i]->mutable_data<float>();
        break;
      case PRECISION(kInt8):
        origin_otensors_[i]->mutable_data<int8_t>();
        break;
      case PRECISION(kInt16):
        origin_otensors_[i]->mutable_data<int16_t>();
        break;
      case PRECISION(kInt32):
        origin_otensors_[i]->mutable_data<int32_t>();
        break;
      case PRECISION(kInt64):
        origin_otensors_[i]->mutable_data<int64_t>();
        break;
      default:
138
        LOG(FATAL) << "[XPU] " << output_names_[i]
139 140 141 142 143 144 145 146 147 148 149
                   << " can't mutable data with precision type "
                   << PrecisionToStr(precision);
        break;
    }
    device_otensors_[i].data = nullptr;
    device_otensors_[i].ctx.device_type =
        subgraph::xpu::CvtDLDeviceType(TARGET(kHost));
    device_otensors_[i].ctx.device_id = 0;
    device_otensors_[i].ndim = origin_odims_[i].size();
    device_otensors_[i].dtype = subgraph::xpu::CvtDLDataType(precision);
    device_otensors_[i].shape = const_cast<int64_t*>(
150
        static_cast<const int64_t*>(origin_odims_[i].data()));
151 152
    device_otensors_[i].strides = nullptr;
    device_otensors_[i].byte_offset = 0;
153
  }
154
  return true;
155 156
}

157
bool SubgraphEngine::LaunchDeviceProgram() {
158 159 160 161
  for (size_t i = 0; i < device_itensors_.size(); i++) {
    // Update the data pointer of DLTensor to track the origin input tensors
    device_itensors_[i].data =
        const_cast<void*>(origin_itensors_[i]->raw_data());
162
    device_program_->SetInput(input_names_[i], &device_itensors_[i]);
163 164 165 166 167 168 169 170 171 172
  }
  // Run the XPU model
  auto GetCurrentUS = []() -> double {
    struct timeval time;
    gettimeofday(&time, NULL);
    return 1e+6 * time.tv_sec + time.tv_usec;
  };
  auto start_time = GetCurrentUS();
  device_program_->Run();
  VLOG(3) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us";
173 174 175 176 177
  for (size_t i = 0; i < device_otensors_.size(); i++) {
    // Update the data pointer of DLTensor to track the origin output tensors
    device_otensors_[i].data =
        const_cast<void*>(origin_otensors_[i]->raw_data());
    device_program_->CopyOutputTo(i, &device_otensors_[i]);
178
  }
179
  return true;
180 181 182 183
}

void SubgraphCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
184
  engine_.reset(new SubgraphEngine(ctx_.get(),
185 186 187
                                   param.block_idx,
                                   param.program_desc,
                                   param.exec_scope,
188
                                   param.input_data_names,
189
                                   param.output_data_names));
190 191 192 193 194
  CHECK(engine_);
}

void SubgraphCompute::Run() {
  CHECK(engine_);
195
  engine_->Run();
196 197 198 199 200 201 202 203 204
}

}  // namespace xpu
}  // namespace kernels
}  // namespace lite
}  // namespace paddle

REGISTER_LITE_KERNEL(subgraph,
                     kXPU,
205
                     kAny,
206 207 208
                     kNCHW,
                     paddle::lite::kernels::xpu::SubgraphCompute,
                     def)
209 210 211 212
    .BindInput("Inputs",
               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
    .BindOutput("Outputs",
                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
213
    .Finalize();