subgraph_compute.cc 6.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/bm/subgraph_compute.h"
#include <sys/time.h>
#include <time.h>
#include <string>
#include <utility>
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/bm/bridges/paddle_use_bridges.h"
#include "lite/kernels/bm/bridges/utility.h"

namespace paddle {
namespace lite {
namespace kernels {
namespace bm {

31
bool SubgraphEngine::BuildDeviceProgram() {
32 33
  int status = 0;
  subgraph::bm::Graph graph;
34
  const auto& bridges = subgraph::SubgraphBridgeRegistry::Instance();
35 36
  graph.CreateCompilerHandle();
  auto& ctx = this->ctx_->template As<BMContext>();
37 38 39
  for (size_t i = 0; i < input_names_.size(); i++) {
    graph.AddNode(input_names_[i]);
  }
40
  if (!origin_program_) {
41 42
    BuildOriginProgram();
  }
43 44
  const auto& insts = origin_program_->instructions(kRootBlockIdx);
  for (auto& inst : insts) {
45
    auto op = const_cast<OpLite*>(inst.op());
46 47 48 49
    CHECK(op);
    op->CheckShape();
    op->InferShape();
    std::string op_type = op->op_info()->Type();
50
    LOG(INFO) << op_type;
51
    if (!bridges.Exists(op_type, TARGET(kBM))) {
52
      return false;
53 54 55 56 57 58 59
    }
    auto kernel = inst.kernel();
    status |=
        bridges.Select(op_type, TARGET(kBM))(reinterpret_cast<void*>(&graph),
                                             const_cast<OpLite*>(op),
                                             const_cast<KernelBase*>(kernel));
    if (subgraph::CHECK_FAILED(status)) {
60
      return false;
61 62
    }
  }
63 64
  std::string net_name = "bmnet_f32bmodel";
  auto unique_net_name = lite::subgraph::bm::UniqueName(net_name);
65
  __bmcompile_opt(
66
      graph.GetCompilerHandle(), const_cast<char*>(unique_net_name.c_str()), 1);
67 68 69
  void* bmodel_data = nullptr;
  unsigned int data_size = 0;
  finish_bmcompiler_data(graph.GetCompilerHandle(), &bmodel_data, &data_size);
70
  graph.UnlockCompilerMutex();
71
  bm_hd_ = static_cast<bm_handle_t>(ctx.GetHandle());
72 73
  bmrt_hd_ = bmrt_create(bm_hd_);
  if (false == bmrt_load_bmodel_data(bmrt_hd_, bmodel_data, data_size)) {
74
    return false;
75 76 77 78 79 80 81
  }
  bmrt_get_network_names(bmrt_hd_, &net_names_);
  net_info_ = bmrt_get_network_info(bmrt_hd_, net_names_[0]);
  auto& stage = net_info_->stages[0];
  // input
  device_inputs_.resize(input_names_.size());
  for (size_t i = 0; i < input_names_.size(); i++) {
82
    auto origin_itensor =
83
        exec_scope_->FindMutableTensor(net_info_->input_names[i]);
84
    CHECK(origin_itensor);
85 86 87
    bm_device_mem_t* p_mem =
        static_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
    CHECK(p_mem != nullptr);
88 89 90
    CHECK_EQ(
        bm_malloc_device_byte(bm_hd_, p_mem, origin_itensor->memory_size()),
        BM_SUCCESS);
91 92 93 94 95 96
    bmrt_tensor_with_device(&device_inputs_[i],
                            *p_mem,
                            net_info_->input_dtypes[i],
                            stage.input_shapes[i]);
  }
  // output
97 98 99 100 101 102 103
  device_outputs_.resize(net_info_->output_num);
  int out_index = 0;
  for (int i = 0; i < output_names_.size(); i++) {
    outname_map_.insert(std::pair<std::string, int>(output_names_[i], i));
  }

  for (int i = 0; i < net_info_->output_num; i++) {
104
    Tensor* t_cur = exec_scope_->FindMutableTensor(net_info_->output_names[i]);
105
    CHECK(t_cur != nullptr);
106 107 108
    bm_device_mem_t* p_mem =
        static_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
    CHECK(p_mem != nullptr);
109 110 111 112 113 114 115 116
    if (outname_map_.find(net_info_->output_names[i]) != outname_map_.end()) {
      origin_otensors_[out_index] = t_cur;
      origin_otensors_[out_index]->mutable_data<float>();
      out_index += 1;
    }
    CHECK_EQ(
        bm_malloc_device_byte(bm_hd_, p_mem, net_info_->max_output_bytes[i]),
        BM_SUCCESS);
117 118 119 120 121
    bmrt_tensor_with_device(&device_outputs_[i],
                            *p_mem,
                            net_info_->output_dtypes[i],
                            stage.output_shapes[i]);
  }
122
  return true;
123 124
}

125
bool SubgraphEngine::LaunchDeviceProgram() {
126
  for (size_t i = 0; i < device_inputs_.size(); i++) {
127 128
    auto origin_itensor =
        exec_scope_->FindMutableTensor(net_info_->input_names[i]);
129 130
    bm_memcpy_s2d(bm_hd_,
                  device_inputs_[i].device_mem,
131
                  const_cast<void*>(origin_itensor->raw_data()));
132 133 134 135 136 137 138 139 140 141
  }
  bmrt_launch_tensor_ex(bmrt_hd_,
                        net_names_[0],
                        static_cast<const bm_tensor_t*>(&device_inputs_[0]),
                        net_info_->input_num,
                        static_cast<bm_tensor_t*>(&device_outputs_[0]),
                        net_info_->output_num,
                        true,
                        false);
  bm_thread_sync(bm_hd_);
142
  int out_index = 0;
143
  for (size_t i = 0; i < device_outputs_.size(); i++) {
144 145 146 147 148 149
    if (outname_map_.find(net_info_->output_names[i]) != outname_map_.end()) {
      bm_memcpy_d2s(bm_hd_,
                    const_cast<void*>(origin_otensors_[out_index]->raw_data()),
                    device_outputs_[i].device_mem);
      out_index++;
    }
150
  }
151
  return true;
152 153 154 155 156
}

void SubgraphCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
  engine_.reset(new SubgraphEngine(ctx_.get(),
157 158 159
                                   param.block_idx,
                                   param.program_desc,
                                   param.exec_scope,
160
                                   param.input_data_names,
161
                                   param.output_data_names));
162 163 164 165 166
  CHECK(engine_);
}

void SubgraphCompute::Run() {
  CHECK(engine_);
167
  engine_->Run();
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
}

}  // namespace bm
}  // namespace kernels
}  // namespace lite
}  // namespace paddle

REGISTER_LITE_KERNEL(subgraph,
                     kBM,
                     kFloat,
                     kNCHW,
                     paddle::lite::kernels::bm::SubgraphCompute,
                     def)
    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
    .Finalize();