executor.cpp 30.3 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "framework/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
Z
zhangyang 已提交
29
#include "memory/t_malloc.h"
L
update  
liuruilong 已提交
30 31 32 33

#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
W
wangliu 已提交
34 35

namespace paddle_mobile {
36
namespace framework {
37

W
wangliu 已提交
38 39
#pragma mark - executor

40
template <typename Device, typename T>
xiebaiyuan's avatar
xiebaiyuan 已提交
41 42 43 44
Executor<Device, T>::Executor(const Program<Device> &program,
                              paddle_mobile::PaddleMobileConfigInternal config,
                              int batch_size, const bool use_optimize,
                              const bool lod_mode)
45
    : program_(program),
H
hjchen2 已提交
46 47
      batch_size_(batch_size),
      use_optimize_(use_optimize),
xiebaiyuan's avatar
xiebaiyuan 已提交
48 49
      lod_mode_(lod_mode),
      config_(config) {
50 51
  DLOG << "executor in lod mode: " << lod_mode_;

W
wangliu 已提交
52
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
53
  variable_ptr->SetValue<int>(batch_size);
54 55

  program_desc_ =
Refine  
陈后江 已提交
56
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
57 58
  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
                        "program_desc_ should not be nullptr");
59 60 61
  // resize feed and fetch list
  // should init feed and fetch variables before infer shape
  InitFeedFetchList();
62

63
  const auto &blocks = program_desc_->Blocks();
64 65 66 67 68 69 70 71
  std::shared_ptr<BlockDesc> block_desc = blocks[0];
  std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
  for (int j = 0; j < ops.size(); ++j) {
    std::shared_ptr<OpDesc> op_desc = ops[j];
    DLOG << "create op: " << op_desc->Type();

    auto op_handler = OpRegistry<Device>::CreateOp(
        op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
72
        op_desc->GetAttrMap(), program_.scope.get());
73 74 75 76
    // infer shape to reshape inputs and outputs before predict,
    // but for lod mode, it still need to infer shape in runtime
    if (!lod_mode) {
      op_handler->InferShape();
W
wangliu 已提交
77
    }
78
    ops_of_block0_.push_back(op_handler);
W
wangliu 已提交
79
  }
W
wangliu 已提交
80
  if (program_.combined) {
L
liuruilong 已提交
81 82 83 84
    InitCombineMemory();
  } else {
    InitMemory();
  }
85

86 87 88 89 90
#ifdef PADDLE_MOBILE_FPGA
  program_.scope->EraseVars({"feed", "fetch"});
  program_.scope->print_vars();
#endif

91
  int count = 0;
92 93 94
  for (auto &op_handler : ops_of_block0_) {
    DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
    op_handler->Init();
L
liuruilong 已提交
95
  }
W
wangliu 已提交
96 97
}

98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
template <typename Device, typename T>
void Executor<Device, T>::InitFeedFetchList() {
  std::unordered_map<std::string, int> feed_indices, fetch_indices;
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &op_desc : block->Ops()) {
      if (op_desc->Type() == "feed") {
        std::string name = op_desc->Output("Out")[0];
        feed_indices[name] = op_desc->GetAttr("col").Get<int>();
      } else if (op_desc->Type() == "fetch") {
        std::string name = op_desc->Input("X")[0];
        fetch_indices[name] = op_desc->GetAttr("col").Get<int>();
      }
    }
  }
  feed_indices_.swap(feed_indices);
  fetch_indices_.swap(fetch_indices);

  auto *feed_var = program_.scope->Var("feed");
  auto *feed_list = feed_var->template GetMutable<framework::LoDTensorArray>();
  feed_list->resize(feed_indices_.size());

  auto *fetch_var = program_.scope->Var("fetch");
  auto *fetch_list =
      fetch_var->template GetMutable<framework::LoDTensorArray>();
  fetch_list->resize(fetch_indices_.size());
}

125
template <typename T>
126
static void LoadMemInternal(void **data, LoDTensor *tensor,
127
                            bool quant_uint8 = false) {
Refine  
陈后江 已提交
128
  char **data_buf = reinterpret_cast<char **>(data);
129
  int64_t size = tensor->numel();
130
  T *tensor_data = tensor->mutable_data<T>();
131 132
  if (quant_uint8) {
    // should be moved into operator init function
133 134
    float min_value;
    float max_value;
135 136 137
    memory::Copy(&min_value, *data_buf, sizeof(float));
    memory::Copy(&max_value, *data_buf + sizeof(float), sizeof(float));
    *data_buf += 2 * sizeof(float);
138
    const float factor = (max_value - min_value) / 255.0;
139
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data_buf);
140 141
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
142
    }
143
    *data_buf += size * sizeof(uint8_t);
144
  } else {
145 146
    memory::Copy(tensor_data, *data_buf, size * sizeof(T));
    *data_buf += size * sizeof(T);
L
liuruilong 已提交
147
  }
148
}
W
wangliu 已提交
149

150 151 152 153
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(void **data,
                                     const std::shared_ptr<VarDesc> var_desc,
                                     LoDTensor *tensor) {
154
  char **data_buf = reinterpret_cast<char **>(data);
155
  // version
156
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
157
  *data_buf += sizeof(uint32_t);
158
  // lod information
H
hjchen2 已提交
159 160
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
Z
zhangyang 已提交
161
  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
162
  *data_buf += sizeof(uint64_t);
163 164 165 166

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
167
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
168
    *data_buf += sizeof(uint64_t);
169
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Z
zhangyang 已提交
170
    memory::Copy(tmp_dim.data(), *data_buf, size);
171
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
172
    *data_buf += size;
W
wangliu 已提交
173
  }
174
  // tensor version
175
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
176
  *data_buf += sizeof(uint32_t);
177
  // tensor desc size
178
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
179
  *data_buf += sizeof(int32_t);
180
  // skip tensor desc
Refine  
陈后江 已提交
181
  *data_buf += tensor_desc_size;
182

183 184
  const TensorDesc &tensor_desc = var_desc->Tensor_desc();
  tensor->Resize(make_ddim(tensor_desc.Dims()));
185 186
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
187
    case VARTYPE_TYPE_FP32:
188 189
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
W
wangliu 已提交
190
      break;
191
    case VARTYPE_TYPE_INT8:
192
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
193
      break;
194
    case VARTYPE_TYPE_INT32:
195
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
196 197
      break;
    default:
198
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
199
  }
W
wangliu 已提交
200 201
}

202 203 204
template <typename Device, typename T>
void Executor<Device, T>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
W
wangliu 已提交
205 206 207 208
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
H
update  
hjchen2 已提交
209
          var->template GetMutable<framework::LoDTensorArray>();
W
wangliu 已提交
210 211
          continue;
        }
Refine  
陈后江 已提交
212
        char *origin_data =
Refine  
陈后江 已提交
213
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
214
        char *data = origin_data;
H
update  
hjchen2 已提交
215
        auto tensor = var->template GetMutable<LoDTensor>();
216 217
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
218
      } else {
219
        DLOG << "init no persistable var: " << var_desc->Name();
H
update  
hjchen2 已提交
220
        varInputMemory(var_desc, var);
W
wangliu 已提交
221 222 223 224 225
      }
    }
  }
}

226 227
template <typename Device, typename T>
void Executor<Device, T>::InitCombineMemory() {
Refine  
陈后江 已提交
228
  char *origin_data = nullptr;
Refine  
陈后江 已提交
229
  bool self_alloc = false;
230
  if (program_.combined_params_buf && program_.combined_params_len) {
231 232
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
233
  } else {
Refine  
陈后江 已提交
234
    self_alloc = true;
Refine  
陈后江 已提交
235
    origin_data = ReadFileToBuff(program_.para_path);
236
  }
Refine  
陈后江 已提交
237 238
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
239
  for (const auto &block : program_desc_->Blocks()) {
L
liuruilong 已提交
240 241 242 243
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
H
update  
hjchen2 已提交
244
          var->template GetMutable<framework::LoDTensorArray>();
L
liuruilong 已提交
245 246
          continue;
        }
L
liuruilong 已提交
247 248

        DLOG << " init combine memory persistable: " << var_desc->Name();
H
update  
hjchen2 已提交
249
        auto tensor = var->template GetMutable<LoDTensor>();
250
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
251
      } else {
H
update  
hjchen2 已提交
252 253
        DLOG << " init combine memory no persistable: " << var_desc->Name();
        varInputMemory(var_desc, var);
L
liuruilong 已提交
254 255 256
      }
    }
  }
Refine  
陈后江 已提交
257
  if (self_alloc) {
258
    delete[] origin_data;
Refine  
陈后江 已提交
259 260
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
261
}
262

L
liuruilong 已提交
263
template <typename Device, typename T>
L
liuruilong 已提交
264
void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
L
liuruilong 已提交
265 266 267 268 269 270
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
H
update  
hjchen2 已提交
271
          var->template GetMutable<framework::LoDTensorArray>();
L
liuruilong 已提交
272 273 274 275 276
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          DDim tensor_dim = tensor->dims();
xiebaiyuan's avatar
xiebaiyuan 已提交
277 278 279 280
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          tensor->Resize(new_dim);
L
liuruilong 已提交
281
          tensor->template mutable_data<T>();
H
update  
hjchen2 已提交
282 283 284
        } else {
          PADDLE_MOBILE_THROW_EXCEPTION("Unsupported var type `%d`",
                                        var_desc->Type());
L
liuruilong 已提交
285 286 287 288 289 290 291 292 293 294
        }
      }
    }
  }

  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<T>();
}

295 296
template <typename Device, typename T>
bool Executor<Device, T>::varInputMemory(
H
update  
hjchen2 已提交
297
    const std::shared_ptr<VarDesc> &var_desc, Variable *var) const {
298
#ifdef PADDLE_MOBILE_FPGA
H
hjchen2 已提交
299
  framework::LoDTensor *tensor = var->template GetMutable<LoDTensor>();
300 301 302
  tensor->init(typeid(float));
  return true;
#endif
H
update  
hjchen2 已提交
303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
  auto TypeId = [](const VarType_Type &type) -> std::type_index {
    switch (type) {
      case VARTYPE_TYPE_BOOL:
        return typeid(bool);
      case VARTYPE_TYPE_FP32:
        return typeid(float);
      case VARTYPE_TYPE_INT8:
        return typeid(int8_t);
      case VARTYPE_TYPE_INT32:
        return typeid(int);
      case VARTYPE_TYPE_INT64:
        return typeid(int64_t);
      default:
        PADDLE_MOBILE_THROW_EXCEPTION("got unhandled var type `%d`", type);
    }
  };

  auto type = var_desc->Type();
  if (type == VARTYPE_TYPE_LOD_TENSOR) {
    auto data_type = var_desc->Tensor_desc().DataType();
    framework::LoDTensor *tensor = var->template GetMutable<LoDTensor>();
    tensor->mutable_data(TypeId(data_type));
  } else if (type == VARTYPE_TYPE_STEP_SCOPES) {
    std::vector<framework::Scope *> *step_scopes =
        var->template GetMutable<std::vector<framework::Scope *>>();
  } else if (type == VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY) {
    framework::LoDTensorArray *tensor_array =
        var->template GetMutable<framework::LoDTensorArray>();
  } else {
    PADDLE_MOBILE_THROW_EXCEPTION("got unhandled var type `%d`", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
333
  }
H
update  
hjchen2 已提交
334
  return true;
xiebaiyuan's avatar
xiebaiyuan 已提交
335
}
L
liuruilong 已提交
336

337 338 339 340 341
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, Tensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
342
  }
343 344 345 346 347 348 349 350
  return this->Predict();
}

template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, LoDTensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
351
  }
352
  return this->Predict();
W
wangliu 已提交
353
}
xiebaiyuan's avatar
xiebaiyuan 已提交
354

355 356 357
template <typename Device, typename T>
std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
                                            const std::vector<int64_t> &dims) {
358 359 360 361 362 363 364
  PADDLE_MOBILE_ENFORCE(feed_indices_.size() != 0,
                        "We don't know which tensor should be assign, since no "
                        "feed op found in this model");
  PADDLE_MOBILE_ENFORCE(fetch_indices_.size() != 0,
                        "We don't know which tensor should be fetch out, since "
                        "no fetch op found in this model");
  std::string input_name = feed_indices_.begin()->first;
365
  Tensor feed_tensor(input, make_ddim(dims));
366
  SetInput(feed_tensor, input_name);
367 368
  std::vector<T> output;
  if (this->Predict() == PMSuccess) {
369 370
    std::string output_name = fetch_indices_.begin()->first;
    const auto output_tensor = GetOutput(output_name);
371 372 373 374 375 376
    output.resize(output_tensor->numel());
    memcpy(output.data(), output_tensor->template data<T>(),
           output.size() * sizeof(T));
  }
  return output;
}
xiebaiyuan's avatar
xiebaiyuan 已提交
377

378 379 380
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const Tensor &input,
                                   const std::string &var_name) {
H
hjchen2 已提交
381
  int index = 0;
382
  if (feed_indices_.find(var_name) != feed_indices_.end()) {
H
hjchen2 已提交
383
    index = feed_indices_.find(var_name)->second;
384
  }
H
hjchen2 已提交
385 386 387 388
  auto *feed_var = program_.scope->Var("feed");
  framework::LoDTensor &target =
      feed_var->template GetMutable<framework::LoDTensorArray>()->at(index);

L
liuruilong 已提交
389
  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
390 391 392
    if (input_dim_last_ != input.dims()) {
      InitNoPersistableMemory(input);
      input_dim_last_ = input.dims();
L
liuruilong 已提交
393 394 395
    }
  }

H
hjchen2 已提交
396 397
  target.Resize(input.dims());
  target.ShareDataWith(input);
398
}
xiebaiyuan's avatar
xiebaiyuan 已提交
399

400 401 402
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const LoDTensor &input,
                                   const std::string &var_name) {
H
hjchen2 已提交
403
  int index = 0;
404
  if (feed_indices_.find(var_name) != feed_indices_.end()) {
H
hjchen2 已提交
405
    index = feed_indices_.find(var_name)->second;
406
  }
H
hjchen2 已提交
407 408 409 410
  auto *feed_var = program_.scope->Var("feed");
  framework::LoDTensor &target =
      feed_var->template GetMutable<framework::LoDTensorArray>()->at(index);

L
liuruilong 已提交
411
  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
412
    if (input_dim_last_ != input.dims()) {
413
      InitNoPersistableMemory(input);
Z
zhaojiaying01 已提交
414
      input_dim_last_ = input.dims();
L
liuruilong 已提交
415 416 417
    }
  }

H
hjchen2 已提交
418 419 420
  target.Resize(input.dims());
  target.ShareDataWith(input);
  target.set_lod(input.lod());
421 422 423 424 425
}

template <typename Device, typename T>
std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
    const std::string &var_name) {
426 427 428 429 430 431 432 433 434
  const auto &iter = fetch_indices_.find(var_name);
  if (var_name == "fetch" || iter != fetch_indices_.end()) {
    int index = 0;
    if (iter != fetch_indices_.end()) {
      index = iter->second;
    }
    auto *fetch_var = program_.scope->Var("fetch");
    framework::LoDTensor &target =
        fetch_var->template GetMutable<framework::LoDTensorArray>()->at(index);
H
hjchen2 已提交
435

436 437 438 439 440 441 442
    return std::make_shared<LoDTensor>(target);
  } else {
    auto *fetch_var = program_.scope->Var(var_name);
    framework::LoDTensor *target =
        fetch_var->template GetMutable<framework::LoDTensor>();
    return std::make_shared<LoDTensor>(*target);
  }
443
}
xiebaiyuan's avatar
xiebaiyuan 已提交
444

445 446
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict() {
xiebaiyuan's avatar
xiebaiyuan 已提交
447
#ifdef PADDLE_MOBILE_PROFILE
448
  std::vector<ProfInfo> profile(ops_of_block0_.size());
449 450
  struct timespec ts;
  int op_index = 0;
xiebaiyuan's avatar
xiebaiyuan 已提交
451
#endif
452
  for (auto &op_handler : ops_of_block0_) {
xiebaiyuan's avatar
xiebaiyuan 已提交
453
#ifdef PADDLE_MOBILE_PROFILE
454 455
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
xiebaiyuan's avatar
xiebaiyuan 已提交
456
#endif
457 458 459 460
    if (lod_mode_) {
      op_handler->InferShape();
    }
    op_handler->Run();
xiebaiyuan's avatar
xiebaiyuan 已提交
461
#ifdef PADDLE_MOBILE_PROFILE
462 463 464
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
    ++op_index;
xiebaiyuan's avatar
xiebaiyuan 已提交
465 466 467 468 469 470 471
#endif
  }
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
472 473 474
    if (ops_of_block0_[i]->Type() == "conv2d" ||
        ops_of_block0_[i]->Type() == "depthwise_conv2d") {
      auto inputs = ops_of_block0_[i]->Inputs();
475 476
      auto *filter =
          GetVarValue<LoDTensor>("Filter", inputs, *(program_.scope));
477
      int kernel_size = filter->dims()[2];
478 479
      _tp[ops_of_block0_[i]->Type() + "_" + std::to_string(kernel_size)] +=
          timeCost;
480
    } else {
481
      _tp[ops_of_block0_[i]->Type()] += timeCost;
482
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
483
  }
H
hjchen2 已提交
484
  printf("====================[ profile ]======================\n");
485
  typedef std::pair<std::string, uint64_t> prof_t;
xiebaiyuan's avatar
xiebaiyuan 已提交
486 487 488 489 490 491 492 493 494 495 496 497 498 499 500
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
H
hjchen2 已提交
501
  printf("====================[---------]======================\n");
xiebaiyuan's avatar
xiebaiyuan 已提交
502
#endif
503
  return PMSuccess;
xiebaiyuan's avatar
xiebaiyuan 已提交
504 505
}

506
#ifdef PADDLE_MOBILE_FPGA
507 508 509 510
template <typename Device, typename T>
void Executor<Device, T>::InjectVariable(const Tensor &t,
                                         std::string var_name) {
  Variable *g_feed_value = program_.scope->Var(var_name);
511
  Tensor *feed_tensor = g_feed_value->template GetMutable<LoDTensor>();
512 513
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
514
}
515

516 517
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const Tensor &t) {
Z
zhangyang0701 已提交
518
  InjectVariable(t, "feed0");
519
}
520

521
template <typename Device, typename T>
522
void Executor<Device, T>::FeedData(const std::vector<void *> &v) {
523
  auto input_size = v.size();
Z
zhangyang0701 已提交
524 525
  int index = 0;
  auto vars = program_.scope->VarContain("feed", &index);
526 527 528
  PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
                        "input data number not correct");
  for (int i = 0; i < input_size; i++) {
Z
zhangyang0701 已提交
529
    auto var = program_.scope->Var("feed", i + index);
530 531 532 533 534 535 536 537 538
    auto feed_tensor = var->template GetMutable<LoDTensor>();
    feed_tensor->external_data = v[i];
  }
}

template <typename Device, typename T>
void Executor<Device, T>::GetResults(std::vector<void *> *v) {
  auto output_size = v->size();
  PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output");
Z
zhangyang0701 已提交
539 540
  int index = 0;
  auto vars = program_.scope->VarContain("fetch", &index);
541 542
  PADDLE_MOBILE_ENFORCE(output_size == vars.size(),
                        "output data number not correct");
543

544
  for (int i = 0; i < output_size; i++) {
Z
zhangyang0701 已提交
545
    auto var = program_.scope->Var("fetch", i + index);
546 547
    auto fetch_tensor = var->template GetMutable<LoDTensor>();
    (*v)[i] = fetch_tensor->template data<float>();
548
  }
549
}
550

551
template <typename Device, typename T>
552 553
void Executor<Device, T>::GetTensorResults(
    std::vector<framework::Tensor *> *v) {
Z
zhangyang0701 已提交
554 555
  int index = 0;
  auto vars = program_.scope->VarContain("fetch", &index);
556
  auto output_size = vars.size();
557
  for (int i = 0; i < output_size; i++) {
Z
zhangyang0701 已提交
558
    auto var = program_.scope->Var("fetch", i + index);
559
    auto fetch_tensor = var->template GetMutable<LoDTensor>();
560
    v->push_back(fetch_tensor);
561 562 563
  }
}

564 565 566 567 568
template <typename Device, typename T>
framework::Tensor *Executor<Device, T>::GetTensorByName(
    const std::string &name) {
  auto var = program_.scope->Var(name);
  return var->template GetMutable<LoDTensor>();
H
hjchen2 已提交
569
}
570

571 572
template <typename Device, typename T>
std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
573
  auto &ops = ops_of_block0_;
574

Z
zhangyang 已提交
575 576 577 578 579
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
580 581 582
  auto *output_tensor =
      GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
  return std::make_shared<Tensor>(Tensor(*output_tensor));
583
}
584

585 586
template <typename Device, typename T>
void Executor<Device, T>::Predict_From_To(int start, int end) {
587
  auto &ops = ops_of_block0_;
588
  end = end < 0 ? static_cast<int>(ops.size()) : end;
589 590 591 592 593 594 595 596 597 598 599 600
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
601
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
602 603 604 605 606 607 608
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
609
}
610

611 612
template <typename Device, typename T>
void Executor<Device, T>::Predict_From(int start) {
613
  Predict_From_To(start);
614
}
615

616 617
template <typename Device, typename T>
void Executor<Device, T>::Predict_To(int end) {
618
  Predict_From_To(0, end);
619
}
620 621
#endif

Y
yangfei 已提交
622
#ifdef PADDLE_MOBILE_CL
xiebaiyuan's avatar
xiebaiyuan 已提交
623 624
template <>
void Executor<GPU_CL, float>::InitNoPersistableMemory(
625
    const Tensor &input_tensor) {
xiebaiyuan's avatar
xiebaiyuan 已提交
626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668
  DLOG << "CL InitNoPersistableMemory ";
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());

      auto cl_image = var->template GetMutable<CLImage>();

      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          cl_context context = program_.scope->GetCLScpoe()->Context();
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();

          DDim tensor_dim = cl_image->dims();
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          cl_image->Resize(new_dim);
          cl_image->InitEmptyImage(context, command_queue, new_dim);
        }
      }
    }
  }
  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<float>();
}
template <>
void Executor<GPU_CL, float>::SetInput(const Tensor &input,
                                       const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());

  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
  DLOG << "config_.load_when_predict   " << config_.load_when_predict;
  DLOG << "target_tensor->IsInitialized() " << target_tensor->IsInitialized();
  DLOG << "target_tensor->dims()   " << target_tensor->dims();
  DLOG << "input.dims()   " << input.dims();
669
  DLOG << "input_dim_last_   " << input_dim_last_;
xiebaiyuan's avatar
xiebaiyuan 已提交
670
  if (config_.load_when_predict) {
xiebaiyuan's avatar
xiebaiyuan 已提交
671
    if (input_dim_last_ != input.dims()) {
672 673 674
      DLOG << "SetInput ---- > resize1";
      target_tensor->Resize(input.dims());
      target_tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
675 676 677 678 679 680 681 682
      InitNoPersistableMemory(*target_tensor);
    }
  } else {
    DLOG << "SetInput ---- > resize2";
    target_tensor->Resize(input.dims());
    DLOG << "SetInput ---- > ShareDataWith";
  }
  target_tensor->ShareDataWith(input);
683 684
  auto &dim = input.dims();
  input_dim_last_ = static_cast<DDim>(dim);
xiebaiyuan's avatar
xiebaiyuan 已提交
685 686
}

687 688 689
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
                                     char **data) {}
L
liuruilong 已提交
690

Y
yangfei 已提交
691
template <>
H
hjchen2 已提交
692 693
void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
                                         float *tensorInput, char **data) {
694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

  (*data) += sizeof(uint32_t);

  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
  (*data) += sizeof(uint64_t);

  for (uint64_t i = 0; i < lod_level; ++i) {
    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
    (*data) += sizeof(uint64_t);
    std::vector<size_t> tmp(size / sizeof(size_t));

    for (int k = 0; k < tmp.size(); ++k) {
      tmp[k] = *reinterpret_cast<size_t *>(*data);
      (*data) += sizeof(size_t);
    }
  }

  // 3. tensor version
  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
  (*data) += sizeof(uint32_t);

  // 4. tensor desc
  int32_t size = *reinterpret_cast<int32_t *>(*data);
  (*data) += sizeof(int32_t);

  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
    buf.get()[m] = (*data)[m];
  }
  (*data) += (sizeof(char) * size);

731
  const TensorDesc &desc = var_desc.Tensor_desc();
732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
    float min_value;
    float max_value;

    memcpy(&min_value, *data, sizeof(float));
    memcpy(&max_value, *data + sizeof(float), sizeof(float));
    *data += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
    for (int k = 0; k < memory_size; ++k) {
      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
    for (int n = 0; n < memory_size; n++) {
      float value;
      memcpy(&value, *data + n * type_size, type_size);
      if (value < 1e-30 && value > -1e-30) {
        static_cast<float *>(memory)[n] = 0.0;
      } else {
        static_cast<float *>(memory)[n] = value;
      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }
}
766

Y
yangfei 已提交
767
template <>
768 769
void Executor<GPU_CL, float>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
770 771 772
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
773
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
774
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
775
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
776
          continue;
L
liuruilong 已提交
777
        } else {
778
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
779
        }
L
liuruilong 已提交
780

Y
yangfei 已提交
781
        char *origin_data =
L
liuruilong 已提交
782
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
783
        char *data = origin_data;
Y
yangfei 已提交
784
        cl_context context = program_.scope->GetCLScpoe()->Context();
785
        const TensorDesc &desc = var_desc->Tensor_desc();
786 787 788 789 790
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
        }
        DLOG << var_desc->Name();
Y
yangfei 已提交
791
        float *tensorInput = static_cast<float *>(
792 793
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);
Y
yangfei 已提交
794

795
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
796

L
liuruilong 已提交
797 798
        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
Y
yangfei 已提交
799

800
        delete origin_data;
Y
yangfei 已提交
801
        paddle_mobile::memory::Free(tensorInput);
802
      } else {
803 804
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          auto cl_image = var->template GetMutable<CLImage>();
805
          cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
806 807
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
808

809 810 811
          const TensorDesc &desc = var_desc->Tensor_desc();
          //          DDim ddim = make_ddim(desc.Dims());
          DDim ddim = cl_image->dims();
812
          DLOG << var_desc->Name();
L
liuruilong 已提交
813
          cl_image->InitEmptyImage(context, command_queue, ddim);
814
        }
Y
yangfei 已提交
815 816 817 818
      }
    }
  }
}
819

Y
yangfei 已提交
820
template <>
821
void Executor<GPU_CL, float>::InitCombineMemory() {
xiebaiyuan's avatar
xiebaiyuan 已提交
822 823
  DLOG << "CL InitCombineMemory---- "
       << "config_.load_when_predict: " << config_.load_when_predict;
Y
yangfei 已提交
824 825
  char *origin_data = nullptr;
  bool self_alloc = false;
Y
yangfei 已提交
826 827
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
828
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
Y
yangfei 已提交
829 830
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
Y
yangfei 已提交
831
    self_alloc = true;
L
liuruilong 已提交
832
    origin_data = ReadFileToBuff(program_.para_path);
Y
yangfei 已提交
833 834
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
835
  float *data = reinterpret_cast<float *>(origin_data);
Y
yangfei 已提交
836

837
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
838 839 840
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
841
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
842
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
843
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
844
          continue;
L
liuruilong 已提交
845
        } else {
846
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
847 848 849 850
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

851 852
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
853 854 855 856 857

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
          numel = numel * ddim[i];
        }
858 859 860
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);
L
liuruilong 已提交
861 862 863 864

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

865 866
        paddle_mobile::memory::Free(tensorInput);
      } else {
867
        auto cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
868
        cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
869 870
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
871 872 873
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = cl_image->dims();
        //  DDim ddim = make_ddim(desc.Dims());
L
liuruilong 已提交
874
        cl_image->InitEmptyImage(context, command_queue, ddim);
Y
yangfei 已提交
875 876 877
      }
    }
  }
Y
yangfei 已提交
878
  if (self_alloc) {
879
    delete data;
Y
yangfei 已提交
880
  }
Y
yangfei 已提交
881
  LOG(kLOG_INFO) << " end init combine memory ";
882
}
Y
yangfei 已提交
883 884 885

#endif

886
template class Executor<CPU, float>;
Y
yangfei 已提交
887

888
template class Executor<FPGA, float>;
W
wangliu 已提交
889

890
template class Executor<GPU_CL, float>;
Y
yangfei 已提交
891

892
template class Executor<GPU_MALI, float>;
Y
yangfei 已提交
893 894

}  // namespace framework
W
wangliu 已提交
895
}  // namespace paddle_mobile