executor.cpp 28.9 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "framework/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
Z
zhangyang 已提交
29
#include "memory/t_malloc.h"
L
update  
liuruilong 已提交
30 31 32 33

#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
W
wangliu 已提交
34 35

namespace paddle_mobile {
36
namespace framework {
37

W
wangliu 已提交
38 39
#pragma mark - executor

40
template <typename Device, typename T>
xiebaiyuan's avatar
xiebaiyuan 已提交
41 42 43 44
Executor<Device, T>::Executor(const Program<Device> &program,
                              paddle_mobile::PaddleMobileConfigInternal config,
                              int batch_size, const bool use_optimize,
                              const bool lod_mode)
45
    : program_(program),
H
hjchen2 已提交
46 47
      batch_size_(batch_size),
      use_optimize_(use_optimize),
xiebaiyuan's avatar
xiebaiyuan 已提交
48 49
      lod_mode_(lod_mode),
      config_(config) {
50 51
  DLOG << "executor in lod mode: " << lod_mode_;

W
wangliu 已提交
52
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
53
  variable_ptr->SetValue<int>(batch_size);
54 55

  program_desc_ =
Refine  
陈后江 已提交
56
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
57 58 59
  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
                        "program_desc_ should not be nullptr");
  const auto &blocks = program_desc_->Blocks();
60 61 62 63 64 65 66 67 68

  std::shared_ptr<BlockDesc> block_desc = blocks[0];
  std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
  for (int j = 0; j < ops.size(); ++j) {
    std::shared_ptr<OpDesc> op_desc = ops[j];
    DLOG << "create op: " << op_desc->Type();

    auto op_handler = OpRegistry<Device>::CreateOp(
        op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
69
        op_desc->GetAttrMap(), program_.scope.get());
70 71 72 73
    // infer shape to reshape inputs and outputs before predict,
    // but for lod mode, it still need to infer shape in runtime
    if (!lod_mode) {
      op_handler->InferShape();
W
wangliu 已提交
74
    }
75
    ops_of_block0_.push_back(op_handler);
W
wangliu 已提交
76
  }
77

W
wangliu 已提交
78
  if (program_.combined) {
L
liuruilong 已提交
79 80 81 82
    InitCombineMemory();
  } else {
    InitMemory();
  }
83 84
  // resize feed and fetch list
  InitFeedFetchList();
85 86

  int count = 0;
87 88 89
  for (auto &op_handler : ops_of_block0_) {
    DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
    op_handler->Init();
L
liuruilong 已提交
90
  }
W
wangliu 已提交
91 92
}

93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
template <typename Device, typename T>
void Executor<Device, T>::InitFeedFetchList() {
  std::unordered_map<std::string, int> feed_indices, fetch_indices;
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &op_desc : block->Ops()) {
      if (op_desc->Type() == "feed") {
        std::string name = op_desc->Output("Out")[0];
        feed_indices[name] = op_desc->GetAttr("col").Get<int>();
      } else if (op_desc->Type() == "fetch") {
        std::string name = op_desc->Input("X")[0];
        fetch_indices[name] = op_desc->GetAttr("col").Get<int>();
      }
    }
  }
  feed_indices_.swap(feed_indices);
  fetch_indices_.swap(fetch_indices);

  auto *feed_var = program_.scope->Var("feed");
  auto *feed_list = feed_var->template GetMutable<framework::LoDTensorArray>();
  feed_list->resize(feed_indices_.size());

  auto *fetch_var = program_.scope->Var("fetch");
  auto *fetch_list =
      fetch_var->template GetMutable<framework::LoDTensorArray>();
  fetch_list->resize(fetch_indices_.size());
}

120
template <typename T>
121
static void LoadMemInternal(void **data, LoDTensor *tensor,
122
                            bool quant_uint8 = false) {
Refine  
陈后江 已提交
123
  char **data_buf = reinterpret_cast<char **>(data);
124
  int64_t size = tensor->numel();
125
  T *tensor_data = tensor->mutable_data<T>();
126 127
  if (quant_uint8) {
    // should be moved into operator init function
128 129
    float min_value;
    float max_value;
130 131 132
    memory::Copy(&min_value, *data_buf, sizeof(float));
    memory::Copy(&max_value, *data_buf + sizeof(float), sizeof(float));
    *data_buf += 2 * sizeof(float);
133
    const float factor = (max_value - min_value) / 255.0;
134
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data_buf);
135 136
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
137
    }
138
    *data_buf += size * sizeof(uint8_t);
139
  } else {
140 141
    memory::Copy(tensor_data, *data_buf, size * sizeof(T));
    *data_buf += size * sizeof(T);
L
liuruilong 已提交
142
  }
143
}
W
wangliu 已提交
144

145 146 147 148
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(void **data,
                                     const std::shared_ptr<VarDesc> var_desc,
                                     LoDTensor *tensor) {
149
  char **data_buf = reinterpret_cast<char **>(data);
150
  // version
151
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
152
  *data_buf += sizeof(uint32_t);
153
  // lod information
H
hjchen2 已提交
154 155
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
Z
zhangyang 已提交
156
  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
157
  *data_buf += sizeof(uint64_t);
158 159 160 161

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
162
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
163
    *data_buf += sizeof(uint64_t);
164
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Z
zhangyang 已提交
165
    memory::Copy(tmp_dim.data(), *data_buf, size);
166
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
167
    *data_buf += size;
W
wangliu 已提交
168
  }
169
  // tensor version
170
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
171
  *data_buf += sizeof(uint32_t);
172
  // tensor desc size
173
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
174
  *data_buf += sizeof(int32_t);
175
  // skip tensor desc
Refine  
陈后江 已提交
176
  *data_buf += tensor_desc_size;
177

178 179
  const TensorDesc &tensor_desc = var_desc->Tensor_desc();
  tensor->Resize(make_ddim(tensor_desc.Dims()));
180 181
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
182
    case VARTYPE_TYPE_FP32:
183 184
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
W
wangliu 已提交
185
      break;
186
    case VARTYPE_TYPE_INT8:
187
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
188
      break;
189
    case VARTYPE_TYPE_INT32:
190
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
191 192
      break;
    default:
193
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
194
  }
W
wangliu 已提交
195 196
}

197 198 199
template <typename Device, typename T>
void Executor<Device, T>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
W
wangliu 已提交
200 201 202 203
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
H
update  
hjchen2 已提交
204
          var->template GetMutable<framework::LoDTensorArray>();
W
wangliu 已提交
205 206
          continue;
        }
Refine  
陈后江 已提交
207
        char *origin_data =
Refine  
陈后江 已提交
208
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
209
        char *data = origin_data;
H
update  
hjchen2 已提交
210
        auto tensor = var->template GetMutable<LoDTensor>();
211 212
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
213
      } else {
214
        DLOG << "init no persistable var: " << var_desc->Name();
H
update  
hjchen2 已提交
215
        varInputMemory(var_desc, var);
W
wangliu 已提交
216 217 218 219 220
      }
    }
  }
}

221 222
template <typename Device, typename T>
void Executor<Device, T>::InitCombineMemory() {
Refine  
陈后江 已提交
223
  char *origin_data = nullptr;
Refine  
陈后江 已提交
224
  bool self_alloc = false;
225
  if (program_.combined_params_buf && program_.combined_params_len) {
226 227
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
228
  } else {
Refine  
陈后江 已提交
229
    self_alloc = true;
Refine  
陈后江 已提交
230
    origin_data = ReadFileToBuff(program_.para_path);
231
  }
Refine  
陈后江 已提交
232 233
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
234
  for (const auto &block : program_desc_->Blocks()) {
L
liuruilong 已提交
235 236 237 238
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
H
update  
hjchen2 已提交
239
          var->template GetMutable<framework::LoDTensorArray>();
L
liuruilong 已提交
240 241
          continue;
        }
L
liuruilong 已提交
242 243

        DLOG << " init combine memory persistable: " << var_desc->Name();
H
update  
hjchen2 已提交
244
        auto tensor = var->template GetMutable<LoDTensor>();
245
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
246
      } else {
H
update  
hjchen2 已提交
247 248
        DLOG << " init combine memory no persistable: " << var_desc->Name();
        varInputMemory(var_desc, var);
L
liuruilong 已提交
249 250 251
      }
    }
  }
Refine  
陈后江 已提交
252
  if (self_alloc) {
253
    delete[] origin_data;
Refine  
陈后江 已提交
254 255
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
256
}
257

L
liuruilong 已提交
258
template <typename Device, typename T>
L
liuruilong 已提交
259
void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
L
liuruilong 已提交
260 261 262 263 264 265
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
H
update  
hjchen2 已提交
266
          var->template GetMutable<framework::LoDTensorArray>();
L
liuruilong 已提交
267 268 269 270 271
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          DDim tensor_dim = tensor->dims();
xiebaiyuan's avatar
xiebaiyuan 已提交
272 273 274 275
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          tensor->Resize(new_dim);
L
liuruilong 已提交
276
          tensor->template mutable_data<T>();
H
update  
hjchen2 已提交
277 278 279
        } else {
          PADDLE_MOBILE_THROW_EXCEPTION("Unsupported var type `%d`",
                                        var_desc->Type());
L
liuruilong 已提交
280 281 282 283 284 285 286 287 288 289
        }
      }
    }
  }

  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<T>();
}

290 291
template <typename Device, typename T>
bool Executor<Device, T>::varInputMemory(
H
update  
hjchen2 已提交
292
    const std::shared_ptr<VarDesc> &var_desc, Variable *var) const {
293 294 295 296
#ifdef PADDLE_MOBILE_FPGA
  tensor->init(typeid(float));
  return true;
#endif
H
update  
hjchen2 已提交
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
  auto TypeId = [](const VarType_Type &type) -> std::type_index {
    switch (type) {
      case VARTYPE_TYPE_BOOL:
        return typeid(bool);
      case VARTYPE_TYPE_FP32:
        return typeid(float);
      case VARTYPE_TYPE_INT8:
        return typeid(int8_t);
      case VARTYPE_TYPE_INT32:
        return typeid(int);
      case VARTYPE_TYPE_INT64:
        return typeid(int64_t);
      default:
        PADDLE_MOBILE_THROW_EXCEPTION("got unhandled var type `%d`", type);
    }
  };

  auto type = var_desc->Type();
  if (type == VARTYPE_TYPE_LOD_TENSOR) {
    auto data_type = var_desc->Tensor_desc().DataType();
    framework::LoDTensor *tensor = var->template GetMutable<LoDTensor>();
    tensor->mutable_data(TypeId(data_type));
  } else if (type == VARTYPE_TYPE_STEP_SCOPES) {
    std::vector<framework::Scope *> *step_scopes =
        var->template GetMutable<std::vector<framework::Scope *>>();
  } else if (type == VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY) {
    framework::LoDTensorArray *tensor_array =
        var->template GetMutable<framework::LoDTensorArray>();
  } else {
    PADDLE_MOBILE_THROW_EXCEPTION("got unhandled var type `%d`", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
327
  }
H
update  
hjchen2 已提交
328
  return true;
xiebaiyuan's avatar
xiebaiyuan 已提交
329
}
L
liuruilong 已提交
330

331 332 333 334 335
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, Tensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
336
  }
337 338 339 340 341 342 343 344
  return this->Predict();
}

template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, LoDTensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
345
  }
346
  return this->Predict();
W
wangliu 已提交
347
}
xiebaiyuan's avatar
xiebaiyuan 已提交
348

349 350 351
template <typename Device, typename T>
std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
                                            const std::vector<int64_t> &dims) {
352 353 354 355 356 357 358
  PADDLE_MOBILE_ENFORCE(feed_indices_.size() != 0,
                        "We don't know which tensor should be assign, since no "
                        "feed op found in this model");
  PADDLE_MOBILE_ENFORCE(fetch_indices_.size() != 0,
                        "We don't know which tensor should be fetch out, since "
                        "no fetch op found in this model");
  std::string input_name = feed_indices_.begin()->first;
359
  Tensor feed_tensor(input, make_ddim(dims));
360
  SetInput(feed_tensor, input_name);
361 362
  std::vector<T> output;
  if (this->Predict() == PMSuccess) {
363 364
    std::string output_name = fetch_indices_.begin()->first;
    const auto output_tensor = GetOutput(output_name);
365 366 367 368 369 370
    output.resize(output_tensor->numel());
    memcpy(output.data(), output_tensor->template data<T>(),
           output.size() * sizeof(T));
  }
  return output;
}
xiebaiyuan's avatar
xiebaiyuan 已提交
371

372 373 374
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const Tensor &input,
                                   const std::string &var_name) {
375 376 377 378 379 380 381 382 383 384 385 386
  framework::LoDTensor *target = nullptr;
  if (feed_indices_.find(var_name) != feed_indices_.end()) {
    int index = feed_indices_.find(var_name)->second;
    auto *feed_var = program_.scope->Var("feed");
    target = &(
        feed_var->template GetMutable<framework::LoDTensorArray>()->at(index));
  } else {
    auto *target_var = program_.scope->FindVar(var_name);
    PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                          var_name.c_str());
    target = target_var->template GetMutable<LoDTensor>();
  }
L
liuruilong 已提交
387
  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
388 389 390
    if (input_dim_last_ != input.dims()) {
      InitNoPersistableMemory(input);
      input_dim_last_ = input.dims();
L
liuruilong 已提交
391 392 393
    }
  }

394 395
  target->Resize(input.dims());
  target->ShareDataWith(input);
396
}
xiebaiyuan's avatar
xiebaiyuan 已提交
397

398 399 400
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const LoDTensor &input,
                                   const std::string &var_name) {
401 402 403 404 405 406 407 408 409 410 411 412
  framework::LoDTensor *target = nullptr;
  if (feed_indices_.find(var_name) != feed_indices_.end()) {
    int index = feed_indices_.find(var_name)->second;
    auto *feed_var = program_.scope->Var("feed");
    target = &(
        feed_var->template GetMutable<framework::LoDTensorArray>()->at(index));
  } else {
    auto *target_var = program_.scope->FindVar(var_name);
    PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                          var_name.c_str());
    target = target_var->template GetMutable<LoDTensor>();
  }
L
liuruilong 已提交
413
  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
414
    if (input_dim_last_ != input.dims()) {
415
      InitNoPersistableMemory(input);
Z
zhaojiaying01 已提交
416
      input_dim_last_ = input.dims();
L
liuruilong 已提交
417 418 419
    }
  }

420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440
  target->Resize(input.dims());
  target->ShareDataWith(input);
  target->set_lod(input.lod());
}

template <typename Device, typename T>
std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
    const std::string &var_name) {
  framework::LoDTensor *target = nullptr;
  if (fetch_indices_.find(var_name) != fetch_indices_.end()) {
    int index = fetch_indices_.find(var_name)->second;
    auto *fetch_var = program_.scope->Var("fetch");
    target = &(
        fetch_var->template GetMutable<framework::LoDTensorArray>()->at(index));
  } else {
    auto *target_var = program_.scope->FindVar(var_name);
    PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                          var_name.c_str());
    target = target_var->template GetMutable<LoDTensor>();
  }
  return std::make_shared<LoDTensor>(*target);
441
}
xiebaiyuan's avatar
xiebaiyuan 已提交
442

443 444
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict() {
xiebaiyuan's avatar
xiebaiyuan 已提交
445
#ifdef PADDLE_MOBILE_PROFILE
446
  std::vector<ProfInfo> profile(ops_of_block0_.size());
447 448
  struct timespec ts;
  int op_index = 0;
xiebaiyuan's avatar
xiebaiyuan 已提交
449
#endif
450
  for (auto &op_handler : ops_of_block0_) {
xiebaiyuan's avatar
xiebaiyuan 已提交
451
#ifdef PADDLE_MOBILE_PROFILE
452 453
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
xiebaiyuan's avatar
xiebaiyuan 已提交
454
#endif
455 456 457 458
    if (lod_mode_) {
      op_handler->InferShape();
    }
    op_handler->Run();
xiebaiyuan's avatar
xiebaiyuan 已提交
459
#ifdef PADDLE_MOBILE_PROFILE
460 461 462
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
    ++op_index;
xiebaiyuan's avatar
xiebaiyuan 已提交
463 464 465 466 467 468 469
#endif
  }
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
470 471 472
    if (ops_of_block0_[i]->Type() == "conv2d" ||
        ops_of_block0_[i]->Type() == "depthwise_conv2d") {
      auto inputs = ops_of_block0_[i]->Inputs();
473 474
      auto *filter =
          GetVarValue<LoDTensor>("Filter", inputs, *(program_.scope));
475
      int kernel_size = filter->dims()[2];
476 477
      _tp[ops_of_block0_[i]->Type() + "_" + std::to_string(kernel_size)] +=
          timeCost;
478
    } else {
479
      _tp[ops_of_block0_[i]->Type()] += timeCost;
480
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
481
  }
H
hjchen2 已提交
482
  printf("====================[ profile ]======================\n");
483
  typedef std::pair<std::string, uint64_t> prof_t;
xiebaiyuan's avatar
xiebaiyuan 已提交
484 485 486 487 488 489 490 491 492 493 494 495 496 497 498
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
H
hjchen2 已提交
499
  printf("====================[---------]======================\n");
xiebaiyuan's avatar
xiebaiyuan 已提交
500
#endif
501
  return PMSuccess;
xiebaiyuan's avatar
xiebaiyuan 已提交
502 503
}

504
#ifdef PADDLE_MOBILE_FPGA
505 506 507 508 509
template <typename Device, typename T>
void Executor<Device, T>::InjectVariable(const Tensor &t,
                                         std::string var_name) {
  Variable *g_feed_value = program_.scope->Var(var_name);
  Tensor *feed_tensor = g_feed_value->GetMutable<LoDTensor>();
510 511
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
512
}
513

514 515
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const Tensor &t) {
516
  InjectVariable(t, "feed");
517
}
518

519 520
template <typename Device, typename T>
std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
521
  auto &ops = ops_of_block0_;
522

Z
zhangyang 已提交
523 524 525 526 527
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
528 529 530
  auto *output_tensor =
      GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
  return std::make_shared<Tensor>(Tensor(*output_tensor));
531
}
532

533 534
template <typename Device, typename T>
void Executor<Device, T>::Predict_From_To(int start, int end) {
535
  auto &ops = ops_of_block0_;
536
  end = end < 0 ? static_cast<int>(ops.size()) : end;
537 538 539 540 541 542 543 544 545 546 547 548
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
549
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
550 551 552 553 554 555 556
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
557
}
558

559 560
template <typename Device, typename T>
void Executor<Device, T>::Predict_From(int start) {
561
  Predict_From_To(start);
562
}
563

564 565
template <typename Device, typename T>
void Executor<Device, T>::Predict_To(int end) {
566
  Predict_From_To(0, end);
567
}
568 569
#endif

Y
yangfei 已提交
570
#ifdef PADDLE_MOBILE_CL
xiebaiyuan's avatar
xiebaiyuan 已提交
571 572
template <>
void Executor<GPU_CL, float>::InitNoPersistableMemory(
573
    const Tensor &input_tensor) {
xiebaiyuan's avatar
xiebaiyuan 已提交
574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616
  DLOG << "CL InitNoPersistableMemory ";
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());

      auto cl_image = var->template GetMutable<CLImage>();

      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          cl_context context = program_.scope->GetCLScpoe()->Context();
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();

          DDim tensor_dim = cl_image->dims();
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          cl_image->Resize(new_dim);
          cl_image->InitEmptyImage(context, command_queue, new_dim);
        }
      }
    }
  }
  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<float>();
}
template <>
void Executor<GPU_CL, float>::SetInput(const Tensor &input,
                                       const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());

  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
  DLOG << "config_.load_when_predict   " << config_.load_when_predict;
  DLOG << "target_tensor->IsInitialized() " << target_tensor->IsInitialized();
  DLOG << "target_tensor->dims()   " << target_tensor->dims();
  DLOG << "input.dims()   " << input.dims();
617
  DLOG << "input_dim_last_   " << input_dim_last_;
xiebaiyuan's avatar
xiebaiyuan 已提交
618
  if (config_.load_when_predict) {
xiebaiyuan's avatar
xiebaiyuan 已提交
619
    if (input_dim_last_ != input.dims()) {
620 621 622
      DLOG << "SetInput ---- > resize1";
      target_tensor->Resize(input.dims());
      target_tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
623 624 625 626 627 628 629 630
      InitNoPersistableMemory(*target_tensor);
    }
  } else {
    DLOG << "SetInput ---- > resize2";
    target_tensor->Resize(input.dims());
    DLOG << "SetInput ---- > ShareDataWith";
  }
  target_tensor->ShareDataWith(input);
631 632
  auto &dim = input.dims();
  input_dim_last_ = static_cast<DDim>(dim);
xiebaiyuan's avatar
xiebaiyuan 已提交
633 634
}

635 636 637
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
                                     char **data) {}
L
liuruilong 已提交
638

Y
yangfei 已提交
639
template <>
H
hjchen2 已提交
640 641
void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
                                         float *tensorInput, char **data) {
642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

  (*data) += sizeof(uint32_t);

  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
  (*data) += sizeof(uint64_t);

  for (uint64_t i = 0; i < lod_level; ++i) {
    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
    (*data) += sizeof(uint64_t);
    std::vector<size_t> tmp(size / sizeof(size_t));

    for (int k = 0; k < tmp.size(); ++k) {
      tmp[k] = *reinterpret_cast<size_t *>(*data);
      (*data) += sizeof(size_t);
    }
  }

  // 3. tensor version
  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
  (*data) += sizeof(uint32_t);

  // 4. tensor desc
  int32_t size = *reinterpret_cast<int32_t *>(*data);
  (*data) += sizeof(int32_t);

  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
    buf.get()[m] = (*data)[m];
  }
  (*data) += (sizeof(char) * size);

679
  const TensorDesc &desc = var_desc.Tensor_desc();
680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
    float min_value;
    float max_value;

    memcpy(&min_value, *data, sizeof(float));
    memcpy(&max_value, *data + sizeof(float), sizeof(float));
    *data += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
    for (int k = 0; k < memory_size; ++k) {
      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
    for (int n = 0; n < memory_size; n++) {
      float value;
      memcpy(&value, *data + n * type_size, type_size);
      if (value < 1e-30 && value > -1e-30) {
        static_cast<float *>(memory)[n] = 0.0;
      } else {
        static_cast<float *>(memory)[n] = value;
      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }
}
714

Y
yangfei 已提交
715
template <>
716 717
void Executor<GPU_CL, float>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
718 719 720
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
721
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
722
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
723
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
724
          continue;
L
liuruilong 已提交
725
        } else {
726
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
727
        }
L
liuruilong 已提交
728

Y
yangfei 已提交
729
        char *origin_data =
L
liuruilong 已提交
730
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
731
        char *data = origin_data;
Y
yangfei 已提交
732
        cl_context context = program_.scope->GetCLScpoe()->Context();
733
        const TensorDesc &desc = var_desc->Tensor_desc();
734 735 736 737 738
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
        }
        DLOG << var_desc->Name();
Y
yangfei 已提交
739
        float *tensorInput = static_cast<float *>(
740 741
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);
Y
yangfei 已提交
742

743
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
744

L
liuruilong 已提交
745 746
        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
Y
yangfei 已提交
747

748
        delete origin_data;
Y
yangfei 已提交
749
        paddle_mobile::memory::Free(tensorInput);
750
      } else {
751 752
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          auto cl_image = var->template GetMutable<CLImage>();
753
          cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
754 755
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
756

757 758 759
          const TensorDesc &desc = var_desc->Tensor_desc();
          //          DDim ddim = make_ddim(desc.Dims());
          DDim ddim = cl_image->dims();
760
          DLOG << var_desc->Name();
L
liuruilong 已提交
761
          cl_image->InitEmptyImage(context, command_queue, ddim);
762
        }
Y
yangfei 已提交
763 764 765 766
      }
    }
  }
}
767

Y
yangfei 已提交
768
template <>
769
void Executor<GPU_CL, float>::InitCombineMemory() {
xiebaiyuan's avatar
xiebaiyuan 已提交
770 771
  DLOG << "CL InitCombineMemory---- "
       << "config_.load_when_predict: " << config_.load_when_predict;
Y
yangfei 已提交
772 773
  char *origin_data = nullptr;
  bool self_alloc = false;
Y
yangfei 已提交
774 775
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
776
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
Y
yangfei 已提交
777 778
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
Y
yangfei 已提交
779
    self_alloc = true;
L
liuruilong 已提交
780
    origin_data = ReadFileToBuff(program_.para_path);
Y
yangfei 已提交
781 782
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
783
  float *data = reinterpret_cast<float *>(origin_data);
Y
yangfei 已提交
784

785
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
786 787 788
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
789
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
790
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
791
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
792
          continue;
L
liuruilong 已提交
793
        } else {
794
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
795 796 797 798
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

799 800
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
801 802 803 804 805

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
          numel = numel * ddim[i];
        }
806 807 808
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);
L
liuruilong 已提交
809 810 811 812

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

813 814
        paddle_mobile::memory::Free(tensorInput);
      } else {
815
        auto cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
816
        cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
817 818
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
819 820 821
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = cl_image->dims();
        //  DDim ddim = make_ddim(desc.Dims());
L
liuruilong 已提交
822
        cl_image->InitEmptyImage(context, command_queue, ddim);
Y
yangfei 已提交
823 824 825
      }
    }
  }
Y
yangfei 已提交
826
  if (self_alloc) {
827
    delete data;
Y
yangfei 已提交
828
  }
Y
yangfei 已提交
829
  LOG(kLOG_INFO) << " end init combine memory ";
830
}
Y
yangfei 已提交
831 832 833

#endif

834
template class Executor<CPU, float>;
Y
yangfei 已提交
835

836
template class Executor<FPGA, float>;
W
wangliu 已提交
837

838
template class Executor<GPU_CL, float>;
Y
yangfei 已提交
839

840
template class Executor<GPU_MALI, float>;
Y
yangfei 已提交
841 842

}  // namespace framework
W
wangliu 已提交
843
}  // namespace paddle_mobile