executor.cpp 25.3 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

qnqinan's avatar
qnqinan 已提交
15
#include "framework/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
qnqinan's avatar
qnqinan 已提交
29 30
<<<<<<< HEAD:src/framework/executor.cpp
=======
Z
zhangyang 已提交
31
#include "memory/t_malloc.h"
qnqinan's avatar
qnqinan 已提交
32
>>>>>>> upstream/develop:src/framework/executor.cpp
qnqinan's avatar
qnqinan 已提交
33 34 35 36 37 38 39 40 41 42

#ifdef PADDLE_EXECUTOR_MULTITHREAD
#include <queue>
#include <utility>
#include "common/threadpool.h"
#endif

#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
W
wangliu 已提交
43 44

namespace paddle_mobile {
qnqinan's avatar
qnqinan 已提交
45
namespace framework {
46

qnqinan's avatar
qnqinan 已提交
47
using framework::Variable;
W
wangliu 已提交
48 49
using framework::Variable;

qnqinan's avatar
qnqinan 已提交
50 51
#pragma mark - executor

W
wangliu 已提交
52
template <typename Dtype, Precision P>
H
hjchen2 已提交
53
Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
54
                             const bool use_optimize, const bool loddable)
H
hjchen2 已提交
55 56 57 58
    : program_(p),
      batch_size_(batch_size),
      use_optimize_(use_optimize),
      loddable_(loddable) {
W
wangliu 已提交
59
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
60
  variable_ptr->SetValue<int>(batch_size);
Refine  
陈后江 已提交
61 62
  to_predict_program_ =
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
63 64
  PADDLE_MOBILE_ENFORCE(to_predict_program_ != nullptr,
                        "to_predict_program_ == NULL!");
65
  const std::vector<std::shared_ptr<framework::BlockDesc>> &blocks =
W
wangliu 已提交
66
      to_predict_program_->Blocks();
67 68

  DLOG << "executor in loaddable mode: " << loddable_;
W
wangliu 已提交
69 70 71 72 73
  for (int i = 0; i < blocks.size(); ++i) {
    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
    for (int j = 0; j < ops.size(); ++j) {
      std::shared_ptr<framework::OpDesc> op = ops[j];
74
      DLOG << "create op: " << op->Type();
W
wangliu 已提交
75 76 77
      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
          program_.scope);
Refine  
陈后江 已提交
78 79
      // infer shape to reshape tensor before predict,
      // but for lod tensor, it will need to reshape in runtime
xiebaiyuan's avatar
xiebaiyuan 已提交
80 81 82
      if (!loddable_) {
        op_base->InferShape();
      }
W
wangliu 已提交
83 84 85
      ops_of_block_[*block_desc.get()].push_back(op_base);
    }
  }
W
wangliu 已提交
86
  if (program_.combined) {
L
liuruilong 已提交
87 88 89 90
    InitCombineMemory();
  } else {
    InitMemory();
  }
L
liuruilong 已提交
91
  std::shared_ptr<framework::BlockDesc> to_predict_block =
L
liuruilong 已提交
92
      to_predict_program_->Block(0);
Z
zhangyang 已提交
93
  int i = 0;
L
liuruilong 已提交
94
  auto &ops = ops_of_block_[*to_predict_block.get()];
L
liuruilong 已提交
95
  for (const auto &op : ops) {
Z
zhangyang 已提交
96
    DLOG << "Initialize op[" << i++ << "]: " << op->Type();
L
liuruilong 已提交
97 98
    op->Init();
  }
W
wangliu 已提交
99 100
}

101
template <typename Dtype>
102 103
static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
                            bool quant_uint8 = false) {
Refine  
陈后江 已提交
104
  char **data_buf = reinterpret_cast<char **>(data);
105
  int64_t size = tensor->numel();
106
  Dtype *tensor_data = tensor->mutable_data<Dtype>();
107 108
  if (quant_uint8) {
    // should be moved into operator init function
109 110
    float min_value;
    float max_value;
Z
zhangyang 已提交
111 112
    memory::Copy(&min_value, data_buf, sizeof(float));
    memory::Copy(&max_value, data_buf + sizeof(float), sizeof(float));
113 114
    data_buf += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
115
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(data_buf);
116 117
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
118
    }
119 120
    data_buf += size * sizeof(uint8_t);
  } else {
Z
zhangyang 已提交
121
    memory::Copy(tensor_data, *data_buf, size * sizeof(Dtype));
Refine  
陈后江 已提交
122
    *data_buf += size * sizeof(Dtype);
L
liuruilong 已提交
123
  }
124
}
W
wangliu 已提交
125

126
template <typename Dtype, Precision P>
Refine  
陈后江 已提交
127
void Executor<Dtype, P>::LoadMemory(
128 129 130
    void **data, const std::shared_ptr<framework::VarDesc> var_desc,
    framework::LoDTensor *tensor) {
  char **data_buf = reinterpret_cast<char **>(data);
131
  // version
132
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
133
  *data_buf += sizeof(uint32_t);
134
  // lod information
H
hjchen2 已提交
135 136
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
Z
zhangyang 已提交
137
  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
138
  *data_buf += sizeof(uint64_t);
139 140 141 142

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
143
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
144
    *data_buf += sizeof(uint64_t);
145
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Z
zhangyang 已提交
146
    memory::Copy(tmp_dim.data(), *data_buf, size);
147
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
148
    *data_buf += size;
W
wangliu 已提交
149
  }
150
  // tensor version
151
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
152
  *data_buf += sizeof(uint32_t);
153
  // tensor desc size
154
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
155
  *data_buf += sizeof(int32_t);
156
  // skip tensor desc
Refine  
陈后江 已提交
157
  *data_buf += tensor_desc_size;
158

Refine  
陈后江 已提交
159
  const framework::TensorDesc &tensor_desc = var_desc->Tensor_desc();
160 161 162
  tensor->Resize(framework::make_ddim(tensor_desc.Dims()));
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
W
wangliu 已提交
163
    case framework::VARTYPE_TYPE_FP32:
164 165
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
W
wangliu 已提交
166
      break;
167
    case framework::VARTYPE_TYPE_INT8:
168
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
169 170
      break;
    case framework::VARTYPE_TYPE_INT32:
171
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
172 173
      break;
    default:
174
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
175
  }
W
wangliu 已提交
176 177 178 179 180 181 182
}

template <typename Dtype, Precision P>
void Executor<Dtype, P>::InitMemory() {
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
183
      auto tensor = var->template GetMutable<framework::LoDTensor>();
W
wangliu 已提交
184 185 186 187
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
Refine  
陈后江 已提交
188
        char *origin_data =
Refine  
陈后江 已提交
189
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
190
        char *data = origin_data;
191 192
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
193 194
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
195
          varInputMemory(var_desc, var, tensor);
W
wangliu 已提交
196 197 198 199 200 201
        }
      }
    }
  }
}

L
liuruilong 已提交
202
template <typename Dtype, Precision P>
L
liuruilong 已提交
203
void Executor<Dtype, P>::InitCombineMemory() {
Refine  
陈后江 已提交
204
  char *origin_data = nullptr;
Refine  
陈后江 已提交
205
  bool self_alloc = false;
206
  if (program_.combined_params_buf && program_.combined_params_len) {
207 208
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
209
  } else {
Refine  
陈后江 已提交
210
    self_alloc = true;
Refine  
陈后江 已提交
211
    origin_data = ReadFileToBuff(program_.para_path);
212
  }
Refine  
陈后江 已提交
213 214
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
L
liuruilong 已提交
215 216 217
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
218
      auto tensor = var->template GetMutable<framework::LoDTensor>();
L
liuruilong 已提交
219 220 221 222
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
223
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
224 225
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
226
          varInputMemory(var_desc, var, tensor);
L
liuruilong 已提交
227 228 229 230
        }
      }
    }
  }
Refine  
陈后江 已提交
231
  if (self_alloc) {
232
    delete[] origin_data;
Refine  
陈后江 已提交
233 234
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
235
}
236

xiebaiyuan's avatar
xiebaiyuan 已提交
237 238 239 240
template <typename Dtype, Precision P>
bool Executor<Dtype, P>::varInputMemory(
    const std::shared_ptr<framework::VarDesc> &var_desc, Variable *var,
    framework::LoDTensor *tensor) const {
241 242
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
Refine  
陈后江 已提交
243
    case framework::VARTYPE_TYPE_FP32:
244
      tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
245
      break;
Refine  
陈后江 已提交
246
    case framework::VARTYPE_TYPE_INT8:
247
      tensor->mutable_data<int8_t>();
Refine  
陈后江 已提交
248 249
      break;
    case framework::VARTYPE_TYPE_INT32:
250
      tensor->mutable_data<int32_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
251
      break;
Refine  
陈后江 已提交
252
    case framework::VARTYPE_TYPE_INT64:
253
      tensor->mutable_data<int64_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
254
      break;
Refine  
陈后江 已提交
255
    default:
xiebaiyuan's avatar
xiebaiyuan 已提交
256 257
      break;
  }
Refine  
陈后江 已提交
258
  bool is_mute_match = (type == framework::VARTYPE_TYPE_FP32) ||
259 260 261
                       (type == framework::VARTYPE_TYPE_INT8) ||
                       (type == framework::VARTYPE_TYPE_INT32) ||
                       (type == framework::VARTYPE_TYPE_INT64);
Refine  
陈后江 已提交
262
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
263 264
  return is_mute_match;
}
L
liuruilong 已提交
265

W
wangliu 已提交
266
template <typename Dtype, Precision P>
W
wangliu 已提交
267 268
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
    const framework::Tensor &t) {
W
wangliu 已提交
269 270 271 272 273 274
  framework::Variable *g_feed_value = program_.scope->Var("feed");
  framework::Tensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
  std::shared_ptr<framework::BlockDesc> to_predict_block =
W
wangliu 已提交
275
      to_predict_program_->Block(0);
D
dolphin8 已提交
276
  auto &ops = ops_of_block_[*to_predict_block.get()];
xiebaiyuan's avatar
xiebaiyuan 已提交
277

D
dolphin8 已提交
278
#ifdef PADDLE_MOBILE_PROFILE
D
dolphin8 已提交
279
  std::vector<ProfInfo> profile(ops.size());
D
dolphin8 已提交
280
#endif
D
dolphin8 已提交
281
  for (int i = 0; i < ops.size(); i++) {
D
dolphin8 已提交
282
#ifdef PADDLE_MOBILE_PROFILE
D
dolphin8 已提交
283 284 285 286
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
L
liuruilong 已提交
287
    // to Run
D
dolphin8 已提交
288 289 290 291 292
    ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
D
dolphin8 已提交
293
  }
W
wangliu 已提交
294 295 296 297 298 299 300
  auto last_op = ops.rbegin();
  auto output_map = (*last_op)->Outputs();
  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
  framework::LoDTensor *output_tensor =
      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                   *(program_.scope));
D
dolphin8 已提交
301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
    _tp[ops[i]->Type()] += timeCost;
  }
  printf("====================[ profile ]======================\n");
  using prof_t = std::pair<std::string, uint64_t>;
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
321 322 323
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
D
dolphin8 已提交
324 325 326
  }
  printf("====================[---------]======================\n");
#endif
L
liuruilong 已提交
327
  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
W
wangliu 已提交
328
}
xiebaiyuan's avatar
xiebaiyuan 已提交
329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400

template <typename Dtype, Precision P>
std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
    const framework::LoDTensor &t) {
  framework::Variable *g_feed_value = program_.scope->Var("feed");
  framework::LoDTensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
  feed_tensor->set_lod(t.lod());

  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);

  auto &ops = ops_of_block_[*to_predict_block.get()];

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = 0; i < ops.size(); i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
    if (loddable_) {
      ops[i]->InferShape();
    }
    ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
  auto last_op = ops.rbegin();

  auto output_map = (*last_op)->Outputs();
  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
  framework::LoDTensor *output_tensor =
      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                   *(program_.scope));
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
    _tp[ops[i]->Type()] += timeCost;
  }
  printf("====================[ profile ]======================\n");
  using prof_t = std::pair<std::string, uint64_t>;
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
  printf("====================[---------]======================\n");
#endif
  return std::make_shared<framework::LoDTensor>(
      framework::LoDTensor(*output_tensor));
}

W
wangliu 已提交
401 402 403 404
template <typename Dtype, Precision P>
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
    const framework::Tensor &t, int block_id) {
  return Predict(t);
W
wangliu 已提交
405 406 407
}

template <typename Dtype, Precision P>
L
liuruilong 已提交
408
std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
W
wangliu 已提交
409 410
    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
  framework::Tensor tensor(input, framework::make_ddim(dims));
W
wangliu 已提交
411
  std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
qnqinan's avatar
qnqinan 已提交
412 413 414 415 416 417 418 419 420 421 422
  if (output_tensor != nullptr) {
    Executor<Dtype, P>::Ptype *output_ptr =
        output_tensor->data<typename Executor<Dtype, P>::Ptype>();
    std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
    for (int j = 0; j < output_tensor->numel(); ++j) {
      result_vector.push_back(output_ptr[j]);
    }
    return result_vector;
  } else {
    DLOG << "return  empty vector";
    return {};
W
wangliu 已提交
423
  }
W
wangliu 已提交
424 425
}

426 427
#ifdef PADDLE_MOBILE_FPGA
template <typename Dtype, Precision P>
428
void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
H
hjchen2 已提交
429
                                        std::string var_name) {
430
  framework::Variable *g_feed_value = program_.scope->Var(var_name);
431 432 433 434
  framework::Tensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
435
}
436

437 438 439
template <typename Dtype, Precision P>
void Executor<Dtype, P>::FeedData(const framework::Tensor &t) {
  InjectVariable(t, "feed");
440
}
441

442
template <typename Dtype, Precision P>
443
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
444 445 446
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
447

Z
zhangyang 已提交
448 449 450 451 452
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
453 454 455
  auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
      out_keys[0], output_map, *(program_.scope));
  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
456
}
457 458 459 460 461 462

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_From_To(int start, int end) {
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
463
  end = end < 0 ? static_cast<int>(ops.size()) : end;
464 465 466 467 468 469 470 471 472 473 474 475
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
476
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
477 478 479 480 481 482 483
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
484
}
485 486 487 488

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_From(int start) {
  Predict_From_To(start);
489
}
490 491 492 493

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_To(int end) {
  Predict_From_To(0, end);
494
}
495 496
#endif

qnqinan's avatar
qnqinan 已提交
497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719
#ifdef PADDLE_MOBILE_CL
template <typename Dtype, Precision P>
void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
                                    float *tensorInput, char **data) {}

template <>
void Executor<GPU_CL, Precision::FP32>::LoadMemory(
    const framework::VarDesc var_desc, float *tensorInput, char **data) {
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

  (*data) += sizeof(uint32_t);

  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
  (*data) += sizeof(uint64_t);

  for (uint64_t i = 0; i < lod_level; ++i) {
    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
    (*data) += sizeof(uint64_t);
    std::vector<size_t> tmp(size / sizeof(size_t));

    for (int k = 0; k < tmp.size(); ++k) {
      tmp[k] = *reinterpret_cast<size_t *>(*data);
      (*data) += sizeof(size_t);
    }
  }

  // 3. tensor version
  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
  (*data) += sizeof(uint32_t);

  // 4. tensor desc
  int32_t size = *reinterpret_cast<int32_t *>(*data);
  (*data) += sizeof(int32_t);

  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
    buf.get()[m] = (*data)[m];
  }
  (*data) += (sizeof(char) * size);

  const framework::TensorDesc &desc = var_desc.Tensor_desc();
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
  //            int type_size = 0;
  //            switch (desc.DataType()) {
  //                case framework::VARTYPE_TYPE_FP16:
  //                    type_size = 2;
  //                    break;
  //                case framework::VARTYPE_TYPE_FP32:
  //                    type_size = 4;
  //                    memory = tensor->mutable_data<float>();
  //                    break;
  //                case framework::VARTYPE_TYPE_FP64:
  //                    type_size = 8;
  //                    break;
  //                case framework::VARTYPE_TYPE_INT32:
  //                    memory = tensor->mutable_data<int32_t>();
  //                    type_size = 4;
  //                    break;
  //                case framework::VARTYPE_TYPE_INT64:
  //                    type_size = 8;
  //                    break;
  //                case framework::VARTYPE_TYPE_BOOL:
  //                    type_size = 1;
  //                    break;
  //                default:
  //                    break;
  //            }
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
    float min_value;
    float max_value;

    memcpy(&min_value, *data, sizeof(float));
    memcpy(&max_value, *data + sizeof(float), sizeof(float));
    *data += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
    for (int k = 0; k < memory_size; ++k) {
      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
    for (int n = 0; n < memory_size; n++) {
      float value;
      memcpy(&value, *data + n * type_size, type_size);
      if (value < 1e-30 && value > -1e-30) {
        static_cast<float *>(memory)[n] = 0.0;
      } else {
        static_cast<float *>(memory)[n] = value;
      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }
}

template <>
void Executor<GPU_CL, Precision::FP32>::InitMemory() {
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
        CLImage *cl_image = nullptr;
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          var->template GetMutable<framework::LoDTensor>();
          continue;
        } else {
          cl_image = var->template GetMutable<framework::CLImage>();
        }

        char *origin_data =
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
        char *data = origin_data;
        cl_context context = program_.scope->GetCLScpoe()->Context();
        const framework::TensorDesc &desc = var_desc->Tensor_desc();
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
        }
        DLOG << var_desc->Name();
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);

        framework::DDim ddim = framework::make_ddim(desc.Dims());

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

        delete origin_data;
        paddle_mobile::memory::Free(tensorInput);
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
          auto cl_image = var->template GetMutable<framework::CLImage>();
          cl_context context = program_.scope->GetCLScpoe()->Context();
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();

          const framework::TensorDesc &desc = var_desc->Tensor_desc();
          //          framework::DDim ddim = framework::make_ddim(desc.Dims());
          framework::DDim ddim = cl_image->dims();
          DLOG << var_desc->Name();
          cl_image->InitEmptyImage(context, command_queue, ddim);
        }
      }
    }
  }
}

template <>
void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
  char *origin_data = nullptr;
  bool self_alloc = false;
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
    self_alloc = true;
    origin_data = ReadFileToBuff(program_.para_path);
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
  float *data = reinterpret_cast<float *>(origin_data);

  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
        CLImage *cl_image = nullptr;
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          var->template GetMutable<framework::LoDTensor>();
          continue;
        } else {
          cl_image = var->template GetMutable<framework::CLImage>();
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

        const framework::TensorDesc &desc = var_desc->Tensor_desc();
        framework::DDim ddim = framework::make_ddim(desc.Dims());

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
          numel = numel * ddim[i];
        }
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

        paddle_mobile::memory::Free(tensorInput);
      } else {
        auto cl_image = var->template GetMutable<framework::CLImage>();
        cl_context context = program_.scope->GetCLScpoe()->Context();
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
        const framework::TensorDesc &desc = var_desc->Tensor_desc();
        framework::DDim ddim = cl_image->dims();
        //        framework::DDim ddim = framework::make_ddim(desc.Dims());
        cl_image->InitEmptyImage(context, command_queue, ddim);
      }
    }
  }
  if (self_alloc) {
    delete data;
  }
  LOG(kLOG_INFO) << " end init combine memory ";
}

#endif

W
wangliu 已提交
720
template class Executor<CPU, Precision::FP32>;
qnqinan's avatar
qnqinan 已提交
721

L
liuruilong 已提交
722
template class Executor<FPGA, Precision::FP32>;
W
wangliu 已提交
723

qnqinan's avatar
qnqinan 已提交
724 725 726 727 728
template class Executor<GPU_CL, Precision::FP32>;

template class Executor<GPU_MALI, Precision::FP32>;

}  // namespace framework
W
wangliu 已提交
729
}  // namespace paddle_mobile