executor.cpp 25.2 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "framework/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
Z
zhangyang 已提交
29
#include "memory/t_malloc.h"
L
update  
liuruilong 已提交
30

D
dolphin8 已提交
31
#ifdef PADDLE_EXECUTOR_MULTITHREAD
D
dolphin8 已提交
32 33 34 35
#include <queue>
#include <utility>
#include "common/threadpool.h"
#endif
W
wangliu 已提交
36

L
update  
liuruilong 已提交
37 38 39
#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
W
wangliu 已提交
40 41

namespace paddle_mobile {
42
namespace framework {
43

W
wangliu 已提交
44
using framework::Variable;
L
liuruilong 已提交
45
using framework::Variable;
W
wangliu 已提交
46 47 48 49

#pragma mark - executor

template <typename Dtype, Precision P>
H
hjchen2 已提交
50
Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
51
                             const bool use_optimize, const bool loddable)
H
hjchen2 已提交
52 53 54 55
    : program_(p),
      batch_size_(batch_size),
      use_optimize_(use_optimize),
      loddable_(loddable) {
W
wangliu 已提交
56
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
57
  variable_ptr->SetValue<int>(batch_size);
Refine  
陈后江 已提交
58 59
  to_predict_program_ =
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
60 61
  PADDLE_MOBILE_ENFORCE(to_predict_program_ != nullptr,
                        "to_predict_program_ == NULL!");
62
  const std::vector<std::shared_ptr<framework::BlockDesc>> &blocks =
W
wangliu 已提交
63
      to_predict_program_->Blocks();
64 65

  DLOG << "executor in loaddable mode: " << loddable_;
W
wangliu 已提交
66 67 68 69 70
  for (int i = 0; i < blocks.size(); ++i) {
    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
    for (int j = 0; j < ops.size(); ++j) {
      std::shared_ptr<framework::OpDesc> op = ops[j];
71
      DLOG << "create op: " << op->Type();
W
wangliu 已提交
72 73 74
      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
          program_.scope);
Refine  
陈后江 已提交
75 76
      // infer shape to reshape tensor before predict,
      // but for lod tensor, it will need to reshape in runtime
xiebaiyuan's avatar
xiebaiyuan 已提交
77 78 79
      if (!loddable_) {
        op_base->InferShape();
      }
W
wangliu 已提交
80 81 82
      ops_of_block_[*block_desc.get()].push_back(op_base);
    }
  }
W
wangliu 已提交
83
  if (program_.combined) {
L
liuruilong 已提交
84 85 86 87
    InitCombineMemory();
  } else {
    InitMemory();
  }
L
liuruilong 已提交
88
  std::shared_ptr<framework::BlockDesc> to_predict_block =
L
liuruilong 已提交
89
      to_predict_program_->Block(0);
Z
zhangyang 已提交
90
  int i = 0;
L
liuruilong 已提交
91
  auto &ops = ops_of_block_[*to_predict_block.get()];
L
liuruilong 已提交
92
  for (const auto &op : ops) {
Z
zhangyang 已提交
93
    DLOG << "Initialize op[" << i++ << "]: " << op->Type();
L
liuruilong 已提交
94 95
    op->Init();
  }
W
wangliu 已提交
96 97
}

98
template <typename Dtype>
99 100
static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
                            bool quant_uint8 = false) {
Refine  
陈后江 已提交
101
  char **data_buf = reinterpret_cast<char **>(data);
102
  int64_t size = tensor->numel();
103
  Dtype *tensor_data = tensor->mutable_data<Dtype>();
104 105
  if (quant_uint8) {
    // should be moved into operator init function
106 107
    float min_value;
    float max_value;
Z
zhangyang 已提交
108 109
    memory::Copy(&min_value, data_buf, sizeof(float));
    memory::Copy(&max_value, data_buf + sizeof(float), sizeof(float));
110 111
    data_buf += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
112
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(data_buf);
113 114
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
115
    }
116 117
    data_buf += size * sizeof(uint8_t);
  } else {
Z
zhangyang 已提交
118
    memory::Copy(tensor_data, *data_buf, size * sizeof(Dtype));
Refine  
陈后江 已提交
119
    *data_buf += size * sizeof(Dtype);
L
liuruilong 已提交
120
  }
121
}
W
wangliu 已提交
122

123
template <typename Dtype, Precision P>
Refine  
陈后江 已提交
124
void Executor<Dtype, P>::LoadMemory(
125 126 127
    void **data, const std::shared_ptr<framework::VarDesc> var_desc,
    framework::LoDTensor *tensor) {
  char **data_buf = reinterpret_cast<char **>(data);
128
  // version
129
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
130
  *data_buf += sizeof(uint32_t);
131
  // lod information
H
hjchen2 已提交
132 133
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
Z
zhangyang 已提交
134
  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
135
  *data_buf += sizeof(uint64_t);
136 137 138 139

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
140
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
141
    *data_buf += sizeof(uint64_t);
142
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Z
zhangyang 已提交
143
    memory::Copy(tmp_dim.data(), *data_buf, size);
144
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
145
    *data_buf += size;
W
wangliu 已提交
146
  }
147
  // tensor version
148
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
149
  *data_buf += sizeof(uint32_t);
150
  // tensor desc size
151
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
152
  *data_buf += sizeof(int32_t);
153
  // skip tensor desc
Refine  
陈后江 已提交
154
  *data_buf += tensor_desc_size;
155

Refine  
陈后江 已提交
156
  const framework::TensorDesc &tensor_desc = var_desc->Tensor_desc();
157 158 159
  tensor->Resize(framework::make_ddim(tensor_desc.Dims()));
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
W
wangliu 已提交
160
    case framework::VARTYPE_TYPE_FP32:
161 162
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
W
wangliu 已提交
163
      break;
164
    case framework::VARTYPE_TYPE_INT8:
165
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
166 167
      break;
    case framework::VARTYPE_TYPE_INT32:
168
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
169 170
      break;
    default:
171
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
172
  }
W
wangliu 已提交
173 174 175 176 177 178 179
}

template <typename Dtype, Precision P>
void Executor<Dtype, P>::InitMemory() {
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
180
      auto tensor = var->template GetMutable<framework::LoDTensor>();
W
wangliu 已提交
181 182 183 184
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
Refine  
陈后江 已提交
185
        char *origin_data =
Refine  
陈后江 已提交
186
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
187
        char *data = origin_data;
188 189
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
190 191
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
192
          varInputMemory(var_desc, var, tensor);
W
wangliu 已提交
193 194 195 196 197 198
        }
      }
    }
  }
}

L
liuruilong 已提交
199
template <typename Dtype, Precision P>
L
liuruilong 已提交
200
void Executor<Dtype, P>::InitCombineMemory() {
Refine  
陈后江 已提交
201
  char *origin_data = nullptr;
Refine  
陈后江 已提交
202
  bool self_alloc = false;
203
  if (program_.combined_params_buf && program_.combined_params_len) {
204 205
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
206
  } else {
Refine  
陈后江 已提交
207
    self_alloc = true;
Refine  
陈后江 已提交
208
    origin_data = ReadFileToBuff(program_.para_path);
209
  }
Refine  
陈后江 已提交
210 211
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
L
liuruilong 已提交
212 213 214
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
215
      auto tensor = var->template GetMutable<framework::LoDTensor>();
L
liuruilong 已提交
216 217 218 219
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
220
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
221 222
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
223
          varInputMemory(var_desc, var, tensor);
L
liuruilong 已提交
224 225 226 227
        }
      }
    }
  }
Refine  
陈后江 已提交
228
  if (self_alloc) {
229
    delete[] origin_data;
Refine  
陈后江 已提交
230 231
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
232
}
233

xiebaiyuan's avatar
xiebaiyuan 已提交
234 235 236 237
template <typename Dtype, Precision P>
bool Executor<Dtype, P>::varInputMemory(
    const std::shared_ptr<framework::VarDesc> &var_desc, Variable *var,
    framework::LoDTensor *tensor) const {
238 239
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
Refine  
陈后江 已提交
240
    case framework::VARTYPE_TYPE_FP32:
241
      tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
242
      break;
Refine  
陈后江 已提交
243
    case framework::VARTYPE_TYPE_INT8:
244
      tensor->mutable_data<int8_t>();
Refine  
陈后江 已提交
245 246
      break;
    case framework::VARTYPE_TYPE_INT32:
247
      tensor->mutable_data<int32_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
248
      break;
Refine  
陈后江 已提交
249
    case framework::VARTYPE_TYPE_INT64:
250
      tensor->mutable_data<int64_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
251
      break;
Refine  
陈后江 已提交
252
    default:
xiebaiyuan's avatar
xiebaiyuan 已提交
253 254
      break;
  }
Refine  
陈后江 已提交
255
  bool is_mute_match = (type == framework::VARTYPE_TYPE_FP32) ||
256 257 258
                       (type == framework::VARTYPE_TYPE_INT8) ||
                       (type == framework::VARTYPE_TYPE_INT32) ||
                       (type == framework::VARTYPE_TYPE_INT64);
Refine  
陈后江 已提交
259
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
260 261
  return is_mute_match;
}
L
liuruilong 已提交
262

W
wangliu 已提交
263
template <typename Dtype, Precision P>
W
wangliu 已提交
264 265
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
    const framework::Tensor &t) {
W
wangliu 已提交
266 267 268 269 270 271
  framework::Variable *g_feed_value = program_.scope->Var("feed");
  framework::Tensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
  std::shared_ptr<framework::BlockDesc> to_predict_block =
W
wangliu 已提交
272
      to_predict_program_->Block(0);
D
dolphin8 已提交
273
  auto &ops = ops_of_block_[*to_predict_block.get()];
xiebaiyuan's avatar
xiebaiyuan 已提交
274

D
dolphin8 已提交
275
#ifdef PADDLE_MOBILE_PROFILE
D
dolphin8 已提交
276
  std::vector<ProfInfo> profile(ops.size());
D
dolphin8 已提交
277
#endif
D
dolphin8 已提交
278
  for (int i = 0; i < ops.size(); i++) {
D
dolphin8 已提交
279
#ifdef PADDLE_MOBILE_PROFILE
D
dolphin8 已提交
280 281 282 283
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
L
liuruilong 已提交
284
    // to Run
D
dolphin8 已提交
285 286 287 288 289
    ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
D
dolphin8 已提交
290
  }
W
wangliu 已提交
291 292 293 294 295 296 297
  auto last_op = ops.rbegin();
  auto output_map = (*last_op)->Outputs();
  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
  framework::LoDTensor *output_tensor =
      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                   *(program_.scope));
D
dolphin8 已提交
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
    _tp[ops[i]->Type()] += timeCost;
  }
  printf("====================[ profile ]======================\n");
  using prof_t = std::pair<std::string, uint64_t>;
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
318 319 320
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
D
dolphin8 已提交
321 322 323
  }
  printf("====================[---------]======================\n");
#endif
L
liuruilong 已提交
324
  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
W
wangliu 已提交
325
}
xiebaiyuan's avatar
xiebaiyuan 已提交
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397

template <typename Dtype, Precision P>
std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
    const framework::LoDTensor &t) {
  framework::Variable *g_feed_value = program_.scope->Var("feed");
  framework::LoDTensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
  feed_tensor->set_lod(t.lod());

  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);

  auto &ops = ops_of_block_[*to_predict_block.get()];

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = 0; i < ops.size(); i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
    if (loddable_) {
      ops[i]->InferShape();
    }
    ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
  auto last_op = ops.rbegin();

  auto output_map = (*last_op)->Outputs();
  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
  framework::LoDTensor *output_tensor =
      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                   *(program_.scope));
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
    _tp[ops[i]->Type()] += timeCost;
  }
  printf("====================[ profile ]======================\n");
  using prof_t = std::pair<std::string, uint64_t>;
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
  printf("====================[---------]======================\n");
#endif
  return std::make_shared<framework::LoDTensor>(
      framework::LoDTensor(*output_tensor));
}

W
wangliu 已提交
398 399 400 401
template <typename Dtype, Precision P>
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
    const framework::Tensor &t, int block_id) {
  return Predict(t);
W
wangliu 已提交
402 403 404
}

template <typename Dtype, Precision P>
L
liuruilong 已提交
405
std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
W
wangliu 已提交
406 407
    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
  framework::Tensor tensor(input, framework::make_ddim(dims));
W
wangliu 已提交
408
  std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
L
liuruilong 已提交
409 410
  if (output_tensor != nullptr) {
    Executor<Dtype, P>::Ptype *output_ptr =
L
liuruilong 已提交
411
        output_tensor->data<typename Executor<Dtype, P>::Ptype>();
L
liuruilong 已提交
412 413 414 415 416 417 418 419
    std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
    for (int j = 0; j < output_tensor->numel(); ++j) {
      result_vector.push_back(output_ptr[j]);
    }
    return result_vector;
  } else {
    DLOG << "return  empty vector";
    return {};
W
wangliu 已提交
420
  }
W
wangliu 已提交
421 422
}

423 424
#ifdef PADDLE_MOBILE_FPGA
template <typename Dtype, Precision P>
425
void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
H
hjchen2 已提交
426
                                        std::string var_name) {
427
  framework::Variable *g_feed_value = program_.scope->Var(var_name);
428 429 430 431
  framework::Tensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
432
}
433

434 435 436
template <typename Dtype, Precision P>
void Executor<Dtype, P>::FeedData(const framework::Tensor &t) {
  InjectVariable(t, "feed");
437
}
438

439
template <typename Dtype, Precision P>
440
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
441 442 443
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
444

Z
zhangyang 已提交
445 446 447 448 449
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
450 451 452
  auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
      out_keys[0], output_map, *(program_.scope));
  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
453
}
454 455 456 457 458 459

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_From_To(int start, int end) {
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
460
  end = end < 0 ? static_cast<int>(ops.size()) : end;
461 462 463 464 465 466 467 468 469 470 471 472
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
473
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
474 475 476 477 478 479 480
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
481
}
482 483 484 485

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_From(int start) {
  Predict_From_To(start);
486
}
487 488 489 490

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_To(int end) {
  Predict_From_To(0, end);
491
}
492 493
#endif

Y
yangfei 已提交
494
#ifdef PADDLE_MOBILE_CL
L
liuruilong 已提交
495 496 497 498
template <typename Dtype, Precision P>
void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
                                    float *tensorInput, char **data) {}

Y
yangfei 已提交
499
template <>
500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598
void Executor<GPU_CL, Precision::FP32>::LoadMemory(
    const framework::VarDesc var_desc, float *tensorInput, char **data) {
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

  (*data) += sizeof(uint32_t);

  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
  (*data) += sizeof(uint64_t);

  for (uint64_t i = 0; i < lod_level; ++i) {
    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
    (*data) += sizeof(uint64_t);
    std::vector<size_t> tmp(size / sizeof(size_t));

    for (int k = 0; k < tmp.size(); ++k) {
      tmp[k] = *reinterpret_cast<size_t *>(*data);
      (*data) += sizeof(size_t);
    }
  }

  // 3. tensor version
  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
  (*data) += sizeof(uint32_t);

  // 4. tensor desc
  int32_t size = *reinterpret_cast<int32_t *>(*data);
  (*data) += sizeof(int32_t);

  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
    buf.get()[m] = (*data)[m];
  }
  (*data) += (sizeof(char) * size);

  const framework::TensorDesc &desc = var_desc.Tensor_desc();
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
  //            int type_size = 0;
  //            switch (desc.DataType()) {
  //                case framework::VARTYPE_TYPE_FP16:
  //                    type_size = 2;
  //                    break;
  //                case framework::VARTYPE_TYPE_FP32:
  //                    type_size = 4;
  //                    memory = tensor->mutable_data<float>();
  //                    break;
  //                case framework::VARTYPE_TYPE_FP64:
  //                    type_size = 8;
  //                    break;
  //                case framework::VARTYPE_TYPE_INT32:
  //                    memory = tensor->mutable_data<int32_t>();
  //                    type_size = 4;
  //                    break;
  //                case framework::VARTYPE_TYPE_INT64:
  //                    type_size = 8;
  //                    break;
  //                case framework::VARTYPE_TYPE_BOOL:
  //                    type_size = 1;
  //                    break;
  //                default:
  //                    break;
  //            }
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
    float min_value;
    float max_value;

    memcpy(&min_value, *data, sizeof(float));
    memcpy(&max_value, *data + sizeof(float), sizeof(float));
    *data += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
    for (int k = 0; k < memory_size; ++k) {
      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
    for (int n = 0; n < memory_size; n++) {
      float value;
      memcpy(&value, *data + n * type_size, type_size);
      if (value < 1e-30 && value > -1e-30) {
        static_cast<float *>(memory)[n] = 0.0;
      } else {
        static_cast<float *>(memory)[n] = value;
      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }
}
599

Y
yangfei 已提交
600 601 602 603 604 605
template <>
void Executor<GPU_CL, Precision::FP32>::InitMemory() {
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
606
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
607
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
Z
zhaojiaying01 已提交
608
          var->template GetMutable<framework::LoDTensor>();
Y
yangfei 已提交
609
          continue;
L
liuruilong 已提交
610 611
        } else {
          cl_image = var->template GetMutable<framework::CLImage>();
Y
yangfei 已提交
612
        }
L
liuruilong 已提交
613

Y
yangfei 已提交
614
        char *origin_data =
L
liuruilong 已提交
615
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
616
        char *data = origin_data;
Y
yangfei 已提交
617
        cl_context context = program_.scope->GetCLScpoe()->Context();
618 619 620 621 622 623
        const framework::TensorDesc &desc = var_desc->Tensor_desc();
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
        }
        DLOG << var_desc->Name();
Y
yangfei 已提交
624
        float *tensorInput = static_cast<float *>(
625 626
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);
Y
yangfei 已提交
627 628

        framework::DDim ddim = framework::make_ddim(desc.Dims());
Y
yangfei 已提交
629

L
liuruilong 已提交
630 631
        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
Y
yangfei 已提交
632

633
        delete origin_data;
Y
yangfei 已提交
634
        paddle_mobile::memory::Free(tensorInput);
635 636 637 638
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
          auto cl_image = var->template GetMutable<framework::CLImage>();
          cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
639 640
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
641

642
          const framework::TensorDesc &desc = var_desc->Tensor_desc();
Y
yangfei 已提交
643 644
          //          framework::DDim ddim = framework::make_ddim(desc.Dims());
          framework::DDim ddim = cl_image->dims();
645
          DLOG << var_desc->Name();
L
liuruilong 已提交
646
          cl_image->InitEmptyImage(context, command_queue, ddim);
647
        }
Y
yangfei 已提交
648 649 650 651
      }
    }
  }
}
652

Y
yangfei 已提交
653 654
template <>
void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
Y
yangfei 已提交
655 656
  char *origin_data = nullptr;
  bool self_alloc = false;
Y
yangfei 已提交
657 658
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
659
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
Y
yangfei 已提交
660 661
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
Y
yangfei 已提交
662
    self_alloc = true;
L
liuruilong 已提交
663
    origin_data = ReadFileToBuff(program_.para_path);
Y
yangfei 已提交
664 665
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
666
  float *data = reinterpret_cast<float *>(origin_data);
Y
yangfei 已提交
667 668 669 670 671

  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
672
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
673
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
Z
zhaojiaying01 已提交
674
          var->template GetMutable<framework::LoDTensor>();
Y
yangfei 已提交
675
          continue;
L
liuruilong 已提交
676 677
        } else {
          cl_image = var->template GetMutable<framework::CLImage>();
Y
yangfei 已提交
678 679 680 681
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

Y
yangfei 已提交
682
        const framework::TensorDesc &desc = var_desc->Tensor_desc();
Y
yangfei 已提交
683
        framework::DDim ddim = framework::make_ddim(desc.Dims());
Y
yangfei 已提交
684 685 686 687 688

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
          numel = numel * ddim[i];
        }
689 690 691
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);
L
liuruilong 已提交
692 693 694 695

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

696 697
        paddle_mobile::memory::Free(tensorInput);
      } else {
Y
yangfei 已提交
698 699
        auto cl_image = var->template GetMutable<framework::CLImage>();
        cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
700 701
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
702
        const framework::TensorDesc &desc = var_desc->Tensor_desc();
Y
yangfei 已提交
703 704
        framework::DDim ddim = cl_image->dims();
        //        framework::DDim ddim = framework::make_ddim(desc.Dims());
L
liuruilong 已提交
705
        cl_image->InitEmptyImage(context, command_queue, ddim);
Y
yangfei 已提交
706 707 708
      }
    }
  }
Y
yangfei 已提交
709
  if (self_alloc) {
710
    delete data;
Y
yangfei 已提交
711
  }
Y
yangfei 已提交
712
  LOG(kLOG_INFO) << " end init combine memory ";
713
}
Y
yangfei 已提交
714 715 716

#endif

W
wangliu 已提交
717
template class Executor<CPU, Precision::FP32>;
Y
yangfei 已提交
718

L
liuruilong 已提交
719
template class Executor<FPGA, Precision::FP32>;
W
wangliu 已提交
720

Y
yangfei 已提交
721 722 723 724 725
template class Executor<GPU_CL, Precision::FP32>;

template class Executor<GPU_MALI, Precision::FP32>;

}  // namespace framework
W
wangliu 已提交
726
}  // namespace paddle_mobile