executor.cpp 25.9 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "framework/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
Z
zhangyang 已提交
29
#include "memory/t_malloc.h"
L
update  
liuruilong 已提交
30

D
dolphin8 已提交
31
#ifdef PADDLE_EXECUTOR_MULTITHREAD
D
dolphin8 已提交
32 33 34
#include <queue>
#include "common/threadpool.h"
#endif
W
wangliu 已提交
35

L
update  
liuruilong 已提交
36 37 38
#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
W
wangliu 已提交
39 40

namespace paddle_mobile {
41
namespace framework {
42

W
wangliu 已提交
43
using framework::Variable;
L
liuruilong 已提交
44
using framework::Variable;
W
wangliu 已提交
45 46 47 48

#pragma mark - executor

template <typename Dtype, Precision P>
H
hjchen2 已提交
49
Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
50
                             const bool use_optimize, const bool loddable)
H
hjchen2 已提交
51 52 53 54
    : program_(p),
      batch_size_(batch_size),
      use_optimize_(use_optimize),
      loddable_(loddable) {
W
wangliu 已提交
55
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
56
  variable_ptr->SetValue<int>(batch_size);
Refine  
陈后江 已提交
57 58
  to_predict_program_ =
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
59 60
  PADDLE_MOBILE_ENFORCE(to_predict_program_ != nullptr,
                        "to_predict_program_ == NULL!");
61
  const std::vector<std::shared_ptr<framework::BlockDesc>> &blocks =
W
wangliu 已提交
62
      to_predict_program_->Blocks();
63 64

  DLOG << "executor in loaddable mode: " << loddable_;
W
wangliu 已提交
65 66 67 68 69
  for (int i = 0; i < blocks.size(); ++i) {
    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
    for (int j = 0; j < ops.size(); ++j) {
      std::shared_ptr<framework::OpDesc> op = ops[j];
70
      DLOG << "create op: " << op->Type();
W
wangliu 已提交
71 72 73
      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
          program_.scope);
Refine  
陈后江 已提交
74
      // infer shape to reshape tensor before predict,
75
      // but for lod tensor, it will still need to reshape in runtime
xiebaiyuan's avatar
xiebaiyuan 已提交
76 77 78
      if (!loddable_) {
        op_base->InferShape();
      }
W
wangliu 已提交
79 80 81
      ops_of_block_[*block_desc.get()].push_back(op_base);
    }
  }
W
wangliu 已提交
82
  if (program_.combined) {
L
liuruilong 已提交
83 84 85 86
    InitCombineMemory();
  } else {
    InitMemory();
  }
L
liuruilong 已提交
87
  std::shared_ptr<framework::BlockDesc> to_predict_block =
L
liuruilong 已提交
88
      to_predict_program_->Block(0);
Z
zhangyang 已提交
89
  int i = 0;
L
liuruilong 已提交
90
  auto &ops = ops_of_block_[*to_predict_block.get()];
L
liuruilong 已提交
91
  for (const auto &op : ops) {
Z
zhangyang 已提交
92
    DLOG << "Initialize op[" << i++ << "]: " << op->Type();
L
liuruilong 已提交
93 94
    op->Init();
  }
W
wangliu 已提交
95 96
}

97
template <typename Dtype>
98 99
static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
                            bool quant_uint8 = false) {
Refine  
陈后江 已提交
100
  char **data_buf = reinterpret_cast<char **>(data);
101
  int64_t size = tensor->numel();
102
  Dtype *tensor_data = tensor->mutable_data<Dtype>();
103 104
  if (quant_uint8) {
    // should be moved into operator init function
105 106
    float min_value;
    float max_value;
Z
zhangyang 已提交
107 108
    memory::Copy(&min_value, data_buf, sizeof(float));
    memory::Copy(&max_value, data_buf + sizeof(float), sizeof(float));
109 110
    data_buf += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
111
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(data_buf);
112 113
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
114
    }
115 116
    data_buf += size * sizeof(uint8_t);
  } else {
Z
zhangyang 已提交
117
    memory::Copy(tensor_data, *data_buf, size * sizeof(Dtype));
Refine  
陈后江 已提交
118
    *data_buf += size * sizeof(Dtype);
L
liuruilong 已提交
119
  }
120
}
W
wangliu 已提交
121

122
template <typename Dtype, Precision P>
Refine  
陈后江 已提交
123
void Executor<Dtype, P>::LoadMemory(
124 125 126
    void **data, const std::shared_ptr<framework::VarDesc> var_desc,
    framework::LoDTensor *tensor) {
  char **data_buf = reinterpret_cast<char **>(data);
127
  // version
128
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
129
  *data_buf += sizeof(uint32_t);
130
  // lod information
H
hjchen2 已提交
131 132
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
Z
zhangyang 已提交
133
  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
134
  *data_buf += sizeof(uint64_t);
135 136 137 138

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
139
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
140
    *data_buf += sizeof(uint64_t);
141
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Z
zhangyang 已提交
142
    memory::Copy(tmp_dim.data(), *data_buf, size);
143
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
144
    *data_buf += size;
W
wangliu 已提交
145
  }
146
  // tensor version
147
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
148
  *data_buf += sizeof(uint32_t);
149
  // tensor desc size
150
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
151
  *data_buf += sizeof(int32_t);
152
  // skip tensor desc
Refine  
陈后江 已提交
153
  *data_buf += tensor_desc_size;
154

Refine  
陈后江 已提交
155
  const framework::TensorDesc &tensor_desc = var_desc->Tensor_desc();
156 157 158
  tensor->Resize(framework::make_ddim(tensor_desc.Dims()));
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
W
wangliu 已提交
159
    case framework::VARTYPE_TYPE_FP32:
160 161
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
W
wangliu 已提交
162
      break;
163
    case framework::VARTYPE_TYPE_INT8:
164
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
165 166
      break;
    case framework::VARTYPE_TYPE_INT32:
167
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
168 169
      break;
    default:
170
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
171
  }
W
wangliu 已提交
172 173 174 175 176 177 178
}

template <typename Dtype, Precision P>
void Executor<Dtype, P>::InitMemory() {
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
179
      auto tensor = var->template GetMutable<framework::LoDTensor>();
W
wangliu 已提交
180 181 182 183
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
Refine  
陈后江 已提交
184
        char *origin_data =
Refine  
陈后江 已提交
185
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
186
        char *data = origin_data;
187 188
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
189 190
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
191
          varInputMemory(var_desc, var, tensor);
W
wangliu 已提交
192 193 194 195 196 197
        }
      }
    }
  }
}

L
liuruilong 已提交
198
template <typename Dtype, Precision P>
L
liuruilong 已提交
199
void Executor<Dtype, P>::InitCombineMemory() {
Refine  
陈后江 已提交
200
  char *origin_data = nullptr;
Refine  
陈后江 已提交
201
  bool self_alloc = false;
202
  if (program_.combined_params_buf && program_.combined_params_len) {
203 204
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
205
  } else {
Refine  
陈后江 已提交
206
    self_alloc = true;
Refine  
陈后江 已提交
207
    origin_data = ReadFileToBuff(program_.para_path);
208
  }
Refine  
陈后江 已提交
209 210
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
L
liuruilong 已提交
211 212 213
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
214
      auto tensor = var->template GetMutable<framework::LoDTensor>();
L
liuruilong 已提交
215 216 217 218
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
219
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
220 221
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
222
          varInputMemory(var_desc, var, tensor);
L
liuruilong 已提交
223 224 225 226
        }
      }
    }
  }
Refine  
陈后江 已提交
227
  if (self_alloc) {
228
    delete[] origin_data;
Refine  
陈后江 已提交
229 230
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
231
}
232

xiebaiyuan's avatar
xiebaiyuan 已提交
233 234 235 236
template <typename Dtype, Precision P>
bool Executor<Dtype, P>::varInputMemory(
    const std::shared_ptr<framework::VarDesc> &var_desc, Variable *var,
    framework::LoDTensor *tensor) const {
237 238
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
Refine  
陈后江 已提交
239
    case framework::VARTYPE_TYPE_FP32:
240
      tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
241
      break;
Refine  
陈后江 已提交
242
    case framework::VARTYPE_TYPE_INT8:
243
      tensor->mutable_data<int8_t>();
Refine  
陈后江 已提交
244 245
      break;
    case framework::VARTYPE_TYPE_INT32:
246
      tensor->mutable_data<int32_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
247
      break;
Refine  
陈后江 已提交
248
    case framework::VARTYPE_TYPE_INT64:
249
      tensor->mutable_data<int64_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
250
      break;
Refine  
陈后江 已提交
251
    default:
xiebaiyuan's avatar
xiebaiyuan 已提交
252 253
      break;
  }
Refine  
陈后江 已提交
254
  bool is_mute_match = (type == framework::VARTYPE_TYPE_FP32) ||
255 256 257
                       (type == framework::VARTYPE_TYPE_INT8) ||
                       (type == framework::VARTYPE_TYPE_INT32) ||
                       (type == framework::VARTYPE_TYPE_INT64);
Refine  
陈后江 已提交
258
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
259 260
  return is_mute_match;
}
L
liuruilong 已提交
261

W
wangliu 已提交
262
template <typename Dtype, Precision P>
W
wangliu 已提交
263 264
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
    const framework::Tensor &t) {
W
wangliu 已提交
265 266 267 268 269 270
  framework::Variable *g_feed_value = program_.scope->Var("feed");
  framework::Tensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
  std::shared_ptr<framework::BlockDesc> to_predict_block =
W
wangliu 已提交
271
      to_predict_program_->Block(0);
D
dolphin8 已提交
272
  auto &ops = ops_of_block_[*to_predict_block.get()];
xiebaiyuan's avatar
xiebaiyuan 已提交
273

D
dolphin8 已提交
274
#ifdef PADDLE_MOBILE_PROFILE
D
dolphin8 已提交
275
  std::vector<ProfInfo> profile(ops.size());
D
dolphin8 已提交
276
#endif
D
dolphin8 已提交
277
  for (int i = 0; i < ops.size(); i++) {
D
dolphin8 已提交
278
#ifdef PADDLE_MOBILE_PROFILE
D
dolphin8 已提交
279 280 281 282
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
283 284 285
    if (loddable_) {
      ops[i]->InferShape();
    }
L
liuruilong 已提交
286
    // to Run
D
dolphin8 已提交
287 288 289 290 291
    ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
D
dolphin8 已提交
292
  }
W
wangliu 已提交
293 294 295 296 297 298 299
  auto last_op = ops.rbegin();
  auto output_map = (*last_op)->Outputs();
  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
  framework::LoDTensor *output_tensor =
      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                   *(program_.scope));
D
dolphin8 已提交
300 301 302 303 304
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
305 306 307 308 309 310 311 312 313
    if (ops[i]->Type() == "conv2d") {
      auto inputs = ops[i]->Inputs();
      auto *filter = framework::GetVarValue<framework::LoDTensor>(
          "Filter", inputs, *(program_.scope));
      int kernel_size = filter->dims()[2];
      _tp[ops[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
    } else {
      _tp[ops[i]->Type()] += timeCost;
    }
D
dolphin8 已提交
314 315 316 317 318 319 320 321 322 323 324 325 326 327
  }
  printf("====================[ profile ]======================\n");
  using prof_t = std::pair<std::string, uint64_t>;
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
328 329 330
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
D
dolphin8 已提交
331 332 333
  }
  printf("====================[---------]======================\n");
#endif
L
liuruilong 已提交
334
  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
W
wangliu 已提交
335
}
xiebaiyuan's avatar
xiebaiyuan 已提交
336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382

template <typename Dtype, Precision P>
std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
    const framework::LoDTensor &t) {
  framework::Variable *g_feed_value = program_.scope->Var("feed");
  framework::LoDTensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
  feed_tensor->set_lod(t.lod());

  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);

  auto &ops = ops_of_block_[*to_predict_block.get()];

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = 0; i < ops.size(); i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
    if (loddable_) {
      ops[i]->InferShape();
    }
    ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
  auto last_op = ops.rbegin();

  auto output_map = (*last_op)->Outputs();
  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
  framework::LoDTensor *output_tensor =
      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                   *(program_.scope));
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
383 384 385 386 387 388 389 390
    if (ops[i]->Type() == "conv2d") {
      auto inputs = ops[i]->Inputs();
      auto input_keys = ops[i]->GetInputKeys();
      auto *filter = framework::GetVarValue<framework::LoDTensor>(
          input_keys[1], inputs, *(program_.scope));
      int kernel_size = filter->dims()[2];
      printf("kernel size: %d\n", kernel_size);
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415
    _tp[ops[i]->Type()] += timeCost;
  }
  printf("====================[ profile ]======================\n");
  using prof_t = std::pair<std::string, uint64_t>;
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
  printf("====================[---------]======================\n");
#endif
  return std::make_shared<framework::LoDTensor>(
      framework::LoDTensor(*output_tensor));
}

W
wangliu 已提交
416 417 418 419
template <typename Dtype, Precision P>
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
    const framework::Tensor &t, int block_id) {
  return Predict(t);
W
wangliu 已提交
420 421 422
}

template <typename Dtype, Precision P>
L
liuruilong 已提交
423
std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
W
wangliu 已提交
424 425
    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
  framework::Tensor tensor(input, framework::make_ddim(dims));
W
wangliu 已提交
426
  std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
L
liuruilong 已提交
427 428
  if (output_tensor != nullptr) {
    Executor<Dtype, P>::Ptype *output_ptr =
L
liuruilong 已提交
429
        output_tensor->data<typename Executor<Dtype, P>::Ptype>();
L
liuruilong 已提交
430 431 432 433 434 435 436 437
    std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
    for (int j = 0; j < output_tensor->numel(); ++j) {
      result_vector.push_back(output_ptr[j]);
    }
    return result_vector;
  } else {
    DLOG << "return  empty vector";
    return {};
W
wangliu 已提交
438
  }
W
wangliu 已提交
439 440
}

441 442
#ifdef PADDLE_MOBILE_FPGA
template <typename Dtype, Precision P>
443
void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
H
hjchen2 已提交
444
                                        std::string var_name) {
445
  framework::Variable *g_feed_value = program_.scope->Var(var_name);
446 447 448 449
  framework::Tensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
450
}
451

452 453 454
template <typename Dtype, Precision P>
void Executor<Dtype, P>::FeedData(const framework::Tensor &t) {
  InjectVariable(t, "feed");
455
}
456

457
template <typename Dtype, Precision P>
458
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
459 460 461
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
462

Z
zhangyang 已提交
463 464 465 466 467
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
468 469 470
  auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
      out_keys[0], output_map, *(program_.scope));
  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
471
}
472 473 474 475 476 477

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_From_To(int start, int end) {
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
478
  end = end < 0 ? static_cast<int>(ops.size()) : end;
479 480 481 482 483 484 485 486 487 488 489 490
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
491
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
492 493 494 495 496 497 498
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
499
}
500 501 502 503

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_From(int start) {
  Predict_From_To(start);
504
}
505 506 507 508

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_To(int end) {
  Predict_From_To(0, end);
509
}
510 511
#endif

Y
yangfei 已提交
512
#ifdef PADDLE_MOBILE_CL
L
liuruilong 已提交
513 514 515 516
template <typename Dtype, Precision P>
void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
                                    float *tensorInput, char **data) {}

Y
yangfei 已提交
517
template <>
518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616
void Executor<GPU_CL, Precision::FP32>::LoadMemory(
    const framework::VarDesc var_desc, float *tensorInput, char **data) {
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

  (*data) += sizeof(uint32_t);

  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
  (*data) += sizeof(uint64_t);

  for (uint64_t i = 0; i < lod_level; ++i) {
    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
    (*data) += sizeof(uint64_t);
    std::vector<size_t> tmp(size / sizeof(size_t));

    for (int k = 0; k < tmp.size(); ++k) {
      tmp[k] = *reinterpret_cast<size_t *>(*data);
      (*data) += sizeof(size_t);
    }
  }

  // 3. tensor version
  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
  (*data) += sizeof(uint32_t);

  // 4. tensor desc
  int32_t size = *reinterpret_cast<int32_t *>(*data);
  (*data) += sizeof(int32_t);

  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
    buf.get()[m] = (*data)[m];
  }
  (*data) += (sizeof(char) * size);

  const framework::TensorDesc &desc = var_desc.Tensor_desc();
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
  //            int type_size = 0;
  //            switch (desc.DataType()) {
  //                case framework::VARTYPE_TYPE_FP16:
  //                    type_size = 2;
  //                    break;
  //                case framework::VARTYPE_TYPE_FP32:
  //                    type_size = 4;
  //                    memory = tensor->mutable_data<float>();
  //                    break;
  //                case framework::VARTYPE_TYPE_FP64:
  //                    type_size = 8;
  //                    break;
  //                case framework::VARTYPE_TYPE_INT32:
  //                    memory = tensor->mutable_data<int32_t>();
  //                    type_size = 4;
  //                    break;
  //                case framework::VARTYPE_TYPE_INT64:
  //                    type_size = 8;
  //                    break;
  //                case framework::VARTYPE_TYPE_BOOL:
  //                    type_size = 1;
  //                    break;
  //                default:
  //                    break;
  //            }
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
    float min_value;
    float max_value;

    memcpy(&min_value, *data, sizeof(float));
    memcpy(&max_value, *data + sizeof(float), sizeof(float));
    *data += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
    for (int k = 0; k < memory_size; ++k) {
      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
    for (int n = 0; n < memory_size; n++) {
      float value;
      memcpy(&value, *data + n * type_size, type_size);
      if (value < 1e-30 && value > -1e-30) {
        static_cast<float *>(memory)[n] = 0.0;
      } else {
        static_cast<float *>(memory)[n] = value;
      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }
}
617

Y
yangfei 已提交
618 619 620 621 622 623
template <>
void Executor<GPU_CL, Precision::FP32>::InitMemory() {
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
624
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
625
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
Z
zhaojiaying01 已提交
626
          var->template GetMutable<framework::LoDTensor>();
Y
yangfei 已提交
627
          continue;
L
liuruilong 已提交
628 629
        } else {
          cl_image = var->template GetMutable<framework::CLImage>();
Y
yangfei 已提交
630
        }
L
liuruilong 已提交
631

Y
yangfei 已提交
632
        char *origin_data =
L
liuruilong 已提交
633
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
634
        char *data = origin_data;
Y
yangfei 已提交
635
        cl_context context = program_.scope->GetCLScpoe()->Context();
636 637 638 639 640 641
        const framework::TensorDesc &desc = var_desc->Tensor_desc();
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
        }
        DLOG << var_desc->Name();
Y
yangfei 已提交
642
        float *tensorInput = static_cast<float *>(
643 644
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);
Y
yangfei 已提交
645 646

        framework::DDim ddim = framework::make_ddim(desc.Dims());
Y
yangfei 已提交
647

L
liuruilong 已提交
648 649
        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
Y
yangfei 已提交
650

651
        delete origin_data;
Y
yangfei 已提交
652
        paddle_mobile::memory::Free(tensorInput);
653 654 655 656
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
          auto cl_image = var->template GetMutable<framework::CLImage>();
          cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
657 658
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
659

660
          const framework::TensorDesc &desc = var_desc->Tensor_desc();
Y
yangfei 已提交
661 662
          //          framework::DDim ddim = framework::make_ddim(desc.Dims());
          framework::DDim ddim = cl_image->dims();
663
          DLOG << var_desc->Name();
L
liuruilong 已提交
664
          cl_image->InitEmptyImage(context, command_queue, ddim);
665
        }
Y
yangfei 已提交
666 667 668 669
      }
    }
  }
}
670

Y
yangfei 已提交
671 672
template <>
void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
Y
yangfei 已提交
673 674
  char *origin_data = nullptr;
  bool self_alloc = false;
Y
yangfei 已提交
675 676
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
677
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
Y
yangfei 已提交
678 679
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
Y
yangfei 已提交
680
    self_alloc = true;
L
liuruilong 已提交
681
    origin_data = ReadFileToBuff(program_.para_path);
Y
yangfei 已提交
682 683
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
684
  float *data = reinterpret_cast<float *>(origin_data);
Y
yangfei 已提交
685 686 687 688 689

  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
690
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
691
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
Z
zhaojiaying01 已提交
692
          var->template GetMutable<framework::LoDTensor>();
Y
yangfei 已提交
693
          continue;
L
liuruilong 已提交
694 695
        } else {
          cl_image = var->template GetMutable<framework::CLImage>();
Y
yangfei 已提交
696 697 698 699
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

Y
yangfei 已提交
700
        const framework::TensorDesc &desc = var_desc->Tensor_desc();
Y
yangfei 已提交
701
        framework::DDim ddim = framework::make_ddim(desc.Dims());
Y
yangfei 已提交
702 703 704 705 706

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
          numel = numel * ddim[i];
        }
707 708 709
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);
L
liuruilong 已提交
710 711 712 713

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

714 715
        paddle_mobile::memory::Free(tensorInput);
      } else {
Y
yangfei 已提交
716 717
        auto cl_image = var->template GetMutable<framework::CLImage>();
        cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
718 719
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
720
        const framework::TensorDesc &desc = var_desc->Tensor_desc();
Y
yangfei 已提交
721 722
        framework::DDim ddim = cl_image->dims();
        //        framework::DDim ddim = framework::make_ddim(desc.Dims());
L
liuruilong 已提交
723
        cl_image->InitEmptyImage(context, command_queue, ddim);
Y
yangfei 已提交
724 725 726
      }
    }
  }
Y
yangfei 已提交
727
  if (self_alloc) {
728
    delete data;
Y
yangfei 已提交
729
  }
Y
yangfei 已提交
730
  LOG(kLOG_INFO) << " end init combine memory ";
731
}
Y
yangfei 已提交
732 733 734

#endif

W
wangliu 已提交
735
template class Executor<CPU, Precision::FP32>;
Y
yangfei 已提交
736

L
liuruilong 已提交
737
template class Executor<FPGA, Precision::FP32>;
W
wangliu 已提交
738

Y
yangfei 已提交
739 740 741 742 743
template class Executor<GPU_CL, Precision::FP32>;

template class Executor<GPU_MALI, Precision::FP32>;

}  // namespace framework
W
wangliu 已提交
744
}  // namespace paddle_mobile