executor.cpp 25.8 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "framework/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
Z
zhangyang 已提交
29
#include "memory/t_malloc.h"
L
update  
liuruilong 已提交
30

D
dolphin8 已提交
31
#ifdef PADDLE_EXECUTOR_MULTITHREAD
D
dolphin8 已提交
32 33 34
#include <queue>
#include "common/threadpool.h"
#endif
W
wangliu 已提交
35

L
update  
liuruilong 已提交
36 37 38
#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
W
wangliu 已提交
39 40

namespace paddle_mobile {
41
namespace framework {
42

W
wangliu 已提交
43
using framework::Variable;
L
liuruilong 已提交
44
using framework::Variable;
W
wangliu 已提交
45 46 47 48

#pragma mark - executor

template <typename Dtype, Precision P>
H
hjchen2 已提交
49
Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
50
                             const bool use_optimize, const bool loddable)
H
hjchen2 已提交
51 52 53 54
    : program_(p),
      batch_size_(batch_size),
      use_optimize_(use_optimize),
      loddable_(loddable) {
W
wangliu 已提交
55
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
56
  variable_ptr->SetValue<int>(batch_size);
Refine  
陈后江 已提交
57 58
  to_predict_program_ =
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
59 60
  PADDLE_MOBILE_ENFORCE(to_predict_program_ != nullptr,
                        "to_predict_program_ == NULL!");
61
  const std::vector<std::shared_ptr<framework::BlockDesc>> &blocks =
W
wangliu 已提交
62
      to_predict_program_->Blocks();
63 64

  DLOG << "executor in loaddable mode: " << loddable_;
W
wangliu 已提交
65 66 67 68 69
  for (int i = 0; i < blocks.size(); ++i) {
    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
    for (int j = 0; j < ops.size(); ++j) {
      std::shared_ptr<framework::OpDesc> op = ops[j];
70
      DLOG << "create op: " << op->Type();
W
wangliu 已提交
71 72 73
      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
          program_.scope);
Refine  
陈后江 已提交
74 75
      // infer shape to reshape tensor before predict,
      // but for lod tensor, it will need to reshape in runtime
xiebaiyuan's avatar
xiebaiyuan 已提交
76 77 78
      if (!loddable_) {
        op_base->InferShape();
      }
W
wangliu 已提交
79 80 81
      ops_of_block_[*block_desc.get()].push_back(op_base);
    }
  }
W
wangliu 已提交
82
  if (program_.combined) {
L
liuruilong 已提交
83 84 85 86
    InitCombineMemory();
  } else {
    InitMemory();
  }
L
liuruilong 已提交
87
  std::shared_ptr<framework::BlockDesc> to_predict_block =
L
liuruilong 已提交
88
      to_predict_program_->Block(0);
Z
zhangyang 已提交
89
  int i = 0;
L
liuruilong 已提交
90
  auto &ops = ops_of_block_[*to_predict_block.get()];
L
liuruilong 已提交
91
  for (const auto &op : ops) {
Z
zhangyang 已提交
92
    DLOG << "Initialize op[" << i++ << "]: " << op->Type();
L
liuruilong 已提交
93 94
    op->Init();
  }
W
wangliu 已提交
95 96
}

97
template <typename Dtype>
98
void LoadMemInternal(void **data, framework::LoDTensor *tensor) {
Refine  
陈后江 已提交
99
  char **data_buf = reinterpret_cast<char **>(data);
100
  int64_t size = tensor->numel();
101
  Dtype *tensor_data = tensor->mutable_data<Dtype>();
102 103
  if (0) {
    // TODO(hjchen2) should be moved into operator init function
104 105
    float min_value;
    float max_value;
Z
zhangyang 已提交
106 107
    memory::Copy(&min_value, data_buf, sizeof(float));
    memory::Copy(&max_value, data_buf + sizeof(float), sizeof(float));
108 109
    data_buf += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
110
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(data_buf);
111 112
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
113
    }
114 115
    data_buf += size * sizeof(uint8_t);
  } else {
Z
zhangyang 已提交
116
    memory::Copy(tensor_data, *data_buf, size * sizeof(Dtype));
Refine  
陈后江 已提交
117
    *data_buf += size * sizeof(Dtype);
L
liuruilong 已提交
118
  }
119
}
W
wangliu 已提交
120

121
template <typename Dtype, Precision P>
Refine  
陈后江 已提交
122
void Executor<Dtype, P>::LoadMemory(
123 124 125
    void **data, const std::shared_ptr<framework::VarDesc> var_desc,
    framework::LoDTensor *tensor) {
  char **data_buf = reinterpret_cast<char **>(data);
126
  // version
127
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
128
  *data_buf += sizeof(uint32_t);
129
  // lod information
H
hjchen2 已提交
130 131
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
Z
zhangyang 已提交
132
  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
133
  *data_buf += sizeof(uint64_t);
134 135 136 137

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
138
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
139
    *data_buf += sizeof(uint64_t);
140
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Z
zhangyang 已提交
141
    memory::Copy(tmp_dim.data(), *data_buf, size);
142
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
143
    *data_buf += size;
W
wangliu 已提交
144
  }
145
  // tensor version
146
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
147
  *data_buf += sizeof(uint32_t);
148
  // tensor desc size
149
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
150
  *data_buf += sizeof(int32_t);
151
  // skip tensor desc
Refine  
陈后江 已提交
152
  *data_buf += tensor_desc_size;
153

Refine  
陈后江 已提交
154
  const framework::TensorDesc &tensor_desc = var_desc->Tensor_desc();
155 156 157
  tensor->Resize(framework::make_ddim(tensor_desc.Dims()));
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
W
wangliu 已提交
158
    case framework::VARTYPE_TYPE_FP32:
159
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
160
      break;
161
    case framework::VARTYPE_TYPE_INT8:
162
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
163 164
      break;
    case framework::VARTYPE_TYPE_INT32:
165
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
166 167
      break;
    default:
168
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
169
  }
W
wangliu 已提交
170 171 172 173 174 175 176
}

template <typename Dtype, Precision P>
void Executor<Dtype, P>::InitMemory() {
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
177
      auto tensor = var->template GetMutable<framework::LoDTensor>();
W
wangliu 已提交
178 179 180 181
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
Refine  
陈后江 已提交
182
        char *origin_data =
Refine  
陈后江 已提交
183
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
184
        char *data = origin_data;
185 186
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
187 188
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
189
          varInputMemory(var_desc, var, tensor);
W
wangliu 已提交
190 191 192 193 194 195
        }
      }
    }
  }
}

L
liuruilong 已提交
196
template <typename Dtype, Precision P>
L
liuruilong 已提交
197
void Executor<Dtype, P>::InitCombineMemory() {
Refine  
陈后江 已提交
198
  char *origin_data = nullptr;
Refine  
陈后江 已提交
199
  bool self_alloc = false;
200
  if (program_.combined_params_buf && program_.combined_params_len) {
201 202
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
203
  } else {
Refine  
陈后江 已提交
204
    self_alloc = true;
Refine  
陈后江 已提交
205
    origin_data = ReadFileToBuff(program_.para_path);
206
  }
Refine  
陈后江 已提交
207 208
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
L
liuruilong 已提交
209 210 211
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
212
      auto tensor = var->template GetMutable<framework::LoDTensor>();
L
liuruilong 已提交
213 214 215 216
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
217
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
218 219
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
220
          varInputMemory(var_desc, var, tensor);
L
liuruilong 已提交
221 222 223 224
        }
      }
    }
  }
Refine  
陈后江 已提交
225
  if (self_alloc) {
226
    delete[] origin_data;
Refine  
陈后江 已提交
227 228
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
229
}
230

xiebaiyuan's avatar
xiebaiyuan 已提交
231 232 233 234
template <typename Dtype, Precision P>
bool Executor<Dtype, P>::varInputMemory(
    const std::shared_ptr<framework::VarDesc> &var_desc, Variable *var,
    framework::LoDTensor *tensor) const {
235 236
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
Refine  
陈后江 已提交
237
    case framework::VARTYPE_TYPE_FP32:
238
      tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
239
      break;
Refine  
陈后江 已提交
240
    case framework::VARTYPE_TYPE_INT8:
241
      tensor->mutable_data<int8_t>();
Refine  
陈后江 已提交
242 243
      break;
    case framework::VARTYPE_TYPE_INT32:
244
      tensor->mutable_data<int32_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
245
      break;
Refine  
陈后江 已提交
246
    case framework::VARTYPE_TYPE_INT64:
247
      tensor->mutable_data<int64_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
248
      break;
Refine  
陈后江 已提交
249
    default:
xiebaiyuan's avatar
xiebaiyuan 已提交
250 251
      break;
  }
Refine  
陈后江 已提交
252
  bool is_mute_match = (type == framework::VARTYPE_TYPE_FP32) ||
253 254 255
                       (type == framework::VARTYPE_TYPE_INT8) ||
                       (type == framework::VARTYPE_TYPE_INT32) ||
                       (type == framework::VARTYPE_TYPE_INT64);
Refine  
陈后江 已提交
256
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
257 258
  return is_mute_match;
}
L
liuruilong 已提交
259

W
wangliu 已提交
260
template <typename Dtype, Precision P>
W
wangliu 已提交
261 262
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
    const framework::Tensor &t) {
W
wangliu 已提交
263 264 265
  framework::Variable *g_feed_value = program_.scope->Var("feed");
  framework::Tensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
266
  DLOG << "feed_tensor dim: " << feed_tensor->dims();
W
wangliu 已提交
267 268 269
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
  std::shared_ptr<framework::BlockDesc> to_predict_block =
W
wangliu 已提交
270
      to_predict_program_->Block(0);
D
dolphin8 已提交
271
  auto &ops = ops_of_block_[*to_predict_block.get()];
xiebaiyuan's avatar
xiebaiyuan 已提交
272

D
dolphin8 已提交
273
#ifdef PADDLE_MOBILE_PROFILE
D
dolphin8 已提交
274
  std::vector<ProfInfo> profile(ops.size());
D
dolphin8 已提交
275
#endif
D
dolphin8 已提交
276
  for (int i = 0; i < ops.size(); i++) {
D
dolphin8 已提交
277
#ifdef PADDLE_MOBILE_PROFILE
D
dolphin8 已提交
278 279 280 281
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
L
liuruilong 已提交
282
    // to Run
D
dolphin8 已提交
283 284 285 286 287
    ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
D
dolphin8 已提交
288
  }
W
wangliu 已提交
289 290 291 292 293 294 295
  auto last_op = ops.rbegin();
  auto output_map = (*last_op)->Outputs();
  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
  framework::LoDTensor *output_tensor =
      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                   *(program_.scope));
D
dolphin8 已提交
296 297 298 299 300
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
301 302 303 304 305 306 307 308 309
    if (ops[i]->Type() == "conv2d") {
      auto inputs = ops[i]->Inputs();
      auto *filter = framework::GetVarValue<framework::LoDTensor>(
          "Filter", inputs, *(program_.scope));
      int kernel_size = filter->dims()[2];
      _tp[ops[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
    } else {
      _tp[ops[i]->Type()] += timeCost;
    }
D
dolphin8 已提交
310 311 312 313 314 315 316 317 318 319 320 321 322 323
  }
  printf("====================[ profile ]======================\n");
  using prof_t = std::pair<std::string, uint64_t>;
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
324 325 326
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
D
dolphin8 已提交
327 328 329
  }
  printf("====================[---------]======================\n");
#endif
L
liuruilong 已提交
330
  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
W
wangliu 已提交
331
}
xiebaiyuan's avatar
xiebaiyuan 已提交
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378

template <typename Dtype, Precision P>
std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
    const framework::LoDTensor &t) {
  framework::Variable *g_feed_value = program_.scope->Var("feed");
  framework::LoDTensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
  feed_tensor->set_lod(t.lod());

  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);

  auto &ops = ops_of_block_[*to_predict_block.get()];

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = 0; i < ops.size(); i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
    if (loddable_) {
      ops[i]->InferShape();
    }
    ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
  auto last_op = ops.rbegin();

  auto output_map = (*last_op)->Outputs();
  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
  framework::LoDTensor *output_tensor =
      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                   *(program_.scope));
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
379 380 381 382 383 384 385 386
    if (ops[i]->Type() == "conv2d") {
      auto inputs = ops[i]->Inputs();
      auto input_keys = ops[i]->GetInputKeys();
      auto *filter = framework::GetVarValue<framework::LoDTensor>(
          input_keys[1], inputs, *(program_.scope));
      int kernel_size = filter->dims()[2];
      printf("kernel size: %d\n", kernel_size);
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
    _tp[ops[i]->Type()] += timeCost;
  }
  printf("====================[ profile ]======================\n");
  using prof_t = std::pair<std::string, uint64_t>;
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
  printf("====================[---------]======================\n");
#endif
  return std::make_shared<framework::LoDTensor>(
      framework::LoDTensor(*output_tensor));
}

W
wangliu 已提交
412 413 414 415
template <typename Dtype, Precision P>
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
    const framework::Tensor &t, int block_id) {
  return Predict(t);
W
wangliu 已提交
416 417 418
}

template <typename Dtype, Precision P>
L
liuruilong 已提交
419
std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
W
wangliu 已提交
420 421
    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
  framework::Tensor tensor(input, framework::make_ddim(dims));
W
wangliu 已提交
422
  std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
L
liuruilong 已提交
423 424
  if (output_tensor != nullptr) {
    Executor<Dtype, P>::Ptype *output_ptr =
L
liuruilong 已提交
425
        output_tensor->data<typename Executor<Dtype, P>::Ptype>();
L
liuruilong 已提交
426 427 428 429 430 431 432 433
    std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
    for (int j = 0; j < output_tensor->numel(); ++j) {
      result_vector.push_back(output_ptr[j]);
    }
    return result_vector;
  } else {
    DLOG << "return  empty vector";
    return {};
W
wangliu 已提交
434
  }
W
wangliu 已提交
435 436
}

437 438
#ifdef PADDLE_MOBILE_FPGA
template <typename Dtype, Precision P>
439
void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
H
hjchen2 已提交
440
                                        std::string var_name) {
441
  framework::Variable *g_feed_value = program_.scope->Var(var_name);
442 443 444 445
  framework::Tensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
446
}
447

448 449 450
template <typename Dtype, Precision P>
void Executor<Dtype, P>::FeedData(const framework::Tensor &t) {
  InjectVariable(t, "feed");
451
}
452

453
template <typename Dtype, Precision P>
454
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
455 456 457
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
458

Z
zhangyang 已提交
459 460 461 462 463
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
464 465 466
  auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
      out_keys[0], output_map, *(program_.scope));
  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
467
}
468 469 470 471 472 473

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_From_To(int start, int end) {
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
474
  end = end < 0 ? static_cast<int>(ops.size()) : end;
475 476 477 478 479 480 481 482 483 484 485 486
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
487
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
488 489 490 491 492 493 494
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
495
}
496 497 498 499

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_From(int start) {
  Predict_From_To(start);
500
}
501 502 503 504

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_To(int end) {
  Predict_From_To(0, end);
505
}
506 507
#endif

Y
yangfei 已提交
508
#ifdef PADDLE_MOBILE_CL
L
liuruilong 已提交
509 510 511 512
template <typename Dtype, Precision P>
void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
                                    float *tensorInput, char **data) {}

Y
yangfei 已提交
513
template <>
514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612
void Executor<GPU_CL, Precision::FP32>::LoadMemory(
    const framework::VarDesc var_desc, float *tensorInput, char **data) {
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

  (*data) += sizeof(uint32_t);

  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
  (*data) += sizeof(uint64_t);

  for (uint64_t i = 0; i < lod_level; ++i) {
    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
    (*data) += sizeof(uint64_t);
    std::vector<size_t> tmp(size / sizeof(size_t));

    for (int k = 0; k < tmp.size(); ++k) {
      tmp[k] = *reinterpret_cast<size_t *>(*data);
      (*data) += sizeof(size_t);
    }
  }

  // 3. tensor version
  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
  (*data) += sizeof(uint32_t);

  // 4. tensor desc
  int32_t size = *reinterpret_cast<int32_t *>(*data);
  (*data) += sizeof(int32_t);

  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
    buf.get()[m] = (*data)[m];
  }
  (*data) += (sizeof(char) * size);

  const framework::TensorDesc &desc = var_desc.Tensor_desc();
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
  //            int type_size = 0;
  //            switch (desc.DataType()) {
  //                case framework::VARTYPE_TYPE_FP16:
  //                    type_size = 2;
  //                    break;
  //                case framework::VARTYPE_TYPE_FP32:
  //                    type_size = 4;
  //                    memory = tensor->mutable_data<float>();
  //                    break;
  //                case framework::VARTYPE_TYPE_FP64:
  //                    type_size = 8;
  //                    break;
  //                case framework::VARTYPE_TYPE_INT32:
  //                    memory = tensor->mutable_data<int32_t>();
  //                    type_size = 4;
  //                    break;
  //                case framework::VARTYPE_TYPE_INT64:
  //                    type_size = 8;
  //                    break;
  //                case framework::VARTYPE_TYPE_BOOL:
  //                    type_size = 1;
  //                    break;
  //                default:
  //                    break;
  //            }
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
    float min_value;
    float max_value;

    memcpy(&min_value, *data, sizeof(float));
    memcpy(&max_value, *data + sizeof(float), sizeof(float));
    *data += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
    for (int k = 0; k < memory_size; ++k) {
      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
    for (int n = 0; n < memory_size; n++) {
      float value;
      memcpy(&value, *data + n * type_size, type_size);
      if (value < 1e-30 && value > -1e-30) {
        static_cast<float *>(memory)[n] = 0.0;
      } else {
        static_cast<float *>(memory)[n] = value;
      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }
}
613

Y
yangfei 已提交
614 615 616 617 618 619
template <>
void Executor<GPU_CL, Precision::FP32>::InitMemory() {
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
620
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
621
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
Z
zhaojiaying01 已提交
622
          var->template GetMutable<framework::LoDTensor>();
Y
yangfei 已提交
623
          continue;
L
liuruilong 已提交
624 625
        } else {
          cl_image = var->template GetMutable<framework::CLImage>();
Y
yangfei 已提交
626
        }
L
liuruilong 已提交
627

Y
yangfei 已提交
628
        char *origin_data =
L
liuruilong 已提交
629
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
630
        char *data = origin_data;
Y
yangfei 已提交
631
        cl_context context = program_.scope->GetCLScpoe()->Context();
632 633 634 635 636 637
        const framework::TensorDesc &desc = var_desc->Tensor_desc();
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
        }
        DLOG << var_desc->Name();
Y
yangfei 已提交
638
        float *tensorInput = static_cast<float *>(
639 640
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);
Y
yangfei 已提交
641 642

        framework::DDim ddim = framework::make_ddim(desc.Dims());
Y
yangfei 已提交
643

L
liuruilong 已提交
644 645
        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
Y
yangfei 已提交
646

647
        delete origin_data;
Y
yangfei 已提交
648
        paddle_mobile::memory::Free(tensorInput);
649 650 651 652
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
          auto cl_image = var->template GetMutable<framework::CLImage>();
          cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
653 654
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
655

656
          const framework::TensorDesc &desc = var_desc->Tensor_desc();
Y
yangfei 已提交
657 658
          //          framework::DDim ddim = framework::make_ddim(desc.Dims());
          framework::DDim ddim = cl_image->dims();
659
          DLOG << var_desc->Name();
L
liuruilong 已提交
660
          cl_image->InitEmptyImage(context, command_queue, ddim);
661
        }
Y
yangfei 已提交
662 663 664 665
      }
    }
  }
}
666

Y
yangfei 已提交
667 668
template <>
void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
Y
yangfei 已提交
669 670
  char *origin_data = nullptr;
  bool self_alloc = false;
Y
yangfei 已提交
671 672
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
673
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
Y
yangfei 已提交
674 675
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
Y
yangfei 已提交
676
    self_alloc = true;
L
liuruilong 已提交
677
    origin_data = ReadFileToBuff(program_.para_path);
Y
yangfei 已提交
678 679
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
680
  float *data = reinterpret_cast<float *>(origin_data);
Y
yangfei 已提交
681 682 683 684 685

  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
686
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
687
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
Z
zhaojiaying01 已提交
688
          var->template GetMutable<framework::LoDTensor>();
Y
yangfei 已提交
689
          continue;
L
liuruilong 已提交
690 691
        } else {
          cl_image = var->template GetMutable<framework::CLImage>();
Y
yangfei 已提交
692 693 694 695
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

Y
yangfei 已提交
696
        const framework::TensorDesc &desc = var_desc->Tensor_desc();
Y
yangfei 已提交
697
        framework::DDim ddim = framework::make_ddim(desc.Dims());
Y
yangfei 已提交
698 699 700 701 702

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
          numel = numel * ddim[i];
        }
703 704 705
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);
L
liuruilong 已提交
706 707 708 709

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

710 711
        paddle_mobile::memory::Free(tensorInput);
      } else {
Y
yangfei 已提交
712 713
        auto cl_image = var->template GetMutable<framework::CLImage>();
        cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
714 715
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
716
        const framework::TensorDesc &desc = var_desc->Tensor_desc();
Y
yangfei 已提交
717 718
        framework::DDim ddim = cl_image->dims();
        //        framework::DDim ddim = framework::make_ddim(desc.Dims());
L
liuruilong 已提交
719
        cl_image->InitEmptyImage(context, command_queue, ddim);
Y
yangfei 已提交
720 721 722
      }
    }
  }
Y
yangfei 已提交
723
  if (self_alloc) {
724
    delete data;
Y
yangfei 已提交
725
  }
Y
yangfei 已提交
726
  LOG(kLOG_INFO) << " end init combine memory ";
727
}
Y
yangfei 已提交
728 729 730

#endif

W
wangliu 已提交
731
template class Executor<CPU, Precision::FP32>;
Y
yangfei 已提交
732

L
liuruilong 已提交
733
template class Executor<FPGA, Precision::FP32>;
W
wangliu 已提交
734

Y
yangfei 已提交
735 736 737 738 739
template class Executor<GPU_CL, Precision::FP32>;

template class Executor<GPU_MALI, Precision::FP32>;

}  // namespace framework
W
wangliu 已提交
740
}  // namespace paddle_mobile