executor.cpp 22.1 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "framework/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
Z
zhangyang 已提交
29
#include "memory/t_malloc.h"
L
update  
liuruilong 已提交
30 31 32 33

#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
W
wangliu 已提交
34 35

namespace paddle_mobile {
36
namespace framework {
37

W
wangliu 已提交
38 39
#pragma mark - executor

40 41 42 43
template <typename Device, typename T>
Executor<Device, T>::Executor(const Program<Device> &program, int batch_size,
                              const bool use_optimize, const bool lod_mode)
    : program_(program),
H
hjchen2 已提交
44 45
      batch_size_(batch_size),
      use_optimize_(use_optimize),
46 47 48
      lod_mode_(lod_mode) {
  DLOG << "executor in lod mode: " << lod_mode_;

W
wangliu 已提交
49
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
50
  variable_ptr->SetValue<int>(batch_size);
51 52

  program_desc_ =
Refine  
陈后江 已提交
53
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
54 55 56 57
  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
                        "program_desc_ should not be nullptr");
  const auto &blocks = program_desc_->Blocks();
  ops_of_block_.resize(blocks.size());
58

W
wangliu 已提交
59
  for (int i = 0; i < blocks.size(); ++i) {
60 61
    std::shared_ptr<BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
W
wangliu 已提交
62
    for (int j = 0; j < ops.size(); ++j) {
63 64 65 66 67 68 69 70 71
      std::shared_ptr<OpDesc> op_desc = ops[j];
      DLOG << "create op: " << op_desc->Type();
      auto op_handler = OpRegistry<Device>::CreateOp(
          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
          op_desc->GetAttrMap(), program_.scope);
      // infer shape to reshape inputs and outputs before predict,
      // but for lod mode, it still need to infer shape in runtime
      if (!lod_mode) {
        op_handler->InferShape();
xiebaiyuan's avatar
xiebaiyuan 已提交
72
      }
73
      ops_of_block_[i].push_back(op_handler);
W
wangliu 已提交
74 75
    }
  }
76

W
wangliu 已提交
77
  if (program_.combined) {
L
liuruilong 已提交
78 79 80 81
    InitCombineMemory();
  } else {
    InitMemory();
  }
82 83 84 85 86 87 88 89

  int count = 0;
  for (int block_id = 0; block_id < ops_of_block_.size(); ++block_id) {
    for (auto &op_handler : ops_of_block_[block_id]) {
      DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
      op_handler->Init();
      ops_list_.push_back(op_handler);
    }
L
liuruilong 已提交
90
  }
W
wangliu 已提交
91 92
}

93 94
template <typename Device>
static void LoadMemInternal(void **data, LoDTensor *tensor,
95
                            bool quant_uint8 = false) {
Refine  
陈后江 已提交
96
  char **data_buf = reinterpret_cast<char **>(data);
97
  int64_t size = tensor->numel();
98
  Device *tensor_data = tensor->mutable_data<Device>();
99 100
  if (quant_uint8) {
    // should be moved into operator init function
101 102
    float min_value;
    float max_value;
Z
zhangyang 已提交
103 104
    memory::Copy(&min_value, data_buf, sizeof(float));
    memory::Copy(&max_value, data_buf + sizeof(float), sizeof(float));
105 106
    data_buf += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
107
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(data_buf);
108 109
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
110
    }
111 112
    data_buf += size * sizeof(uint8_t);
  } else {
113 114
    memory::Copy(tensor_data, *data_buf, size * sizeof(Device));
    *data_buf += size * sizeof(Device);
L
liuruilong 已提交
115
  }
116
}
W
wangliu 已提交
117

118 119 120 121
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(void **data,
                                     const std::shared_ptr<VarDesc> var_desc,
                                     LoDTensor *tensor) {
122
  char **data_buf = reinterpret_cast<char **>(data);
123
  // version
124
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
125
  *data_buf += sizeof(uint32_t);
126
  // lod information
H
hjchen2 已提交
127 128
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
Z
zhangyang 已提交
129
  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
130
  *data_buf += sizeof(uint64_t);
131 132 133 134

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
135
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
136
    *data_buf += sizeof(uint64_t);
137
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Z
zhangyang 已提交
138
    memory::Copy(tmp_dim.data(), *data_buf, size);
139
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
140
    *data_buf += size;
W
wangliu 已提交
141
  }
142
  // tensor version
143
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
144
  *data_buf += sizeof(uint32_t);
145
  // tensor desc size
146
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
147
  *data_buf += sizeof(int32_t);
148
  // skip tensor desc
Refine  
陈后江 已提交
149
  *data_buf += tensor_desc_size;
150

151 152
  const TensorDesc &tensor_desc = var_desc->Tensor_desc();
  tensor->Resize(make_ddim(tensor_desc.Dims()));
153 154
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
155
    case VARTYPE_TYPE_FP32:
156 157
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
W
wangliu 已提交
158
      break;
159
    case VARTYPE_TYPE_INT8:
160
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
161
      break;
162
    case VARTYPE_TYPE_INT32:
163
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
164 165
      break;
    default:
166
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
167
  }
W
wangliu 已提交
168 169
}

170 171 172
template <typename Device, typename T>
void Executor<Device, T>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
W
wangliu 已提交
173 174
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
175
      auto tensor = var->template GetMutable<LoDTensor>();
W
wangliu 已提交
176 177 178 179
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
Refine  
陈后江 已提交
180
        char *origin_data =
Refine  
陈后江 已提交
181
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
182
        char *data = origin_data;
183 184
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
185
      } else {
186
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
187
          varInputMemory(var_desc, var, tensor);
W
wangliu 已提交
188 189 190 191 192 193
        }
      }
    }
  }
}

194 195
template <typename Device, typename T>
void Executor<Device, T>::InitCombineMemory() {
Refine  
陈后江 已提交
196
  char *origin_data = nullptr;
Refine  
陈后江 已提交
197
  bool self_alloc = false;
198
  if (program_.combined_params_buf && program_.combined_params_len) {
199 200
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
201
  } else {
Refine  
陈后江 已提交
202
    self_alloc = true;
Refine  
陈后江 已提交
203
    origin_data = ReadFileToBuff(program_.para_path);
204
  }
Refine  
陈后江 已提交
205 206
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
207
  for (const auto &block : program_desc_->Blocks()) {
L
liuruilong 已提交
208 209
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
210
      auto tensor = var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
211 212 213 214
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
215
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
216
      } else {
217
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
218
          varInputMemory(var_desc, var, tensor);
L
liuruilong 已提交
219 220 221 222
        }
      }
    }
  }
Refine  
陈后江 已提交
223
  if (self_alloc) {
224
    delete[] origin_data;
Refine  
陈后江 已提交
225 226
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
227
}
228

229 230 231 232
template <typename Device, typename T>
bool Executor<Device, T>::varInputMemory(
    const std::shared_ptr<VarDesc> &var_desc, Variable *var,
    LoDTensor *tensor) const {
233 234
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
235
    case VARTYPE_TYPE_FP32:
236
      tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
237
      break;
238
    case VARTYPE_TYPE_INT8:
239
      tensor->mutable_data<int8_t>();
Refine  
陈后江 已提交
240
      break;
241
    case VARTYPE_TYPE_INT32:
242
      tensor->mutable_data<int32_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
243
      break;
244
    case VARTYPE_TYPE_INT64:
245
      tensor->mutable_data<int64_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
246
      break;
Refine  
陈后江 已提交
247
    default:
xiebaiyuan's avatar
xiebaiyuan 已提交
248 249
      break;
  }
250 251 252
  bool is_mute_match =
      (type == VARTYPE_TYPE_FP32) || (type == VARTYPE_TYPE_INT8) ||
      (type == VARTYPE_TYPE_INT32) || (type == VARTYPE_TYPE_INT64);
Refine  
陈后江 已提交
253
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
254 255
  return is_mute_match;
}
L
liuruilong 已提交
256

257 258 259 260 261
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, Tensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
262
  }
263 264 265 266 267 268 269 270
  return this->Predict();
}

template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, LoDTensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
271
  }
272
  return this->Predict();
W
wangliu 已提交
273
}
xiebaiyuan's avatar
xiebaiyuan 已提交
274

275 276 277 278 279 280 281 282 283 284 285 286 287 288
template <typename Device, typename T>
std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
                                            const std::vector<int64_t> &dims) {
  Tensor feed_tensor(input, make_ddim(dims));
  SetInput(feed_tensor, "feed");
  std::vector<T> output;
  if (this->Predict() == PMSuccess) {
    const auto output_tensor = GetOutput("fetch");
    output.resize(output_tensor->numel());
    memcpy(output.data(), output_tensor->template data<T>(),
           output.size() * sizeof(T));
  }
  return output;
}
xiebaiyuan's avatar
xiebaiyuan 已提交
289

290 291 292 293 294 295 296 297 298 299
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const Tensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
}
xiebaiyuan's avatar
xiebaiyuan 已提交
300

301 302 303 304 305 306 307 308 309 310 311
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const LoDTensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
  target_tensor->set_lod(input.lod());
}
xiebaiyuan's avatar
xiebaiyuan 已提交
312

313 314
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict() {
xiebaiyuan's avatar
xiebaiyuan 已提交
315
#ifdef PADDLE_MOBILE_PROFILE
316 317 318
  std::vector<ProfInfo> profile(ops_list_.size());
  struct timespec ts;
  int op_index = 0;
xiebaiyuan's avatar
xiebaiyuan 已提交
319
#endif
320 321
  for (auto &block : ops_of_block_) {
    for (auto &op_handler : block) {
xiebaiyuan's avatar
xiebaiyuan 已提交
322
#ifdef PADDLE_MOBILE_PROFILE
323 324
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
xiebaiyuan's avatar
xiebaiyuan 已提交
325
#endif
326 327 328 329
      if (lod_mode_) {
        op_handler->InferShape();
      }
      op_handler->Run();
xiebaiyuan's avatar
xiebaiyuan 已提交
330
#ifdef PADDLE_MOBILE_PROFILE
331 332 333
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
      ++op_index;
xiebaiyuan's avatar
xiebaiyuan 已提交
334
#endif
335
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
336 337 338 339 340 341
  }
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
342 343 344 345 346
    if (ops_list_[i]->Type() == "conv2d" ||
        ops_list_[i]->Type() == "depthwise_conv2d") {
      auto inputs = ops_list_[i]->Inputs();
      auto *filter =
          GetVarValue<LoDTensor>("Filter", inputs, *(program_.scope));
347
      int kernel_size = filter->dims()[2];
348 349 350
      _tp[ops_list_[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
    } else {
      _tp[ops_list_[i]->Type()] += timeCost;
351
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
352
  }
H
hjchen2 已提交
353
  printf("====================[ profile ]======================\n");
354
  typedef std::pair<std::string, uint64_t> prof_t;
xiebaiyuan's avatar
xiebaiyuan 已提交
355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
H
hjchen2 已提交
370
  printf("====================[---------]======================\n");
xiebaiyuan's avatar
xiebaiyuan 已提交
371
#endif
372
  return PMSuccess;
xiebaiyuan's avatar
xiebaiyuan 已提交
373 374
}

375 376 377 378 379 380 381 382
template <typename Device, typename T>
std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
    const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *output_tensor = target_var->template GetMutable<LoDTensor>();
  return std::make_shared<LoDTensor>(*output_tensor);
W
wangliu 已提交
383 384
}

385
#ifdef PADDLE_MOBILE_FPGA
386 387 388 389 390
template <typename Device, typename T>
void Executor<Device, T>::InjectVariable(const Tensor &t,
                                         std::string var_name) {
  Variable *g_feed_value = program_.scope->Var(var_name);
  Tensor *feed_tensor = g_feed_value->GetMutable<LoDTensor>();
391 392
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
393
}
394

395 396
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const Tensor &t) {
397
  InjectVariable(t, "feed");
398
}
399

400 401 402
template <typename Device, typename T>
std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
  std::shared_ptr<BlockDesc> to_predict_block = program_desc_->Block(0);
403
  auto &ops = ops_of_block_[*to_predict_block.get()];
404

Z
zhangyang 已提交
405 406 407 408 409
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
410 411 412
  auto *output_tensor =
      GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
  return std::make_shared<Tensor>(Tensor(*output_tensor));
413
}
414

415 416 417
template <typename Device, typename T>
void Executor<Device, T>::Predict_From_To(int start, int end) {
  std::shared_ptr<BlockDesc> to_predict_block = program_desc_->Block(0);
418
  auto &ops = ops_of_block_[*to_predict_block.get()];
419
  end = end < 0 ? static_cast<int>(ops.size()) : end;
420 421 422 423 424 425 426 427 428 429 430 431
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
432
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
433 434 435 436 437 438 439
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
440
}
441

442 443
template <typename Device, typename T>
void Executor<Device, T>::Predict_From(int start) {
444
  Predict_From_To(start);
445
}
446

447 448
template <typename Device, typename T>
void Executor<Device, T>::Predict_To(int end) {
449
  Predict_From_To(0, end);
450
}
451 452
#endif

Y
yangfei 已提交
453
#ifdef PADDLE_MOBILE_CL
454 455 456
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
                                     char **data) {}
L
liuruilong 已提交
457

Y
yangfei 已提交
458
template <>
459 460 461
void Executor<GPU_CL, Precision::FP32>::LoadMemory(const VarDesc var_desc,
                                                   float *tensorInput,
                                                   char **data) {
462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

  (*data) += sizeof(uint32_t);

  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
  (*data) += sizeof(uint64_t);

  for (uint64_t i = 0; i < lod_level; ++i) {
    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
    (*data) += sizeof(uint64_t);
    std::vector<size_t> tmp(size / sizeof(size_t));

    for (int k = 0; k < tmp.size(); ++k) {
      tmp[k] = *reinterpret_cast<size_t *>(*data);
      (*data) += sizeof(size_t);
    }
  }

  // 3. tensor version
  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
  (*data) += sizeof(uint32_t);

  // 4. tensor desc
  int32_t size = *reinterpret_cast<int32_t *>(*data);
  (*data) += sizeof(int32_t);

  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
    buf.get()[m] = (*data)[m];
  }
  (*data) += (sizeof(char) * size);

499
  const TensorDesc &desc = var_desc.Tensor_desc();
500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
    float min_value;
    float max_value;

    memcpy(&min_value, *data, sizeof(float));
    memcpy(&max_value, *data + sizeof(float), sizeof(float));
    *data += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
    for (int k = 0; k < memory_size; ++k) {
      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
    for (int n = 0; n < memory_size; n++) {
      float value;
      memcpy(&value, *data + n * type_size, type_size);
      if (value < 1e-30 && value > -1e-30) {
        static_cast<float *>(memory)[n] = 0.0;
      } else {
        static_cast<float *>(memory)[n] = value;
      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }
}
534

Y
yangfei 已提交
535
template <>
536 537
void Executor<GPU_CL, float>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
538 539 540
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
541
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
542
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
543
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
544
          continue;
L
liuruilong 已提交
545
        } else {
546
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
547
        }
L
liuruilong 已提交
548

Y
yangfei 已提交
549
        char *origin_data =
L
liuruilong 已提交
550
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
551
        char *data = origin_data;
Y
yangfei 已提交
552
        cl_context context = program_.scope->GetCLScpoe()->Context();
553
        const TensorDesc &desc = var_desc->Tensor_desc();
554 555 556 557 558
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
        }
        DLOG << var_desc->Name();
Y
yangfei 已提交
559
        float *tensorInput = static_cast<float *>(
560 561
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);
Y
yangfei 已提交
562

563
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
564

L
liuruilong 已提交
565 566
        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
Y
yangfei 已提交
567

568
        delete origin_data;
Y
yangfei 已提交
569
        paddle_mobile::memory::Free(tensorInput);
570
      } else {
571 572
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          auto cl_image = var->template GetMutable<CLImage>();
573
          cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
574 575
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
576

577 578 579
          const TensorDesc &desc = var_desc->Tensor_desc();
          //          DDim ddim = make_ddim(desc.Dims());
          DDim ddim = cl_image->dims();
580
          DLOG << var_desc->Name();
L
liuruilong 已提交
581
          cl_image->InitEmptyImage(context, command_queue, ddim);
582
        }
Y
yangfei 已提交
583 584 585 586
      }
    }
  }
}
587

Y
yangfei 已提交
588
template <>
589
void Executor<GPU_CL, float>::InitCombineMemory() {
Y
yangfei 已提交
590 591
  char *origin_data = nullptr;
  bool self_alloc = false;
Y
yangfei 已提交
592 593
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
594
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
Y
yangfei 已提交
595 596
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
Y
yangfei 已提交
597
    self_alloc = true;
L
liuruilong 已提交
598
    origin_data = ReadFileToBuff(program_.para_path);
Y
yangfei 已提交
599 600
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
601
  float *data = reinterpret_cast<float *>(origin_data);
Y
yangfei 已提交
602

603
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
604 605 606
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
607
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
608
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
609
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
610
          continue;
L
liuruilong 已提交
611
        } else {
612
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
613 614 615 616
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

617 618
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
619 620 621 622 623

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
          numel = numel * ddim[i];
        }
624 625 626
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);
L
liuruilong 已提交
627 628 629 630

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

631 632
        paddle_mobile::memory::Free(tensorInput);
      } else {
633
        auto cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
634
        cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
635 636
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
637 638 639
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = cl_image->dims();
        //  DDim ddim = make_ddim(desc.Dims());
L
liuruilong 已提交
640
        cl_image->InitEmptyImage(context, command_queue, ddim);
Y
yangfei 已提交
641 642 643
      }
    }
  }
Y
yangfei 已提交
644
  if (self_alloc) {
645
    delete data;
Y
yangfei 已提交
646
  }
Y
yangfei 已提交
647
  LOG(kLOG_INFO) << " end init combine memory ";
648
}
Y
yangfei 已提交
649 650 651

#endif

652
template class Executor<CPU, float>;
Y
yangfei 已提交
653

654
template class Executor<FPGA, float>;
W
wangliu 已提交
655

656
template class Executor<GPU_CL, float>;
Y
yangfei 已提交
657

658
template class Executor<GPU_MALI, float>;
Y
yangfei 已提交
659 660

}  // namespace framework
W
wangliu 已提交
661
}  // namespace paddle_mobile