executor.cpp 38.1 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

H
hjchen2 已提交
15
#include "framework/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <unordered_map>
18
#include <utility>
W
wangliu 已提交
19
#include <vector>
L
liuruilong 已提交
20
#include "common/enforce.h"
L
liuruilong 已提交
21
#include "common/log.h"
22
#include "framework/context.h"
L
liuruilong 已提交
23
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
24 25
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
26
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
27 28 29 30
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
H
hjchen2 已提交
31
#include "memory/t_malloc.h"
32
#include "pass/memory_optimize.h"
33
#include "pass/model_obfuscate.h"
L
update  
liuruilong 已提交
34 35
#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
36
#include "pass/memory_optimize_super.h"
L
update  
liuruilong 已提交
37
#endif
W
wangliu 已提交
38 39

namespace paddle_mobile {
40
namespace framework {
41

W
wangliu 已提交
42 43
#pragma mark - executor

44
template <typename Device, typename T>
45 46
void Executor<Device, T>::SetThreadNum(int thread_num, PowerMode power_mode) {
  CPUContext::Context()->set_thread_num(thread_num, power_mode);
47 48
}

49
template <typename Device, typename T>
xiebaiyuan's avatar
xiebaiyuan 已提交
50 51 52 53
Executor<Device, T>::Executor(const Program<Device> &program,
                              paddle_mobile::PaddleMobileConfigInternal config,
                              int batch_size, const bool use_optimize,
                              const bool lod_mode)
54
    : program_(program),
H
hjchen2 已提交
55 56
      batch_size_(batch_size),
      use_optimize_(use_optimize),
xiebaiyuan's avatar
xiebaiyuan 已提交
57 58
      lod_mode_(lod_mode),
      config_(config) {
59
  DLOG << "executor in lod mode: " << lod_mode;
60

W
wangliu 已提交
61
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
62
  variable_ptr->SetValue<int>(batch_size);
63 64

  program_desc_ =
Refine  
陈后江 已提交
65
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
66 67
  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
                        "program_desc_ should not be nullptr");
C
Chon 已提交
68 69
#if !defined(PADDLE_MOBILE_FPGA) && !defined(PADDLE_MOBILE_FPGA_KD) && \
    !defined(PADDLE_MOBILE_CL)
70
  if (config_.memory_optimization_level != NoMemoryOptimization) {
71 72
    pass::MemoryOptPass()(program_desc_.get(), program_.scope.get(),
                          config_.memory_optimization_level);
Y
Yanzhan Yang 已提交
73
  }
74
#endif
75 76 77 78
  // resize feed and fetch list
  // should init feed and fetch variables before infer shape
  InitFeedFetchList();
  const auto &blocks = program_desc_->Blocks();
79 80 81 82 83 84 85 86
  std::shared_ptr<BlockDesc> block_desc = blocks[0];
  std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
  for (int j = 0; j < ops.size(); ++j) {
    std::shared_ptr<OpDesc> op_desc = ops[j];
    DLOG << "create op: " << op_desc->Type();

    auto op_handler = OpRegistry<Device>::CreateOp(
        op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
87
        op_desc->GetAttrMap(), program_.scope.get());
88 89 90 91
    // infer shape to reshape inputs and outputs before predict,
    // but for lod mode, it still need to infer shape in runtime
    if (!lod_mode) {
      op_handler->InferShape();
W
wangliu 已提交
92
    }
93
    ops_of_block0_.push_back(op_handler);
W
wangliu 已提交
94
  }
95 96 97
#ifdef PADDLE_MOBILE_FPGA_V2
  InitQuantMemory();
#endif
W
wangliu 已提交
98
  if (program_.combined) {
L
liuruilong 已提交
99 100 101 102
    InitCombineMemory();
  } else {
    InitMemory();
  }
103
  int count = 0;
Z
zp7 已提交
104
#ifdef PADDLE_MOBILE_PROFILE
105 106 107
  std::vector<ProfInfo> profile(ops_of_block0_.size());
  struct timespec ts;
  int op_index = 0;
Z
zp7 已提交
108
#endif
109
  for (auto &op_handler : ops_of_block0_) {
Z
zp7 已提交
110 111 112 113
#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
114 115
    DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
    op_handler->Init();
Z
zp7 已提交
116 117
#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
118 119
    profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
    ++op_index;
Z
zp7 已提交
120
#endif
L
liuruilong 已提交
121
  }
Z
zp7 已提交
122 123 124 125
#ifdef PADDLE_MOBILE_PROFILE
  printf("================[ op init profile ]==================\n");
  PrintProfile(profile);
#endif
W
wangliu 已提交
126 127
}

128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
template <typename Device, typename T>
void Executor<Device, T>::InitFeedFetchList() {
  std::unordered_map<std::string, int> feed_indices, fetch_indices;
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &op_desc : block->Ops()) {
      if (op_desc->Type() == "feed") {
        std::string name = op_desc->Output("Out")[0];
        feed_indices[name] = op_desc->GetAttr("col").Get<int>();
      } else if (op_desc->Type() == "fetch") {
        std::string name = op_desc->Input("X")[0];
        fetch_indices[name] = op_desc->GetAttr("col").Get<int>();
      }
    }
  }
  feed_indices_.swap(feed_indices);
  fetch_indices_.swap(fetch_indices);

  auto *feed_var = program_.scope->Var("feed");
  auto *feed_list = feed_var->template GetMutable<framework::LoDTensorArray>();
  feed_list->resize(feed_indices_.size());

  auto *fetch_var = program_.scope->Var("fetch");
  auto *fetch_list =
      fetch_var->template GetMutable<framework::LoDTensorArray>();
  fetch_list->resize(fetch_indices_.size());
}

155
template <typename T>
156
static void LoadMemInternal(void **data, LoDTensor *tensor,
157
                            bool quant_uint8 = false) {
Refine  
陈后江 已提交
158
  char **data_buf = reinterpret_cast<char **>(data);
159
  int64_t size = tensor->numel();
160
  T *tensor_data = tensor->mutable_data<T>();
161 162
  if (quant_uint8) {
    // should be moved into operator init function
163 164
    float min_value;
    float max_value;
165 166 167
    memory::Copy(&min_value, *data_buf, sizeof(float));
    memory::Copy(&max_value, *data_buf + sizeof(float), sizeof(float));
    *data_buf += 2 * sizeof(float);
168
    const float factor = (max_value - min_value) / 255.0;
169
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data_buf);
170 171
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
172
    }
173
    *data_buf += size * sizeof(uint8_t);
174
  } else {
175 176
    memory::Copy(tensor_data, *data_buf, size * sizeof(T));
    *data_buf += size * sizeof(T);
L
liuruilong 已提交
177
  }
178
}
W
wangliu 已提交
179

180 181 182 183
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(void **data,
                                     const std::shared_ptr<VarDesc> var_desc,
                                     LoDTensor *tensor) {
184
  char **data_buf = reinterpret_cast<char **>(data);
185
  // version
186
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
187
  *data_buf += sizeof(uint32_t);
188
  // lod information
H
hjchen2 已提交
189 190
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
Z
zhangyang 已提交
191
  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
192
  *data_buf += sizeof(uint64_t);
193 194 195 196

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
197
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
198
    *data_buf += sizeof(uint64_t);
199
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Z
zhangyang 已提交
200
    memory::Copy(tmp_dim.data(), *data_buf, size);
201
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
202
    *data_buf += size;
W
wangliu 已提交
203
  }
204
  // tensor version
205
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
206
  *data_buf += sizeof(uint32_t);
207
  // tensor desc size
208
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
209
  *data_buf += sizeof(int32_t);
210
  // skip tensor desc
Refine  
陈后江 已提交
211
  *data_buf += tensor_desc_size;
212

213 214
  const TensorDesc &tensor_desc = var_desc->Tensor_desc();
  tensor->Resize(make_ddim(tensor_desc.Dims()));
215 216
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
217
    case VARTYPE_TYPE_FP32:
218 219
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
W
wangliu 已提交
220
      break;
221
    case VARTYPE_TYPE_INT8:
222
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
223
      break;
224
    case VARTYPE_TYPE_INT32:
225
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
226 227
      break;
    default:
228
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
229
  }
W
wangliu 已提交
230 231
}

232 233 234
template <typename Device, typename T>
void Executor<Device, T>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
W
wangliu 已提交
235 236 237 238
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
H
update  
hjchen2 已提交
239
          var->template GetMutable<framework::LoDTensorArray>();
W
wangliu 已提交
240 241
          continue;
        }
H
hjchen2 已提交
242
        DLOG << "init persistable var: " << var_desc->Name();
Refine  
陈后江 已提交
243
        char *origin_data =
Refine  
陈后江 已提交
244
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
245
        char *data = origin_data;
H
update  
hjchen2 已提交
246
        auto tensor = var->template GetMutable<LoDTensor>();
247 248
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
249
      } else {
250
        DLOG << "init no persistable var: " << var_desc->Name();
H
update  
hjchen2 已提交
251
        varInputMemory(var_desc, var);
W
wangliu 已提交
252 253 254 255 256
      }
    }
  }
}

257 258
template <typename Device, typename T>
void Executor<Device, T>::InitCombineMemory() {
Refine  
陈后江 已提交
259
  char *origin_data = nullptr;
Refine  
陈后江 已提交
260
  bool self_alloc = false;
261
  if (program_.combined_params_buf && program_.combined_params_len) {
262 263
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
264 265 266 267
    if (config_.model_obfuscate_key != "") {
      auto obfuscator = pass::ModelObfuscatePass(config_.model_obfuscate_key);
      obfuscator.convert_data(origin_data, program_.combined_params_len);
    }
268
  } else {
Refine  
陈后江 已提交
269
    self_alloc = true;
Refine  
陈后江 已提交
270
    origin_data = ReadFileToBuff(program_.para_path);
271 272 273 274
    if (config_.model_obfuscate_key != "") {
      auto obfuscator = pass::ModelObfuscatePass(config_.model_obfuscate_key);
      obfuscator.convert_data(origin_data, GetFileLength(program_.para_path));
    }
275
  }
Refine  
陈后江 已提交
276 277
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
278
  for (const auto &block : program_desc_->Blocks()) {
L
liuruilong 已提交
279 280 281 282
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
H
update  
hjchen2 已提交
283
          var->template GetMutable<framework::LoDTensorArray>();
L
liuruilong 已提交
284 285
          continue;
        }
L
liuruilong 已提交
286 287

        DLOG << " init combine memory persistable: " << var_desc->Name();
H
update  
hjchen2 已提交
288
        auto tensor = var->template GetMutable<LoDTensor>();
289
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
290
      } else {
H
update  
hjchen2 已提交
291 292
        DLOG << " init combine memory no persistable: " << var_desc->Name();
        varInputMemory(var_desc, var);
L
liuruilong 已提交
293 294 295
      }
    }
  }
Refine  
陈后江 已提交
296
  if (self_alloc) {
297
    delete[] origin_data;
Refine  
陈后江 已提交
298 299
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
300
}
301

C
Chon 已提交
302 303 304 305 306 307 308 309 310 311 312 313 314 315
static void ClearNoPersistableTensorArray(const framework::ProgramDesc *program,
                                          framework::Scope *scope) {
  for (const auto &block : program->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      if (!var_desc->Persistable() &&
          var_desc->Type() == VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY) {
        auto var = scope->Var(var_desc->Name());
        auto array = var->template GetMutable<framework::LoDTensorArray>();
        array->resize(1);
      }
    }
  }
}

L
liuruilong 已提交
316
template <typename Device, typename T>
L
liuruilong 已提交
317
void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
318 319 320
  if (input_tensor.dims().size() != 4) {
    return;
  }
L
liuruilong 已提交
321 322 323
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
324 325 326 327 328 329
      if (!var_desc->Persistable() &&
          var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
        DLOG << "InitNoPersistableMemory var " << var_desc->Name();
        auto tensor = var->template GetMutable<LoDTensor>();
        if (tensor->IsInitialized() && tensor->dims().size() == 4) {
          DLOG << "var's tensor is Initialized or dims size != 4";
L
liuruilong 已提交
330
          DDim tensor_dim = tensor->dims();
xiebaiyuan's avatar
xiebaiyuan 已提交
331 332 333 334
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          tensor->Resize(new_dim);
335 336 337
          tensor->template mutable_data_new<T>();
          DLOG << "var's tensor dims " << tensor_dim;
          DLOG << "var's tensor new dims " << new_dim;
H
update  
hjchen2 已提交
338
        } else {
339
          DLOG << "var's tensor is not Initialized ???";
L
liuruilong 已提交
340 341 342 343 344 345
        }
      }
    }
  }
}

346 347
template <typename Device, typename T>
bool Executor<Device, T>::varInputMemory(
H
update  
hjchen2 已提交
348
    const std::shared_ptr<VarDesc> &var_desc, Variable *var) const {
349
#ifdef PADDLE_MOBILE_FPGA
H
hjchen2 已提交
350
  framework::LoDTensor *tensor = var->template GetMutable<LoDTensor>();
351 352 353
#ifdef PADDLE_MOBILE_FPGA_V2
  tensor->init(type_id<int8_t>().hash_code());
#else
354
  tensor->init(type_id<float>().hash_code());
355
#endif
356 357
  return true;
#endif
H
update  
hjchen2 已提交
358 359 360 361 362 363 364 365 366 367 368 369 370

  auto type = var_desc->Type();
  if (type == VARTYPE_TYPE_LOD_TENSOR) {
    auto data_type = var_desc->Tensor_desc().DataType();
    framework::LoDTensor *tensor = var->template GetMutable<LoDTensor>();
  } else if (type == VARTYPE_TYPE_STEP_SCOPES) {
    std::vector<framework::Scope *> *step_scopes =
        var->template GetMutable<std::vector<framework::Scope *>>();
  } else if (type == VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY) {
    framework::LoDTensorArray *tensor_array =
        var->template GetMutable<framework::LoDTensorArray>();
  } else {
    PADDLE_MOBILE_THROW_EXCEPTION("got unhandled var type `%d`", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
371
  }
H
update  
hjchen2 已提交
372
  return true;
xiebaiyuan's avatar
xiebaiyuan 已提交
373
}
L
liuruilong 已提交
374

375 376 377 378 379
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, Tensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
380
  }
381 382 383 384 385 386 387 388
  return this->Predict();
}

template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, LoDTensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
389
  }
390
  return this->Predict();
W
wangliu 已提交
391
}
xiebaiyuan's avatar
xiebaiyuan 已提交
392

393 394 395
template <typename Device, typename T>
std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
                                            const std::vector<int64_t> &dims) {
396 397 398 399 400 401 402
  PADDLE_MOBILE_ENFORCE(feed_indices_.size() != 0,
                        "We don't know which tensor should be assign, since no "
                        "feed op found in this model");
  PADDLE_MOBILE_ENFORCE(fetch_indices_.size() != 0,
                        "We don't know which tensor should be fetch out, since "
                        "no fetch op found in this model");
  std::string input_name = feed_indices_.begin()->first;
403
  Tensor feed_tensor(input, make_ddim(dims));
404
  SetInput(feed_tensor, input_name);
405 406
  std::vector<T> output;
  if (this->Predict() == PMSuccess) {
407 408
    std::string output_name = fetch_indices_.begin()->first;
    const auto output_tensor = GetOutput(output_name);
409 410 411 412 413 414
    output.resize(output_tensor->numel());
    memcpy(output.data(), output_tensor->template data<T>(),
           output.size() * sizeof(T));
  }
  return output;
}
xiebaiyuan's avatar
xiebaiyuan 已提交
415

416 417 418
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const Tensor &input,
                                   const std::string &var_name) {
H
hjchen2 已提交
419
  int index = 0;
420
  if (feed_indices_.find(var_name) != feed_indices_.end()) {
H
hjchen2 已提交
421
    index = feed_indices_.find(var_name)->second;
422
  }
H
hjchen2 已提交
423 424 425 426 427 428
  auto *feed_var = program_.scope->Var("feed");
  framework::LoDTensor &target =
      feed_var->template GetMutable<framework::LoDTensorArray>()->at(index);

  target.Resize(input.dims());
  target.ShareDataWith(input);
429 430
  if (feed_indices_.size() == 1) {
    auto &dim = input.dims();
431 432 433
    if (lod_mode_ && product(dim) < 0.9 * product(input_dim_last_)) {
      InitNoPersistableMemory(target);
    }
434 435 436
    input_dim_has_changed_ = input_dim_last_ != dim;
    input_dim_last_ = static_cast<DDim>(dim);
  }
437
}
xiebaiyuan's avatar
xiebaiyuan 已提交
438

439 440 441
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const LoDTensor &input,
                                   const std::string &var_name) {
H
hjchen2 已提交
442
  int index = 0;
443
  if (feed_indices_.find(var_name) != feed_indices_.end()) {
H
hjchen2 已提交
444
    index = feed_indices_.find(var_name)->second;
445
  }
H
hjchen2 已提交
446 447 448 449 450 451 452
  auto *feed_var = program_.scope->Var("feed");
  framework::LoDTensor &target =
      feed_var->template GetMutable<framework::LoDTensorArray>()->at(index);

  target.Resize(input.dims());
  target.ShareDataWith(input);
  target.set_lod(input.lod());
453 454
  if (feed_indices_.size() == 1) {
    auto &dim = input.dims();
455 456 457
    if (lod_mode_ && product(dim) < 0.9 * product(input_dim_last_)) {
      InitNoPersistableMemory(target);
    }
458 459 460
    input_dim_has_changed_ = input_dim_last_ != dim;
    input_dim_last_ = static_cast<DDim>(dim);
  }
461 462 463 464 465
}

template <typename Device, typename T>
std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
    const std::string &var_name) {
466 467 468 469 470 471 472 473 474
  const auto &iter = fetch_indices_.find(var_name);
  if (var_name == "fetch" || iter != fetch_indices_.end()) {
    int index = 0;
    if (iter != fetch_indices_.end()) {
      index = iter->second;
    }
    auto *fetch_var = program_.scope->Var("fetch");
    framework::LoDTensor &target =
        fetch_var->template GetMutable<framework::LoDTensorArray>()->at(index);
H
hjchen2 已提交
475

476 477 478 479 480 481 482
    return std::make_shared<LoDTensor>(target);
  } else {
    auto *fetch_var = program_.scope->Var(var_name);
    framework::LoDTensor *target =
        fetch_var->template GetMutable<framework::LoDTensor>();
    return std::make_shared<LoDTensor>(*target);
  }
483
}
xiebaiyuan's avatar
xiebaiyuan 已提交
484

485 486 487 488 489 490 491 492 493 494 495 496 497 498
#ifdef PADDLE_MOBILE_CL
template <typename Device, typename T>
const CLImage *Executor<Device, T>::GetOutputImage(
    const std::string &var_name) {
  auto var = program_.scope->FindVar(var_name);
  if (var->IsInitialized() && var->template IsType<framework::CLImage>()) {
    const CLImage *cl_image = var->template Get<framework::CLImage>();
    return cl_image;
  } else {
    return nullptr;
  }
}
#endif

499 500
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict() {
501
  try {
502
#if _OPENMP
503
    omp_set_num_threads(CPUContext::Context()->get_thread_num());
504
#endif
505 506 507
    // clear all no persistable tensor array since write_to_array
    // is always push back a new tensor in the array
    ClearNoPersistableTensorArray(program_desc_.get(), program_.scope.get());
508

xiebaiyuan's avatar
xiebaiyuan 已提交
509
#ifdef PADDLE_MOBILE_PROFILE
510 511 512
    std::vector<ProfInfo> profile(ops_of_block0_.size());
    struct timespec ts;
    int op_index = 0;
xiebaiyuan's avatar
xiebaiyuan 已提交
513
#endif
514 515
    for (int i = 0; i < ops_of_block0_.size(); ++i) {
      auto &op_handler = ops_of_block0_[i];
xiebaiyuan's avatar
xiebaiyuan 已提交
516
#ifdef PADDLE_MOBILE_PROFILE
517 518
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
xiebaiyuan's avatar
xiebaiyuan 已提交
519
#endif
520 521 522 523 524 525
      DLOG << i << "th, "
           << "run op: " << op_handler->Type();
      if (lod_mode_ && input_dim_has_changed_) {
        op_handler->InferShape();
      }
      op_handler->Run();
xiebaiyuan's avatar
xiebaiyuan 已提交
526
#ifdef PADDLE_MOBILE_PROFILE
527 528 529
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
      ++op_index;
xiebaiyuan's avatar
xiebaiyuan 已提交
530
#endif
531 532 533 534
    }
    if (feed_indices_.size() == 1) {
      input_dim_has_changed_ = false;
    }
535 536

#ifdef PADDLE_MOBILE_PROFILE
537
    PrintProfile(profile);
538
#endif
539 540 541 542 543 544 545 546
    return PMSuccess;
  } catch (PaddleMobileException &e) {
    exception_msg_ = e.what();
    return PMException;
  } catch (std::exception &e) {
    exception_msg_ = e.what();
    return PMException;
  }
547 548
}

xiebaiyuan's avatar
xiebaiyuan 已提交
549
#ifdef PADDLE_MOBILE_PROFILE
550 551 552
template <typename Device, typename T>
void Executor<Device, T>::PrintProfile(
    const vector<Executor<Device, T>::ProfInfo> &profile) const {
xiebaiyuan's avatar
xiebaiyuan 已提交
553 554 555 556
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
557 558 559 560 561 562
    if (this->ops_of_block0_[i]->Type() == "conv2d" ||
        this->ops_of_block0_[i]->Type() == "depthwise_conv2d") {
      auto inputs = this->ops_of_block0_[i]->Inputs();

      auto *filter = GetVarValue<ProfileTensorType>("Filter", inputs,
                                                    *(this->program_.scope));
563
      int kernel_size = filter->dims()[2];
564 565
      _tp[this->ops_of_block0_[i]->Type() + "_" +
          std::to_string(kernel_size)] += timeCost;
566
    } else {
567
      _tp[this->ops_of_block0_[i]->Type()] += timeCost;
568
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
569
  }
H
hjchen2 已提交
570
  printf("====================[ profile ]======================\n");
571
  typedef std::pair<std::string, uint64_t> prof_t;
xiebaiyuan's avatar
xiebaiyuan 已提交
572 573 574 575 576 577 578 579 580 581 582 583 584 585 586
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
H
hjchen2 已提交
587
  printf("====================[---------]======================\n");
xiebaiyuan's avatar
xiebaiyuan 已提交
588
}
589
#endif
xiebaiyuan's avatar
xiebaiyuan 已提交
590

591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616
template <typename Device, typename T>
void Executor<Device, T>::FeedTensorData(const vector<framework::Tensor> &v) {
  auto input_size = v.size();
  auto *feed_var = program_.scope->Var("feed");

  PADDLE_MOBILE_ENFORCE(input_size == feed_indices_.size(),
                        "input data number not correct");
  for (int i = 0; i < input_size; i++) {
    framework::LoDTensor &target =
        feed_var->template GetMutable<framework::LoDTensorArray>()->at(i);
    target.ShareDataWith(v[input_size - i - 1]);
  }
}

template <typename Device, typename T>
void Executor<Device, T>::GetTensorResults(
    std::vector<framework::Tensor *> *v) {
  auto *fetch_var = program_.scope->Var("fetch");
  auto output_size = fetch_indices_.size();
  for (int i = 0; i < output_size; i++) {
    framework::LoDTensor &target =
        fetch_var->template GetMutable<framework::LoDTensorArray>()->at(i);
    v->push_back(&target);
  }
}

617 618 619 620 621
template <typename Device, typename T>
std::string Executor<Device, T>::GetExceptionMsg() {
  return exception_msg_;
}

622
#ifdef PADDLE_MOBILE_FPGA
623 624 625 626
template <typename Device, typename T>
void Executor<Device, T>::InjectVariable(const Tensor &t,
                                         std::string var_name) {
  Variable *g_feed_value = program_.scope->Var(var_name);
627
  Tensor *feed_tensor = g_feed_value->template GetMutable<LoDTensor>();
628 629
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
630
}
631

632 633
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const Tensor &t) {
Z
zhangyang0701 已提交
634
  InjectVariable(t, "feed0");
635
}
636

637
template <typename Device, typename T>
638
void Executor<Device, T>::FeedData(const std::vector<void *> &v) {
639
  auto input_size = v.size();
Z
zhangyang0701 已提交
640
  int index = 0;
641 642 643
  // auto vars = program_.scope->VarContain("feed", &index);
  // PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
  //                    "input data number not correct");
644
  for (int i = 0; i < input_size; i++) {
Z
zhangyang0701 已提交
645
    auto var = program_.scope->Var("feed", i + index);
646 647 648 649 650 651 652 653 654
    auto feed_tensor = var->template GetMutable<LoDTensor>();
    feed_tensor->external_data = v[i];
  }
}

template <typename Device, typename T>
void Executor<Device, T>::GetResults(std::vector<void *> *v) {
  auto output_size = v->size();
  PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output");
Z
zhangyang0701 已提交
655 656
  int index = 0;
  auto vars = program_.scope->VarContain("fetch", &index);
657 658
  PADDLE_MOBILE_ENFORCE(output_size == vars.size(),
                        "output data number not correct");
659

660
  for (int i = 0; i < output_size; i++) {
Z
zhangyang0701 已提交
661
    auto var = program_.scope->Var("fetch", i + index);
662 663
    auto fetch_tensor = var->template GetMutable<LoDTensor>();
    (*v)[i] = fetch_tensor->template data<float>();
664
  }
665
}
666

667
template <typename Device, typename T>
668 669 670 671
framework::Tensor *Executor<Device, T>::GetTensorByName(
    const std::string &name) {
  auto var = program_.scope->Var(name);
  return var->template GetMutable<LoDTensor>();
H
hjchen2 已提交
672
}
673

674 675
template <typename Device, typename T>
std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
676
  auto &ops = ops_of_block0_;
677

Z
zhangyang 已提交
678 679 680 681 682
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
683 684 685
  auto *output_tensor =
      GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
  return std::make_shared<Tensor>(Tensor(*output_tensor));
686
}
687

688 689
template <typename Device, typename T>
void Executor<Device, T>::Predict_From_To(int start, int end) {
690
  auto &ops = ops_of_block0_;
691
  end = end < 0 ? static_cast<int>(ops.size()) : end;
692 693 694 695 696 697 698 699 700 701 702 703
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
704
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
705 706 707 708 709 710 711
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
712
}
713

714 715
template <typename Device, typename T>
void Executor<Device, T>::Predict_From(int start) {
716
  Predict_From_To(start);
717
}
718

719 720
template <typename Device, typename T>
void Executor<Device, T>::Predict_To(int end) {
721
  Predict_From_To(0, end);
722
}
723 724 725 726 727 728
#ifdef PADDLE_MOBILE_FPGA_V2
std::map<std::string, float> LoadQuantValFromFile(std::string filename) {
  std::map<std::string, float> quantValList;
  std::ifstream in;
  in.open(filename, std::ios::in);
  if (!in.is_open()) {
729 730
    // std::cout << "open File Failed." << std::endl;
    DLOG << "open File Failed.";
731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746
    exit(-1);
  }

  std::string line;
  while (getline(in, line)) {
    std::string splitStr = " : ";
    std::string::size_type pos;
    pos = line.find(splitStr);
    std::string subStr[2];
    subStr[0] = line.substr(0, pos);
    subStr[1] = line.substr(pos + splitStr.size(), line.size());
    quantValList.insert(std::make_pair(subStr[0], atof(subStr[1].c_str())));
  }
  in.close();
  return quantValList;
}
747

748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769
template <typename Device, typename T>
void Executor<Device, T>::InitQuantMemory() {
  std::string quantValFilePath;
  if (program_.combined) {
    quantValFilePath = program_.para_path;
    quantValFilePath =
        quantValFilePath.substr(0, (quantValFilePath.length() - 6));
    quantValFilePath = quantValFilePath + "scale";
  } else {
    quantValFilePath = program_.model_path + "/scale";
  }
  std::map<std::string, float> quantValList =
      LoadQuantValFromFile(quantValFilePath);
  auto ops = ops_of_block0_;
  for (int id = 0; id < ops.size(); id++) {
    auto op = ops[id];
    auto input_keys = op->GetInputKeys();
    auto inputs = op->Inputs();
    for (auto key = input_keys.begin(); key != input_keys.end(); key++) {
      auto inputs_vars = inputs[*key];
      int count = inputs_vars.size();
      for (int i = 0; i < count; i++) {
770 771 772 773 774 775
        if (inputs_vars[i] != "feed") {
          auto tensor = GetTensorByName(inputs_vars[i]);
          tensor->scale[0] = quantValList[inputs_vars[i]];
          DLOG << "input variance name : " << inputs_vars[i]
               << ", scale value : " << tensor->scale[0];
        }
776 777 778 779 780 781 782 783
      }
    }
    auto output_keys = op->GetOutKeys();
    auto outputs = op->Outputs();
    for (auto key = output_keys.begin(); key != output_keys.end(); key++) {
      auto outputs_vars = outputs[*key];
      int count = outputs_vars.size();
      for (int i = 0; i < count; i++) {
784 785 786 787 788 789
        if (outputs_vars[i] != "fetch") {
          auto tensor = GetTensorByName(outputs_vars[i]);
          tensor->scale[0] = quantValList[outputs_vars[i]];
          DLOG << "output variance name : " << outputs_vars[i]
               << ", scale value : " << tensor->scale[0];
        }
790 791 792 793 794 795
      }
    }
  }
}
#endif
#endif
Y
yangfei 已提交
796
#ifdef PADDLE_MOBILE_CL
xiebaiyuan's avatar
xiebaiyuan 已提交
797 798
template <>
void Executor<GPU_CL, float>::InitNoPersistableMemory(
799
    const Tensor &input_tensor) {
xiebaiyuan's avatar
xiebaiyuan 已提交
800 801 802 803 804 805 806
  DLOG << "CL InitNoPersistableMemory ";
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());

      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
807
          var->template GetMutable<framework::LoDTensorArray>();
xiebaiyuan's avatar
xiebaiyuan 已提交
808 809 810 811
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
812
          auto cl_image = var->template GetMutable<CLImage>();
xiebaiyuan's avatar
xiebaiyuan 已提交
813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830
          cl_context context = program_.scope->GetCLScpoe()->Context();
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();

          DDim tensor_dim = cl_image->dims();
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          cl_image->Resize(new_dim);
          cl_image->InitEmptyImage(context, command_queue, new_dim);
        }
      }
    }
  }
  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<float>();
}
H
hjchen2 已提交
831

xiebaiyuan's avatar
xiebaiyuan 已提交
832 833 834
template <>
void Executor<GPU_CL, float>::SetInput(const Tensor &input,
                                       const std::string &var_name) {
H
hjchen2 已提交
835 836 837 838 839
  int index = 0;
  if (feed_indices_.find(var_name) != feed_indices_.end()) {
    index = feed_indices_.find(var_name)->second;
  }
  auto *feed_var = program_.scope->Var("feed");
840
  framework::LoDTensor *input_tensor =
H
hjchen2 已提交
841
      &(feed_var->template GetMutable<framework::LoDTensorArray>()->at(index));
xiebaiyuan's avatar
xiebaiyuan 已提交
842 843

  DLOG << "config_.load_when_predict   " << config_.load_when_predict;
844 845
  DLOG << "target_tensor->IsInitialized() " << input_tensor->IsInitialized();
  DLOG << "target_tensor->dims()   " << input_tensor->dims();
xiebaiyuan's avatar
xiebaiyuan 已提交
846
  DLOG << "input.dims()   " << input.dims();
847
  DLOG << "input_dim_last_   " << input_dim_last_;
xiebaiyuan's avatar
xiebaiyuan 已提交
848
  if (config_.load_when_predict) {
xiebaiyuan's avatar
xiebaiyuan 已提交
849
    if (input_dim_last_ != input.dims()) {
850
      DLOG << "SetInput ---- > resize1";
851 852 853 854 855 856
      input_tensor->Resize(input.dims());
      input_tensor->mutable_data<float>();
      //     InitNoPersistableMemory(*input_tensor);
      pass::MemoryOptPassSuper()(program_desc_.get(), program_.scope.get(),
                                 config_.memory_optimization_level,
                                 input.dims());
xiebaiyuan's avatar
xiebaiyuan 已提交
857 858 859
    }
  } else {
    DLOG << "SetInput ---- > resize2";
860
    input_tensor->Resize(input.dims());
xiebaiyuan's avatar
xiebaiyuan 已提交
861 862
    DLOG << "SetInput ---- > ShareDataWith";
  }
863
  input_tensor->ShareDataWith(input);
864 865 866
  if (feed_indices_.size() == 1) {
    input_dim_has_changed_ = input_dim_last_ != input.dims();
  }
867 868
  auto &dim = input.dims();
  input_dim_last_ = static_cast<DDim>(dim);
xiebaiyuan's avatar
xiebaiyuan 已提交
869 870
}

871 872 873
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
                                     char **data) {}
L
liuruilong 已提交
874

Y
yangfei 已提交
875
template <>
H
hjchen2 已提交
876 877
void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
                                         float *tensorInput, char **data) {
878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

  (*data) += sizeof(uint32_t);

  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
  (*data) += sizeof(uint64_t);

  for (uint64_t i = 0; i < lod_level; ++i) {
    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
    (*data) += sizeof(uint64_t);
    std::vector<size_t> tmp(size / sizeof(size_t));

    for (int k = 0; k < tmp.size(); ++k) {
      tmp[k] = *reinterpret_cast<size_t *>(*data);
      (*data) += sizeof(size_t);
    }
  }

  // 3. tensor version
  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
  (*data) += sizeof(uint32_t);

  // 4. tensor desc
  int32_t size = *reinterpret_cast<int32_t *>(*data);
  (*data) += sizeof(int32_t);

  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
    buf.get()[m] = (*data)[m];
  }
  (*data) += (sizeof(char) * size);

915
  const TensorDesc &desc = var_desc.Tensor_desc();
916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
    float min_value;
    float max_value;

    memcpy(&min_value, *data, sizeof(float));
    memcpy(&max_value, *data + sizeof(float), sizeof(float));
    *data += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
    for (int k = 0; k < memory_size; ++k) {
      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
    for (int n = 0; n < memory_size; n++) {
      float value;
      memcpy(&value, *data + n * type_size, type_size);
      if (value < 1e-30 && value > -1e-30) {
        static_cast<float *>(memory)[n] = 0.0;
      } else {
        static_cast<float *>(memory)[n] = value;
      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }
}
950

Y
yangfei 已提交
951
template <>
952 953
void Executor<GPU_CL, float>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
954 955 956
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
957
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
958
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
H
hjchen2 已提交
959
          var->template GetMutable<framework::LoDTensorArray>();
Y
yangfei 已提交
960
          continue;
L
liuruilong 已提交
961
        } else {
962
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
963
        }
L
liuruilong 已提交
964

Y
yangfei 已提交
965
        char *origin_data =
L
liuruilong 已提交
966
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
967
        char *data = origin_data;
Y
yangfei 已提交
968
        cl_context context = program_.scope->GetCLScpoe()->Context();
969
        const TensorDesc &desc = var_desc->Tensor_desc();
970 971 972 973 974
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
        }
        DLOG << var_desc->Name();
Y
yangfei 已提交
975
        float *tensorInput = static_cast<float *>(
976 977
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);
Y
yangfei 已提交
978

979
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
980

L
liuruilong 已提交
981 982
        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
Y
yangfei 已提交
983

984
        delete origin_data;
Y
yangfei 已提交
985
        paddle_mobile::memory::Free(tensorInput);
986
      } else {
987 988
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          auto cl_image = var->template GetMutable<CLImage>();
989
          cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
990 991
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
992

993 994 995
          const TensorDesc &desc = var_desc->Tensor_desc();
          //          DDim ddim = make_ddim(desc.Dims());
          DDim ddim = cl_image->dims();
996
          DLOG << var_desc->Name();
L
liuruilong 已提交
997
          cl_image->InitEmptyImage(context, command_queue, ddim);
998
        }
Y
yangfei 已提交
999 1000 1001 1002
      }
    }
  }
}
1003

Y
yangfei 已提交
1004
template <>
1005
void Executor<GPU_CL, float>::InitCombineMemory() {
xiebaiyuan's avatar
xiebaiyuan 已提交
1006 1007
  DLOG << "CL InitCombineMemory---- "
       << "config_.load_when_predict: " << config_.load_when_predict;
Y
yangfei 已提交
1008 1009
  char *origin_data = nullptr;
  bool self_alloc = false;
Y
yangfei 已提交
1010 1011
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
1012
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
1013 1014 1015 1016
    if (config_.model_obfuscate_key != "") {
      auto obfuscator = pass::ModelObfuscatePass(config_.model_obfuscate_key);
      obfuscator.convert_data(origin_data, program_.combined_params_len);
    }
Y
yangfei 已提交
1017 1018
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
Y
yangfei 已提交
1019
    self_alloc = true;
L
liuruilong 已提交
1020
    origin_data = ReadFileToBuff(program_.para_path);
1021 1022 1023 1024
    if (config_.model_obfuscate_key != "") {
      auto obfuscator = pass::ModelObfuscatePass(config_.model_obfuscate_key);
      obfuscator.convert_data(origin_data, GetFileLength(program_.para_path));
    }
Y
yangfei 已提交
1025 1026
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
1027
  float *data = reinterpret_cast<float *>(origin_data);
Y
yangfei 已提交
1028

1029
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
1030 1031 1032
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
1033
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
1034
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
H
hjchen2 已提交
1035
          var->template GetMutable<framework::LoDTensorArray>();
Y
yangfei 已提交
1036
          continue;
L
liuruilong 已提交
1037
        } else {
1038
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
1039 1040 1041 1042
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

1043 1044
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
1045 1046 1047 1048 1049

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
          numel = numel * ddim[i];
        }
1050 1051 1052
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);
L
liuruilong 已提交
1053 1054 1055 1056

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

1057 1058
        paddle_mobile::memory::Free(tensorInput);
      } else {
1059
        auto cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
1060
        cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
1061 1062
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
1063 1064
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = cl_image->dims();
1065 1066 1067
        bool shouldResize = true;
        if (ddim.size() > 4) {
          for (int i = 0; i < ddim.size() - 4; ++i) {
1068
            if (ddim[i] != 0 && ddim[i] != 1) {
1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081
              shouldResize = false;
              break;
            }
          }
          if (shouldResize) {
            std::vector<int64_t> temp_intput_dims;
            temp_intput_dims.reserve(static_cast<size_t>(4));
            for (int i = ddim.size() - 4; i < ddim.size(); ++i) {
              temp_intput_dims.push_back(ddim[i]);
            }
            ddim = framework::make_ddim(temp_intput_dims);
          }
        }
1082
        //  DDim ddim = make_ddim(desc.Dims());
L
liuruilong 已提交
1083
        cl_image->InitEmptyImage(context, command_queue, ddim);
Y
yangfei 已提交
1084 1085 1086
      }
    }
  }
Y
yangfei 已提交
1087
  if (self_alloc) {
1088
    delete data;
Y
yangfei 已提交
1089
  }
Y
yangfei 已提交
1090
  LOG(kLOG_INFO) << " end init combine memory ";
1091
}
Y
yangfei 已提交
1092 1093 1094

#endif

1095
template class Executor<CPU, float>;
Y
yangfei 已提交
1096

1097
template class Executor<FPGA, float>;
W
wangliu 已提交
1098

1099
template class Executor<GPU_CL, float>;
Y
yangfei 已提交
1100 1101

}  // namespace framework
W
wangliu 已提交
1102
}  // namespace paddle_mobile