executor.cpp 25.8 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "framework/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
Z
zhangyang 已提交
29
#include "memory/t_malloc.h"
L
update  
liuruilong 已提交
30 31 32 33

#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
W
wangliu 已提交
34 35

namespace paddle_mobile {
36
namespace framework {
37

W
wangliu 已提交
38 39
#pragma mark - executor

40
template <typename Device, typename T>
xiebaiyuan's avatar
xiebaiyuan 已提交
41 42 43 44
Executor<Device, T>::Executor(const Program<Device> &program,
                              paddle_mobile::PaddleMobileConfigInternal config,
                              int batch_size, const bool use_optimize,
                              const bool lod_mode)
45
    : program_(program),
H
hjchen2 已提交
46 47
      batch_size_(batch_size),
      use_optimize_(use_optimize),
xiebaiyuan's avatar
xiebaiyuan 已提交
48 49
      lod_mode_(lod_mode),
      config_(config) {
50 51
  DLOG << "executor in lod mode: " << lod_mode_;

W
wangliu 已提交
52
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
53
  variable_ptr->SetValue<int>(batch_size);
54 55

  program_desc_ =
Refine  
陈后江 已提交
56
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
57 58 59
  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
                        "program_desc_ should not be nullptr");
  const auto &blocks = program_desc_->Blocks();
60 61 62 63 64 65 66 67 68 69 70 71 72 73

  std::shared_ptr<BlockDesc> block_desc = blocks[0];
  std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
  for (int j = 0; j < ops.size(); ++j) {
    std::shared_ptr<OpDesc> op_desc = ops[j];
    DLOG << "create op: " << op_desc->Type();

    auto op_handler = OpRegistry<Device>::CreateOp(
        op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
        op_desc->GetAttrMap(), program_.scope);
    // infer shape to reshape inputs and outputs before predict,
    // but for lod mode, it still need to infer shape in runtime
    if (!lod_mode) {
      op_handler->InferShape();
W
wangliu 已提交
74
    }
75
    ops_of_block0_.push_back(op_handler);
W
wangliu 已提交
76
  }
77

W
wangliu 已提交
78
  if (program_.combined) {
L
liuruilong 已提交
79 80 81 82
    InitCombineMemory();
  } else {
    InitMemory();
  }
83 84

  int count = 0;
85 86 87
  for (auto &op_handler : ops_of_block0_) {
    DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
    op_handler->Init();
L
liuruilong 已提交
88
  }
W
wangliu 已提交
89 90
}

91
template <typename T>
92
static void LoadMemInternal(void **data, LoDTensor *tensor,
93
                            bool quant_uint8 = false) {
Refine  
陈后江 已提交
94
  char **data_buf = reinterpret_cast<char **>(data);
95
  int64_t size = tensor->numel();
96
  T *tensor_data = tensor->mutable_data<T>();
97 98
  if (quant_uint8) {
    // should be moved into operator init function
99 100
    float min_value;
    float max_value;
101 102 103
    memory::Copy(&min_value, *data_buf, sizeof(float));
    memory::Copy(&max_value, *data_buf + sizeof(float), sizeof(float));
    *data_buf += 2 * sizeof(float);
104
    const float factor = (max_value - min_value) / 255.0;
105
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data_buf);
106 107
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
108
    }
109
    *data_buf += size * sizeof(uint8_t);
110
  } else {
111 112
    memory::Copy(tensor_data, *data_buf, size * sizeof(T));
    *data_buf += size * sizeof(T);
L
liuruilong 已提交
113
  }
114
}
W
wangliu 已提交
115

116 117 118 119
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(void **data,
                                     const std::shared_ptr<VarDesc> var_desc,
                                     LoDTensor *tensor) {
120
  char **data_buf = reinterpret_cast<char **>(data);
121
  // version
122
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
123
  *data_buf += sizeof(uint32_t);
124
  // lod information
H
hjchen2 已提交
125 126
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
Z
zhangyang 已提交
127
  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
128
  *data_buf += sizeof(uint64_t);
129 130 131 132

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
133
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
134
    *data_buf += sizeof(uint64_t);
135
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Z
zhangyang 已提交
136
    memory::Copy(tmp_dim.data(), *data_buf, size);
137
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
138
    *data_buf += size;
W
wangliu 已提交
139
  }
140
  // tensor version
141
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
142
  *data_buf += sizeof(uint32_t);
143
  // tensor desc size
144
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
145
  *data_buf += sizeof(int32_t);
146
  // skip tensor desc
Refine  
陈后江 已提交
147
  *data_buf += tensor_desc_size;
148

149 150
  const TensorDesc &tensor_desc = var_desc->Tensor_desc();
  tensor->Resize(make_ddim(tensor_desc.Dims()));
151 152
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
153
    case VARTYPE_TYPE_FP32:
154 155
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
W
wangliu 已提交
156
      break;
157
    case VARTYPE_TYPE_INT8:
158
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
159
      break;
160
    case VARTYPE_TYPE_INT32:
161
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
162 163
      break;
    default:
164
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
165
  }
W
wangliu 已提交
166 167
}

168 169 170
template <typename Device, typename T>
void Executor<Device, T>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
W
wangliu 已提交
171 172
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
173
      auto tensor = var->template GetMutable<LoDTensor>();
W
wangliu 已提交
174 175 176 177
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
Refine  
陈后江 已提交
178
        char *origin_data =
Refine  
陈后江 已提交
179
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
180
        char *data = origin_data;
181 182
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
183
      } else {
184
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
185
          varInputMemory(var_desc, var, tensor);
W
wangliu 已提交
186 187 188 189 190 191
        }
      }
    }
  }
}

192 193
template <typename Device, typename T>
void Executor<Device, T>::InitCombineMemory() {
Refine  
陈后江 已提交
194
  char *origin_data = nullptr;
Refine  
陈后江 已提交
195
  bool self_alloc = false;
196
  if (program_.combined_params_buf && program_.combined_params_len) {
197 198
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
199
  } else {
Refine  
陈后江 已提交
200
    self_alloc = true;
Refine  
陈后江 已提交
201
    origin_data = ReadFileToBuff(program_.para_path);
202
  }
Refine  
陈后江 已提交
203 204
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
205
  for (const auto &block : program_desc_->Blocks()) {
L
liuruilong 已提交
206 207
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
208
      auto tensor = var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
209 210 211 212
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
L
liuruilong 已提交
213 214 215

        DLOG << " init combine memory persistable: " << var_desc->Name();

216
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
217
      } else {
218
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
xiebaiyuan's avatar
xiebaiyuan 已提交
219 220
          DLOG << " init combine memory no persistable in lod: "
               << var_desc->Name();
221
          varInputMemory(var_desc, var, tensor);
L
liuruilong 已提交
222 223
        } else {
          DLOG << " init combine memory no persistable: " << var_desc->Name();
L
liuruilong 已提交
224 225 226 227
        }
      }
    }
  }
Refine  
陈后江 已提交
228
  if (self_alloc) {
229
    delete[] origin_data;
Refine  
陈后江 已提交
230 231
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
232
}
233

L
liuruilong 已提交
234
template <typename Device, typename T>
L
liuruilong 已提交
235
void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
L
liuruilong 已提交
236 237 238 239 240 241 242 243 244 245 246
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          DDim tensor_dim = tensor->dims();
xiebaiyuan's avatar
xiebaiyuan 已提交
247 248 249 250
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          tensor->Resize(new_dim);
L
liuruilong 已提交
251 252 253 254 255 256 257 258 259 260 261
          tensor->template mutable_data<T>();
        }
      }
    }
  }

  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<T>();
}

262 263 264 265
template <typename Device, typename T>
bool Executor<Device, T>::varInputMemory(
    const std::shared_ptr<VarDesc> &var_desc, Variable *var,
    LoDTensor *tensor) const {
266 267 268 269
#ifdef PADDLE_MOBILE_FPGA
  tensor->init(typeid(float));
  return true;
#endif
270 271
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
272
    case VARTYPE_TYPE_FP32:
273
      tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
274
      break;
275
    case VARTYPE_TYPE_INT8:
276
      tensor->mutable_data<int8_t>();
Refine  
陈后江 已提交
277
      break;
278
    case VARTYPE_TYPE_INT32:
279
      tensor->mutable_data<int32_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
280
      break;
281
    case VARTYPE_TYPE_INT64:
282
      tensor->mutable_data<int64_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
283
      break;
Refine  
陈后江 已提交
284
    default:
xiebaiyuan's avatar
xiebaiyuan 已提交
285 286
      break;
  }
287 288 289
  bool is_mute_match =
      (type == VARTYPE_TYPE_FP32) || (type == VARTYPE_TYPE_INT8) ||
      (type == VARTYPE_TYPE_INT32) || (type == VARTYPE_TYPE_INT64);
Refine  
陈后江 已提交
290
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
291 292
  return is_mute_match;
}
L
liuruilong 已提交
293

294 295 296 297 298
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, Tensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
299
  }
300 301 302 303 304 305 306 307
  return this->Predict();
}

template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, LoDTensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
308
  }
309
  return this->Predict();
W
wangliu 已提交
310
}
xiebaiyuan's avatar
xiebaiyuan 已提交
311

312 313 314 315 316 317 318 319 320 321 322 323 324 325
template <typename Device, typename T>
std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
                                            const std::vector<int64_t> &dims) {
  Tensor feed_tensor(input, make_ddim(dims));
  SetInput(feed_tensor, "feed");
  std::vector<T> output;
  if (this->Predict() == PMSuccess) {
    const auto output_tensor = GetOutput("fetch");
    output.resize(output_tensor->numel());
    memcpy(output.data(), output_tensor->template data<T>(),
           output.size() * sizeof(T));
  }
  return output;
}
xiebaiyuan's avatar
xiebaiyuan 已提交
326

327 328 329 330 331 332
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const Tensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
L
liuruilong 已提交
333

334
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
335 336

  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
337 338 339
    if (input_dim_last_ != input.dims()) {
      InitNoPersistableMemory(input);
      input_dim_last_ = input.dims();
L
liuruilong 已提交
340 341 342
    }
  }

343 344 345
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
}
xiebaiyuan's avatar
xiebaiyuan 已提交
346

347 348 349 350 351 352 353
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const LoDTensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
354 355

  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
356
    if (input_dim_last_ != input.dims()) {
L
liuruilong 已提交
357
      InitNoPersistableMemory(*target_tensor);
Z
zhaojiaying01 已提交
358
      input_dim_last_ = input.dims();
L
liuruilong 已提交
359 360 361
    }
  }

362 363 364 365
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
  target_tensor->set_lod(input.lod());
}
xiebaiyuan's avatar
xiebaiyuan 已提交
366

367 368
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict() {
xiebaiyuan's avatar
xiebaiyuan 已提交
369
#ifdef PADDLE_MOBILE_PROFILE
370
  std::vector<ProfInfo> profile(ops_of_block0_.size());
371 372
  struct timespec ts;
  int op_index = 0;
xiebaiyuan's avatar
xiebaiyuan 已提交
373
#endif
374
  for (auto &op_handler : ops_of_block0_) {
xiebaiyuan's avatar
xiebaiyuan 已提交
375
#ifdef PADDLE_MOBILE_PROFILE
376 377
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
xiebaiyuan's avatar
xiebaiyuan 已提交
378
#endif
379 380 381 382
    if (lod_mode_) {
      op_handler->InferShape();
    }
    op_handler->Run();
xiebaiyuan's avatar
xiebaiyuan 已提交
383
#ifdef PADDLE_MOBILE_PROFILE
384 385 386
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
    ++op_index;
xiebaiyuan's avatar
xiebaiyuan 已提交
387 388 389 390 391 392 393
#endif
  }
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
394 395 396
    if (ops_of_block0_[i]->Type() == "conv2d" ||
        ops_of_block0_[i]->Type() == "depthwise_conv2d") {
      auto inputs = ops_of_block0_[i]->Inputs();
397 398
      auto *filter =
          GetVarValue<LoDTensor>("Filter", inputs, *(program_.scope));
399
      int kernel_size = filter->dims()[2];
400 401
      _tp[ops_of_block0_[i]->Type() + "_" + std::to_string(kernel_size)] +=
          timeCost;
402
    } else {
403
      _tp[ops_of_block0_[i]->Type()] += timeCost;
404
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
405
  }
H
hjchen2 已提交
406
  printf("====================[ profile ]======================\n");
407
  typedef std::pair<std::string, uint64_t> prof_t;
xiebaiyuan's avatar
xiebaiyuan 已提交
408 409 410 411 412 413 414 415 416 417 418 419 420 421 422
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
H
hjchen2 已提交
423
  printf("====================[---------]======================\n");
xiebaiyuan's avatar
xiebaiyuan 已提交
424
#endif
425
  return PMSuccess;
xiebaiyuan's avatar
xiebaiyuan 已提交
426 427
}

428 429 430 431 432 433 434 435
template <typename Device, typename T>
std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
    const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *output_tensor = target_var->template GetMutable<LoDTensor>();
  return std::make_shared<LoDTensor>(*output_tensor);
W
wangliu 已提交
436 437
}

438
#ifdef PADDLE_MOBILE_FPGA
439 440 441 442 443
template <typename Device, typename T>
void Executor<Device, T>::InjectVariable(const Tensor &t,
                                         std::string var_name) {
  Variable *g_feed_value = program_.scope->Var(var_name);
  Tensor *feed_tensor = g_feed_value->GetMutable<LoDTensor>();
444 445
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
446
}
447

448 449
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const Tensor &t) {
450
  InjectVariable(t, "feed");
451
}
452

453 454
template <typename Device, typename T>
std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
455
  auto &ops = ops_of_block0_;
456

Z
zhangyang 已提交
457 458 459 460 461
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
462 463 464
  auto *output_tensor =
      GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
  return std::make_shared<Tensor>(Tensor(*output_tensor));
465
}
466

467 468
template <typename Device, typename T>
void Executor<Device, T>::Predict_From_To(int start, int end) {
469
  auto &ops = ops_of_block0_;
470
  end = end < 0 ? static_cast<int>(ops.size()) : end;
471 472 473 474 475 476 477 478 479 480 481 482
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
483
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
484 485 486 487 488 489 490
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
491
}
492

493 494
template <typename Device, typename T>
void Executor<Device, T>::Predict_From(int start) {
495
  Predict_From_To(start);
496
}
497

498 499
template <typename Device, typename T>
void Executor<Device, T>::Predict_To(int end) {
500
  Predict_From_To(0, end);
501
}
502 503
#endif

Y
yangfei 已提交
504
#ifdef PADDLE_MOBILE_CL
xiebaiyuan's avatar
xiebaiyuan 已提交
505 506
template <>
void Executor<GPU_CL, float>::InitNoPersistableMemory(
507
    const Tensor &input_tensor) {
xiebaiyuan's avatar
xiebaiyuan 已提交
508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550
  DLOG << "CL InitNoPersistableMemory ";
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());

      auto cl_image = var->template GetMutable<CLImage>();

      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          cl_context context = program_.scope->GetCLScpoe()->Context();
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();

          DDim tensor_dim = cl_image->dims();
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          cl_image->Resize(new_dim);
          cl_image->InitEmptyImage(context, command_queue, new_dim);
        }
      }
    }
  }
  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<float>();
}
template <>
void Executor<GPU_CL, float>::SetInput(const Tensor &input,
                                       const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());

  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
  DLOG << "config_.load_when_predict   " << config_.load_when_predict;
  DLOG << "target_tensor->IsInitialized() " << target_tensor->IsInitialized();
  DLOG << "target_tensor->dims()   " << target_tensor->dims();
  DLOG << "input.dims()   " << input.dims();
551
  DLOG << "input_dim_last_   " << input_dim_last_;
xiebaiyuan's avatar
xiebaiyuan 已提交
552
  if (config_.load_when_predict) {
xiebaiyuan's avatar
xiebaiyuan 已提交
553
    if (input_dim_last_ != input.dims()) {
554 555 556
      DLOG << "SetInput ---- > resize1";
      target_tensor->Resize(input.dims());
      target_tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
557 558 559 560 561 562 563 564
      InitNoPersistableMemory(*target_tensor);
    }
  } else {
    DLOG << "SetInput ---- > resize2";
    target_tensor->Resize(input.dims());
    DLOG << "SetInput ---- > ShareDataWith";
  }
  target_tensor->ShareDataWith(input);
565 566
  auto &dim = input.dims();
  input_dim_last_ = static_cast<DDim>(dim);
xiebaiyuan's avatar
xiebaiyuan 已提交
567 568
}

569 570 571
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
                                     char **data) {}
L
liuruilong 已提交
572

Y
yangfei 已提交
573
template <>
H
hjchen2 已提交
574 575
void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
                                         float *tensorInput, char **data) {
576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

  (*data) += sizeof(uint32_t);

  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
  (*data) += sizeof(uint64_t);

  for (uint64_t i = 0; i < lod_level; ++i) {
    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
    (*data) += sizeof(uint64_t);
    std::vector<size_t> tmp(size / sizeof(size_t));

    for (int k = 0; k < tmp.size(); ++k) {
      tmp[k] = *reinterpret_cast<size_t *>(*data);
      (*data) += sizeof(size_t);
    }
  }

  // 3. tensor version
  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
  (*data) += sizeof(uint32_t);

  // 4. tensor desc
  int32_t size = *reinterpret_cast<int32_t *>(*data);
  (*data) += sizeof(int32_t);

  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
    buf.get()[m] = (*data)[m];
  }
  (*data) += (sizeof(char) * size);

613
  const TensorDesc &desc = var_desc.Tensor_desc();
614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
    float min_value;
    float max_value;

    memcpy(&min_value, *data, sizeof(float));
    memcpy(&max_value, *data + sizeof(float), sizeof(float));
    *data += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
    for (int k = 0; k < memory_size; ++k) {
      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
    for (int n = 0; n < memory_size; n++) {
      float value;
      memcpy(&value, *data + n * type_size, type_size);
      if (value < 1e-30 && value > -1e-30) {
        static_cast<float *>(memory)[n] = 0.0;
      } else {
        static_cast<float *>(memory)[n] = value;
      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }
}
648

Y
yangfei 已提交
649
template <>
650 651
void Executor<GPU_CL, float>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
652 653 654
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
655
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
656
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
657
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
658
          continue;
L
liuruilong 已提交
659
        } else {
660
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
661
        }
L
liuruilong 已提交
662

Y
yangfei 已提交
663
        char *origin_data =
L
liuruilong 已提交
664
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
665
        char *data = origin_data;
Y
yangfei 已提交
666
        cl_context context = program_.scope->GetCLScpoe()->Context();
667
        const TensorDesc &desc = var_desc->Tensor_desc();
668 669 670 671 672
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
        }
        DLOG << var_desc->Name();
Y
yangfei 已提交
673
        float *tensorInput = static_cast<float *>(
674 675
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);
Y
yangfei 已提交
676

677
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
678

L
liuruilong 已提交
679 680
        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
Y
yangfei 已提交
681

682
        delete origin_data;
Y
yangfei 已提交
683
        paddle_mobile::memory::Free(tensorInput);
684
      } else {
685 686
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          auto cl_image = var->template GetMutable<CLImage>();
687
          cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
688 689
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
690

691 692 693
          const TensorDesc &desc = var_desc->Tensor_desc();
          //          DDim ddim = make_ddim(desc.Dims());
          DDim ddim = cl_image->dims();
694
          DLOG << var_desc->Name();
L
liuruilong 已提交
695
          cl_image->InitEmptyImage(context, command_queue, ddim);
696
        }
Y
yangfei 已提交
697 698 699 700
      }
    }
  }
}
701

Y
yangfei 已提交
702
template <>
703
void Executor<GPU_CL, float>::InitCombineMemory() {
xiebaiyuan's avatar
xiebaiyuan 已提交
704 705
  DLOG << "CL InitCombineMemory---- "
       << "config_.load_when_predict: " << config_.load_when_predict;
Y
yangfei 已提交
706 707
  char *origin_data = nullptr;
  bool self_alloc = false;
Y
yangfei 已提交
708 709
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
710
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
Y
yangfei 已提交
711 712
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
Y
yangfei 已提交
713
    self_alloc = true;
L
liuruilong 已提交
714
    origin_data = ReadFileToBuff(program_.para_path);
Y
yangfei 已提交
715 716
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
717
  float *data = reinterpret_cast<float *>(origin_data);
Y
yangfei 已提交
718

719
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
720 721 722
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
723
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
724
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
725
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
726
          continue;
L
liuruilong 已提交
727
        } else {
728
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
729 730 731 732
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

733 734
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
735 736 737 738 739

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
          numel = numel * ddim[i];
        }
740 741 742
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);
L
liuruilong 已提交
743 744 745 746

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

747 748
        paddle_mobile::memory::Free(tensorInput);
      } else {
749
        auto cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
750
        cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
751 752
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
753 754 755
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = cl_image->dims();
        //  DDim ddim = make_ddim(desc.Dims());
L
liuruilong 已提交
756
        cl_image->InitEmptyImage(context, command_queue, ddim);
Y
yangfei 已提交
757 758 759
      }
    }
  }
Y
yangfei 已提交
760
  if (self_alloc) {
761
    delete data;
Y
yangfei 已提交
762
  }
Y
yangfei 已提交
763
  LOG(kLOG_INFO) << " end init combine memory ";
764
}
Y
yangfei 已提交
765 766 767

#endif

768
template class Executor<CPU, float>;
Y
yangfei 已提交
769

770
template class Executor<FPGA, float>;
W
wangliu 已提交
771

772
template class Executor<GPU_CL, float>;
Y
yangfei 已提交
773

774
template class Executor<GPU_MALI, float>;
Y
yangfei 已提交
775 776

}  // namespace framework
W
wangliu 已提交
777
}  // namespace paddle_mobile