cxx_api_impl.cc 9.3 KB
Newer Older
Y
Yan Chunwei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/api/cxx_api.h"
16 17
#include <memory>
#include <mutex>  //NOLINT
18
#include <string>
Y
Yan Chunwei 已提交
19
#include "lite/api/paddle_api.h"
20
#include "lite/core/device_info.h"
21
#include "lite/core/version.h"
22 23 24 25

#ifndef LITE_ON_TINY_PUBLISH
#include "lite/api/paddle_use_passes.h"
#endif
J
jiweibo 已提交
26 27 28
#ifdef LITE_WITH_CUDA
#include "lite/backends/cuda/cuda_utils.h"
#endif
29
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
30
    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
31 32 33
#include <omp.h>
#include "lite/backends/x86/mklml.h"
#endif
Y
Yan Chunwei 已提交
34 35 36 37
namespace paddle {
namespace lite {

void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
38
  config_ = config;
J
update  
jiweibo 已提交
39
  config_.check_valid();
J
jiweibo 已提交
40 41
  auto places = config.valid_places();
  std::vector<std::string> passes = config.get_passes_internal();
42
#ifdef LITE_WITH_CUDA
J
jiweibo 已提交
43 44 45 46
  // if kCUDA is included in valid places, it should be initialized first,
  // otherwise skip this step.
  for (auto &p : places) {
    if (p.target == TARGET(kCUDA)) {
47
      InitCudaEnv(&passes);
J
jiweibo 已提交
48
      break;
49
    }
J
jiweibo 已提交
50
  }
51
#endif
J
jiweibo 已提交
52 53

  if (!status_is_cloned_) {
54
#ifdef LITE_WITH_MLU
55
    Env<TARGET(kMLU)>::Init();
56 57 58 59
    lite::TargetWrapperMlu::SetMLURunMode(config.mlu_core_version(),
                                          config.mlu_core_number(),
                                          config.mlu_input_layout(),
                                          config.mlu_firstconv_param());
60
#endif  // LITE_WITH_MLU
61 62 63 64 65 66 67 68 69 70 71 72
    auto use_layout_preprocess_pass =
        config.model_dir().find("OPENCL_PRE_PRECESS");
    VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass;
    if (places[0].target == TARGET(kOpenCL) &&
        use_layout_preprocess_pass != std::string::npos) {
      passes = {"type_layout_cast_preprocess_pass"};
      VLOG(1) << "add pass:" << passes[0];
    }
    raw_predictor_->Build(config, places, passes);
  } else {
    raw_predictor_->PrepareFeedFetch();
    CHECK(raw_predictor_) << "The Predictor can not be nullptr in Clone mode.";
73
  }
J
jiweibo 已提交
74

T
TianXiaogang 已提交
75 76
  mode_ = config.power_mode();
  threads_ = config.threads();
77
#ifdef LITE_WITH_NPU
78 79
  // Store the model-level configuration into scope for kernels, and use
  // exe_scope to store the execution-level configuration
80
  Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
81
      raw_predictor_->scope(), config.subgraph_model_cache_dir());
82
#endif
83
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
84
    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
85
  int num_threads = config.x86_math_library_num_threads();
86 87 88
  int real_num_threads = num_threads > 1 ? num_threads : 1;
  paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads);
  omp_set_num_threads(real_num_threads);
89
  VLOG(3) << "set_x86_math_library_math_threads() is set successfully and the "
90
             "number of threads is:"
91
          << real_num_threads;
92
#endif
Y
Yan Chunwei 已提交
93 94
}

J
jiweibo 已提交
95
#ifdef LITE_WITH_CUDA
96
void CxxPaddleApiImpl::InitCudaEnv(std::vector<std::string> *passes) {
J
jiweibo 已提交
97 98 99
  Env<TARGET(kCUDA)>::Init();

  // init two streams for each predictor.
J
update  
jiweibo 已提交
100 101
  if (config_.cuda_exec_stream()) {
    cuda_exec_stream_.reset(
J
jiweibo 已提交
102
        new lite::CudaStreamGuard(*config_.cuda_exec_stream()));
J
jiweibo 已提交
103
  } else {
J
jiweibo 已提交
104
    cuda_exec_stream_.reset(new lite::CudaStreamGuard());
J
jiweibo 已提交
105
  }
J
update  
jiweibo 已提交
106
  if (config_.cuda_io_stream()) {
J
jiweibo 已提交
107
    cuda_io_stream_.reset(new lite::CudaStreamGuard(*config_.cuda_io_stream()));
J
jiweibo 已提交
108
  } else {
J
jiweibo 已提交
109
    cuda_io_stream_.reset(new lite::CudaStreamGuard());
J
jiweibo 已提交
110 111
  }

J
update  
jiweibo 已提交
112 113
  raw_predictor_->set_cuda_exec_stream(cuda_exec_stream_->stream());
  raw_predictor_->set_cuda_io_stream(cuda_io_stream_->stream());
J
jiweibo 已提交
114 115

  // init sync events.
J
update  
jiweibo 已提交
116 117 118
  if (config_.cuda_use_multi_stream()) {
    cuda_use_multi_stream_ = true;
    raw_predictor_->set_cuda_use_multi_stream(cuda_use_multi_stream_);
J
jiweibo 已提交
119 120 121 122 123
    passes->push_back("multi_stream_analysis_pass");
    VLOG(3) << "add pass: " << (*passes)[0];
    Env<TargetType::kCUDA>::Devs &devs = Env<TargetType::kCUDA>::Global();
    int dev_id = TargetWrapperCuda::GetCurDevice();
    for (size_t i = 0; i < lite::kMaxStream; ++i) {
J
update  
jiweibo 已提交
124
      cuda_exec_streams_.emplace_back(devs[dev_id].exec_streams()[i]);
J
jiweibo 已提交
125 126
      cudaEvent_t out_event;
      TargetWrapperCuda::CreateEventWithFlags(&out_event);
J
update  
jiweibo 已提交
127
      cuda_output_events_.push_back(out_event);
J
jiweibo 已提交
128 129 130 131
    }
  } else {
    cudaEvent_t out_event;
    TargetWrapperCuda::CreateEventWithFlags(&out_event);
J
update  
jiweibo 已提交
132
    cuda_output_events_.push_back(out_event);
J
jiweibo 已提交
133
  }
J
update  
jiweibo 已提交
134
  TargetWrapperCuda::CreateEventWithFlags(&cuda_input_event_);
J
jiweibo 已提交
135 136
}

J
update  
jiweibo 已提交
137 138 139
void CxxPaddleApiImpl::SyncCudaInputs() {
  TargetWrapperCuda::RecordEvent(cuda_input_event_, cuda_io_stream_->stream());
  if (cuda_use_multi_stream_) {
J
jiweibo 已提交
140
    for (int i = 0; i < lite::kMaxStream; ++i) {
J
update  
jiweibo 已提交
141 142
      TargetWrapperCuda::StreamSync(cuda_exec_streams_[i].stream(),
                                    cuda_input_event_);
J
jiweibo 已提交
143 144
    }
  } else {
J
update  
jiweibo 已提交
145 146
    TargetWrapperCuda::StreamSync(cuda_exec_stream_->stream(),
                                  cuda_input_event_);
J
jiweibo 已提交
147 148 149
  }
}

J
update  
jiweibo 已提交
150 151 152 153 154 155 156
void CxxPaddleApiImpl::SyncCudaOutputs() {
  if (cuda_use_multi_stream_) {
    for (size_t i = 0; i < cuda_output_events_.size(); ++i) {
      TargetWrapperCuda::RecordEvent(cuda_output_events_[i],
                                     cuda_exec_streams_[i].stream());
      TargetWrapperCuda::StreamSync(cuda_io_stream_->stream(),
                                    cuda_output_events_[i]);
J
jiweibo 已提交
157 158
    }
  } else {
J
update  
jiweibo 已提交
159 160 161 162
    TargetWrapperCuda::RecordEvent(cuda_output_events_[0],
                                   cuda_exec_stream_->stream());
    TargetWrapperCuda::StreamSync(cuda_io_stream_->stream(),
                                  cuda_output_events_[0]);
J
jiweibo 已提交
163 164 165 166
  }
}
#endif

Y
Yan Chunwei 已提交
167
std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInput(int i) {
168
  auto *x = raw_predictor_->GetInput(i);
169
#ifdef LITE_WITH_CUDA
J
jiweibo 已提交
170
  return std::unique_ptr<lite_api::Tensor>(
J
update  
jiweibo 已提交
171
      new lite_api::Tensor(x, cuda_io_stream_->stream()));
172 173 174
#else
  return std::unique_ptr<lite_api::Tensor>(new lite_api::Tensor(x));
#endif
Y
Yan Chunwei 已提交
175 176 177 178
}

std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetOutput(
    int i) const {
179
  const auto *x = raw_predictor_->GetOutput(i);
180
#ifdef LITE_WITH_CUDA
J
jiweibo 已提交
181
  return std::unique_ptr<lite_api::Tensor>(
J
update  
jiweibo 已提交
182
      new lite_api::Tensor(x, cuda_io_stream_->stream()));
183 184 185
#else
  return std::unique_ptr<lite_api::Tensor>(new lite_api::Tensor(x));
#endif
Y
Yan Chunwei 已提交
186 187
}

S
sangoly 已提交
188
std::vector<std::string> CxxPaddleApiImpl::GetInputNames() {
189
  return raw_predictor_->GetInputNames();
190 191
}

192
std::vector<std::string> CxxPaddleApiImpl::GetParamNames() {
193
  return raw_predictor_->GetParamNames();
194 195
}

S
sangoly 已提交
196
std::vector<std::string> CxxPaddleApiImpl::GetOutputNames() {
197
  return raw_predictor_->GetOutputNames();
198 199
}

T
TianXiaogang 已提交
200 201 202 203
void CxxPaddleApiImpl::Run() {
#ifdef LITE_WITH_ARM
  lite::DeviceInfo::Global().SetRunMode(mode_, threads_);
#endif
J
jiweibo 已提交
204
#ifdef LITE_WITH_CUDA
J
update  
jiweibo 已提交
205
  SyncCudaInputs();
J
jiweibo 已提交
206 207
#endif

208
  raw_predictor_->Run();
J
jiweibo 已提交
209 210

#ifdef LITE_WITH_CUDA
J
update  
jiweibo 已提交
211
  SyncCudaOutputs();
J
jiweibo 已提交
212
#endif
T
TianXiaogang 已提交
213
}
Y
Yan Chunwei 已提交
214

215 216
std::shared_ptr<lite_api::PaddlePredictor> CxxPaddleApiImpl::Clone() {
  std::lock_guard<std::mutex> lock(mutex_);
217 218 219 220 221 222 223 224 225 226 227
  auto predictor =
      std::make_shared<lite::CxxPaddleApiImpl>(raw_predictor_->Clone());
  predictor->Init(config_);
  return predictor;
}

std::shared_ptr<lite_api::PaddlePredictor> CxxPaddleApiImpl::Clone(
    const std::vector<std::string> &var_names) {
  std::lock_guard<std::mutex> lock(mutex_);
  auto predictor = std::make_shared<lite::CxxPaddleApiImpl>(
      raw_predictor_->Clone(var_names));
228 229 230 231
  predictor->Init(config_);
  return predictor;
}

232 233
std::string CxxPaddleApiImpl::GetVersion() const { return version(); }

Y
Yan Chunwei 已提交
234 235
std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetTensor(
    const std::string &name) const {
236
  auto *x = raw_predictor_->GetTensor(name);
Y
Yan Chunwei 已提交
237 238 239
  return std::unique_ptr<const lite_api::Tensor>(new lite_api::Tensor(x));
}

240 241 242
std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetMutableTensor(
    const std::string &name) {
  return std::unique_ptr<lite_api::Tensor>(
243
      new lite_api::Tensor(raw_predictor_->GetMutableTensor(name)));
244 245
}

246 247 248
std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInputByName(
    const std::string &name) {
  return std::unique_ptr<lite_api::Tensor>(
249
      new lite_api::Tensor(raw_predictor_->GetInputByName(name)));
250 251
}

Y
Yan Chunwei 已提交
252
void CxxPaddleApiImpl::SaveOptimizedModel(const std::string &model_dir,
253 254
                                          lite_api::LiteModelType model_type,
                                          bool record_info) {
255
  raw_predictor_->SaveModel(model_dir, model_type, record_info);
Y
Yan Chunwei 已提交
256 257
}

J
jiweibo 已提交
258
CxxPaddleApiImpl::~CxxPaddleApiImpl() {
J
jiweibo 已提交
259
#ifdef LITE_WITH_CUDA
J
update  
jiweibo 已提交
260 261 262
  TargetWrapperCuda::DestroyEvent(cuda_input_event_);
  for (size_t i = 0; i < cuda_output_events_.size(); ++i) {
    TargetWrapperCuda::DestroyEvent(cuda_output_events_[i]);
J
jiweibo 已提交
263
  }
J
jiweibo 已提交
264
#endif
J
jiweibo 已提交
265 266
}

Y
Yan Chunwei 已提交
267 268 269 270 271 272 273 274 275 276 277 278 279 280
}  // namespace lite

namespace lite_api {

template <>
std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(
    const CxxConfig &config) {
  auto x = std::make_shared<lite::CxxPaddleApiImpl>();
  x->Init(config);
  return x;
}

}  // namespace lite_api
}  // namespace paddle