提交 84bdddb2 编写于 作者: J jiweibo

reorganize stream. test=develop

上级 07ae2599
...@@ -349,6 +349,11 @@ void Predictor::GenRuntimeProgram() { ...@@ -349,6 +349,11 @@ void Predictor::GenRuntimeProgram() {
program_ = optimizer_.GenRuntimeProgram(); program_ = optimizer_.GenRuntimeProgram();
CHECK_EQ(exec_scope_, program_->exec_scope()); CHECK_EQ(exec_scope_, program_->exec_scope());
program_generated_ = true; program_generated_ = true;
#ifdef LITE_WITH_CUDA
if (!multi_stream_) {
program_->UpdateContext(exec_stream_, io_stream_);
}
#endif
} }
const lite::Tensor *Predictor::GetTensor(const std::string &name) const { const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
......
...@@ -20,12 +20,17 @@ ...@@ -20,12 +20,17 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "lite/api/paddle_api.h" #include "lite/api/paddle_api.h"
#include "lite/core/device_info.h"
#include "lite/core/op_lite.h" #include "lite/core/op_lite.h"
#include "lite/core/optimizer.h" #include "lite/core/optimizer.h"
#include "lite/core/program.h" #include "lite/core/program.h"
#include "lite/core/types.h" #include "lite/core/types.h"
#include "lite/model_parser/model_parser.h" #include "lite/model_parser/model_parser.h"
#ifdef LITE_WITH_CUDA
#include "lite/backends/cuda/cuda_utils.h"
#endif
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -56,7 +61,9 @@ class LITE_API Predictor { ...@@ -56,7 +61,9 @@ class LITE_API Predictor {
const std::vector<std::string>& var_names = {}) const std::vector<std::string>& var_names = {})
: program_desc_(desc), scope_(root) { : program_desc_(desc), scope_(root) {
Program program(*desc.get(), scope_, valid_places, var_names); Program program(*desc.get(), scope_, valid_places, var_names);
optimizer_ = Optimizer(std::move(program), valid_places); std::vector<std::string> passes{};
// TODO(wilber): rethink a new way to associate config and passes.
optimizer_ = Optimizer(std::move(program), valid_places, passes);
exec_scope_ = optimizer_.exec_scope(); exec_scope_ = optimizer_.exec_scope();
valid_places_ = valid_places; valid_places_ = valid_places;
} }
...@@ -146,14 +153,23 @@ class LITE_API Predictor { ...@@ -146,14 +153,23 @@ class LITE_API Predictor {
bool record_info = false); bool record_info = false);
void SaveOpKernelInfo(const std::string& model_dir); void SaveOpKernelInfo(const std::string& model_dir);
// #ifdef LITE_WITH_TRAIN // #ifdef LITE_WITH_TRAIN
// void Run(const std::vector<framework::Tensor>& tensors) { // void Run(const std::vector<framework::Tensor>& tensors) {
// FeedVars(tensors); // FeedVars(tensors);
// program_->Run(); // program_->Run();
// } // }
// void FeedVars(const std::vector<framework::Tensor>& tensors); // void FeedVars(const std::vector<framework::Tensor>& tensors);
// #endif // #endif
#ifdef LITE_WITH_CUDA
void SetMultiStream(bool multi_stream) { multi_stream_ = multi_stream; }
bool multi_stream() { return multi_stream_; }
void SetExecStream(cudaStream_t* stream) { exec_stream_ = stream; }
void SetIoStream(cudaStream_t* stream) { io_stream_ = stream; }
const cudaStream_t& exec_stream() { return *exec_stream_; }
const cudaStream_t& io_stream() { return *io_stream_; }
#endif
private: private:
Optimizer optimizer_; Optimizer optimizer_;
...@@ -165,6 +181,11 @@ class LITE_API Predictor { ...@@ -165,6 +181,11 @@ class LITE_API Predictor {
std::vector<std::string> input_names_; std::vector<std::string> input_names_;
std::vector<std::string> output_names_; std::vector<std::string> output_names_;
std::vector<Place> valid_places_; std::vector<Place> valid_places_;
#ifdef LITE_WITH_CUDA
bool multi_stream_{false};
cudaStream_t* io_stream_;
cudaStream_t* exec_stream_;
#endif
}; };
class CxxPaddleApiImpl : public lite_api::PaddlePredictor { class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
...@@ -178,6 +199,8 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor { ...@@ -178,6 +199,8 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
status_is_cloned_ = true; status_is_cloned_ = true;
} }
~CxxPaddleApiImpl();
/// Create a new predictor from a config. /// Create a new predictor from a config.
void Init(const lite_api::CxxConfig& config); void Init(const lite_api::CxxConfig& config);
...@@ -216,11 +239,31 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor { ...@@ -216,11 +239,31 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf, lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
bool record_info = false) override; bool record_info = false) override;
private:
#ifdef LITE_WITH_CUDA
// Cuda related environment initialization, including setting stream pointers,
// initializing synchronization events, setting predictor_id, etc.
void CudaEnvInit(std::vector<std::string>* passes);
// Due to the asynchronous nature of cuda kernel execution, synchronization is
// required before setting input and getting output.
void InputSync();
void OutputSync();
#endif
private: private:
std::shared_ptr<Predictor> raw_predictor_; std::shared_ptr<Predictor> raw_predictor_;
lite_api::CxxConfig config_; lite_api::CxxConfig config_;
std::mutex mutex_; std::mutex mutex_;
bool status_is_cloned_; bool status_is_cloned_;
#ifdef LITE_WITH_CUDA
bool multi_stream_{false};
cudaStream_t* io_stream_;
cudaStream_t* exec_stream_;
cudaEvent_t input_event_;
std::vector<cudaEvent_t> output_events_;
// only for multi exec stream mode.
std::vector<cudaStream_t*> exec_streams_;
#endif
}; };
/* /*
......
...@@ -23,7 +23,9 @@ ...@@ -23,7 +23,9 @@
#ifndef LITE_ON_TINY_PUBLISH #ifndef LITE_ON_TINY_PUBLISH
#include "lite/api/paddle_use_passes.h" #include "lite/api/paddle_use_passes.h"
#endif #endif
#ifdef LITE_WITH_CUDA
#include "lite/backends/cuda/cuda_utils.h"
#endif
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
!(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__) !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
#include <omp.h> #include <omp.h>
...@@ -34,7 +36,6 @@ namespace lite { ...@@ -34,7 +36,6 @@ namespace lite {
void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
config_ = config; config_ = config;
if (!status_is_cloned_) {
auto places = config.valid_places(); auto places = config.valid_places();
std::vector<std::string> passes = config.get_passes_internal(); std::vector<std::string> passes = config.get_passes_internal();
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
...@@ -42,15 +43,13 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { ...@@ -42,15 +43,13 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
// otherwise skip this step. // otherwise skip this step.
for (auto &p : places) { for (auto &p : places) {
if (p.target == TARGET(kCUDA)) { if (p.target == TARGET(kCUDA)) {
Env<TARGET(kCUDA)>::Init(); CudaEnvInit(&passes);
if (config_.multi_stream()) {
passes = {"multi_stream_analysis_pass"};
VLOG(3) << "add pass: " << passes[0];
}
break; break;
} }
} }
#endif #endif
if (!status_is_cloned_) {
#ifdef LITE_WITH_MLU #ifdef LITE_WITH_MLU
Env<TARGET(kMLU)>::Init(); Env<TARGET(kMLU)>::Init();
lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(), lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
...@@ -73,6 +72,7 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { ...@@ -73,6 +72,7 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
raw_predictor_->PrepareFeedFetch(); raw_predictor_->PrepareFeedFetch();
CHECK(raw_predictor_) << "The Predictor can not be nullptr in Clone mode."; CHECK(raw_predictor_) << "The Predictor can not be nullptr in Clone mode.";
} }
mode_ = config.power_mode(); mode_ = config.power_mode();
threads_ = config.threads(); threads_ = config.threads();
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
...@@ -87,15 +87,83 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { ...@@ -87,15 +87,83 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
#endif #endif
} }
#ifdef LITE_WITH_CUDA
void CxxPaddleApiImpl::CudaEnvInit(std::vector<std::string> *passes) {
Env<TARGET(kCUDA)>::Init();
// init two streams for each predictor.
if (config_.exec_stream()) {
exec_stream_ = config_.exec_stream();
} else {
exec_stream_ = new cudaStream_t();
TargetWrapperCuda::CreateStream(exec_stream_);
}
if (config_.io_stream()) {
io_stream_ = config_.io_stream();
} else {
io_stream_ = new cudaStream_t();
TargetWrapperCuda::CreateStream(io_stream_);
}
raw_predictor_->SetExecStream(exec_stream_);
raw_predictor_->SetIoStream(io_stream_);
// init sync events.
if (config_.multi_stream()) {
multi_stream_ = true;
raw_predictor_->SetMultiStream(multi_stream_);
passes->push_back("multi_stream_analysis_pass");
VLOG(3) << "add pass: " << (*passes)[0];
Env<TargetType::kCUDA>::Devs &devs = Env<TargetType::kCUDA>::Global();
int dev_id = TargetWrapperCuda::GetCurDevice();
for (size_t i = 0; i < lite::kMaxStream; ++i) {
exec_streams_.push_back(
const_cast<cudaStream_t *>(&devs[dev_id].exec_streams()[i]));
cudaEvent_t out_event;
TargetWrapperCuda::CreateEventWithFlags(&out_event);
output_events_.push_back(out_event);
}
} else {
cudaEvent_t out_event;
TargetWrapperCuda::CreateEventWithFlags(&out_event);
output_events_.push_back(out_event);
}
TargetWrapperCuda::CreateEventWithFlags(&input_event_);
}
void CxxPaddleApiImpl::InputSync() {
TargetWrapperCuda::RecordEvent(input_event_, *io_stream_);
if (multi_stream_) {
for (int i = 0; i < lite::kMaxStream; ++i) {
TargetWrapperCuda::StreamSync(*exec_streams_[i], input_event_);
}
} else {
TargetWrapperCuda::StreamSync(*exec_stream_, input_event_);
}
}
void CxxPaddleApiImpl::OutputSync() {
if (multi_stream_) {
for (size_t i = 0; i < output_events_.size(); ++i) {
TargetWrapperCuda::RecordEvent(output_events_[i], *exec_streams_[i]);
TargetWrapperCuda::StreamSync(*io_stream_, output_events_[i]);
}
} else {
TargetWrapperCuda::RecordEvent(output_events_[0], *exec_stream_);
TargetWrapperCuda::StreamSync(*io_stream_, output_events_[0]);
}
}
#endif
std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInput(int i) { std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInput(int i) {
auto *x = raw_predictor_->GetInput(i); auto *x = raw_predictor_->GetInput(i);
return std::unique_ptr<lite_api::Tensor>(new lite_api::Tensor(x)); return std::unique_ptr<lite_api::Tensor>(new lite_api::Tensor(x, io_stream_));
} }
std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetOutput( std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetOutput(
int i) const { int i) const {
const auto *x = raw_predictor_->GetOutput(i); const auto *x = raw_predictor_->GetOutput(i);
return std::unique_ptr<lite_api::Tensor>(new lite_api::Tensor(x)); return std::unique_ptr<lite_api::Tensor>(new lite_api::Tensor(x, io_stream_));
} }
std::vector<std::string> CxxPaddleApiImpl::GetInputNames() { std::vector<std::string> CxxPaddleApiImpl::GetInputNames() {
...@@ -114,7 +182,15 @@ void CxxPaddleApiImpl::Run() { ...@@ -114,7 +182,15 @@ void CxxPaddleApiImpl::Run() {
#ifdef LITE_WITH_ARM #ifdef LITE_WITH_ARM
lite::DeviceInfo::Global().SetRunMode(mode_, threads_); lite::DeviceInfo::Global().SetRunMode(mode_, threads_);
#endif #endif
#ifdef LITE_WITH_CUDA
InputSync();
#endif
raw_predictor_->Run(); raw_predictor_->Run();
#ifdef LITE_WITH_CUDA
OutputSync();
#endif
} }
std::shared_ptr<lite_api::PaddlePredictor> CxxPaddleApiImpl::Clone() { std::shared_ptr<lite_api::PaddlePredictor> CxxPaddleApiImpl::Clone() {
...@@ -160,6 +236,17 @@ void CxxPaddleApiImpl::SaveOptimizedModel(const std::string &model_dir, ...@@ -160,6 +236,17 @@ void CxxPaddleApiImpl::SaveOptimizedModel(const std::string &model_dir,
raw_predictor_->SaveModel(model_dir, model_type, record_info); raw_predictor_->SaveModel(model_dir, model_type, record_info);
} }
CxxPaddleApiImpl::~CxxPaddleApiImpl() {
TargetWrapperCuda::DestroyEvent(input_event_);
for (size_t i = 0; i < output_events_.size(); ++i) {
TargetWrapperCuda::DestroyEvent(output_events_[i]);
}
if (multi_stream_) {
TargetWrapperCuda::DestroyStream(*io_stream_);
TargetWrapperCuda::DestroyStream(*exec_stream_);
}
}
} // namespace lite } // namespace lite
namespace lite_api { namespace lite_api {
......
...@@ -84,7 +84,7 @@ TEST(CXXApi, clone_predictor) { ...@@ -84,7 +84,7 @@ TEST(CXXApi, clone_predictor) {
auto* cloned_output_tensor = cloned_predictor->GetOutput(0); auto* cloned_output_tensor = cloned_predictor->GetOutput(0);
int step = 50; int step = 50;
for (int i = 0; i < output_tensor->data_size(); i += step) { for (size_t i = 0; i < output_tensor->data_size(); i += step) {
EXPECT_NEAR(output_tensor->data<float>()[i], EXPECT_NEAR(output_tensor->data<float>()[i],
cloned_output_tensor->data<float>()[i], cloned_output_tensor->data<float>()[i],
1e-6); 1e-6);
......
...@@ -30,6 +30,15 @@ Tensor::Tensor(void *raw) : raw_tensor_(raw) {} ...@@ -30,6 +30,15 @@ Tensor::Tensor(void *raw) : raw_tensor_(raw) {}
// TODO(Superjomn) refine this by using another `const void* const_raw`; // TODO(Superjomn) refine this by using another `const void* const_raw`;
Tensor::Tensor(const void *raw) { raw_tensor_ = const_cast<void *>(raw); } Tensor::Tensor(const void *raw) { raw_tensor_ = const_cast<void *>(raw); }
#ifdef LITE_WITH_CUDA
Tensor::Tensor(void *raw, cudaStream_t *stream)
: raw_tensor_(raw), io_stream_(stream) {}
Tensor::Tensor(const void *raw, cudaStream_t *stream) : io_stream_(stream) {
raw_tensor_ = const_cast<void *>(raw);
}
#endif
lite::Tensor *tensor(void *x) { return static_cast<lite::Tensor *>(x); } lite::Tensor *tensor(void *x) { return static_cast<lite::Tensor *>(x); }
const lite::Tensor *ctensor(void *x) { const lite::Tensor *ctensor(void *x) {
return static_cast<const lite::Tensor *>(x); return static_cast<const lite::Tensor *>(x);
...@@ -93,8 +102,8 @@ void Tensor::CopyFromCpu(const T *src_data) { ...@@ -93,8 +102,8 @@ void Tensor::CopyFromCpu(const T *src_data) {
data, src_data, num * sizeof(T), lite::IoDirection::HtoH); data, src_data, num * sizeof(T), lite::IoDirection::HtoH);
} else if (type == TargetType::kCUDA) { } else if (type == TargetType::kCUDA) {
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
lite::TargetWrapperCuda::MemcpySync( lite::TargetWrapperCuda::MemcpyAsync(
data, src_data, num * sizeof(T), lite::IoDirection::HtoD); data, src_data, num * sizeof(T), lite::IoDirection::HtoD, *io_stream_);
#else #else
LOG(FATAL) << "Please compile the lib with CUDA."; LOG(FATAL) << "Please compile the lib with CUDA.";
#endif #endif
...@@ -113,8 +122,9 @@ void Tensor::CopyToCpu(T *data) const { ...@@ -113,8 +122,9 @@ void Tensor::CopyToCpu(T *data) const {
data, src_data, num * sizeof(T), lite::IoDirection::HtoH); data, src_data, num * sizeof(T), lite::IoDirection::HtoH);
} else if (type == TargetType::kCUDA) { } else if (type == TargetType::kCUDA) {
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
lite::TargetWrapperCuda::MemcpySync( lite::TargetWrapperCuda::MemcpyAsync(
data, src_data, num * sizeof(T), lite::IoDirection::DtoH); data, src_data, num * sizeof(T), lite::IoDirection::DtoH, *io_stream_);
lite::TargetWrapperCuda::StreamSync(*io_stream_);
#else #else
LOG(FATAL) << "Please compile the lib with CUDA."; LOG(FATAL) << "Please compile the lib with CUDA.";
#endif #endif
......
...@@ -24,6 +24,10 @@ ...@@ -24,6 +24,10 @@
#include <vector> #include <vector>
#include "paddle_place.h" // NOLINT #include "paddle_place.h" // NOLINT
#ifdef LITE_WITH_CUDA
#include "lite/backends/cuda/cuda_utils.h"
#endif
namespace paddle { namespace paddle {
namespace lite_api { namespace lite_api {
...@@ -61,8 +65,16 @@ struct LITE_API Tensor { ...@@ -61,8 +65,16 @@ struct LITE_API Tensor {
// Set LoD of the tensor // Set LoD of the tensor
void SetLoD(const lod_t& lod); void SetLoD(const lod_t& lod);
#ifdef LITE_WITH_CUDA
explicit Tensor(void* raw, cudaStream_t* stream);
explicit Tensor(const void* raw, cudaStream_t* stream);
#endif
private: private:
void* raw_tensor_; void* raw_tensor_;
#ifdef LITE_WITH_CUDA
cudaStream_t* io_stream_{nullptr};
#endif
}; };
/// The PaddlePredictor defines the basic interfaces for different kinds of /// The PaddlePredictor defines the basic interfaces for different kinds of
...@@ -155,6 +167,8 @@ class LITE_API CxxConfig : public ConfigBase { ...@@ -155,6 +167,8 @@ class LITE_API CxxConfig : public ConfigBase {
#endif #endif
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
bool multi_stream_{false}; bool multi_stream_{false};
cudaStream_t* exec_stream_{nullptr};
cudaStream_t* io_stream_{nullptr};
#endif #endif
#ifdef LITE_WITH_MLU #ifdef LITE_WITH_MLU
lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270}; lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
...@@ -203,6 +217,12 @@ class LITE_API CxxConfig : public ConfigBase { ...@@ -203,6 +217,12 @@ class LITE_API CxxConfig : public ConfigBase {
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; } void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; }
bool multi_stream() const { return multi_stream_; } bool multi_stream() const { return multi_stream_; }
void set_exec_stream(cudaStream_t* exec_stream) {
exec_stream_ = exec_stream;
}
void set_io_stream(cudaStream_t* io_stream) { io_stream_ = io_stream; }
cudaStream_t* exec_stream() { return exec_stream_; }
cudaStream_t* io_stream() { return io_stream_; }
#endif #endif
#ifdef LITE_WITH_MLU #ifdef LITE_WITH_MLU
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/backends/cuda/target_wrapper.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
void RunModel(lite_api::CxxConfig config) {
auto predictor = lite_api::CreatePaddlePredictor(config);
const int batch_size = 4;
const int channels = 3;
const int height = 224;
const int width = 224;
auto input_tensor = predictor->GetInput(0);
std::vector<int64_t> input_shape{batch_size, channels, height, width};
input_tensor->Resize(input_shape);
std::vector<float> in_data(batch_size * channels * height * width);
for (size_t i = 0; i < in_data.size(); i++) {
in_data[i] = 1;
}
input_tensor->CopyFromCpu<float, lite_api::TargetType::kCUDA>(in_data.data());
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor->Run();
}
auto start = GetCurrentUS();
for (int i = 0; i < FLAGS_repeats; ++i) {
predictor->Run();
}
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
std::vector<float> results{
0.000241399, 0.000224183, 0.000536607, 0.000286386, 0.000726817,
0.000212999, 0.00638716, 0.00128127, 0.000135354, 0.000767598,
0.000241399, 0.000224183, 0.000536607, 0.000286386, 0.000726817,
0.000212999, 0.00638716, 0.00128127, 0.000135354, 0.000767598,
0.000241399, 0.000224183, 0.000536607, 0.000286386, 0.000726817,
0.000212999, 0.00638716, 0.00128127, 0.000135354, 0.000767598,
0.000241399, 0.000224183, 0.000536607, 0.000286386, 0.000726817,
0.000212999, 0.00638716, 0.00128127, 0.000135354, 0.000767598};
auto out = predictor->GetOutput(0);
ASSERT_EQ(out->shape().size(), 2u);
ASSERT_EQ(out->shape()[0], batch_size);
ASSERT_EQ(out->shape()[1], 1000);
std::vector<int64_t> shape = out->shape();
int out_num =
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
std::vector<float> out_cpu(out_num);
out->CopyToCpu(out_cpu.data());
int step = 100;
for (size_t i = 0; i < results.size(); ++i) {
EXPECT_NEAR(out_cpu[i * step], results[i], 1e-6);
}
}
TEST(Resnet50, config_no_stream) {
lite_api::CxxConfig config;
config.set_model_dir(FLAGS_model_dir);
config.set_valid_places({lite_api::Place{TARGET(kCUDA), PRECISION(kFloat)}});
RunModel(config);
}
TEST(Resnet50, config_exec_stream) {
lite_api::CxxConfig config;
config.set_model_dir(FLAGS_model_dir);
config.set_valid_places({lite_api::Place{TARGET(kCUDA), PRECISION(kFloat)}});
cudaStream_t exec_stream;
lite::TargetWrapperCuda::CreateStream(&exec_stream);
config.set_exec_stream(&exec_stream);
RunModel(config);
}
TEST(Resnet50, config_io_stream) {
lite_api::CxxConfig config;
config.set_model_dir(FLAGS_model_dir);
config.set_valid_places({lite_api::Place{TARGET(kCUDA), PRECISION(kFloat)}});
cudaStream_t io_stream;
lite::TargetWrapperCuda::CreateStream(&io_stream);
config.set_io_stream(&io_stream);
RunModel(config);
}
TEST(Resnet50, config_all_stream) {
lite_api::CxxConfig config;
config.set_model_dir(FLAGS_model_dir);
config.set_valid_places({lite_api::Place{TARGET(kCUDA), PRECISION(kFloat)}});
cudaStream_t exec_stream;
lite::TargetWrapperCuda::CreateStream(&exec_stream);
config.set_exec_stream(&exec_stream);
cudaStream_t io_stream;
lite::TargetWrapperCuda::CreateStream(&io_stream);
config.set_io_stream(&io_stream);
RunModel(config);
}
TEST(Resnet50, config_multi_exec_stream) {
lite_api::CxxConfig config;
config.set_model_dir(FLAGS_model_dir);
config.set_valid_places({lite_api::Place{TARGET(kCUDA), PRECISION(kFloat)}});
config.set_multi_stream(true);
RunModel(config);
}
} // namespace lite
} // namespace paddle
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/core/target_wrapper.h" #include "lite/core/target_wrapper.h"
namespace paddle { namespace paddle {
...@@ -31,34 +32,40 @@ class TargetWrapper<TARGET(kCUDA)> { ...@@ -31,34 +32,40 @@ class TargetWrapper<TARGET(kCUDA)> {
static size_t num_devices(); static size_t num_devices();
static size_t maximum_stream() { return 0; } static size_t maximum_stream() { return 0; }
static size_t GetCurDevice() { static int GetCurDevice() {
int dev_id; int dev_id;
cudaGetDevice(&dev_id); CUDA_CALL(cudaGetDevice(&dev_id));
return dev_id; return dev_id;
} }
static void CreateStream(stream_t* stream) {} static void CreateStream(stream_t* stream) {
static void DestroyStream(const stream_t& stream) {} CUDA_CALL(cudaStreamCreate(stream));
}
static void DestroyStream(const stream_t& stream) {
CUDA_CALL(cudaStreamDestroy(stream));
}
static void CreateEvent(event_t* event) { cudaEventCreate(event); } static void CreateEvent(event_t* event) { CUDA_CALL(cudaEventCreate(event)); }
static void CreateEventWithFlags( static void CreateEventWithFlags(
event_t* event, unsigned int flags = cudaEventDisableTiming) { event_t* event, unsigned int flags = cudaEventDisableTiming) {
cudaEventCreateWithFlags(event, flags); CUDA_CALL(cudaEventCreateWithFlags(event, flags));
}
static void DestroyEvent(const event_t& event) {
CUDA_CALL(cudaEventDestroy(event));
} }
static void DestroyEvent(const event_t& event) { cudaEventDestroy(event); }
static void RecordEvent(const event_t& event) {} static void RecordEvent(const event_t& event) {}
static void RecordEvent(const event_t& event, const stream_t& stream) { static void RecordEvent(const event_t& event, const stream_t& stream) {
cudaEventRecord(event, stream); CUDA_CALL(cudaEventRecord(event, stream));
} }
static void SyncEvent(const event_t& event) {} static void SyncEvent(const event_t& event) {}
static void StreamSync(const stream_t& stream) { static void StreamSync(const stream_t& stream) {
cudaStreamSynchronize(stream); CUDA_CALL(cudaStreamSynchronize(stream));
} }
static void StreamSync(const stream_t& stream, const event_t& event) { static void StreamSync(const stream_t& stream, const event_t& event) {
cudaStreamWaitEvent(stream, event, 0); CUDA_CALL(cudaStreamWaitEvent(stream, event, 0));
} }
static void DeviceSync() { cudaDeviceSynchronize(); } static void DeviceSync() { CUDA_CALL(cudaDeviceSynchronize()); }
static void* Malloc(size_t size); static void* Malloc(size_t size);
static void Free(void* ptr); static void Free(void* ptr);
......
...@@ -26,6 +26,10 @@ ...@@ -26,6 +26,10 @@
namespace paddle { namespace paddle {
namespace lite { namespace lite {
// kMaxStream is determined by multi-stream performance testing, may change with
// default multi-stream algorithm changes.
constexpr int kMaxStream = 6;
#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU)) #if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
typedef enum { typedef enum {
...@@ -159,7 +163,7 @@ class Env { ...@@ -159,7 +163,7 @@ class Env {
static Devs* devs = new Devs(); static Devs* devs = new Devs();
return *devs; return *devs;
} }
static void Init(int max_stream = 6) { static void Init(int max_stream = lite::kMaxStream) {
#ifdef LITE_WITH_MLU #ifdef LITE_WITH_MLU
CNRT_CALL(cnrtInit(0)); CNRT_CALL(cnrtInit(0));
#endif #endif
...@@ -305,6 +309,7 @@ class Device<TARGET(kCUDA)> { ...@@ -305,6 +309,7 @@ class Device<TARGET(kCUDA)> {
bool has_hmma_; bool has_hmma_;
bool has_imma_; bool has_imma_;
int runtime_version_; int runtime_version_;
// Currently used in exec multi-stream.
std::vector<cudaStream_t> exec_stream_; std::vector<cudaStream_t> exec_stream_;
std::vector<cudaStream_t> io_stream_; std::vector<cudaStream_t> io_stream_;
}; };
......
...@@ -40,7 +40,9 @@ class Optimizer { ...@@ -40,7 +40,9 @@ class Optimizer {
public: public:
Optimizer() {} Optimizer() {}
Optimizer(Program&& program, const std::vector<Place>& valid_places) { Optimizer(Program&& program,
const std::vector<Place>& valid_places,
const std::vector<std::string>& passes) {
program_ = &program; program_ = &program;
valid_places_ = valid_places; valid_places_ = valid_places;
CHECK(!valid_places.empty()) << "At least one valid_place should be set"; CHECK(!valid_places.empty()) << "At least one valid_place should be set";
...@@ -50,7 +52,7 @@ class Optimizer { ...@@ -50,7 +52,7 @@ class Optimizer {
factor.ConsiderPrecision(); factor.ConsiderPrecision();
factor.ConsiderDataLayout(); factor.ConsiderDataLayout();
Run(std::move(program), valid_places, factor, {}); Run(std::move(program), valid_places, factor, passes);
} }
void Run(Program&& program, void Run(Program&& program,
......
...@@ -73,7 +73,7 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { ...@@ -73,7 +73,7 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
std::map<std::string, cpp::VarDesc> origin_var_maps; std::map<std::string, cpp::VarDesc> origin_var_maps;
auto& main_block = *desc->GetBlock<cpp::BlockDesc>(0); auto& main_block = *desc->GetBlock<cpp::BlockDesc>(0);
auto var_size = main_block.VarsSize(); auto var_size = main_block.VarsSize();
for (int i = 0; i < var_size; i++) { for (int i = 0; i < static_cast<int>(var_size); i++) {
auto v = main_block.GetVar<cpp::VarDesc>(i); auto v = main_block.GetVar<cpp::VarDesc>(i);
auto name = v->Name(); auto name = v->Name();
origin_var_maps.emplace(name, *v); origin_var_maps.emplace(name, *v);
...@@ -144,6 +144,15 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { ...@@ -144,6 +144,15 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
} }
} }
} }
#ifdef LITE_WITH_CUDA
void RuntimeProgram::UpdateContext(cudaStream_t* exec, cudaStream_t* io) {
for (auto& inst : instructions_) {
inst.UpdateContext(exec, io);
}
}
#endif
void RuntimeProgram::Run() { void RuntimeProgram::Run() {
#ifdef LITE_WITH_PRECISION_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE
auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler(); auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler();
...@@ -210,7 +219,8 @@ void Program::Build(const cpp::ProgramDesc& prog) { ...@@ -210,7 +219,8 @@ void Program::Build(const cpp::ProgramDesc& prog) {
if (op_type == "while" || op_type == "conditional_block" || if (op_type == "while" || op_type == "conditional_block" ||
op_type == "subgraph") { op_type == "subgraph") {
auto sub_block_idx = op_desc.GetAttr<int32_t>("sub_block"); auto sub_block_idx = op_desc.GetAttr<int32_t>("sub_block");
CHECK(sub_block_idx >= 0 && sub_block_idx < program.BlocksSize()) CHECK(sub_block_idx >= 0 &&
sub_block_idx < static_cast<int>(program.BlocksSize()))
<< "Invalid attribute sub_block(" << sub_block_idx << ") for " << "Invalid attribute sub_block(" << sub_block_idx << ") for "
<< op_type; << op_type;
auto sub_block_desc = auto sub_block_desc =
......
...@@ -128,6 +128,12 @@ struct Instruction { ...@@ -128,6 +128,12 @@ struct Instruction {
} }
} }
void Sync() const { kernel_->mutable_context()->As<CUDAContext>().Sync(); } void Sync() const { kernel_->mutable_context()->As<CUDAContext>().Sync(); }
void UpdateContext(cudaStream_t* exec, cudaStream_t* io) {
if (kernel_->target() == TargetType::kCUDA) {
kernel_->mutable_context()->As<CUDAContext>().SetExecStream(*exec);
kernel_->mutable_context()->As<CUDAContext>().SetIoStream(*io);
}
}
#endif #endif
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
...@@ -215,6 +221,12 @@ class LITE_API RuntimeProgram { ...@@ -215,6 +221,12 @@ class LITE_API RuntimeProgram {
// be added in vars_. // be added in vars_.
void UpdateVarsOfProgram(cpp::ProgramDesc* desc); void UpdateVarsOfProgram(cpp::ProgramDesc* desc);
#ifdef LITE_WITH_CUDA
// UpdateContext will update the exec stream and io stream of all kernels in
// the program.
void UpdateContext(cudaStream_t* exec, cudaStream_t* io);
#endif
private: private:
RuntimeProgram(const RuntimeProgram&) = delete; RuntimeProgram(const RuntimeProgram&) = delete;
std::vector<Instruction> instructions_; std::vector<Instruction> instructions_;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册