未验证 提交 f6461e39 编写于 作者: W Wilber 提交者: GitHub

add cuda cxx demo (#3205)

- 增加cuda c++ demo.
- 考虑到检测模型尾部一般是multiclass_nms,该kernel为host,如果fetch kernel为cuda的话,则会在此处插入无用的io_copy(host->cuda),由于该原因,注释掉fetch的cuda kernel. 默认使用host的fetch kernel. 此处暗中进行的行为:每次predictor run完,都会默认把数据从cuda拷贝到cpu
上级 7bf64f67
......@@ -135,19 +135,18 @@ if(LITE_WITH_CUDA)
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
#COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
)
add_dependencies(publish_inference_cuda_cxx_lib bundle_full_api)
add_dependencies(publish_inference_cuda_cxx_lib bundle_light_api)
add_dependencies(publish_inference_cuda_cxx_lib paddle_full_api_shared)
add_dependencies(publish_inference_cuda_cxx_lib paddle_light_api_shared)
#add_dependencies(publish_inference_cuda_cxx_lib test_model_bin)
add_dependencies(publish_inference publish_inference_cuda_cxx_lib)
add_custom_target(publish_inference_cuda_cxx_demos ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/cuda_demo/*" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
)
add_dependencies(publish_inference_cuda_cxx_lib publish_inference_cuda_cxx_demos)
add_dependencies(publish_inference_cuda_cxx_demos paddle_full_api_shared)
......
......@@ -149,9 +149,6 @@ void RuntimeProgram::Run() {
#endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE
}
#ifdef LITE_WITH_CUDA
TargetWrapperCuda::DeviceSync();
#endif
#ifdef LITE_WITH_PROFILE
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
#endif // LITE_WITH_PROFILE
......
project(demo CXX C)
cmake_minimum_required(VERSION 2.8)
set(TARGET demo)
set(CMAKE_CXX_FLAGS "-std=c++11 -O3")
set(LITE_LIB "${PROJECT_SOURCE_DIR}/../../cxx")
set(PROTOBUF_LIB "${PROJECT_SOURCE_DIR}/../../third_party/protobuf")
include_directories("${LITE_LIB}/include")
link_directories("${LITE_LIB}/lib")
link_directories("${PROTOBUF_LIB}/lib")
add_executable(${TARGET} ${TARGET}.cc)
set(DEPS ${LITE_LIB}/lib/libpaddle_full_api_shared.so)
set(DEPS ${DEPS} protobuf-lite)
set(DEPS ${DEPS} "-lrt -lpthread -ldl")
target_link_libraries(${TARGET} ${DEPS})
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <vector>
#include "paddle_api.h" // NOLINT
using namespace paddle::lite_api; // NOLINT
int64_t ShapeProduction(const shape_t& shape) {
int64_t res = 1;
for (auto i : shape) res *= i;
return res;
}
void RunModel(std::string model_dir) {
// 1. Create CxxConfig
CxxConfig config;
config.set_model_file(model_dir + "/__model__");
config.set_param_file(model_dir + "/__params__");
config.set_valid_places({
Place{TARGET(kCUDA), PRECISION(kFloat)},
});
// 2. Create PaddlePredictor by CxxConfig
std::shared_ptr<PaddlePredictor> predictor =
CreatePaddlePredictor<CxxConfig>(config);
// 3. Prepare input data
int num = 1;
int channels = 3;
int height = 608;
int width = 608;
std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
input_tensor->Resize({num, channels, height, width});
// fake input data
std::vector<float> data(num * channels * height * width, 0);
for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
data[i] = i % 10 * 0.1;
}
input_tensor->CopyFromCpu<float, TargetType::kCUDA>(data.data());
std::unique_ptr<Tensor> size_tensor(std::move(predictor->GetInput(1)));
size_tensor->Resize({1, 2});
std::vector<int> size_data{608, 608};
size_tensor->CopyFromCpu<int, TargetType::kCUDA>(size_data.data());
// 4. Run predictor
predictor->Run();
// 5. Get output
std::unique_ptr<const Tensor> output_tensor(
std::move(predictor->GetOutput(0)));
std::vector<float> out_cpu(ShapeProduction(output_tensor->shape()), 0);
std::cout << "output size is " << ShapeProduction(output_tensor->shape())
<< std::endl;
output_tensor->CopyToCpu(out_cpu.data());
for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
std::cout << "Output[" << i << "]: " << out_cpu[i] << std::endl;
}
}
int main(int argc, char** argv) {
if (argc < 2) {
std::cerr << "[ERROR] usage: ./" << argv[0] << " model_dir\n";
exit(1);
}
std::string model_dir = argv[1];
RunModel(model_dir);
return 0;
}
......@@ -50,24 +50,26 @@ void FetchCompute<T, Ptype>::Run() {
typedef paddle::lite::kernels::cuda::FetchCompute<float, PRECISION(kFloat)>
FetchFp32;
REGISTER_LITE_KERNEL(fetch, kCUDA, kFloat, kNCHW, FetchFp32, nchw)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kCUDA),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.Finalize();
REGISTER_LITE_KERNEL(fetch, kCUDA, kFloat, kNHWC, FetchFp32, nhwc)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kCUDA),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.Finalize();
// When the model ends with a cpu kernel, adding cuda's fetch kernel will add
// useless io_copy
// REGISTER_LITE_KERNEL(fetch, kCUDA, kFloat, kNCHW, FetchFp32, nchw)
// .BindInput("X",
// {LiteType::GetTensorTy(TARGET(kCUDA),
// PRECISION(kFloat),
// DATALAYOUT(kNCHW))})
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kHost),
// PRECISION(kFloat),
// DATALAYOUT(kNCHW))})
// .Finalize();
//
// REGISTER_LITE_KERNEL(fetch, kCUDA, kFloat, kNHWC, FetchFp32, nhwc)
// .BindInput("X",
// {LiteType::GetTensorTy(TARGET(kCUDA),
// PRECISION(kFloat),
// DATALAYOUT(kNHWC))})
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kHost),
// PRECISION(kFloat),
// DATALAYOUT(kNHWC))})
// .Finalize();
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册