diff --git a/demo/CMakeLists.txt b/demo/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9bb0910053db5f2fc4f2a425bbc3804fd94e0b0 --- /dev/null +++ b/demo/CMakeLists.txt @@ -0,0 +1,17 @@ +# CMakeLists.txt + +cmake_minimum_required(VERSION 3.0 FATAL_ERROR) +project(LibTorchDemo) + +# compile options +set(CMAKE_CXX_FLAGS_RELEASE "-O3") +set(CMAKE_CXX_STANDARD 14) + +# package +find_package(OpenCV REQUIRED) +find_package(Torch REQUIRED PATHS "/usr/local/lib/libtorch") # 将libtorch放在 /usr/local/lib + +add_executable(digit digit.cpp) +# libtorch +target_link_libraries(digit ${TORCH_LIBRARIES}) +target_link_libraries(digit ${OpenCV_LIBS}) \ No newline at end of file diff --git a/demo/convert2jit.py b/demo/convert2jit.py new file mode 100644 index 0000000000000000000000000000000000000000..9821fec1e6bb1d81e1336eed225e7cb51e37b403 --- /dev/null +++ b/demo/convert2jit.py @@ -0,0 +1,10 @@ +import torch +from digit import Digit + +model = Digit() +model.load_state_dict(torch.load("model/digit.pth", map_location="cpu")) + +sample = torch.randn(1, 1, 8, 8) + +trace_model = torch.jit.trace(model, sample) +trace_model.save("model/digit.jit") \ No newline at end of file diff --git a/demo/digit.cpp b/demo/digit.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fdd08a70928ea063c29402380bfb809692ba7512 --- /dev/null +++ b/demo/digit.cpp @@ -0,0 +1,52 @@ +#include "iostream" +#include "opencv2/opencv.hpp" +#include "torch/script.h" +#include "fstream" + +void checkPath(const char* path) { + std::ifstream in; + in.open(path); + bool flag = (bool)in; + in.close(); + if (flag) return; + else { + std::cout << "file " << path << " doesn't exist!" << std::endl; + exit(-1); + } +} + +int main(int argc, char const *argv[]) +{ + if (argc != 3) { + std::cout << "usage : digit " << std::endl; + return -1; + } + + checkPath(argv[1]); + checkPath(argv[2]); + cv::Mat img = cv::imread(argv[2]), gimg, fimg, rimg; + cv::cvtColor(img, gimg, CV_BGR2GRAY); + + gimg.convertTo(fimg, CV_32F, - 1. / 255., 1.); + cv::resize(fimg, rimg, {8, 8}); + + // convert Mat to tensor + at::Tensor img_tensor = torch::from_blob( + rimg.data, + {1, 1, 8, 8}, + torch::kFloat32 + ); + + // load model + torch::jit::Module model = torch::jit::load(argv[1]); + + // torch.no_grad() + torch::NoGradGuard no_grad; // 请一定加入torch::NoGradGuard no_grad; 这句话,否则内存会炸。 + + // forward + torch::Tensor out = model({img_tensor}).toTensor(); + int pre_lab = torch::argmax(out, 1).item().toInt(); + + std::cout << "predict number is " << pre_lab << std::endl; + return 0; +} \ No newline at end of file diff --git a/demo/digit.py b/demo/digit.py new file mode 100644 index 0000000000000000000000000000000000000000..5a969aad5e379bf0d642163aafec8799480c9686 --- /dev/null +++ b/demo/digit.py @@ -0,0 +1,95 @@ +from sklearn.datasets import load_digits +import torch +from torch import nn +import torch.utils.data as Data +import numpy as np +from sklearn.metrics import accuracy_score +import matplotlib.pyplot as plt +import os + +class Digit(nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Sequential( + nn.Conv2d(1, 16, 3, 1, 1), + nn.Tanh(), + nn.Conv2d(16, 32, 3, 2, 1), + nn.Tanh(), + nn.Conv2d(32, 16, 3, 2, 1), + nn.Tanh(), + nn.Conv2d(16, 8, 3, 1, 1) + ) + + self.output = nn.Linear(32, 10) + + def forward(self, x): + out = self.conv(x) + out = self.output(out.flatten(1)) + return out + +RATIO = 0.8 +BATCH_SIZE = 128 +EPOCH = 10 + +if __name__ == "__main__": + X, y = load_digits(return_X_y=True) + X = X / 16. + sample_num = len(y) + X = [x.reshape(1, 8, 8).tolist() for x in X] + + indice = np.arange(sample_num) + np.random.shuffle(indice) + + X = torch.FloatTensor(X) + y = torch.LongTensor(y) + offline = int(sample_num * RATIO) + + train = Data.TensorDataset(X[indice[:offline]], y[indice[:offline]]) + test = Data.TensorDataset(X[indice[offline:]], y[indice[offline:]]) + + train_loader = Data.DataLoader(train, BATCH_SIZE, True) + test_loader = Data.DataLoader(test, BATCH_SIZE, False) + + model = Digit() + optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3) + criterion = nn.CrossEntropyLoss(reduction="mean") + + test_losses = [] + test_accs = [] + + for epoch in range(EPOCH): + model.train() + for bx, by in train_loader: + out = model(bx) + loss = criterion(out, by) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + model.eval() + correct = 0 + total = 0 + test_loss = [] + test_acc = [] + for bx, by in test_loader: + with torch.no_grad(): + out = model(bx) + pre_lab = out.argmax(1) + loss = criterion(out, by) + + test_loss.append(loss.item()) + test_acc.append(accuracy_score(pre_lab, by)) + + test_losses.append(np.mean(test_loss)) + test_accs.append(np.mean(test_acc)) + + plt.figure(dpi=120) + plt.plot(test_losses, 'o-', label="loss") + plt.plot(test_accs, 'o-', label="accuracy") + plt.legend() + plt.grid() + plt.show() + + if not os.path.exists("model"): + os.makedirs("model") + torch.save(model.state_dict(), "model/digit.pth") \ No newline at end of file diff --git a/demo/digit_test.cpp b/demo/digit_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4e77efbc5dacff654a1254de522f57ad584aa8e1 --- /dev/null +++ b/demo/digit_test.cpp @@ -0,0 +1,9 @@ +#include "iostream" +#include "opencv2/opencv.hpp" +#include "torch/script.h" + +int main(int argc, char const *argv[]) +{ + std::cout << "hello world!" << std::endl; + return 0; +} \ No newline at end of file diff --git a/demo/image/sample.png b/demo/image/sample.png new file mode 100644 index 0000000000000000000000000000000000000000..1a253b2910672a5a7ba314a1d09348047f94a1f6 Binary files /dev/null and b/demo/image/sample.png differ diff --git a/demo/readme.md b/demo/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..1c3bf0a6b109693063c4edb1ea78aa9756d9541e --- /dev/null +++ b/demo/readme.md @@ -0,0 +1,18 @@ +# 使用 + +```py +# 测试 +1、将 digit_test.cpp 改成 digit.cpp +2、mkdir build && cd build +3、camke .. && make -j4 +4、./digit + +# libtorch加载模型 +1、python digit.py # 训练一个原生pytorch模型 +2、python convert2jit.py # pytorch模型 转成 jit模型 +3、python test_jit.py # 测试 jit模型 +4、mkdir build && cd build +5、camke .. && make -j4 +6、./digit model/digit.jit image/sample.png +``` + diff --git a/demo/test_jit.py b/demo/test_jit.py new file mode 100644 index 0000000000000000000000000000000000000000..4316a58bd2465dc18791e5cea927925776f7d83e --- /dev/null +++ b/demo/test_jit.py @@ -0,0 +1,45 @@ +import time +import torch +import cv2 as cv +from digit import Digit +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd + +def run_model(model, image): + s = time.time() + out = model(image) + pre_lab = torch.argmax(out, dim=1) + cost_time = round(time.time() - s, 5) + return cost_time + +image = cv.imread("image/sample.png") +image = cv.cvtColor(image, cv.COLOR_BGR2GRAY) +image = 1 - image / 255. +image = cv.resize(image, (8, 8)) + + +image = torch.FloatTensor(image).unsqueeze(0).unsqueeze(0).contiguous() +origin_model = Digit() +origin_model.load_state_dict(torch.load("model/digit.pth")) +jit_model = torch.jit.load("model/digit.jit") + +# init jit +for _ in range(3): + run_model(origin_model, image) + run_model(jit_model, image) + +test_times = 10 + +# begin testing +results = pd.DataFrame({ + "type" : ["orgin"] * test_times + ["jit"] * test_times, + "cost_time" : [run_model(origin_model, image) for _ in range(test_times)] + [run_model(jit_model, image) for _ in range(test_times)] +}) + +plt.figure(dpi=120) +sns.boxplot( + x=results["type"], + y=results["cost_time"] +) +plt.show() \ No newline at end of file diff --git a/doc/libtorch.md b/doc/libtorch.md new file mode 100644 index 0000000000000000000000000000000000000000..c0c3d0c115e3b4c17a3e0c0f66c9f4b2d4c73ada --- /dev/null +++ b/doc/libtorch.md @@ -0,0 +1,297 @@ +- https://pytorch.org/tutorials/advanced/cpp_export.html +- [LibTorch的安装、配置与使用](https://blog.csdn.net/weixin_45632168/article/details/114679263) +- [libtorch c++调用 (五)Linux下的调用](https://blog.csdn.net/juluwangriyue/article/details/108463026) +- https://pytorch.org/cppdocs/ +- [libtorch教程](https://www.zhihu.com/column/c_1373368181138972672) +- [VS2019 配置 LibTorch 和 OpenCV](https://zhuanlan.zhihu.com/p/375084412) + +[toc] + +# 环境 + +```python +Ubuntu 9.4.0-1ubuntu1~20.04.1 +python3.7 +pytorch-1.11.0 +cuda-11.0 +libtorch-1.12-cuda113 +gcc version 9.4.0 +cmake version 3.22.5 +GNU Make 4.2.1 + +# --------------------------------------- +centos7 +python3.7 +cuda-10.2 +cudnn-10.2-linux-x64-v8.1.0.77 +torch-1.11.0+cu102-cp37-cp37m-linux_x86_64 +torchvision-0.12.0+cu102-cp37-cp37m-linux_x86_64 +libtorch-shared-with-deps-1.12.1+cu102 +onnxruntime-linux-x64-1.12.1 +cmake version 3.14.5 +GNU Make 3.82 +gcc version 8.3.1 20190311 +``` + + + +# 下载 libtorch + +- https://pytorch.org/ + +```python +# cuda113-linux (需要安装 cuda-11.3 以及对应版本的 cudnn-8.0) +https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.12.1%2Bcu113.zip +# cpu-linux +https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.12.1%2Bcpu.zip +``` + +# 将Pytorch模型转化为Torch Script + +```python +import torch +import torchvision +# An instance of your model. +model = torchvision.models.resnet18() +# An example input you would normally provide to your model's forward() method. +example = torch.rand(1, 3, 224, 224) +# Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing. +traced_script_module = torch.jit.trace(model, example) +traced_script_module.save("traced_resnet_model.pt") +``` + +# 在C++中加载Model + +```c +#include +#include +#include // cuda相关函数头文件 +#include + +int main(int argc, const char* argv[]) { + if (argc != 2) { + std::cerr << "usage: example-app \n"; + return -1; + } + + torch::DeviceType device_type = at::kCPU; // 定义设备类型 + if (torch::cuda::is_available()) + device_type = at::kCUDA; + + + torch::jit::script::Module model; + try { + // Deserialize the ScriptModule from a file using torch::jit::load(). + model = torch::jit::load(argv[1]); + } + catch (const c10::Error& e) { + std::cerr << "error loading the model\n"; return -1; + } + std::cout << "ok\n"; + + // Create a vector of inputs. + + // std::vector inputs; + // inputs.push_back(torch::ones({1, 3, 224, 224})); + + model.to(device_type); + std::vector inputs; + inputs.push_back(torch::ones({ 1, 3, 224, 224 }).to(device_type)); + + // Execute the model and turn its output into a tensor. + + at::Tensor output = model.forward(inputs).toTensor(); + std::cout << output.slice(/*dim=*/1, /*start=*/0, /*end=*/5) << '\n'; + } + +``` + +## 结合opencv + +- https://blog.csdn.net/mmmkl1/article/details/118522533 +- https://github.com/qubvel/segmentation_models.pytorch + +```c++ +#include "DemoPytorch.h" +#include +#include +#include +#include +#include +#include +#include +#include + +int main() { + // load model + torch::jit::script::Module module; + try { + module = torch::jit::load("./torch_script_eval.pt"); + module.to(torch::kCPU); // set model to cpu mode + /*module.to(torch::kCUDA);*/ // set model to cuda mode + module.eval(); + std::cout << "MODEL LOADED"; + } + catch (const c10::Error& e) { + std::cerr << "error loading the model\n"; + } + + // load img + cv::Mat img_original = cv::imread("./00011584_002.png",0); + cv::Mat img = cv::Mat(img_original); + // normalize + cv::resize(img, img, cv::Size(512, 512)); + img.convertTo(img, CV_32FC1); + // img to tensor + torch::Tensor mean = torch::tensor({ 0.485,0.456,0.406 }); + torch::Tensor std = torch::tensor({ 0.229, 0.224, 0.225 }); + auto input_tensor = torch::from_blob(img.data, { 512,512,1 }); + input_tensor = input_tensor / 255.0f; + input_tensor = input_tensor - mean; + input_tensor = input_tensor / std; + input_tensor = input_tensor.permute({ 2,0,1 }); + input_tensor = input_tensor.to(torch::kCPU); + /*input_tensor = input_tensor.to(torch::kCUDA);*/ + input_tensor = input_tensor.unsqueeze(0); + std::vector input; + input.push_back(input_tensor); + // pred begin + auto pred = module.forward(input).toTensor(); + // pred tensor to mat + pred = pred.squeeze().detach(); + pred = pred * 255; + pred = pred.to(torch::kU8); + pred = pred.to(torch::kCPU); + cv::Mat output_mat(cv::Size{ 512,512 }, CV_8UC1, pred.data_ptr()); + // show result + cv::imshow("original img", img_original); + cv::imshow("mask", output_mat); + cv::waitKey(0); + cv::destroyWindow("original img"); + cv::destroyWindow("mask"); + + return 0; +} + +``` + + + +# test + +```cpp +//%%file main.cpp + +#include +#include + +using namespace std; + +int main() +{ + // torch::Tensor tensor = torch::eye(3); + torch::Tensor tensor = torch::eye(3).to(at::kCUDA); // 数据加载至GPU + std::cout << tensor << std::endl; + cout << "Hello World!" << endl; + return 0; +} + + +// 测试gpu是否可以使用 + +#include +#include +#include // cuda相关函数头文件 +#include + +int main() +{ + std::cout <<"cuda::is_available():" << torch::cuda::is_available() << std::endl; + // system("pause"); + return 0; +} +``` + +```makefile +#%%file CMakeLists.txt + +cmake_minimum_required(VERSION 3.5) + +project(libtorch_demo LANGUAGES CXX) + +# packages +#find_package(CUDA) +# nvcc flags +#set(CUDA_NVCC_FLAGS -gencode arch=compute_20,code=sm_20;-G;-g) +#set(CUDA_NVCC_FLAGS -gencode;arch=compute_60,code=sm_60;-G;-g) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +set(Torch_DIR /kaggle/working/libtorch/share/cmake/Torch) +find_package(Torch REQUIRED) + +add_executable(libtorch_demo main.cpp) +target_link_libraries(libtorch_demo "${TORCH_LIBRARIES}") +set_property(TARGET libtorch_demo PROPERTY CXX_STANDARD 17) +``` + + + +```python +mkdir build +cd build +cmake .. +# cmake .. -DCUDNN_INCLUDE_DIR=/usr/include -DCUDNN_LIBRARY=/usr/lib/x86_64-linux-gnu +make +``` + +```python +cmake_minimum_required(VERSION 3.5) + +project(dtp) + +#find_package(OpenCV REQUIRED) + +#message(STATUS "OpenCV library status:") +#message(STATUS " version: ${OpenCV_VERSION}") +#message(STATUS " libraries: ${OpenCV_LIBS}") +#message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}") + +set(Torch_DIR /kaggle/working/libtorch/share/cmake/Torch) +find_package(Torch REQUIRED) +#include_directories(${OpenCV_INCLUDE_DIRS}) +include_directories(${TORCH_INCLUDE_DIRS}) +add_executable(dtp main.cpp) +target_link_libraries(dtp + "${TORCH_LIBRARIES}" + # ${OpenCV_LIBS} + ) + +set_property(TARGET dtp PROPERTY CXX_STANDARD 14) +``` + + + +# 几个常见错误 + +```python +# 1、 +CMake Error: CMake was unable to find a build program corresponding to "Unix Makefiles". CMAKE_MAKE_PROGRAot set. You probably need to select a different build tool. +CMake Error: CMAKE_CXX_COMPILER not set, after EnableLanguage +# 解决方法:yum install make -y + +# 2 +-- Could NOT find CUDA (missing: CUDA_TOOLKIT_ROOT_DIR CUDA_NVCC_EXECUTABLE CUDA_INCLUDE_DIRS CUDA_CUDART_LIBRARY) +# 解决方法:需要安装 cuda (也可以将现有的/usr/local/cuda 复制过来) + +# 3、 +OSError: libcudnn.so.8: cannot open shared object file: No such file or directory + +# 下载 cudnn8.0以上 将文件复制到 /usr/local/cuda + +# 4、 +cannot find -lCUDA_cublas_LIBRARY-NOTFOUND +# 找到所有 libcublas.so 复制到 /usr/local/cuda/lib64 +``` + diff --git a/doc/onnxruntime.md b/doc/onnxruntime.md new file mode 100644 index 0000000000000000000000000000000000000000..3d495631c420d551cb2923f33a9c818f94151b87 --- /dev/null +++ b/doc/onnxruntime.md @@ -0,0 +1,337 @@ +- https://onnxruntime.ai/ +- https://onnxruntime.ai/docs/tutorials/traditional-ml.html +- https://github.com/microsoft/onnxruntime +- https://github.com/microsoft/onnxruntime-inference-examples +- `pip install onnxruntime` + +[toc] + +```python +from torchvision.models import resnet18 +import torch + +model = resnet18() +torch.onnx.export(model, torch.randn(1, 3, 224, 224), + 'model.onnx', verbose=True, opset_version=11, + input_names=['input'], # the model's input names + output_names=['output'] + ) + +``` + + + +##### 将 ONNX 模型转换为 ORT 格式脚本使用 + +```python +python -m onnxruntime.tools.convert_onnx_models_to_ort # 会生成 .ort文件 +``` + + + +```python +import onnxruntime +import torch +def to_numpy(tensor): + return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() + +x = torch.randn(1,3,224,224) +model_path = "model.onnx" # or 'model.ort' +# ort_session = onnxruntime.InferenceSession(model_path) # 默认cpu +ort_session = onnxruntime.InferenceSession(model_path,providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']) +ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(x)} +ort_outs = ort_session.run(None, ort_inputs)[0] +ort_outs = torch.softmax(torch.from_numpy(ort_outs), -1) +print(ort_outs.argmax(-1)) +``` + + + +---- + + + +# 安装 + +```python +# 在任何一种环境中,一次只能安装其中一个软件包 +pip install onnxruntime +pip install onnxruntime-gpu + +# 安装 ONNX 以导出模型 +## ONNX is built into PyTorch +pip install torch +## tensorflow +pip install tf2onnx +## sklearn +pip install skl2onnx +``` + +## PyTorch CV + +- 使用导出模型`torch.onnx.export` + +```python +torch.onnx.export(model, # model being run + torch.randn(1, 28, 28).to(device), # model input (or a tuple for multiple inputs) + "fashion_mnist_model.onnx", # where to save the model (can be a file or file-like object) + input_names = ['input'], # the model's input names + output_names = ['output']) # the model's output names + + +# Export the model +torch.onnx.export(model, # model being run + (text, offsets), # model input (or a tuple for multiple inputs) + "ag_news_model.onnx", # where to save the model (can be a file or file-like object) + export_params=True, # store the trained parameter weights inside the model file + opset_version=10, # the ONNX version to export the model to + do_constant_folding=True, # whether to execute constant folding for optimization + input_names = ['input', 'offsets'], # the model's input names + output_names = ['output'], # the model's output names + dynamic_axes={'input' : {0 : 'batch_size'}, # variable length axes + 'output' : {0 : 'batch_size'}}) +``` + +- 加载 onnx 模型`onnx.load` + +```python +import onnx +onnx_model = onnx.load("fashion_mnist_model.onnx") +onnx.checker.check_model(onnx_model) +``` + +- 使用创建推理会话`ort.InferenceSession` + +```python +import onnxruntime as ort +import numpy as np +x, y = test_data[0][0], test_data[0][1] +ort_sess = ort.InferenceSession('fashion_mnist_model.onnx') +outputs = ort_sess.run(None, {'input': x.numpy()}) + +# Print Result +predicted, actual = classes[outputs[0][0].argmax(0)], classes[y] +print(f'Predicted: "{predicted}", Actual: "{actual}"') +``` + +## SciKit Learn CV + +```python +from sklearn.datasets import load_iris +from sklearn.model_selection import train_test_split +iris = load_iris() +X, y = iris.data, iris.target +X_train, X_test, y_train, y_test = train_test_split(X, y) + +from sklearn.linear_model import LogisticRegression +clr = LogisticRegression() +clr.fit(X_train, y_train) +print(clr) + +# LogisticRegression() +``` + +- 将模型转换或导出为 ONNX 格式 + +```python +from skl2onnx import convert_sklearn +from skl2onnx.common.data_types import FloatTensorType + +initial_type = [('float_input', FloatTensorType([None, 4]))] +onx = convert_sklearn(clr, initial_types=initial_type) +with open("logreg_iris.onnx", "wb") as f: + f.write(onx.SerializeToString()) +``` + +- 使用 ONNX Runtime 加载和运行模型我们将使用 ONNX Runtime 来计算此机器学习模型的预测。 + +```python +import numpy +import onnxruntime as rt + +# sess = rt.InferenceSession("logreg_iris.onnx") # 默认cpu +sess = rt.InferenceSession("logreg_iris.onnx",providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']) +input_name = sess.get_inputs()[0].name +pred_onx = sess.run(None, {input_name: X_test.astype(numpy.float32)})[0] +print(pred_onx) +``` + +- 获取预测类 + +```python +import numpy +import onnxruntime as rt + +sess = rt.InferenceSession("logreg_iris.onnx") +input_name = sess.get_inputs()[0].name +label_name = sess.get_outputs()[0].name +pred_onx = sess.run( + [label_name], {input_name: X_test.astype(numpy.float32)})[0] +print(pred_onx) +``` + + + +## C++ 版本 + +- 下载安装包 https://github.com/microsoft/onnxruntime/releases + + + +```c++ +//main.cpp +//https://blog.csdn.net/baidu_34595620/article/details/112176278 +//https://github.com/microsoft/onnxruntime-inference-examples/blob/main/c_cxx/squeezenet/main.cpp +#include +#include +#include +// #include "onnxruntime_c_api.h" +#include "onnxruntime_cxx_api.h" +// #include "cuda_provider_factory.h" +#include +#include //头文件 +clock_t t_start,t_end; + +int main(int argc, const char* argv[]) { + Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test"); + Ort::SessionOptions session_options; + session_options.SetIntraOpNumThreads(1); + + session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); + + #ifdef _WIN32 + const wchar_t* model_path = L"model.onnx"; + #else + const char* model_path = "model.onnx"; + #endif + + Ort::Session session(env, model_path, session_options); + // print model input layer (node names, types, shape etc.) + Ort::AllocatorWithDefaultOptions allocator; + +// print number of model input nodes + size_t num_input_nodes = session.GetInputCount(); + std::vector input_node_names(num_input_nodes); + std::vector input_node_dims; // simplify... this model has only 1 input node {1, 3, 224, 224}. + // Otherwise need vector> + + printf("Number of inputs = %zu\n", num_input_nodes); + + // iterate over all input nodes + for (int i = 0; i < num_input_nodes; i++) { + // print input node names + char* input_name = session.GetInputName(i, allocator); + printf("Input %d : name=%s\n", i, input_name); + input_node_names[i] = input_name; + + // print input node types + Ort::TypeInfo type_info = session.GetInputTypeInfo(i); + auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); + + ONNXTensorElementDataType type = tensor_info.GetElementType(); + printf("Input %d : type=%d\n", i, type); + + // print input shapes/dims + input_node_dims = tensor_info.GetShape(); + printf("Input %d : num_dims=%zu\n", i, input_node_dims.size()); + for (size_t j = 0; j < input_node_dims.size(); j++) + printf("Input %d : dim %zu=%jd\n", i, j, input_node_dims[j]); + } + + size_t input_tensor_size = 224 * 224 * 3; // simplify ... using known dim values to calculate size + // use OrtGetTensorShapeElementCount() to get official size! + + std::vector input_tensor_values(input_tensor_size); + std::vector output_node_names = {"output"};//{"softmaxout_1"}; + + // initialize input data with values in [0.0, 1.0] + for (unsigned int i = 0; i < input_tensor_size; i++) + input_tensor_values[i] = (float)i / (input_tensor_size + 1); + + // create input tensor object from data values + auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); + Ort::Value input_tensor = Ort::Value::CreateTensor(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), 4); + assert(input_tensor.IsTensor()); + + int nums=100; + t_start=clock(); //程序开始计时 + for(int i=0;i(); + // assert(abs(floatarr[0] - 0.000045) < 1e-6); + + // score the model, and print scores for first 5 classes + for (int i = 0; i < 5; i++) + printf("Score for class [%d] = %f\n", i, floatarr[i]); + */ + // Results should be as below... + // Score for class[0] = 0.000045 + // Score for class[1] = 0.003846 + // Score for class[2] = 0.000125 + // Score for class[3] = 0.001180 + // Score for class[4] = 0.001317 + + + // release buffers allocated by ORT alloctor + for(const char* node_name : input_node_names) + allocator.Free(const_cast(reinterpret_cast(node_name))); + + printf("Done!\n"); + + } + +``` + +```makefile +# CMakeLists.txt +project(capi_test) + +set(CMAKE_BUILD_TYPE Debug) + +cmake_minimum_required(VERSION 3.13) + +#option(ONNXRUNTIME_ROOTDIR "onnxruntime root dir") + +# tensorrt_provider_factory.h contains old APIs of the tensorrt execution provider +#include(CheckIncludeFileCXX) +#CHECK_INCLUDE_FILE_CXX(tensorrt_provider_factory.h HAVE_TENSORRT_PROVIDER_FACTORY_H) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +#include_directories( +# ${ONNXRUNTIME_ROOTDIR}/include/onnxruntime/core/session/ +# ${ONNXRUNTIME_ROOTDIR}/include/onnxruntime/core/providers/tensorrt/ +#) +include_directories("/opt/onnxruntime/include") +link_directories("/opt/onnxruntime/lib") + + +ADD_EXECUTABLE(capi_test main.cpp) +if(HAVE_TENSORRT_PROVIDER_FACTORY_H) + target_compile_definitions(capi_test PRIVATE -DHAVE_TENSORRT_PROVIDER_FACTORY_H) +endif() +target_link_libraries(capi_test onnxruntime) +``` + +## [使用 PyTorch 进行 ORT 训练](https://onnxruntime.ai/docs/get-started/training-pytorch.html) + +```python +pip install torch-ort +python -m torch_ort.configure + +from torch_ort import ORTModule +. +. +. +model = ORTModule(model) +``` + diff --git a/doc/readme.md b/doc/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..fa38b498141c464144eac420018b118f0102a468 --- /dev/null +++ b/doc/readme.md @@ -0,0 +1,133 @@ +- [NCNN、OpenVino、 TensorRT、MediaPipe、ONNX,各种推理部署架构,到底哪家强?](https://www.bilibili.com/read/cv13656068) + + + +[toc] + +# 1、纯python + +- 0.02707[cpu], 0.00655[gpu] + +```python +from torchvision.models import resnet18 +import torch +import time + +device = "cpu" # 0.02707 0.00655 +nums = 100 +model = resnet18().to(device) +inputs = torch.randn(nums, 3, 224, 224).to(device) +start = time.perf_counter() +for i in range(nums): + preds = model(inputs[[i]]) +end = time.perf_counter() +print(f"mean_time:{((end - start) / nums):.5f}") + +torch.onnx.export(model, torch.randn(1, 3, 224, 224), + 'model.onnx', verbose=True, opset_version=11, + input_names=['input'], # the model's input names + output_names=['output'] + ) + +example = torch.rand(1, 3, 224, 224) +# Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing. +traced_script_module = torch.jit.trace(model, example) +traced_script_module.save("traced_resnet_model.pt") + +# python -m onnxruntime.tools.convert_onnx_models_to_ort model.onnx # 会生成 model.ort +``` + + + +# 2、libtorch + +- 0.1934[cpu],0.0077[gpu] + +```c++ +#include +#include +#include // cuda相关函数头文件 +#include +#include //头文件 +clock_t t_start,t_end; +int main(int argc, const char* argv[]) { + if (argc != 2) { + std::cerr << "usage: example-app \n"; + return -1; + } + + torch::DeviceType device_type = at::kCPU; // 定义设备类型 + if (torch::cuda::is_available()) + device_type = at::kCUDA; + + + torch::jit::script::Module model; + try { + // Deserialize the ScriptModule from a file using torch::jit::load(). + model = torch::jit::load(argv[1]); + } + catch (const c10::Error& e) { + std::cerr << "error loading the model\n"; return -1; + } + std::cout << "ok\n"; + + // Create a vector of inputs. + + // std::vector inputs; + // inputs.push_back(torch::ones({1, 3, 224, 224})); + + int nums = 100; + model.to(device_type); + // std::vector inputs; + // inputs.push_back(torch::ones({ 1, 3, 224, 224 }).to(device_type)); + std::vector> inputs; + std::vector inputs2; + for(int i=0;i - [https://www.pytorch.org](https://www.pytorch.org/) +> - https://developer.nvidia.com/cuda +> - https://developer.nvidia.com/cudnn +> - https://developer.nvidia.com/tensorrt + + + +```python +from torchvision.models import resnet18 +import torch +import torch_tensorrt + +model = resnet18().eval() # torch module needs to be in eval (not training) mode + +inputs = [ + torch_tensorrt.Input( + min_shape=[1, 3, 224, 224], + opt_shape=[1, 3, 256, 256], + max_shape=[1, 3, 320, 320], + dtype=torch.half, + ) +] +enabled_precisions = {torch.float, torch.half} # Run with fp16 + +trt_ts_module = torch_tensorrt.compile( + model, inputs=inputs, enabled_precisions=enabled_precisions +) + +input_data = torch.randn(1, 3, 224, 224) +input_data = input_data.to("cuda").half() +result = trt_ts_module(input_data) +torch.jit.save(trt_ts_module, "trt_ts_module.ts") +``` + +```python +# Deployment application +import torch +import torch_tensorrt + +trt_ts_module = torch.jit.load("trt_ts_module.ts") +input_data = input_data.to("cuda").half() +result = trt_ts_module(input_data) +``` + + + +--- +## kaggle环境配置 +```python +cuda-11.0 / !ls /usr/include/cudnn* +gcc version 9.4.0 (Ubuntu 9.4.0-1ubuntu1~20.04.1) +cmake version 3.22.5 +GNU Make 4.2.1 +python3.8 +pytorch-1.11.0 +torchvision-0.12.0 + +# torch-tensorrt==1.1.0 +pip3 install torch-tensorrt -f https://github.com/pytorch/TensorRT/releases + +import torch_tensorrt 显示以下错误,需要下载 libtorch gpu版本 (https://pytorch.org/) +# ImportError: libtorch_cuda_cu.so: cannot open shared object file: No such file or directory +# https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.12.1%2Bcu113.zip +``` + +## [实例](https://developer.nvidia.com/blog/accelerating-inference-up-to-6x-faster-in-pytorch-with-torch-tensorrt/) + +要执行这些步骤,您需要以下资源: + +- 具有 NVIDIA GPU、计算架构 7 或更早版本的 Linux 机器 +- 已安装 Docker,19.03 或更高版本 +- 一个 Docker 容器,包含 PyTorch、Torch-TensorRT 以及从[NGC 目录中提取的所有依赖项](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) + +按照说明运行标记为[nvcr.io/nvidia/pytorch:21.11-py3](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch)的 Docker 容器。 + +```python +docker run --gpus all -it --rm nvcr.io/nvidia/pytorch:21.11-py3 +# or +nvida-docker run -it --rm nvcr.io/nvidia/pytorch:21.11-py3 +``` + +在 Docker 容器中有一个实时 bash 终端,启动一个 JupyterLab 实例来运行 Python 代码 +```python +jupyter-notebook --ip 0.0.0.0 --port 8000 +jupyter-lab --allow-root --ip=0.0.0.0 --NotebookApp.token=’TensorRT’ --port 8888 +``` + +```python +# pip install timm + +import torch +import torch_tensorrt +import timm +import time +import numpy as np +import torch.backends.cudnn as cudnn + +torch.hub._validate_not_a_forked_repo=lambda a,b,c: True + +efficientnet_b0 = timm.create_model('efficientnet_b0',pretrained=True) + +model =efficientnet_b0.eval().to("cuda") +detections_batch = model(torch.randn(128, 3, 224, 224).to("cuda")) +detections_batch.shape + +# 要通过 PyTorch JIT 和 Torch-TensorRT AOT 编译方法对该模型进行基准测试,请编写一个简单的基准实用程序函数: +cudnn.benchmark = True + +def benchmark(model, input_shape=(1024, 3, 512, 512), dtype='fp32', nwarmup=50, nruns=1000): + input_data = torch.randn(input_shape) + input_data = input_data.to("cuda") + if dtype=='fp16': + input_data = input_data.half() + + print("Warm up ...") + with torch.no_grad(): + for _ in range(nwarmup): + features = model(input_data) + torch.cuda.synchronize() + print("Start timing ...") + timings = [] + with torch.no_grad(): + for i in range(1, nruns+1): + start_time = time.time() + pred_loc = model(input_data) + torch.cuda.synchronize() + end_time = time.time() + timings.append(end_time - start_time) + if i%10==0: + print('Iteration %d/%d, avg batch time %.2f ms'%(i, nruns, np.mean(timings)*1000)) + + print("Input shape:", input_data.size()) + print('Average throughput: %.2f images/second'%(input_shape[0]/np.mean(timings))) + +# Inference using PyTorch and TorchScript +model = efficientnet_b0.eval().to("cuda") +benchmark(model, input_shape=(1, 3, 224, 224), nruns=100) + +""" +Start timing ... +Iteration 10/100, avg batch time 16.47 ms +Iteration 20/100, avg batch time 16.51 ms +Iteration 30/100, avg batch time 17.21 ms +Iteration 40/100, avg batch time 17.53 ms +Iteration 50/100, avg batch time 17.67 ms +Iteration 60/100, avg batch time 17.84 ms +Iteration 70/100, avg batch time 17.98 ms +Iteration 80/100, avg batch time 17.99 ms +Iteration 90/100, avg batch time 17.82 ms +Iteration 100/100, avg batch time 17.68 ms +Input shape: torch.Size([1, 3, 224, 224]) +Average throughput: 56.55 images/second + + +""" + +# 可以使用 TorchScript JIT 模块重复相同的步骤 +traced_model = torch.jit.trace(model, torch.randn((1,3,224,224)).to("cuda")) +torch.jit.save(traced_model, "efficientnet_b0_traced.jit.pt") +benchmark(traced_model, input_shape=(1, 3, 224, 224), nruns=100) + +""" +Start timing ... +Iteration 10/100, avg batch time 11.88 ms +Iteration 20/100, avg batch time 12.04 ms +Iteration 30/100, avg batch time 12.30 ms +Iteration 40/100, avg batch time 12.43 ms +Iteration 50/100, avg batch time 12.49 ms +Iteration 60/100, avg batch time 12.37 ms +Iteration 70/100, avg batch time 12.36 ms +Iteration 80/100, avg batch time 12.43 ms +Iteration 90/100, avg batch time 12.16 ms +Iteration 100/100, avg batch time 11.84 ms +Input shape: torch.Size([1, 3, 224, 224]) +Average throughput: 84.43 images/second + + +""" + + +# Inference using Torch-TensorRT +# 要使用 Torch-TensorRT 以混合精度编译模型,请运行以下命令 +trt_model = torch_tensorrt.compile(model, + inputs= [torch_tensorrt.Input((1, 3, 224, 224),dtype=torch.half)], + enabled_precisions= { torch.half} # Run with FP16 +) + +benchmark(trt_model, input_shape=(1, 3, 224, 224), nruns=100, dtype="fp16") +""" +dtype="fp32" +Iteration 10/100, avg batch time 9.29 ms +Iteration 20/100, avg batch time 9.24 ms +Iteration 30/100, avg batch time 9.26 ms +Iteration 40/100, avg batch time 9.28 ms +Iteration 50/100, avg batch time 9.27 ms +Iteration 60/100, avg batch time 9.28 ms +Iteration 70/100, avg batch time 9.14 ms +Iteration 80/100, avg batch time 9.03 ms +Iteration 90/100, avg batch time 9.01 ms +Iteration 100/100, avg batch time 9.03 ms +Input shape: torch.Size([1, 3, 224, 224]) +Average throughput: 110.70 images/second + +dtype="fp16" +Iteration 10/100, avg batch time 8.09 ms +Iteration 20/100, avg batch time 5.35 ms +Iteration 30/100, avg batch time 4.18 ms +Iteration 40/100, avg batch time 3.57 ms +Iteration 50/100, avg batch time 3.21 ms +Iteration 60/100, avg batch time 2.96 ms +Iteration 70/100, avg batch time 2.78 ms +Iteration 80/100, avg batch time 2.64 ms +Iteration 90/100, avg batch time 2.53 ms +Iteration 100/100, avg batch time 2.45 ms +Input shape: torch.Size([1, 3, 224, 224]) +Average throughput: 408.98 images/second +""" +``` + +### 基准测试结果 + +这是我在批量大小为 1 的 NVIDIA A100 GPU 上取得的结果。 + +![Torch 和 TensorRT 之间的吞吐量比较吞吐量是 4 倍。](https://developer-blogs.nvidia.com/wp-content/uploads/2021/12/native-throughput-comparison-torch-tensorrt-625x433.png) + +*图 6. 在批量大小为 1 的 NVIDIA A100 GPU 上比较原生 PyTorch 与 Torch-TensorRt 的吞吐量* + + + +# TensorRT + +- https://docs.nvidia.com/deeplearning/tensorrt/quick-start-guide/index.html#onnx-export + +安装包安装参考 [here](https://note.youdao.com/old-web/#/file/WEB8c4a998486a217e1084f117df705f315/markdown/WEBa2da28f7fa10323b590866cd1f9a35e0/) + +pip安装`!pip install -U nvidia-tensorrt --index-url https://pypi.ngc.nvidia.com # install` + +## pytorch to onnx + +```python +import torchvision.models as models + +resnext50_32x4d = models.resnext50_32x4d(pretrained=True) +import torch + +BATCH_SIZE = 64 +dummy_input=torch.randn(BATCH_SIZE, 3, 224, 224) + +import torch.onnx +torch.onnx.export(resnext50_32x4d, dummy_input, "resnet50_onnx_model.onnx", verbose=False,opset_version=11) +``` + +## [将 ONNX 转换为 TensorRT 引擎](https://docs.nvidia.com/deeplearning/tensorrt/quick-start-guide/index.html#convert-onnx-engine) + +- 使用执行器 + +```python +/opt/TensorRT-7.2.3.4/bin/trtexec --onnx=resnet50_onnx_model.onnx --saveEngine=resnet_engine.trt --fp16 +/opt/TensorRT-7.2.3.4/bin/trtexec --onnx=model.onnx --saveEngine=model.trt --fp16 --workspace=64 --buildOnly +/opt/TensorRT-7.2.3.4/bin/trtexec --onnx=model.onnx --saveEngine=model.trt --best --workspace=64 --buildOnly # --minTiming=5 --avgTiming=10 + +--fp16除了 FP32 之外,还为支持它的层启用 FP16 精度。 +--int8除了 FP32 之外,还为支持它的层启用 INT8 精度。 +--best 启用所有支持的精度以实现每一层的最佳性能 +--workspace 控制构建器考虑的算法的最大可用持久暂存内存量(以 MB 为单位) +--minShapes和--maxShapes 指定每个网络输入的维度范围和 --optShapes指定自动调谐器应用于优化的维度 +--buildOnly请求跳过推理性能测量 +--tacticSources可用于在默认策略源(cuDNN、cuBLAS 和 cuBLASLt)中添加或删除策略 +--minTiming和--avgTiming 分别设置策略选择中使用的最小和平均迭代次数。 +``` + +- 使用 TensorRT API + +```python +import torch +import torchvision.models as models +def torch2onnx(model:torch.nn.Module,x:torch.Tensor,save_path:str="./model.onnx"): + """pytorch模型保存成 .onnx格式""" + # x = torch.rand([32,3,224,224]) + model.eval() + # Export the model + torch.onnx.export(model, # model being run + x, # model input (or a tuple for multiple inputs) + save_path, # where to save the model (can be a file or file-like object) + verbose=True, + export_params=True, # store the trained parameter weights inside the model file + opset_version=11, # the ONNX version to export the model to + do_constant_folding=True, # whether to execute constant folding for optimization + input_names=['input'], # the model's input names + output_names=['output'], # the model's output names + # dynamic_axes={'input': {0: 'batch_size'}, # variable lenght axes + # 'output': {0: 'batch_size'}} # tensorrt 中没法执行,要注销这句 + ) + + # input_names = ["input"] + # output_names = ["output"] + # + # torch.onnx.export(model, x, save_path, verbose=True, opset_version=8, input_names=input_names, + # output_names=output_names) + +torch2onnx(models.resnet18(),torch.randn(64,3,224,224)) +``` + +```python +from __future__ import print_function + +import numpy as np +import tensorrt as trt +# import pycuda.driver as cuda +# import pycuda.autoinit + +import os +# import sys +# sys.path.insert(1, os.path.join(sys.path[0], ".")) +# import common +# import layers_trt as lytrt + +import time +from functools import wraps, partial + +TRT_LOGGER = trt.Logger() + +EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + + +def GiB(val): + return val * 1 << 30 + + +def timeit(func): + @wraps(func) + def inner(*args, **kwargs): + start = time.time() + r = func(*args, **kwargs) + end = time.time() + print("%s cost time: %s" % (func.__name__, end - start)) + return r + + return inner + + +class DefModelData: + PLUGIN_LIBRARY = None + BATCH_SIZE = 1 + MEM_SIZE = 1 << 28 # 256MiB ; 1 << 28/1024/1024=256 + # MEM_SIZE = GiB(2) # 1G + + DTYPE = trt.float16 + NP_DTYPE = np.float16 + INPUT_SHAPE = [1, 3, 32, 32] + OUTPUT_SIZE = [-1, 10] # [-1,10] + + onnx_file_path = "model.onnx" + engine_file_path = "model.trt" + + model_file_path = "model.npz" + INPUT_NAME = 'input' + OUTPUT_NAME = 'output' + + +def onnx2engine(ModelData=None): + """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it.""" + if ModelData is None: ModelData = DefModelData + + onnx_file_path = ModelData.onnx_file_path + engine_file_path = ModelData.engine_file_path + + def build_engine(): + """Takes an ONNX file and creates a TensorRT engine to run inference with""" + with trt.Builder(TRT_LOGGER) as builder, builder.create_network( # common.EXPLICIT_BATCH + EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser: + builder.max_workspace_size = ModelData.MEM_SIZE + builder.max_batch_size = ModelData.BATCH_SIZE + + if ModelData.DTYPE == trt.float16: + builder.fp16_mode = True + + # Parse model file + if not os.path.exists(onnx_file_path): + print( + 'ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.'.format(onnx_file_path)) + exit(0) + print('Loading ONNX file from path {}...'.format(onnx_file_path)) + with open(onnx_file_path, 'rb') as model: + print('Beginning ONNX file parsing') + if not parser.parse(model.read()): + print('ERROR: Failed to parse the ONNX file.') + for error in range(parser.num_errors): + print(parser.get_error(error)) + return None + # The actual yolov3.onnx is generated with batch size 64. Reshape input to batch size 1 + # network.get_input(0).shape = [1, 3, 608, 608] + if 'INPUT_SHAPE' in ModelData.__dict__.keys(): + network.get_input(0).shape = ModelData.INPUT_SHAPE + + print('Completed parsing of ONNX file') + print('Building an engine from file {}; this may take a while...'.format(onnx_file_path)) + engine = builder.build_cuda_engine(network) + print("Completed creating Engine") + with open(engine_file_path, "wb") as f: + f.write(engine.serialize()) + return engine + + if os.path.exists(engine_file_path): + # If a serialized engine exists, use it instead of building an engine. + print("Reading engine from file {}".format(engine_file_path)) + with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: + return runtime.deserialize_cuda_engine(f.read()) + else: + return build_engine() + + +def onnx2trt(ModelData=None): + # from toolsmall.tools.speed.modelTansform import onnx2engine + + if ModelData is None: ModelData = DefModelData + onnx2engine(ModelData) + + +if __name__ == "__main__": + ModelData = DefModelData + ModelData.BATCH_SIZE = 1 + ModelData.INPUT_SHAPE = [64, 3, 224, 224] + ModelData.OUTPUT_SIZE = [-1, 1000] + onnx2trt(ModelData) + +``` + +## 运行engine + +`/opt/TensorRT-7.2.3.4/samples/python/common.py` + +```python +from __future__ import print_function + +import numpy as np +import tensorrt as trt +# import pycuda.driver as cuda +# import pycuda.autoinit + +import os +# import sys +# sys.path.insert(1, os.path.join(sys.path[0], ".")) +import common +# import layers_trt as lytrt + +import time +from functools import wraps, partial + +TRT_LOGGER = trt.Logger() + + +def timeit(func): + @wraps(func) + def inner(*args, **kwargs): + start = time.time() + r = func(*args, **kwargs) + end = time.time() + print("%s cost time: %s" % (func.__name__, end - start)) + return r + + return inner + + +class DefModelData: + PLUGIN_LIBRARY = None + BATCH_SIZE = 1 + MEM_SIZE = 1 << 28 # 256MiB ; 1 << 28/1024/1024=256 + # MEM_SIZE = common.GiB(2) # 1G + + DTYPE = trt.float16 + NP_DTYPE = np.float16 + INPUT_SHAPE = [1, 3, 32, 32] + OUTPUT_SIZE = [-1, 10] # [-1,10] + + onnx_file_path = "model.onnx" + engine_file_path = "model.trt" + + model_file_path = "model.npz" + INPUT_NAME = 'input' + OUTPUT_NAME = 'output' + + +def loadEngine(engine_file_path: str = "./model.engine"): + # If a serialized engine exists, use it instead of building an engine. + print("Reading engine from file {}".format(engine_file_path)) + with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: + return runtime.deserialize_cuda_engine(f.read()) + + +@timeit +def runEngineInfer(data=np.ones([1, 3, 32, 32]), ModelData=None): + if ModelData is None: ModelData = DefModelData + engine_file_path = ModelData.engine_file_path + with loadEngine(engine_file_path) as engine, engine.create_execution_context() as context: + inputs, outputs, bindings, stream = common.allocate_buffers(engine) + + len_data = len(data) + data = data.ravel().astype(ModelData.NP_DTYPE) # 展成一行 + np.copyto(inputs[0].host, data) + + # [output] = common.do_inference(context, bindings=bindings, \ + # inputs=inputs, outputs=outputs, stream=stream, \ + # batch_size=ModelData.BATCH_SIZE) + + [output] = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, + stream=stream) + + output = np.reshape(output, ModelData.OUTPUT_SIZE)[:len_data] # 转成[-1,10] + pred = np.argmax(output, -1) + print("Prediction: " + str(pred)) + + +if __name__ == "__main__": + ModelData = DefModelData + ModelData.BATCH_SIZE = 64 + ModelData.INPUT_SHAPE = [64, 3, 224, 224] + ModelData.OUTPUT_SIZE = [-1, 1000] + ModelData.engine_file_path = "model.trt" + runEngineInfer(data=np.random.randn(64, 3, 224, 224), ModelData=ModelData) + +``` + +## test + +- 0.8920[cpu];0.0841[cuda] + +```python +import torch +import torchvision.models as models +import torch.backends.cudnn as cudnn +import time + +cudnn.benchmark = True +device = "cpu" # "cuda" # 0.8920321515493561;0.08411713584093378 +nums = 100 +nwarmup = 5 +model = models.resnet18().eval().to(device) +datas = torch.randn(64, 3, 224, 224).to(device) +print("Warm up ...") +with torch.no_grad(): + for _ in range(nwarmup): + features = model(datas) + +start = time.perf_counter() +with torch.no_grad(): + for _ in range(100): + datas = torch.randn(64, 3, 224, 224).to(device) + output = model(datas) + pred = output.argmax(-1) + +end = time.perf_counter() +print(f"mean_time:{((end - start) / nums):.5f}") + +``` + +- mean_time:0.04328 + +```python +from __future__ import print_function + +import time +import numpy as np +import tensorrt as trt +# import pycuda.driver as cuda +# import pycuda.autoinit + +import os +# import sys +# sys.path.insert(1, os.path.join(sys.path[0], ".")) +import common +# import layers_trt as lytrt + +import time +from functools import wraps, partial + +TRT_LOGGER = trt.Logger() + + +def timeit(func): + @wraps(func) + def inner(*args, **kwargs): + start = time.time() + r = func(*args, **kwargs) + end = time.time() + print("%s cost time: %s" % (func.__name__, end - start)) + return r + + return inner + + +class DefModelData: + PLUGIN_LIBRARY = None + BATCH_SIZE = 1 + MEM_SIZE = 1 << 28 # 256MiB ; 1 << 28/1024/1024=256 + # MEM_SIZE = common.GiB(2) # 1G + + DTYPE = trt.float16 + NP_DTYPE = np.float16 + INPUT_SHAPE = [1, 3, 32, 32] + OUTPUT_SIZE = [-1, 10] # [-1,10] + + onnx_file_path = "model.onnx" + engine_file_path = "model.trt" + + model_file_path = "model.npz" + INPUT_NAME = 'input' + OUTPUT_NAME = 'output' + + +def loadEngine(engine_file_path: str = "./model.engine"): + # If a serialized engine exists, use it instead of building an engine. + print("Reading engine from file {}".format(engine_file_path)) + with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: + return runtime.deserialize_cuda_engine(f.read()) + + +@timeit +def runEngineInfer(data=np.ones([1, 3, 32, 32]), ModelData=None): + if ModelData is None: ModelData = DefModelData + engine_file_path = ModelData.engine_file_path + with loadEngine(engine_file_path) as engine, engine.create_execution_context() as context: + inputs, outputs, bindings, stream = common.allocate_buffers(engine) + len_data = len(data) + nums = 100 + nwarmup = 5 + print("Warm up ...") + for _ in range(nwarmup): + data = data.ravel().astype(ModelData.NP_DTYPE) # 展成一行 + np.copyto(inputs[0].host, data) + [output] = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, + stream=stream) + output = np.reshape(output, ModelData.OUTPUT_SIZE)[:len_data] # 转成[-1,10] + pred = np.argmax(output, -1) + # print("Prediction: " + str(pred)) + + start = time.perf_counter() + for _ in range(nums): + data = data.ravel().astype(ModelData.NP_DTYPE) # 展成一行 + np.copyto(inputs[0].host, data) + [output] = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, + stream=stream) + output = np.reshape(output, ModelData.OUTPUT_SIZE)[:len_data] # 转成[-1,10] + pred = np.argmax(output, -1) + # print("Prediction: " + str(pred)) + end = time.perf_counter() + print(f"mean_time:{((end - start) / nums):.5f}") + + +if __name__ == "__main__": + ModelData = DefModelData + ModelData.BATCH_SIZE = 64 + ModelData.INPUT_SHAPE = [64, 3, 224, 224] + ModelData.OUTPUT_SIZE = [-1, 1000] + ModelData.engine_file_path = "model.trt" + runEngineInfer(data=np.random.randn(64, 3, 224, 224), ModelData=ModelData) + +``` + diff --git a/readme.md b/readme.md index c08534f00bdab05854942b59092c94700e6e8c69..5e2090fd94ed6d23d9b88a341802e7e90daf99b8 100644 --- a/readme.md +++ b/readme.md @@ -1,5 +1,6 @@ - https://www.cvmart.net/community/detail/7040 - https://www.cvmart.net/community/detail/5609 +- https://github.com/pytorch/TensorRT # 1、训练模型 python训练 (略过) # 2、模型推理部署 diff --git a/readme_CN.md b/readme_CN.md new file mode 100644 index 0000000000000000000000000000000000000000..a3f0f002f2eb646a09b8ae6204bc8928428d83e1 --- /dev/null +++ b/readme_CN.md @@ -0,0 +1,330 @@ +- https://pytorch.org/cppdocs/ +- https://pytorch.org/get-started/locally/ # 下载 libtorch库 + +- [LibTorch的安装与基本使用](https://zhuanlan.zhihu.com/p/513571175) + +- https://docs.openvino.ai/2023.3/openvino_docs_install_guides_installing_openvino_apt.html # openvino c++安装 + +- https://github.com/openvinotoolkit/openvino + +- https://github.com/openvinotoolkit/openvino_notebooks + +- https://github.com/microsoft/onnxruntime + +- https://github.com/microsoft/onnxruntime-inference-examples + +- https://github.com/pytorch/TensorRT + +- https://github.com/NVIDIA/TensorRT + +- https://github.com/onnx/onnx-tensorrt + +- https://github.com/wang-xinyu/tensorrtx + + + +# 1、安装 opencv(c++) + +- [Linux安装Opencv(C++)](https://blog.csdn.net/weixin_44384491/article/details/121142093) + +# 2、安装libtorch + +下载的libtorch的版本最好和你的pytorch的版本一致。Linux下各个libtorch的release版本的下载链接可以在下面这篇文章中找到: + +## 2.2.1-cu118 + +>Download here (Pre-cxx11 ABI): +>https://download.pytorch.org/libtorch/cu118/libtorch-shared-with-deps-2.2.1%2Bcu118.zip +>Download here (cxx11 ABI): +>https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.2.1%2Bcu118.zip + +## 1.11.0-cu115 + +>https://download.pytorch.org/libtorch/cu115 + +## 2.2.1-cpu + +>Download here (Pre-cxx11 ABI): +>https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-2.2.1%2Bcpu.zip +>Download here (cxx11 ABI): +>https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.2.1%2Bcpu.zip + + + +下载完成后,随便丢到一个地方去解压,完成。比如我是习惯性放在/usr/local/lib下的。我也建议linux小白将libtorch放在 /usr/local/lib下,并保证libtorch文件夹下存在include这个文件夹。 + +## 配置CMakeLists.txt + + + +```py +## 目录结构 +xxxx + - CMakeLists.txt + - digit.cpp + - digit.py +``` + + + +```cmake +# CMakeLists.txt + +cmake_minimum_required(VERSION 3.0 FATAL_ERROR) +project(LibTorchDemo) + +# compile options +set(CMAKE_CXX_FLAGS_RELEASE "-O3") +set(CMAKE_CXX_STANDARD 14) + +# package +find_package(OpenCV REQUIRED) +find_package(Torch REQUIRED PATHS "/usr/local/lib/libtorch") + +add_executable(digit digit.cpp) +# libtorch +target_link_libraries(digit ${TORCH_LIBRARIES}) +target_link_libraries(digit ${OpenCV_LIBS}) +``` + +> "/usr/local/lib/libtorch" :Torch的package路径根据你的安装目录指定 + +然后写个文件include一下库,文件名为digit.cpp: + +```c++ +#include "iostream" +#include "opencv2/opencv.hpp" +#include "torch/script.h" + +int main(int argc, char const *argv[]) +{ + std::cout << "hello world!" << std::endl; + return 0; +} +``` + +编译一把: + +```bash +$mkdir build +$cd build +$cmake .. && make -j8 install +``` + +> 如果出现错误,基本都是找不到头文件或者静态库,如果找不到头文件,在CMakeLists.txt中include_directories()中添加能够搜索到你在cpp中写的相对路径的根目录路径。如果静态库找不到,请检查安装包是否损坏,或者静态库目录是否在gcc的搜索路径中。 + +## 第一步:先用PyTorch训练一个网络 + +既然我们需要将PyTorch模型使用C++部署,那么首先肯定需要一个Torch的模型。我们先使用PyTorch简单训练一个手写数字识别,相信看这篇文章的靓仔都是torch老手了,我直接上代码: + +> 如果你已经有一个模型文件了,请直接跳转到第二步 + +```python +from sklearn.datasets import load_digits +import torch +from torch import nn +import torch.utils.data as Data +import numpy as np +from sklearn.metrics import accuracy_score +import matplotlib.pyplot as plt +import os + +class Digit(nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Sequential( + nn.Conv2d(1, 16, 3, 1, 1), + nn.Tanh(), + nn.Conv2d(16, 32, 3, 2, 1), + nn.Tanh(), + nn.Conv2d(32, 16, 3, 2, 1), + nn.Tanh(), + nn.Conv2d(16, 8, 3, 1, 1) + ) + + self.output = nn.Linear(32, 10) + + def forward(self, x): + out = self.conv(x) + out = self.output(out.flatten(1)) + return out + +RATIO = 0.8 +BATCH_SIZE = 128 +EPOCH = 10 + +if __name__ == "__main__": + X, y = load_digits(return_X_y=True) + X = X / 16. + sample_num = len(y) + X = [x.reshape(1, 8, 8).tolist() for x in X] + + indice = np.arange(sample_num) + np.random.shuffle(indice) + + X = torch.FloatTensor(X) + y = torch.LongTensor(y) + offline = int(sample_num * RATIO) + + train = Data.TensorDataset(X[indice[:offline]], y[indice[:offline]]) + test = Data.TensorDataset(X[indice[offline:]], y[indice[offline:]]) + + train_loader = Data.DataLoader(train, BATCH_SIZE, True) + test_loader = Data.DataLoader(test, BATCH_SIZE, False) + + model = Digit() + optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3) + criterion = nn.CrossEntropyLoss(reduction="mean") + + test_losses = [] + test_accs = [] + + for epoch in range(EPOCH): + model.train() + for bx, by in train_loader: + out = model(bx) + loss = criterion(out, by) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + model.eval() + correct = 0 + total = 0 + test_loss = [] + test_acc = [] + for bx, by in test_loader: + with torch.no_grad(): + out = model(bx) + pre_lab = out.argmax(1) + loss = criterion(out, by) + + test_loss.append(loss.item()) + test_acc.append(accuracy_score(pre_lab, by)) + + test_losses.append(np.mean(test_loss)) + test_accs.append(np.mean(test_acc)) + + plt.figure(dpi=120) + plt.plot(test_losses, 'o-', label="loss") + plt.plot(test_accs, 'o-', label="accuracy") + plt.legend() + plt.grid() + plt.show() + + if not os.path.exists("model"): + os.makedirs("model") + torch.save(model.state_dict(), "model/digit.pth") +``` + +## 第二步:使用tracing将模型文件转化成TorchScript + +PyTorch导出的模型文件是不能直接被libtorch读取的,因为PyTorch默认导出的后端的序列化是joblib。PyTorch通过JIT搭建了Python和C++的桥梁,我们可以将模型转成TorchScript Module,将Python运行时的部分运行时包裹进去。 + +转换方法非常简单: + +```py +import torch +from digit import Digit + +model = Digit() +model.load_state_dict(torch.load("model/digit.pth", map_location="cpu")) + +sample = torch.randn(1, 1, 8, 8) + +trace_model = torch.jit.trace(model, sample) +trace_model.save("model/digit.jit") +``` + +运行下述测试代码,由于Python本身的特性和JIT的即时编译的特性,模型在同一进程生命周期内运行时前几次会比较慢,所以在测试前,需要空跑几次: + +## 第三步:使用libtorch重写推理程序 + +由于TorchScript可以被C++直接调用,所以我们只需要使用libtorch重写推理代码,并将模型读入就完成了。 + +libtorch的语法和PyTorch基本一致,学起来很快,于此锦恢就不再赘述了。相应的,在C++中,我们用cv::Mat来取代Python中的numpy.ndarray对象,如何将cv::Mat转成libtorch可以读入的数据结构也会在demo中涉及。 + +下面的例子会完成一个C++命令行程序,它的第一个参数为模型,第二个参数为需要读入的手写数字图像的路径,预测结果会打印到控制台上。期待已久的C++代码如下: + +```c +#include "iostream" +#include "opencv2/opencv.hpp" +#include "torch/script.h" +#include "fstream" + +void checkPath(const char* path) { + std::ifstream in; + in.open(path); + bool flag = (bool)in; + in.close(); + if (flag) return; + else { + std::cout << "file " << path << " doesn't exist!" << std::endl; + exit(-1); + } +} + +int main(int argc, char const *argv[]) +{ + if (argc != 3) { + std::cout << "usage : digit " << std::endl; + return -1; + } + + checkPath(argv[1]); + checkPath(argv[2]); + cv::Mat img = cv::imread(argv[2]), gimg, fimg, rimg; + cv::cvtColor(img, gimg, CV_BGR2GRAY); + + gimg.convertTo(fimg, CV_32F, - 1. / 255., 1.); + cv::resize(fimg, rimg, {8, 8}); + + // convert Mat to tensor + at::Tensor img_tensor = torch::from_blob( + rimg.data, + {1, 1, 8, 8}, + torch::kFloat32 + ); + + // load model + torch::jit::Module model = torch::jit::load(argv[1]); + + // torch.no_grad() + torch::NoGradGuard no_grad; // 请一定加入torch::NoGradGuard no_grad; 这句话,否则内存会炸。 + + // forward + torch::Tensor out = model({img_tensor}).toTensor(); + int pre_lab = torch::argmax(out, 1).item().toInt(); + + std::cout << "predict number is " << pre_lab << std::endl; + return 0; +} +``` + + + +# 实践 + +```py +# 安装环境依赖 +sudo apt-get update -y +sudo apt-get install cmake -y +sudo apt-get install build-essential libgtk2.0-dev libavcodec-dev libavformat-dev libjpeg-dev libswscale-dev libtiff5-dev -y +sudo apt-get install libgtk2.0-dev -y +sudo apt-get install pkg-config -y + +# 安装opencv +!wget https://github.com/opencv/opencv/archive/4.9.0.zip +cd opencv-4.9.0 +mkdir build +cd build +cmake -D CMAKE_BUILD_TYPE=Release -D CMAKE_INSTALL_PREFIX=/usr/local .. +make -j8 +make install + +# 安装libtorch +!wget https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.2.1%2Bcu121.zip +unzip libtorch-cxx11-abi-shared-with-deps-2.2.1%2Bcu121.zip +cp -r libtorch /usr/local/lib +``` +