update

b1b3c9d9 · 风吴痕 · 37d936fc · b1b3c9d9 · b1b3c9d9 · b1b3c9d9
14 changed file
--- a/demo/CMakeLists.txt
+++ b/demo/CMakeLists.txt
+# CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+project(LibTorchDemo)
+
+# compile options
+set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+set(CMAKE_CXX_STANDARD 14)
+
+# package
+find_package(OpenCV REQUIRED)
+find_package(Torch REQUIRED PATHS "/usr/local/lib/libtorch") # 将libtorch放在 /usr/local/lib
+
+add_executable(digit digit.cpp)
+# libtorch
+target_link_libraries(digit ${TORCH_LIBRARIES})
+target_link_libraries(digit ${OpenCV_LIBS})
\ No newline at end of file
--- a/demo/convert2jit.py
+++ b/demo/convert2jit.py
+import torch
+from digit import Digit
+
+model = Digit()
+model.load_state_dict(torch.load("model/digit.pth", map_location="cpu"))
+
+sample = torch.randn(1, 1, 8, 8)
+
+trace_model = torch.jit.trace(model, sample)
+trace_model.save("model/digit.jit")
\ No newline at end of file
--- a/demo/digit.cpp
+++ b/demo/digit.cpp
+#include "iostream"
+#include "opencv2/opencv.hpp"
+#include "torch/script.h"
+#include "fstream"
+
+void checkPath(const char* path) {
+    std::ifstream in;
+    in.open(path);
+    bool flag = (bool)in;
+    in.close();
+    if (flag) return;
+    else {
+        std::cout << "file " << path << " doesn't exist!" << std::endl;
+        exit(-1);
+    }
+}
+
+int main(int argc, char const *argv[])
+{
+    if (argc != 3) {
+        std::cout << "usage : digit <model path> <image path>" << std::endl;
+        return -1;
+    }
+
+    checkPath(argv[1]);
+    checkPath(argv[2]);
+    cv::Mat img = cv::imread(argv[2]), gimg, fimg, rimg;
+    cv::cvtColor(img, gimg, CV_BGR2GRAY);
+
+    gimg.convertTo(fimg, CV_32F, - 1. / 255., 1.);
+    cv::resize(fimg, rimg, {8, 8});
+
+    // convert Mat to tensor
+    at::Tensor img_tensor = torch::from_blob(
+        rimg.data,
+        {1, 1, 8, 8},
+        torch::kFloat32
+    );
+
+    // load model
+    torch::jit::Module model = torch::jit::load(argv[1]);
+
+    // torch.no_grad() 
+    torch::NoGradGuard no_grad; // 请一定加入torch::NoGradGuard no_grad; 这句话，否则内存会炸。
+    
+    // forward
+    torch::Tensor out = model({img_tensor}).toTensor();
+    int pre_lab = torch::argmax(out, 1).item().toInt();
+
+    std::cout << "predict number is " << pre_lab << std::endl;
+    return 0;
+}
\ No newline at end of file
--- a/demo/digit.py
+++ b/demo/digit.py
+from sklearn.datasets import load_digits
+import torch
+from torch import nn
+import torch.utils.data as Data
+import numpy as np
+from sklearn.metrics import accuracy_score
+import matplotlib.pyplot as plt
+import os
+
+class Digit(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(1, 16, 3, 1, 1),
+            nn.Tanh(),
+            nn.Conv2d(16, 32, 3, 2, 1),
+            nn.Tanh(),
+            nn.Conv2d(32, 16, 3, 2, 1),
+            nn.Tanh(),
+            nn.Conv2d(16, 8, 3, 1, 1)
+        )
+
+        self.output = nn.Linear(32, 10)
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.output(out.flatten(1))
+        return out
+
+RATIO = 0.8
+BATCH_SIZE = 128
+EPOCH = 10
+
+if __name__ == "__main__":
+    X, y = load_digits(return_X_y=True)
+    X = X / 16.
+    sample_num = len(y)
+    X = [x.reshape(1, 8, 8).tolist() for x in X]
+
+    indice = np.arange(sample_num)
+    np.random.shuffle(indice)
+
+    X = torch.FloatTensor(X)
+    y = torch.LongTensor(y)
+    offline = int(sample_num * RATIO)
+
+    train = Data.TensorDataset(X[indice[:offline]], y[indice[:offline]])
+    test  = Data.TensorDataset(X[indice[offline:]], y[indice[offline:]])
+
+    train_loader = Data.DataLoader(train, BATCH_SIZE, True)
+    test_loader  = Data.DataLoader(test,  BATCH_SIZE, False)
+    
+    model = Digit()
+    optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3)
+    criterion = nn.CrossEntropyLoss(reduction="mean")
+
+    test_losses = []
+    test_accs = []
+
+    for epoch in range(EPOCH):
+        model.train()
+        for bx, by in train_loader:
+            out = model(bx)
+            loss = criterion(out, by)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        model.eval()
+        correct = 0
+        total = 0
+        test_loss = []
+        test_acc = []
+        for bx, by in test_loader:
+            with torch.no_grad():
+                out = model(bx)
+                pre_lab = out.argmax(1)
+                loss = criterion(out, by)
+
+            test_loss.append(loss.item())
+            test_acc.append(accuracy_score(pre_lab, by))
+
+        test_losses.append(np.mean(test_loss))
+        test_accs.append(np.mean(test_acc))
+    
+    plt.figure(dpi=120)
+    plt.plot(test_losses, 'o-', label="loss")
+    plt.plot(test_accs, 'o-', label="accuracy")
+    plt.legend()
+    plt.grid()
+    plt.show()
+
+    if not os.path.exists("model"):
+        os.makedirs("model")
+    torch.save(model.state_dict(), "model/digit.pth")
\ No newline at end of file
--- a/demo/digit_test.cpp
+++ b/demo/digit_test.cpp
+#include "iostream"
+#include "opencv2/opencv.hpp"
+#include "torch/script.h"
+
+int main(int argc, char const *argv[])
+{
+    std::cout << "hello world!" << std::endl;
+    return 0;
+}
\ No newline at end of file
--- a/demo/image/sample.png
+++ b/demo/image/sample.png
--- a/demo/readme.md
+++ b/demo/readme.md
+# 使用
+
+```py
+# 测试
+1、将 digit_test.cpp 改成 digit.cpp
+2、mkdir build && cd  build
+3、camke .. && make -j4
+4、./digit
+
+# libtorch加载模型
+1、python digit.py # 训练一个原生pytorch模型
+2、python convert2jit.py # pytorch模型 转成 jit模型
+3、python test_jit.py # 测试 jit模型
+4、mkdir build && cd  build
+5、camke .. && make -j4
+6、./digit model/digit.jit image/sample.png
+```
+
--- a/demo/test_jit.py
+++ b/demo/test_jit.py
+import time
+import torch
+import cv2 as cv
+from digit import Digit
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+
+def run_model(model, image):
+    s = time.time()
+    out = model(image)
+    pre_lab = torch.argmax(out, dim=1)
+    cost_time = round(time.time() - s, 5)
+    return cost_time
+
+image = cv.imread("image/sample.png")
+image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
+image = 1 - image / 255.
+image = cv.resize(image, (8, 8))
+
+
+image = torch.FloatTensor(image).unsqueeze(0).unsqueeze(0).contiguous()
+origin_model = Digit()
+origin_model.load_state_dict(torch.load("model/digit.pth"))
+jit_model = torch.jit.load("model/digit.jit")
+
+# init jit
+for _ in range(3):
+    run_model(origin_model, image)
+    run_model(jit_model, image)
+
+test_times = 10
+
+# begin testing
+results = pd.DataFrame({
+    "type" : ["orgin"] * test_times + ["jit"] * test_times,
+    "cost_time" : [run_model(origin_model, image) for _ in range(test_times)] + [run_model(jit_model, image) for _ in range(test_times)]
+})
+
+plt.figure(dpi=120)
+sns.boxplot(
+    x=results["type"],
+    y=results["cost_time"]
+)
+plt.show()
\ No newline at end of file
--- a/doc/libtorch.md
+++ b/doc/libtorch.md
+- https://pytorch.org/tutorials/advanced/cpp_export.html
+- [LibTorch的安装、配置与使用](https://blog.csdn.net/weixin_45632168/article/details/114679263)
+- [libtorch c++调用 （五）Linux下的调用](https://blog.csdn.net/juluwangriyue/article/details/108463026)
+- https://pytorch.org/cppdocs/
+- [libtorch教程](https://www.zhihu.com/column/c_1373368181138972672)
+- [VS2019 配置 LibTorch 和 OpenCV](https://zhuanlan.zhihu.com/p/375084412)
+
+[toc]
+
+# 环境
+
+```python
+Ubuntu 9.4.0-1ubuntu1~20.04.1
+python3.7
+pytorch-1.11.0
+cuda-11.0
+libtorch-1.12-cuda113
+gcc version 9.4.0
+cmake version 3.22.5
+GNU Make 4.2.1
+
+# ---------------------------------------
+centos7
+python3.7
+cuda-10.2
+cudnn-10.2-linux-x64-v8.1.0.77
+torch-1.11.0+cu102-cp37-cp37m-linux_x86_64
+torchvision-0.12.0+cu102-cp37-cp37m-linux_x86_64
+libtorch-shared-with-deps-1.12.1+cu102
+onnxruntime-linux-x64-1.12.1
+cmake version 3.14.5
+GNU Make 3.82
+gcc version 8.3.1 20190311
+```
+
+
+
+# 下载 libtorch
+
+- https://pytorch.org/
+
+```python
+# cuda113-linux (需要安装 cuda-11.3 以及对应版本的 cudnn-8.0)
+https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.12.1%2Bcu113.zip
+# cpu-linux
+https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.12.1%2Bcpu.zip
+```
+
+# 将Pytorch模型转化为Torch Script
+
+```python
+import torch 
+import torchvision 
+# An instance of your model.
+model = torchvision.models.resnet18() 
+# An example input you would normally provide to your model's forward() method.
+example = torch.rand(1, 3, 224, 224) 
+# Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing. 
+traced_script_module = torch.jit.trace(model, example) 
+traced_script_module.save("traced_resnet_model.pt") 
+```
+
+# 在C++中加载Model
+
+```c
+#include<iostream>
+#include<torch/script.h>
+#include <torch/torch.h> // cuda相关函数头文件
+#include<memory>
+
+int main(int argc, const char* argv[]) { 
+	if (argc != 2) {
+ 		std::cerr << "usage: example-app <path-to-exported-script-module>\n";
+ 		return -1; 
+	} 
+    
+    torch::DeviceType device_type = at::kCPU; // 定义设备类型
+    if (torch::cuda::is_available())
+        device_type = at::kCUDA;
+
+    
+	torch::jit::script::Module model; 
+	try { 
+		// Deserialize the ScriptModule from a file using torch::jit::load(). 
+		model = torch::jit::load(argv[1]); 
+	} 
+	catch (const c10::Error& e) {
+		std::cerr << "error loading the model\n"; return -1; 
+	} 
+	std::cout << "ok\n";
+    
+    // Create a vector of inputs. 
+
+    // std::vector<torch::jit::IValue> inputs; 
+    // inputs.push_back(torch::ones({1, 3, 224, 224}));
+    
+    model.to(device_type);
+    std::vector<torch::jit::IValue> inputs;
+    inputs.push_back(torch::ones({ 1, 3, 224, 224 }).to(device_type));
+
+    // Execute the model and turn its output into a tensor. 
+
+    at::Tensor output = model.forward(inputs).toTensor(); 
+    std::cout << output.slice(/*dim=*/1, /*start=*/0, /*end=*/5) << '\n'; 
+ } 
+
+```
+
+## 结合opencv
+
+- https://blog.csdn.net/mmmkl1/article/details/118522533
+- https://github.com/qubvel/segmentation_models.pytorch
+
+```c++
+#include "DemoPytorch.h"
+#include <iostream>
+#include <memory>
+#include <algorithm>
+#include <stdio.h>
+#include <opencv2/core.hpp>
+#include <opencv2/opencv.hpp>
+#include <torch/torch.h>
+#include <torch/script.h>
+
+int main() {
+	// load model
+	torch::jit::script::Module module;
+	try {
+		module = torch::jit::load("./torch_script_eval.pt");
+		module.to(torch::kCPU);  // set model to cpu mode
+		/*module.to(torch::kCUDA);*/  // set model to cuda mode
+		module.eval();
+		std::cout << "MODEL LOADED";
+	}
+	catch (const c10::Error& e) {
+		std::cerr << "error loading the model\n";
+	}
+
+	// load img
+	cv::Mat img_original = cv::imread("./00011584_002.png",0);
+	cv::Mat img = cv::Mat(img_original);
+	// normalize
+	cv::resize(img, img, cv::Size(512, 512));
+	img.convertTo(img, CV_32FC1);
+	// img to tensor
+	torch::Tensor mean = torch::tensor({ 0.485,0.456,0.406 });
+	torch::Tensor std = torch::tensor({ 0.229, 0.224, 0.225 });
+	auto input_tensor = torch::from_blob(img.data, { 512,512,1 });
+	input_tensor = input_tensor / 255.0f;
+	input_tensor = input_tensor - mean;
+	input_tensor = input_tensor / std;
+	input_tensor = input_tensor.permute({ 2,0,1 });
+	input_tensor = input_tensor.to(torch::kCPU);
+	/*input_tensor = input_tensor.to(torch::kCUDA);*/
+	input_tensor = input_tensor.unsqueeze(0);
+	std::vector<torch::jit::IValue> input;
+	input.push_back(input_tensor);
+	// pred begin
+	auto pred = module.forward(input).toTensor();
+	// pred tensor to mat
+	pred = pred.squeeze().detach();
+	pred = pred * 255;
+	pred = pred.to(torch::kU8);
+	pred = pred.to(torch::kCPU);
+	cv::Mat output_mat(cv::Size{ 512,512 }, CV_8UC1, pred.data_ptr());
+	// show result
+	cv::imshow("original img", img_original);
+	cv::imshow("mask", output_mat);
+	cv::waitKey(0);
+	cv::destroyWindow("original img");
+	cv::destroyWindow("mask");
+
+	return 0;
+}
+
+```
+
+
+
+# test
+
+```cpp
+//%%file main.cpp
+
+#include <iostream>
+#include <torch/torch.h>
+ 
+using namespace std;
+ 
+int main()
+{
+    // torch::Tensor tensor = torch::eye(3);
+    torch::Tensor tensor = torch::eye(3).to(at::kCUDA);  // 数据加载至GPU
+    std::cout << tensor << std::endl;
+    cout << "Hello World!" << endl;
+    return 0;
+}
+
+
+// 测试gpu是否可以使用
+
+#include<iostream>
+#include<torch/script.h>
+#include <torch/torch.h> // cuda相关函数头文件
+#include<memory>
+ 
+int main()
+{
+	std::cout <<"cuda::is_available():" << torch::cuda::is_available() << std::endl;
+	// system("pause");
+    return 0;
+}
+```
+
+```makefile
+#%%file CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.5)
+ 
+project(libtorch_demo LANGUAGES CXX)
+
+# packages
+#find_package(CUDA)
+# nvcc flags
+#set(CUDA_NVCC_FLAGS -gencode arch=compute_20,code=sm_20;-G;-g)
+#set(CUDA_NVCC_FLAGS -gencode;arch=compute_60,code=sm_60;-G;-g)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+ 
+set(Torch_DIR /kaggle/working/libtorch/share/cmake/Torch)
+find_package(Torch REQUIRED)
+ 
+add_executable(libtorch_demo main.cpp)
+target_link_libraries(libtorch_demo "${TORCH_LIBRARIES}")
+set_property(TARGET libtorch_demo PROPERTY CXX_STANDARD 17)
+```
+
+
+
+```python
+mkdir build
+cd build
+cmake ..
+# cmake .. -DCUDNN_INCLUDE_DIR=/usr/include -DCUDNN_LIBRARY=/usr/lib/x86_64-linux-gnu
+make 
+```
+
+```python
+cmake_minimum_required(VERSION 3.5)
+
+project(dtp)
+
+#find_package(OpenCV REQUIRED)
+
+#message(STATUS "OpenCV library status:")
+#message(STATUS "    version: ${OpenCV_VERSION}")
+#message(STATUS "    libraries: ${OpenCV_LIBS}")
+#message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")
+
+set(Torch_DIR /kaggle/working/libtorch/share/cmake/Torch)
+find_package(Torch REQUIRED)
+#include_directories(${OpenCV_INCLUDE_DIRS})
+include_directories(${TORCH_INCLUDE_DIRS})
+add_executable(dtp main.cpp)
+target_link_libraries(dtp 
+    "${TORCH_LIBRARIES}"
+    # ${OpenCV_LIBS}
+    )
+
+set_property(TARGET dtp PROPERTY CXX_STANDARD 14)
+```
+
+
+
+# 几个常见错误
+
+```python
+# 1、
+CMake Error: CMake was unable to find a build program corresponding to "Unix Makefiles".  CMAKE_MAKE_PROGRAot set.  You probably need to select a different build tool.
+CMake Error: CMAKE_CXX_COMPILER not set, after EnableLanguage
+# 解决方法：yum install make -y
+
+# 2
+-- Could NOT find CUDA (missing: CUDA_TOOLKIT_ROOT_DIR CUDA_NVCC_EXECUTABLE CUDA_INCLUDE_DIRS CUDA_CUDART_LIBRARY)
+# 解决方法：需要安装 cuda （也可以将现有的/usr/local/cuda 复制过来）
+
+# 3、
+OSError: libcudnn.so.8: cannot open shared object file: No such file or directory
+
+# 下载 cudnn8.0以上 将文件复制到 /usr/local/cuda
+
+# 4、
+cannot find -lCUDA_cublas_LIBRARY-NOTFOUND
+# 找到所有 libcublas.so 复制到 /usr/local/cuda/lib64
+```
+
--- a/doc/onnxruntime.md
+++ b/doc/onnxruntime.md
+- https://onnxruntime.ai/
+- https://onnxruntime.ai/docs/tutorials/traditional-ml.html
+- https://github.com/microsoft/onnxruntime
+- https://github.com/microsoft/onnxruntime-inference-examples
+- `pip install onnxruntime`
+
+[toc]
+
+```python
+from torchvision.models import resnet18
+import torch
+
+model = resnet18()
+torch.onnx.export(model, torch.randn(1, 3, 224, 224),
+                  'model.onnx', verbose=True, opset_version=11,
+                  input_names=['input'],  # the model's input names
+                  output_names=['output']
+                  )
+
+```
+
+
+
+##### 将 ONNX 模型转换为 ORT 格式脚本使用
+
+```python
+python -m onnxruntime.tools.convert_onnx_models_to_ort <onnx model file or dir> # 会生成 .ort文件
+```
+
+
+
+```python
+import onnxruntime
+import torch
+def to_numpy(tensor):
+    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
+
+x = torch.randn(1,3,224,224)
+model_path = "model.onnx" # or 'model.ort'
+# ort_session = onnxruntime.InferenceSession(model_path) # 默认cpu
+ort_session = onnxruntime.InferenceSession(model_path,providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'])
+ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(x)}
+ort_outs = ort_session.run(None, ort_inputs)[0]
+ort_outs = torch.softmax(torch.from_numpy(ort_outs), -1)
+print(ort_outs.argmax(-1))
+```
+
+
+
+----
+
+
+
+# 安装
+
+```python
+# 在任何一种环境中，一次只能安装其中一个软件包
+pip install onnxruntime
+pip install onnxruntime-gpu
+
+# 安装 ONNX 以导出模型
+## ONNX is built into PyTorch
+pip install torch
+## tensorflow
+pip install tf2onnx
+## sklearn
+pip install skl2onnx
+```
+
+## PyTorch CV
+
+- 使用导出模型`torch.onnx.export`
+
+```python
+torch.onnx.export(model,                                # model being run
+                  torch.randn(1, 28, 28).to(device),    # model input (or a tuple for multiple inputs)
+                  "fashion_mnist_model.onnx",           # where to save the model (can be a file or file-like object)
+                  input_names = ['input'],              # the model's input names
+                  output_names = ['output'])            # the model's output names
+
+
+# Export the model
+torch.onnx.export(model,                     # model being run
+                (text, offsets),           # model input (or a tuple for multiple inputs)
+                "ag_news_model.onnx",      # where to save the model (can be a file or file-like object)
+                export_params=True,        # store the trained parameter weights inside the model file
+                opset_version=10,          # the ONNX version to export the model to
+                do_constant_folding=True,  # whether to execute constant folding for optimization
+                input_names = ['input', 'offsets'],   # the model's input names
+                output_names = ['output'], # the model's output names
+                dynamic_axes={'input' : {0 : 'batch_size'},    # variable length axes
+                              'output' : {0 : 'batch_size'}})
+```
+
+- 加载 onnx 模型`onnx.load`
+
+```python
+import onnx
+onnx_model = onnx.load("fashion_mnist_model.onnx")
+onnx.checker.check_model(onnx_model)
+```
+
+- 使用创建推理会话`ort.InferenceSession`
+
+```python
+import onnxruntime as ort
+import numpy as np
+x, y = test_data[0][0], test_data[0][1]
+ort_sess = ort.InferenceSession('fashion_mnist_model.onnx')
+outputs = ort_sess.run(None, {'input': x.numpy()})
+
+# Print Result 
+predicted, actual = classes[outputs[0][0].argmax(0)], classes[y]
+print(f'Predicted: "{predicted}", Actual: "{actual}"')
+```
+
+## SciKit Learn CV
+
+```python
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+iris = load_iris()
+X, y = iris.data, iris.target
+X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+from sklearn.linear_model import LogisticRegression
+clr = LogisticRegression()
+clr.fit(X_train, y_train)
+print(clr)
+
+# LogisticRegression()
+```
+
+- 将模型转换或导出为 ONNX 格式
+
+```python
+from skl2onnx import convert_sklearn
+from skl2onnx.common.data_types import FloatTensorType
+
+initial_type = [('float_input', FloatTensorType([None, 4]))]
+onx = convert_sklearn(clr, initial_types=initial_type)
+with open("logreg_iris.onnx", "wb") as f:
+    f.write(onx.SerializeToString())
+```
+
+- 使用 ONNX Runtime 加载和运行模型我们将使用 ONNX Runtime 来计算此机器学习模型的预测。
+
+```python
+import numpy
+import onnxruntime as rt
+
+# sess = rt.InferenceSession("logreg_iris.onnx") # 默认cpu
+sess = rt.InferenceSession("logreg_iris.onnx",providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'])
+input_name = sess.get_inputs()[0].name
+pred_onx = sess.run(None, {input_name: X_test.astype(numpy.float32)})[0]
+print(pred_onx)
+```
+
+- 获取预测类
+
+```python
+import numpy
+import onnxruntime as rt
+
+sess = rt.InferenceSession("logreg_iris.onnx")
+input_name = sess.get_inputs()[0].name
+label_name = sess.get_outputs()[0].name
+pred_onx = sess.run(
+    [label_name], {input_name: X_test.astype(numpy.float32)})[0]
+print(pred_onx)
+```
+
+
+
+## C++ 版本
+
+- 下载安装包 https://github.com/microsoft/onnxruntime/releases
+
+
+
+```c++
+//main.cpp
+//https://blog.csdn.net/baidu_34595620/article/details/112176278
+//https://github.com/microsoft/onnxruntime-inference-examples/blob/main/c_cxx/squeezenet/main.cpp
+#include<iostream>
+#include <vector>
+#include <assert.h>
+// #include "onnxruntime_c_api.h"
+#include "onnxruntime_cxx_api.h"
+// #include "cuda_provider_factory.h"
+#include<memory>
+#include<ctime>   //头文件
+clock_t t_start,t_end;
+
+int main(int argc, const char* argv[]) { 
+	Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
+	Ort::SessionOptions session_options;
+	session_options.SetIntraOpNumThreads(1);
+	
+	session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
+	
+	#ifdef _WIN32
+	  const wchar_t* model_path = L"model.onnx";
+	#else
+	  const char* model_path = "model.onnx";
+	#endif
+	
+	Ort::Session session(env, model_path, session_options);
+	// print model input layer (node names, types, shape etc.)
+	Ort::AllocatorWithDefaultOptions allocator;
+	
+// print number of model input nodes
+  size_t num_input_nodes = session.GetInputCount();
+  std::vector<const char*> input_node_names(num_input_nodes);
+  std::vector<int64_t> input_node_dims;  // simplify... this model has only 1 input node {1, 3, 224, 224}.
+                                         // Otherwise need vector<vector<>>
+
+  printf("Number of inputs = %zu\n", num_input_nodes);
+
+  // iterate over all input nodes
+  for (int i = 0; i < num_input_nodes; i++) {
+    // print input node names
+    char* input_name = session.GetInputName(i, allocator);
+    printf("Input %d : name=%s\n", i, input_name);
+    input_node_names[i] = input_name;
+
+    // print input node types
+    Ort::TypeInfo type_info = session.GetInputTypeInfo(i);
+    auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
+
+    ONNXTensorElementDataType type = tensor_info.GetElementType();
+    printf("Input %d : type=%d\n", i, type);
+
+    // print input shapes/dims
+    input_node_dims = tensor_info.GetShape();
+    printf("Input %d : num_dims=%zu\n", i, input_node_dims.size());
+    for (size_t j = 0; j < input_node_dims.size(); j++)
+      printf("Input %d : dim %zu=%jd\n", i, j, input_node_dims[j]);
+  }
+
+  size_t input_tensor_size = 224 * 224 * 3;  // simplify ... using known dim values to calculate size
+                                             // use OrtGetTensorShapeElementCount() to get official size!
+
+  std::vector<float> input_tensor_values(input_tensor_size);
+  std::vector<const char*> output_node_names = {"output"};//{"softmaxout_1"};
+
+  // initialize input data with values in [0.0, 1.0]
+  for (unsigned int i = 0; i < input_tensor_size; i++)
+    input_tensor_values[i] = (float)i / (input_tensor_size + 1);
+
+  // create input tensor object from data values
+  auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+  Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), 4);
+  assert(input_tensor.IsTensor());
+  
+  int nums=100;
+  t_start=clock();		//程序开始计时
+  for(int i=0;i<nums;++i){
+  // score model & input tensor, get back output tensor
+  auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, 	output_node_names.data(), 1);
+  // assert(output_tensors.size() == 1 && output_tensors.front().IsTensor());
+  }
+  t_end=clock();		//程序结束用时
+  double endtime=(double)(t_end-t_start)/CLOCKS_PER_SEC;//计算
+  std::cout<<"Total time:"<<endtime/nums<<"s"<<std::endl;		//s为单位
+  /*
+  // Get pointer to output tensor float values
+  float* floatarr = output_tensors.front().GetTensorMutableData<float>();
+  // assert(abs(floatarr[0] - 0.000045) < 1e-6);
+
+  // score the model, and print scores for first 5 classes
+  for (int i = 0; i < 5; i++)
+    printf("Score for class [%d] =  %f\n", i, floatarr[i]);
+  */
+  // Results should be as below...
+  // Score for class[0] = 0.000045
+  // Score for class[1] = 0.003846
+  // Score for class[2] = 0.000125
+  // Score for class[3] = 0.001180
+  // Score for class[4] = 0.001317
+
+
+  // release buffers allocated by ORT alloctor
+  for(const char* node_name : input_node_names)
+    allocator.Free(const_cast<void*>(reinterpret_cast<const void*>(node_name)));
+
+  printf("Done!\n");
+
+ } 
+
+```
+
+```makefile
+# CMakeLists.txt
+project(capi_test)
+
+set(CMAKE_BUILD_TYPE Debug)
+
+cmake_minimum_required(VERSION 3.13)
+
+#option(ONNXRUNTIME_ROOTDIR "onnxruntime root dir")
+
+# tensorrt_provider_factory.h contains old APIs of the tensorrt execution provider
+#include(CheckIncludeFileCXX)
+#CHECK_INCLUDE_FILE_CXX(tensorrt_provider_factory.h HAVE_TENSORRT_PROVIDER_FACTORY_H)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+#include_directories( 
+#    ${ONNXRUNTIME_ROOTDIR}/include/onnxruntime/core/session/
+#    ${ONNXRUNTIME_ROOTDIR}/include/onnxruntime/core/providers/tensorrt/
+#)
+include_directories("/opt/onnxruntime/include")
+link_directories("/opt/onnxruntime/lib")
+
+        
+ADD_EXECUTABLE(capi_test  main.cpp)
+if(HAVE_TENSORRT_PROVIDER_FACTORY_H)
+  target_compile_definitions(capi_test PRIVATE -DHAVE_TENSORRT_PROVIDER_FACTORY_H)
+endif()
+target_link_libraries(capi_test onnxruntime)
+```
+
+## [使用 PyTorch 进行 ORT 训练](https://onnxruntime.ai/docs/get-started/training-pytorch.html)
+
+```python
+pip install torch-ort
+python -m torch_ort.configure
+
+from torch_ort import ORTModule
+.
+.
+.
+model = ORTModule(model)
+```
+
--- a/doc/readme.md
+++ b/doc/readme.md
+- [NCNN、OpenVino、 TensorRT、MediaPipe、ONNX，各种推理部署架构，到底哪家强？](https://www.bilibili.com/read/cv13656068)
+
+
+
+[toc]
+
+# 1、纯python 
+
+- 0.02707[cpu], 0.00655[gpu]
+
+```python
+from torchvision.models import resnet18
+import torch
+import time
+
+device = "cpu" # 0.02707 0.00655
+nums = 100
+model = resnet18().to(device)
+inputs = torch.randn(nums, 3, 224, 224).to(device)
+start = time.perf_counter()
+for i in range(nums):
+    preds = model(inputs[[i]])
+end = time.perf_counter()
+print(f"mean_time:{((end - start) / nums):.5f}")
+
+torch.onnx.export(model, torch.randn(1, 3, 224, 224),
+                  'model.onnx', verbose=True, opset_version=11,
+                  input_names=['input'],  # the model's input names
+                  output_names=['output']
+                  )
+
+example = torch.rand(1, 3, 224, 224) 
+# Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing. 
+traced_script_module = torch.jit.trace(model, example) 
+traced_script_module.save("traced_resnet_model.pt") 
+
+# python -m onnxruntime.tools.convert_onnx_models_to_ort model.onnx  # 会生成 model.ort
+```
+
+
+
+# 2、libtorch
+
+- 0.1934[cpu],0.0077[gpu]
+
+```c++
+#include<iostream>
+#include<torch/script.h>
+#include <torch/torch.h> // cuda相关函数头文件
+#include<memory>
+#include<ctime>   //头文件
+clock_t t_start,t_end;
+int main(int argc, const char* argv[]) { 
+	if (argc != 2) {
+ 		std::cerr << "usage: example-app <path-to-exported-script-module>\n";
+ 		return -1; 
+	} 
+    
+    torch::DeviceType device_type = at::kCPU; // 定义设备类型
+    if (torch::cuda::is_available())
+        device_type = at::kCUDA;
+
+    
+	torch::jit::script::Module model; 
+	try { 
+		// Deserialize the ScriptModule from a file using torch::jit::load(). 
+		model = torch::jit::load(argv[1]); 
+	} 
+	catch (const c10::Error& e) {
+		std::cerr << "error loading the model\n"; return -1; 
+	} 
+	std::cout << "ok\n";
+    
+    // Create a vector of inputs. 
+
+    // std::vector<torch::jit::IValue> inputs; 
+    // inputs.push_back(torch::ones({1, 3, 224, 224}));
+    
+	int nums = 100;
+    model.to(device_type);
+    // std::vector<torch::jit::IValue> inputs;
+    // inputs.push_back(torch::ones({ 1, 3, 224, 224 }).to(device_type));
+	std::vector<std::vector<torch::jit::IValue>> inputs;
+	std::vector<torch::jit::IValue> inputs2;
+	for(int i=0;i<nums;++i){
+		inputs2.push_back(torch::randn({ 1, 3, 224, 224 }).to(device_type));
+		inputs.push_back(inputs2);
+		inputs2.clear();
+	}
+	
+    // Execute the model and turn its output into a tensor.
+	at::Tensor output;	
+	t_start=clock();		//程序开始计时
+	for(int i=0;i<nums;++i){
+		output = model.forward(inputs[i]).toTensor(); 
+	}
+	t_end=clock();		//程序结束用时
+	double endtime=(double)(t_end-t_start)/CLOCKS_PER_SEC;//计算
+	std::cout<<"Total time:"<<endtime/nums<<"s"<<std::endl;		//s为单位
+	// at::Tensor output = model.forward(inputs).toTensor(); 
+    std::cout << output.slice(/*dim=*/1, /*start=*/0, /*end=*/5) << '\n'; 
+ } 
+
+```
+
+
+
+
+
+# 3、onnxruntime
+
+- 0.01482[cpu py]，0.0679[cpu c++]
+
+```python
+import onnxruntime
+import numpy as np
+import time
+
+nums = 100
+model_path = "model.onnx"  # or 'model.ort'
+ort_session = onnxruntime.InferenceSession(
+    model_path,
+    providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'])
+inputs = np.random.randn(nums, 3, 224, 224).astype(np.float32)
+start = time.perf_counter()
+for i in range(nums):
+    ort_inputs = {ort_session.get_inputs()[0].name: inputs[[i]]}
+    ort_outs = ort_session.run(None, ort_inputs)[0]
+end = time.perf_counter()
+print(f"mean_time:{((end - start) / nums):.5f}")
+# ort_outs = torch.softmax(torch.from_numpy(ort_outs), -1)
+# print(ort_outs.argmax(-1))
+```
--- a/doc/tensorrt.md
+++ b/doc/tensorrt.md
--- a/readme.md
+++ b/readme.md
 - https://www.cvmart.net/community/detail/7040
 - https://www.cvmart.net/community/detail/5609
+- https://github.com/pytorch/TensorRT

 # 1、训练模型 python训练 (略过)
 # 2、模型推理部署

--- a/readme_CN.md
+++ b/readme_CN.md
+- https://pytorch.org/cppdocs/
+- https://pytorch.org/get-started/locally/ # 下载 libtorch库
+
+- [LibTorch的安装与基本使用](https://zhuanlan.zhihu.com/p/513571175)
+
+- https://docs.openvino.ai/2023.3/openvino_docs_install_guides_installing_openvino_apt.html # openvino c++安装
+
+- https://github.com/openvinotoolkit/openvino
+
+- https://github.com/openvinotoolkit/openvino_notebooks
+
+- https://github.com/microsoft/onnxruntime
+
+- https://github.com/microsoft/onnxruntime-inference-examples
+
+- https://github.com/pytorch/TensorRT
+
+- https://github.com/NVIDIA/TensorRT
+
+- https://github.com/onnx/onnx-tensorrt
+
+- https://github.com/wang-xinyu/tensorrtx
+
+
+
+# 1、安装 opencv（c++）
+
+- [Linux安装Opencv（C++）](https://blog.csdn.net/weixin_44384491/article/details/121142093)
+
+# 2、安装libtorch
+
+下载的libtorch的版本最好和你的pytorch的版本一致。Linux下各个libtorch的release版本的下载链接可以在下面这篇文章中找到：
+
+## 2.2.1-cu118
+
+>Download here (Pre-cxx11 ABI):
+>https://download.pytorch.org/libtorch/cu118/libtorch-shared-with-deps-2.2.1%2Bcu118.zip
+>Download here (cxx11 ABI):
+>https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.2.1%2Bcu118.zip
+
+## 1.11.0-cu115
+
+>https://download.pytorch.org/libtorch/cu115
+
+## 2.2.1-cpu
+
+>Download here (Pre-cxx11 ABI):
+>https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-2.2.1%2Bcpu.zip
+>Download here (cxx11 ABI):
+>https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.2.1%2Bcpu.zip
+
+
+
+下载完成后，随便丢到一个地方去解压，完成。比如我是习惯性放在/usr/local/lib下的。我也建议linux小白将libtorch放在 /usr/local/lib下，并保证libtorch文件夹下存在include这个文件夹。
+
+## 配置CMakeLists.txt
+
+
+
+```py
+## 目录结构
+xxxx
+ - CMakeLists.txt
+ - digit.cpp
+ - digit.py
+```
+
+
+
+```cmake
+# CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+project(LibTorchDemo)
+
+# compile options
+set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+set(CMAKE_CXX_STANDARD 14)
+
+# package
+find_package(OpenCV REQUIRED)
+find_package(Torch REQUIRED PATHS "/usr/local/lib/libtorch")
+
+add_executable(digit digit.cpp)
+# libtorch
+target_link_libraries(digit ${TORCH_LIBRARIES})
+target_link_libraries(digit ${OpenCV_LIBS})
+```
+
+> "/usr/local/lib/libtorch" ：Torch的package路径根据你的安装目录指定
+
+然后写个文件include一下库，文件名为digit.cpp：
+
+```c++
+#include "iostream"
+#include "opencv2/opencv.hpp"
+#include "torch/script.h"
+
+int main(int argc, char const *argv[])
+{
+    std::cout << "hello world!" << std::endl;
+    return 0;
+}
+```
+
+编译一把：
+
+```bash
+$mkdir build
+$cd build
+$cmake .. && make -j8 install
+```
+
+> 如果出现错误，基本都是找不到头文件或者静态库，如果找不到头文件，在CMakeLists.txt中include_directories()中添加能够搜索到你在cpp中写的相对路径的根目录路径。如果静态库找不到，请检查安装包是否损坏，或者静态库目录是否在gcc的搜索路径中。
+
+## 第一步：先用PyTorch训练一个网络
+
+既然我们需要将PyTorch模型使用C++部署，那么首先肯定需要一个Torch的模型。我们先使用PyTorch简单训练一个手写数字识别，相信看这篇文章的靓仔都是torch老手了，我直接上代码：
+
+> 如果你已经有一个模型文件了，请直接跳转到第二步
+
+```python
+from sklearn.datasets import load_digits
+import torch
+from torch import nn
+import torch.utils.data as Data
+import numpy as np
+from sklearn.metrics import accuracy_score
+import matplotlib.pyplot as plt
+import os
+
+class Digit(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(1, 16, 3, 1, 1),
+            nn.Tanh(),
+            nn.Conv2d(16, 32, 3, 2, 1),
+            nn.Tanh(),
+            nn.Conv2d(32, 16, 3, 2, 1),
+            nn.Tanh(),
+            nn.Conv2d(16, 8, 3, 1, 1)
+        )
+
+        self.output = nn.Linear(32, 10)
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.output(out.flatten(1))
+        return out
+
+RATIO = 0.8
+BATCH_SIZE = 128
+EPOCH = 10
+
+if __name__ == "__main__":
+    X, y = load_digits(return_X_y=True)
+    X = X / 16.
+    sample_num = len(y)
+    X = [x.reshape(1, 8, 8).tolist() for x in X]
+
+    indice = np.arange(sample_num)
+    np.random.shuffle(indice)
+
+    X = torch.FloatTensor(X)
+    y = torch.LongTensor(y)
+    offline = int(sample_num * RATIO)
+
+    train = Data.TensorDataset(X[indice[:offline]], y[indice[:offline]])
+    test  = Data.TensorDataset(X[indice[offline:]], y[indice[offline:]])
+
+    train_loader = Data.DataLoader(train, BATCH_SIZE, True)
+    test_loader  = Data.DataLoader(test,  BATCH_SIZE, False)
+    
+    model = Digit()
+    optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3)
+    criterion = nn.CrossEntropyLoss(reduction="mean")
+
+    test_losses = []
+    test_accs = []
+
+    for epoch in range(EPOCH):
+        model.train()
+        for bx, by in train_loader:
+            out = model(bx)
+            loss = criterion(out, by)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        model.eval()
+        correct = 0
+        total = 0
+        test_loss = []
+        test_acc = []
+        for bx, by in test_loader:
+            with torch.no_grad():
+                out = model(bx)
+                pre_lab = out.argmax(1)
+                loss = criterion(out, by)
+
+            test_loss.append(loss.item())
+            test_acc.append(accuracy_score(pre_lab, by))
+
+        test_losses.append(np.mean(test_loss))
+        test_accs.append(np.mean(test_acc))
+    
+    plt.figure(dpi=120)
+    plt.plot(test_losses, 'o-', label="loss")
+    plt.plot(test_accs, 'o-', label="accuracy")
+    plt.legend()
+    plt.grid()
+    plt.show()
+
+    if not os.path.exists("model"):
+        os.makedirs("model")
+    torch.save(model.state_dict(), "model/digit.pth")
+```
+
+## 第二步：使用tracing将模型文件转化成TorchScript
+
+PyTorch导出的模型文件是不能直接被libtorch读取的，因为PyTorch默认导出的后端的序列化是joblib。PyTorch通过JIT搭建了Python和C++的桥梁，我们可以将模型转成TorchScript Module，将Python运行时的部分运行时包裹进去。
+
+转换方法非常简单：
+
+```py
+import torch
+from digit import Digit
+
+model = Digit()
+model.load_state_dict(torch.load("model/digit.pth", map_location="cpu"))
+
+sample = torch.randn(1, 1, 8, 8)
+
+trace_model = torch.jit.trace(model, sample)
+trace_model.save("model/digit.jit")
+```
+
+运行下述测试代码，由于Python本身的特性和JIT的即时编译的特性，模型在同一进程生命周期内运行时前几次会比较慢，所以在测试前，需要空跑几次：
+
+## 第三步：使用libtorch重写推理程序
+
+由于TorchScript可以被C++直接调用，所以我们只需要使用libtorch重写推理代码，并将模型读入就完成了。
+
+libtorch的语法和PyTorch基本一致，学起来很快，于此锦恢就不再赘述了。相应的，在C++中，我们用cv::Mat来取代Python中的numpy.ndarray对象，如何将cv::Mat转成libtorch可以读入的数据结构也会在demo中涉及。
+
+下面的例子会完成一个C++命令行程序，它的第一个参数为模型，第二个参数为需要读入的手写数字图像的路径，预测结果会打印到控制台上。期待已久的C++代码如下：
+
+```c
+#include "iostream"
+#include "opencv2/opencv.hpp"
+#include "torch/script.h"
+#include "fstream"
+
+void checkPath(const char* path) {
+    std::ifstream in;
+    in.open(path);
+    bool flag = (bool)in;
+    in.close();
+    if (flag) return;
+    else {
+        std::cout << "file " << path << " doesn't exist!" << std::endl;
+        exit(-1);
+    }
+}
+
+int main(int argc, char const *argv[])
+{
+    if (argc != 3) {
+        std::cout << "usage : digit <model path> <image path>" << std::endl;
+        return -1;
+    }
+
+    checkPath(argv[1]);
+    checkPath(argv[2]);
+    cv::Mat img = cv::imread(argv[2]), gimg, fimg, rimg;
+    cv::cvtColor(img, gimg, CV_BGR2GRAY);
+
+    gimg.convertTo(fimg, CV_32F, - 1. / 255., 1.);
+    cv::resize(fimg, rimg, {8, 8});
+
+    // convert Mat to tensor
+    at::Tensor img_tensor = torch::from_blob(
+        rimg.data,
+        {1, 1, 8, 8},
+        torch::kFloat32
+    );
+
+    // load model
+    torch::jit::Module model = torch::jit::load(argv[1]);
+
+    // torch.no_grad() 
+    torch::NoGradGuard no_grad; // 请一定加入torch::NoGradGuard no_grad; 这句话，否则内存会炸。
+    
+    // forward
+    torch::Tensor out = model({img_tensor}).toTensor();
+    int pre_lab = torch::argmax(out, 1).item().toInt();
+
+    std::cout << "predict number is " << pre_lab << std::endl;
+    return 0;
+}
+```
+
+
+
+# 实践
+
+```py
+# 安装环境依赖
+sudo apt-get update -y
+sudo apt-get install cmake -y
+sudo apt-get install build-essential libgtk2.0-dev libavcodec-dev libavformat-dev libjpeg-dev libswscale-dev libtiff5-dev -y
+sudo apt-get install libgtk2.0-dev -y
+sudo apt-get install pkg-config -y
+
+# 安装opencv
+!wget https://github.com/opencv/opencv/archive/4.9.0.zip
+cd opencv-4.9.0
+mkdir build
+cd build
+cmake -D CMAKE_BUILD_TYPE=Release -D CMAKE_INSTALL_PREFIX=/usr/local ..
+make -j8
+make install
+
+# 安装libtorch
+!wget https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.2.1%2Bcu121.zip
+unzip libtorch-cxx11-abi-shared-with-deps-2.2.1%2Bcu121.zip
+cp -r libtorch /usr/local/lib
+```
+