diff --git a/demo/CMakeLists.txt b/demo/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d9bb0910053db5f2fc4f2a425bbc3804fd94e0b0
--- /dev/null
+++ b/demo/CMakeLists.txt
@@ -0,0 +1,17 @@
+# CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+project(LibTorchDemo)
+
+# compile options
+set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+set(CMAKE_CXX_STANDARD 14)
+
+# package
+find_package(OpenCV REQUIRED)
+find_package(Torch REQUIRED PATHS "/usr/local/lib/libtorch") # 将libtorch放在 /usr/local/lib
+
+add_executable(digit digit.cpp)
+# libtorch
+target_link_libraries(digit ${TORCH_LIBRARIES})
+target_link_libraries(digit ${OpenCV_LIBS})
\ No newline at end of file
diff --git a/demo/convert2jit.py b/demo/convert2jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..9821fec1e6bb1d81e1336eed225e7cb51e37b403
--- /dev/null
+++ b/demo/convert2jit.py
@@ -0,0 +1,10 @@
+import torch
+from digit import Digit
+
+model = Digit()
+model.load_state_dict(torch.load("model/digit.pth", map_location="cpu"))
+
+sample = torch.randn(1, 1, 8, 8)
+
+trace_model = torch.jit.trace(model, sample)
+trace_model.save("model/digit.jit")
\ No newline at end of file
diff --git a/demo/digit.cpp b/demo/digit.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fdd08a70928ea063c29402380bfb809692ba7512
--- /dev/null
+++ b/demo/digit.cpp
@@ -0,0 +1,52 @@
+#include "iostream"
+#include "opencv2/opencv.hpp"
+#include "torch/script.h"
+#include "fstream"
+
+void checkPath(const char* path) {
+    std::ifstream in;
+    in.open(path);
+    bool flag = (bool)in;
+    in.close();
+    if (flag) return;
+    else {
+        std::cout << "file " << path << " doesn't exist!" << std::endl;
+        exit(-1);
+    }
+}
+
+int main(int argc, char const *argv[])
+{
+    if (argc != 3) {
+        std::cout << "usage : digit <model path> <image path>" << std::endl;
+        return -1;
+    }
+
+    checkPath(argv[1]);
+    checkPath(argv[2]);
+    cv::Mat img = cv::imread(argv[2]), gimg, fimg, rimg;
+    cv::cvtColor(img, gimg, CV_BGR2GRAY);
+
+    gimg.convertTo(fimg, CV_32F, - 1. / 255., 1.);
+    cv::resize(fimg, rimg, {8, 8});
+
+    // convert Mat to tensor
+    at::Tensor img_tensor = torch::from_blob(
+        rimg.data,
+        {1, 1, 8, 8},
+        torch::kFloat32
+    );
+
+    // load model
+    torch::jit::Module model = torch::jit::load(argv[1]);
+
+    // torch.no_grad() 
+    torch::NoGradGuard no_grad; // 请一定加入torch::NoGradGuard no_grad; 这句话，否则内存会炸。
+    
+    // forward
+    torch::Tensor out = model({img_tensor}).toTensor();
+    int pre_lab = torch::argmax(out, 1).item().toInt();
+
+    std::cout << "predict number is " << pre_lab << std::endl;
+    return 0;
+}
\ No newline at end of file
diff --git a/demo/digit.py b/demo/digit.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a969aad5e379bf0d642163aafec8799480c9686
--- /dev/null
+++ b/demo/digit.py
@@ -0,0 +1,95 @@
+from sklearn.datasets import load_digits
+import torch
+from torch import nn
+import torch.utils.data as Data
+import numpy as np
+from sklearn.metrics import accuracy_score
+import matplotlib.pyplot as plt
+import os
+
+class Digit(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(1, 16, 3, 1, 1),
+            nn.Tanh(),
+            nn.Conv2d(16, 32, 3, 2, 1),
+            nn.Tanh(),
+            nn.Conv2d(32, 16, 3, 2, 1),
+            nn.Tanh(),
+            nn.Conv2d(16, 8, 3, 1, 1)
+        )
+
+        self.output = nn.Linear(32, 10)
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.output(out.flatten(1))
+        return out
+
+RATIO = 0.8
+BATCH_SIZE = 128
+EPOCH = 10
+
+if __name__ == "__main__":
+    X, y = load_digits(return_X_y=True)
+    X = X / 16.
+    sample_num = len(y)
+    X = [x.reshape(1, 8, 8).tolist() for x in X]
+
+    indice = np.arange(sample_num)
+    np.random.shuffle(indice)
+
+    X = torch.FloatTensor(X)
+    y = torch.LongTensor(y)
+    offline = int(sample_num * RATIO)
+
+    train = Data.TensorDataset(X[indice[:offline]], y[indice[:offline]])
+    test  = Data.TensorDataset(X[indice[offline:]], y[indice[offline:]])
+
+    train_loader = Data.DataLoader(train, BATCH_SIZE, True)
+    test_loader  = Data.DataLoader(test,  BATCH_SIZE, False)
+    
+    model = Digit()
+    optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3)
+    criterion = nn.CrossEntropyLoss(reduction="mean")
+
+    test_losses = []
+    test_accs = []
+
+    for epoch in range(EPOCH):
+        model.train()
+        for bx, by in train_loader:
+            out = model(bx)
+            loss = criterion(out, by)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        model.eval()
+        correct = 0
+        total = 0
+        test_loss = []
+        test_acc = []
+        for bx, by in test_loader:
+            with torch.no_grad():
+                out = model(bx)
+                pre_lab = out.argmax(1)
+                loss = criterion(out, by)
+
+            test_loss.append(loss.item())
+            test_acc.append(accuracy_score(pre_lab, by))
+
+        test_losses.append(np.mean(test_loss))
+        test_accs.append(np.mean(test_acc))
+    
+    plt.figure(dpi=120)
+    plt.plot(test_losses, 'o-', label="loss")
+    plt.plot(test_accs, 'o-', label="accuracy")
+    plt.legend()
+    plt.grid()
+    plt.show()
+
+    if not os.path.exists("model"):
+        os.makedirs("model")
+    torch.save(model.state_dict(), "model/digit.pth")
\ No newline at end of file
diff --git a/demo/digit_test.cpp b/demo/digit_test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e77efbc5dacff654a1254de522f57ad584aa8e1
--- /dev/null
+++ b/demo/digit_test.cpp
@@ -0,0 +1,9 @@
+#include "iostream"
+#include "opencv2/opencv.hpp"
+#include "torch/script.h"
+
+int main(int argc, char const *argv[])
+{
+    std::cout << "hello world!" << std::endl;
+    return 0;
+}
\ No newline at end of file
diff --git a/demo/image/sample.png b/demo/image/sample.png
new file mode 100644
index 0000000000000000000000000000000000000000..1a253b2910672a5a7ba314a1d09348047f94a1f6
Binary files /dev/null and b/demo/image/sample.png differ
diff --git a/demo/readme.md b/demo/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c3bf0a6b109693063c4edb1ea78aa9756d9541e
--- /dev/null
+++ b/demo/readme.md
@@ -0,0 +1,18 @@
+# 使用
+
+```py
+# 测试
+1、将 digit_test.cpp 改成 digit.cpp
+2、mkdir build && cd  build
+3、camke .. && make -j4
+4、./digit
+
+# libtorch加载模型
+1、python digit.py # 训练一个原生pytorch模型
+2、python convert2jit.py # pytorch模型 转成 jit模型
+3、python test_jit.py # 测试 jit模型
+4、mkdir build && cd  build
+5、camke .. && make -j4
+6、./digit model/digit.jit image/sample.png
+```
+
diff --git a/demo/test_jit.py b/demo/test_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..4316a58bd2465dc18791e5cea927925776f7d83e
--- /dev/null
+++ b/demo/test_jit.py
@@ -0,0 +1,45 @@
+import time
+import torch
+import cv2 as cv
+from digit import Digit
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+
+def run_model(model, image):
+    s = time.time()
+    out = model(image)
+    pre_lab = torch.argmax(out, dim=1)
+    cost_time = round(time.time() - s, 5)
+    return cost_time
+
+image = cv.imread("image/sample.png")
+image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
+image = 1 - image / 255.
+image = cv.resize(image, (8, 8))
+
+
+image = torch.FloatTensor(image).unsqueeze(0).unsqueeze(0).contiguous()
+origin_model = Digit()
+origin_model.load_state_dict(torch.load("model/digit.pth"))
+jit_model = torch.jit.load("model/digit.jit")
+
+# init jit
+for _ in range(3):
+    run_model(origin_model, image)
+    run_model(jit_model, image)
+
+test_times = 10
+
+# begin testing
+results = pd.DataFrame({
+    "type" : ["orgin"] * test_times + ["jit"] * test_times,
+    "cost_time" : [run_model(origin_model, image) for _ in range(test_times)] + [run_model(jit_model, image) for _ in range(test_times)]
+})
+
+plt.figure(dpi=120)
+sns.boxplot(
+    x=results["type"],
+    y=results["cost_time"]
+)
+plt.show()
\ No newline at end of file
diff --git a/doc/libtorch.md b/doc/libtorch.md
new file mode 100644
index 0000000000000000000000000000000000000000..c0c3d0c115e3b4c17a3e0c0f66c9f4b2d4c73ada
--- /dev/null
+++ b/doc/libtorch.md
@@ -0,0 +1,297 @@
+- https://pytorch.org/tutorials/advanced/cpp_export.html
+- [LibTorch的安装、配置与使用](https://blog.csdn.net/weixin_45632168/article/details/114679263)
+- [libtorch c++调用 （五）Linux下的调用](https://blog.csdn.net/juluwangriyue/article/details/108463026)
+- https://pytorch.org/cppdocs/
+- [libtorch教程](https://www.zhihu.com/column/c_1373368181138972672)
+- [VS2019 配置 LibTorch 和 OpenCV](https://zhuanlan.zhihu.com/p/375084412)
+
+[toc]
+
+# 环境
+
+```python
+Ubuntu 9.4.0-1ubuntu1~20.04.1
+python3.7
+pytorch-1.11.0
+cuda-11.0
+libtorch-1.12-cuda113
+gcc version 9.4.0
+cmake version 3.22.5
+GNU Make 4.2.1
+
+# ---------------------------------------
+centos7
+python3.7
+cuda-10.2
+cudnn-10.2-linux-x64-v8.1.0.77
+torch-1.11.0+cu102-cp37-cp37m-linux_x86_64
+torchvision-0.12.0+cu102-cp37-cp37m-linux_x86_64
+libtorch-shared-with-deps-1.12.1+cu102
+onnxruntime-linux-x64-1.12.1
+cmake version 3.14.5
+GNU Make 3.82
+gcc version 8.3.1 20190311
+```
+
+
+
+# 下载 libtorch
+
+- https://pytorch.org/
+
+```python
+# cuda113-linux (需要安装 cuda-11.3 以及对应版本的 cudnn-8.0)
+https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.12.1%2Bcu113.zip
+# cpu-linux
+https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.12.1%2Bcpu.zip
+```
+
+# 将Pytorch模型转化为Torch Script
+
+```python
+import torch 
+import torchvision 
+# An instance of your model.
+model = torchvision.models.resnet18() 
+# An example input you would normally provide to your model's forward() method.
+example = torch.rand(1, 3, 224, 224) 
+# Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing. 
+traced_script_module = torch.jit.trace(model, example) 
+traced_script_module.save("traced_resnet_model.pt") 
+```
+
+# 在C++中加载Model
+
+```c
+#include<iostream>
+#include<torch/script.h>
+#include <torch/torch.h> // cuda相关函数头文件
+#include<memory>
+
+int main(int argc, const char* argv[]) { 
+	if (argc != 2) {
+ 		std::cerr << "usage: example-app <path-to-exported-script-module>\n";
+ 		return -1; 
+	} 
+    
+    torch::DeviceType device_type = at::kCPU; // 定义设备类型
+    if (torch::cuda::is_available())
+        device_type = at::kCUDA;
+
+    
+	torch::jit::script::Module model; 
+	try { 
+		// Deserialize the ScriptModule from a file using torch::jit::load(). 
+		model = torch::jit::load(argv[1]); 
+	} 
+	catch (const c10::Error& e) {
+		std::cerr << "error loading the model\n"; return -1; 
+	} 
+	std::cout << "ok\n";
+    
+    // Create a vector of inputs. 
+
+    // std::vector<torch::jit::IValue> inputs; 
+    // inputs.push_back(torch::ones({1, 3, 224, 224}));
+    
+    model.to(device_type);
+    std::vector<torch::jit::IValue> inputs;
+    inputs.push_back(torch::ones({ 1, 3, 224, 224 }).to(device_type));
+
+    // Execute the model and turn its output into a tensor. 
+
+    at::Tensor output = model.forward(inputs).toTensor(); 
+    std::cout << output.slice(/*dim=*/1, /*start=*/0, /*end=*/5) << '\n'; 
+ } 
+
+```
+
+## 结合opencv
+
+- https://blog.csdn.net/mmmkl1/article/details/118522533
+- https://github.com/qubvel/segmentation_models.pytorch
+
+```c++
+#include "DemoPytorch.h"
+#include <iostream>
+#include <memory>
+#include <algorithm>
+#include <stdio.h>
+#include <opencv2/core.hpp>
+#include <opencv2/opencv.hpp>
+#include <torch/torch.h>
+#include <torch/script.h>
+
+int main() {
+	// load model
+	torch::jit::script::Module module;
+	try {
+		module = torch::jit::load("./torch_script_eval.pt");
+		module.to(torch::kCPU);  // set model to cpu mode
+		/*module.to(torch::kCUDA);*/  // set model to cuda mode
+		module.eval();
+		std::cout << "MODEL LOADED";
+	}
+	catch (const c10::Error& e) {
+		std::cerr << "error loading the model\n";
+	}
+
+	// load img
+	cv::Mat img_original = cv::imread("./00011584_002.png",0);
+	cv::Mat img = cv::Mat(img_original);
+	// normalize
+	cv::resize(img, img, cv::Size(512, 512));
+	img.convertTo(img, CV_32FC1);
+	// img to tensor
+	torch::Tensor mean = torch::tensor({ 0.485,0.456,0.406 });
+	torch::Tensor std = torch::tensor({ 0.229, 0.224, 0.225 });
+	auto input_tensor = torch::from_blob(img.data, { 512,512,1 });
+	input_tensor = input_tensor / 255.0f;
+	input_tensor = input_tensor - mean;
+	input_tensor = input_tensor / std;
+	input_tensor = input_tensor.permute({ 2,0,1 });
+	input_tensor = input_tensor.to(torch::kCPU);
+	/*input_tensor = input_tensor.to(torch::kCUDA);*/
+	input_tensor = input_tensor.unsqueeze(0);
+	std::vector<torch::jit::IValue> input;
+	input.push_back(input_tensor);
+	// pred begin
+	auto pred = module.forward(input).toTensor();
+	// pred tensor to mat
+	pred = pred.squeeze().detach();
+	pred = pred * 255;
+	pred = pred.to(torch::kU8);
+	pred = pred.to(torch::kCPU);
+	cv::Mat output_mat(cv::Size{ 512,512 }, CV_8UC1, pred.data_ptr());
+	// show result
+	cv::imshow("original img", img_original);
+	cv::imshow("mask", output_mat);
+	cv::waitKey(0);
+	cv::destroyWindow("original img");
+	cv::destroyWindow("mask");
+
+	return 0;
+}
+
+```
+
+
+
+# test
+
+```cpp
+//%%file main.cpp
+
+#include <iostream>
+#include <torch/torch.h>
+ 
+using namespace std;
+ 
+int main()
+{
+    // torch::Tensor tensor = torch::eye(3);
+    torch::Tensor tensor = torch::eye(3).to(at::kCUDA);  // 数据加载至GPU
+    std::cout << tensor << std::endl;
+    cout << "Hello World!" << endl;
+    return 0;
+}
+
+
+// 测试gpu是否可以使用
+
+#include<iostream>
+#include<torch/script.h>
+#include <torch/torch.h> // cuda相关函数头文件
+#include<memory>
+ 
+int main()
+{
+	std::cout <<"cuda::is_available():" << torch::cuda::is_available() << std::endl;
+	// system("pause");
+    return 0;
+}
+```
+
+```makefile
+#%%file CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.5)
+ 
+project(libtorch_demo LANGUAGES CXX)
+
+# packages
+#find_package(CUDA)
+# nvcc flags
+#set(CUDA_NVCC_FLAGS -gencode arch=compute_20,code=sm_20;-G;-g)
+#set(CUDA_NVCC_FLAGS -gencode;arch=compute_60,code=sm_60;-G;-g)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+ 
+set(Torch_DIR /kaggle/working/libtorch/share/cmake/Torch)
+find_package(Torch REQUIRED)
+ 
+add_executable(libtorch_demo main.cpp)
+target_link_libraries(libtorch_demo "${TORCH_LIBRARIES}")
+set_property(TARGET libtorch_demo PROPERTY CXX_STANDARD 17)
+```
+
+
+
+```python
+mkdir build
+cd build
+cmake ..
+# cmake .. -DCUDNN_INCLUDE_DIR=/usr/include -DCUDNN_LIBRARY=/usr/lib/x86_64-linux-gnu
+make 
+```
+
+```python
+cmake_minimum_required(VERSION 3.5)
+
+project(dtp)
+
+#find_package(OpenCV REQUIRED)
+
+#message(STATUS "OpenCV library status:")
+#message(STATUS "    version: ${OpenCV_VERSION}")
+#message(STATUS "    libraries: ${OpenCV_LIBS}")
+#message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")
+
+set(Torch_DIR /kaggle/working/libtorch/share/cmake/Torch)
+find_package(Torch REQUIRED)
+#include_directories(${OpenCV_INCLUDE_DIRS})
+include_directories(${TORCH_INCLUDE_DIRS})
+add_executable(dtp main.cpp)
+target_link_libraries(dtp 
+    "${TORCH_LIBRARIES}"
+    # ${OpenCV_LIBS}
+    )
+
+set_property(TARGET dtp PROPERTY CXX_STANDARD 14)
+```
+
+
+
+# 几个常见错误
+
+```python
+# 1、
+CMake Error: CMake was unable to find a build program corresponding to "Unix Makefiles".  CMAKE_MAKE_PROGRAot set.  You probably need to select a different build tool.
+CMake Error: CMAKE_CXX_COMPILER not set, after EnableLanguage
+# 解决方法：yum install make -y
+
+# 2
+-- Could NOT find CUDA (missing: CUDA_TOOLKIT_ROOT_DIR CUDA_NVCC_EXECUTABLE CUDA_INCLUDE_DIRS CUDA_CUDART_LIBRARY)
+# 解决方法：需要安装 cuda （也可以将现有的/usr/local/cuda 复制过来）
+
+# 3、
+OSError: libcudnn.so.8: cannot open shared object file: No such file or directory
+
+# 下载 cudnn8.0以上 将文件复制到 /usr/local/cuda
+
+# 4、
+cannot find -lCUDA_cublas_LIBRARY-NOTFOUND
+# 找到所有 libcublas.so 复制到 /usr/local/cuda/lib64
+```
+
diff --git a/doc/onnxruntime.md b/doc/onnxruntime.md
new file mode 100644
index 0000000000000000000000000000000000000000..3d495631c420d551cb2923f33a9c818f94151b87
--- /dev/null
+++ b/doc/onnxruntime.md
@@ -0,0 +1,337 @@
+- https://onnxruntime.ai/
+- https://onnxruntime.ai/docs/tutorials/traditional-ml.html
+- https://github.com/microsoft/onnxruntime
+- https://github.com/microsoft/onnxruntime-inference-examples
+- `pip install onnxruntime`
+
+[toc]
+
+```python
+from torchvision.models import resnet18
+import torch
+
+model = resnet18()
+torch.onnx.export(model, torch.randn(1, 3, 224, 224),
+                  'model.onnx', verbose=True, opset_version=11,
+                  input_names=['input'],  # the model's input names
+                  output_names=['output']
+                  )
+
+```
+
+
+
+##### 将 ONNX 模型转换为 ORT 格式脚本使用
+
+```python
+python -m onnxruntime.tools.convert_onnx_models_to_ort <onnx model file or dir> # 会生成 .ort文件
+```
+
+
+
+```python
+import onnxruntime
+import torch
+def to_numpy(tensor):
+    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
+
+x = torch.randn(1,3,224,224)
+model_path = "model.onnx" # or 'model.ort'
+# ort_session = onnxruntime.InferenceSession(model_path) # 默认cpu
+ort_session = onnxruntime.InferenceSession(model_path,providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'])
+ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(x)}
+ort_outs = ort_session.run(None, ort_inputs)[0]
+ort_outs = torch.softmax(torch.from_numpy(ort_outs), -1)
+print(ort_outs.argmax(-1))
+```
+
+
+
+----
+
+
+
+# 安装
+
+```python
+# 在任何一种环境中，一次只能安装其中一个软件包
+pip install onnxruntime
+pip install onnxruntime-gpu
+
+# 安装 ONNX 以导出模型
+## ONNX is built into PyTorch
+pip install torch
+## tensorflow
+pip install tf2onnx
+## sklearn
+pip install skl2onnx
+```
+
+## PyTorch CV
+
+- 使用导出模型`torch.onnx.export`
+
+```python
+torch.onnx.export(model,                                # model being run
+                  torch.randn(1, 28, 28).to(device),    # model input (or a tuple for multiple inputs)
+                  "fashion_mnist_model.onnx",           # where to save the model (can be a file or file-like object)
+                  input_names = ['input'],              # the model's input names
+                  output_names = ['output'])            # the model's output names
+
+
+# Export the model
+torch.onnx.export(model,                     # model being run
+                (text, offsets),           # model input (or a tuple for multiple inputs)
+                "ag_news_model.onnx",      # where to save the model (can be a file or file-like object)
+                export_params=True,        # store the trained parameter weights inside the model file
+                opset_version=10,          # the ONNX version to export the model to
+                do_constant_folding=True,  # whether to execute constant folding for optimization
+                input_names = ['input', 'offsets'],   # the model's input names
+                output_names = ['output'], # the model's output names
+                dynamic_axes={'input' : {0 : 'batch_size'},    # variable length axes
+                              'output' : {0 : 'batch_size'}})
+```
+
+- 加载 onnx 模型`onnx.load`
+
+```python
+import onnx
+onnx_model = onnx.load("fashion_mnist_model.onnx")
+onnx.checker.check_model(onnx_model)
+```
+
+- 使用创建推理会话`ort.InferenceSession`
+
+```python
+import onnxruntime as ort
+import numpy as np
+x, y = test_data[0][0], test_data[0][1]
+ort_sess = ort.InferenceSession('fashion_mnist_model.onnx')
+outputs = ort_sess.run(None, {'input': x.numpy()})
+
+# Print Result 
+predicted, actual = classes[outputs[0][0].argmax(0)], classes[y]
+print(f'Predicted: "{predicted}", Actual: "{actual}"')
+```
+
+## SciKit Learn CV
+
+```python
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+iris = load_iris()
+X, y = iris.data, iris.target
+X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+from sklearn.linear_model import LogisticRegression
+clr = LogisticRegression()
+clr.fit(X_train, y_train)
+print(clr)
+
+# LogisticRegression()
+```
+
+- 将模型转换或导出为 ONNX 格式
+
+```python
+from skl2onnx import convert_sklearn
+from skl2onnx.common.data_types import FloatTensorType
+
+initial_type = [('float_input', FloatTensorType([None, 4]))]
+onx = convert_sklearn(clr, initial_types=initial_type)
+with open("logreg_iris.onnx", "wb") as f:
+    f.write(onx.SerializeToString())
+```
+
+- 使用 ONNX Runtime 加载和运行模型我们将使用 ONNX Runtime 来计算此机器学习模型的预测。
+
+```python
+import numpy
+import onnxruntime as rt
+
+# sess = rt.InferenceSession("logreg_iris.onnx") # 默认cpu
+sess = rt.InferenceSession("logreg_iris.onnx",providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'])
+input_name = sess.get_inputs()[0].name
+pred_onx = sess.run(None, {input_name: X_test.astype(numpy.float32)})[0]
+print(pred_onx)
+```
+
+- 获取预测类
+
+```python
+import numpy
+import onnxruntime as rt
+
+sess = rt.InferenceSession("logreg_iris.onnx")
+input_name = sess.get_inputs()[0].name
+label_name = sess.get_outputs()[0].name
+pred_onx = sess.run(
+    [label_name], {input_name: X_test.astype(numpy.float32)})[0]
+print(pred_onx)
+```
+
+
+
+## C++ 版本
+
+- 下载安装包 https://github.com/microsoft/onnxruntime/releases
+
+
+
+```c++
+//main.cpp
+//https://blog.csdn.net/baidu_34595620/article/details/112176278
+//https://github.com/microsoft/onnxruntime-inference-examples/blob/main/c_cxx/squeezenet/main.cpp
+#include<iostream>
+#include <vector>
+#include <assert.h>
+// #include "onnxruntime_c_api.h"
+#include "onnxruntime_cxx_api.h"
+// #include "cuda_provider_factory.h"
+#include<memory>
+#include<ctime>   //头文件
+clock_t t_start,t_end;
+
+int main(int argc, const char* argv[]) { 
+	Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
+	Ort::SessionOptions session_options;
+	session_options.SetIntraOpNumThreads(1);
+	
+	session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
+	
+	#ifdef _WIN32
+	  const wchar_t* model_path = L"model.onnx";
+	#else
+	  const char* model_path = "model.onnx";
+	#endif
+	
+	Ort::Session session(env, model_path, session_options);
+	// print model input layer (node names, types, shape etc.)
+	Ort::AllocatorWithDefaultOptions allocator;
+	
+// print number of model input nodes
+  size_t num_input_nodes = session.GetInputCount();
+  std::vector<const char*> input_node_names(num_input_nodes);
+  std::vector<int64_t> input_node_dims;  // simplify... this model has only 1 input node {1, 3, 224, 224}.
+                                         // Otherwise need vector<vector<>>
+
+  printf("Number of inputs = %zu\n", num_input_nodes);
+
+  // iterate over all input nodes
+  for (int i = 0; i < num_input_nodes; i++) {
+    // print input node names
+    char* input_name = session.GetInputName(i, allocator);
+    printf("Input %d : name=%s\n", i, input_name);
+    input_node_names[i] = input_name;
+
+    // print input node types
+    Ort::TypeInfo type_info = session.GetInputTypeInfo(i);
+    auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
+
+    ONNXTensorElementDataType type = tensor_info.GetElementType();
+    printf("Input %d : type=%d\n", i, type);
+
+    // print input shapes/dims
+    input_node_dims = tensor_info.GetShape();
+    printf("Input %d : num_dims=%zu\n", i, input_node_dims.size());
+    for (size_t j = 0; j < input_node_dims.size(); j++)
+      printf("Input %d : dim %zu=%jd\n", i, j, input_node_dims[j]);
+  }
+
+  size_t input_tensor_size = 224 * 224 * 3;  // simplify ... using known dim values to calculate size
+                                             // use OrtGetTensorShapeElementCount() to get official size!
+
+  std::vector<float> input_tensor_values(input_tensor_size);
+  std::vector<const char*> output_node_names = {"output"};//{"softmaxout_1"};
+
+  // initialize input data with values in [0.0, 1.0]
+  for (unsigned int i = 0; i < input_tensor_size; i++)
+    input_tensor_values[i] = (float)i / (input_tensor_size + 1);
+
+  // create input tensor object from data values
+  auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+  Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), 4);
+  assert(input_tensor.IsTensor());
+  
+  int nums=100;
+  t_start=clock();		//程序开始计时
+  for(int i=0;i<nums;++i){
+  // score model & input tensor, get back output tensor
+  auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, 	output_node_names.data(), 1);
+  // assert(output_tensors.size() == 1 && output_tensors.front().IsTensor());
+  }
+  t_end=clock();		//程序结束用时
+  double endtime=(double)(t_end-t_start)/CLOCKS_PER_SEC;//计算
+  std::cout<<"Total time:"<<endtime/nums<<"s"<<std::endl;		//s为单位
+  /*
+  // Get pointer to output tensor float values
+  float* floatarr = output_tensors.front().GetTensorMutableData<float>();
+  // assert(abs(floatarr[0] - 0.000045) < 1e-6);
+
+  // score the model, and print scores for first 5 classes
+  for (int i = 0; i < 5; i++)
+    printf("Score for class [%d] =  %f\n", i, floatarr[i]);
+  */
+  // Results should be as below...
+  // Score for class[0] = 0.000045
+  // Score for class[1] = 0.003846
+  // Score for class[2] = 0.000125
+  // Score for class[3] = 0.001180
+  // Score for class[4] = 0.001317
+
+
+  // release buffers allocated by ORT alloctor
+  for(const char* node_name : input_node_names)
+    allocator.Free(const_cast<void*>(reinterpret_cast<const void*>(node_name)));
+
+  printf("Done!\n");
+
+ } 
+
+```
+
+```makefile
+# CMakeLists.txt
+project(capi_test)
+
+set(CMAKE_BUILD_TYPE Debug)
+
+cmake_minimum_required(VERSION 3.13)
+
+#option(ONNXRUNTIME_ROOTDIR "onnxruntime root dir")
+
+# tensorrt_provider_factory.h contains old APIs of the tensorrt execution provider
+#include(CheckIncludeFileCXX)
+#CHECK_INCLUDE_FILE_CXX(tensorrt_provider_factory.h HAVE_TENSORRT_PROVIDER_FACTORY_H)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+#include_directories( 
+#    ${ONNXRUNTIME_ROOTDIR}/include/onnxruntime/core/session/
+#    ${ONNXRUNTIME_ROOTDIR}/include/onnxruntime/core/providers/tensorrt/
+#)
+include_directories("/opt/onnxruntime/include")
+link_directories("/opt/onnxruntime/lib")
+
+        
+ADD_EXECUTABLE(capi_test  main.cpp)
+if(HAVE_TENSORRT_PROVIDER_FACTORY_H)
+  target_compile_definitions(capi_test PRIVATE -DHAVE_TENSORRT_PROVIDER_FACTORY_H)
+endif()
+target_link_libraries(capi_test onnxruntime)
+```
+
+## [使用 PyTorch 进行 ORT 训练](https://onnxruntime.ai/docs/get-started/training-pytorch.html)
+
+```python
+pip install torch-ort
+python -m torch_ort.configure
+
+from torch_ort import ORTModule
+.
+.
+.
+model = ORTModule(model)
+```
+
diff --git a/doc/readme.md b/doc/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..fa38b498141c464144eac420018b118f0102a468
--- /dev/null
+++ b/doc/readme.md
@@ -0,0 +1,133 @@
+- [NCNN、OpenVino、 TensorRT、MediaPipe、ONNX，各种推理部署架构，到底哪家强？](https://www.bilibili.com/read/cv13656068)
+
+
+
+[toc]
+
+# 1、纯python 
+
+- 0.02707[cpu], 0.00655[gpu]
+
+```python
+from torchvision.models import resnet18
+import torch
+import time
+
+device = "cpu" # 0.02707 0.00655
+nums = 100
+model = resnet18().to(device)
+inputs = torch.randn(nums, 3, 224, 224).to(device)
+start = time.perf_counter()
+for i in range(nums):
+    preds = model(inputs[[i]])
+end = time.perf_counter()
+print(f"mean_time:{((end - start) / nums):.5f}")
+
+torch.onnx.export(model, torch.randn(1, 3, 224, 224),
+                  'model.onnx', verbose=True, opset_version=11,
+                  input_names=['input'],  # the model's input names
+                  output_names=['output']
+                  )
+
+example = torch.rand(1, 3, 224, 224) 
+# Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing. 
+traced_script_module = torch.jit.trace(model, example) 
+traced_script_module.save("traced_resnet_model.pt") 
+
+# python -m onnxruntime.tools.convert_onnx_models_to_ort model.onnx  # 会生成 model.ort
+```
+
+
+
+# 2、libtorch
+
+- 0.1934[cpu],0.0077[gpu]
+
+```c++
+#include<iostream>
+#include<torch/script.h>
+#include <torch/torch.h> // cuda相关函数头文件
+#include<memory>
+#include<ctime>   //头文件
+clock_t t_start,t_end;
+int main(int argc, const char* argv[]) { 
+	if (argc != 2) {
+ 		std::cerr << "usage: example-app <path-to-exported-script-module>\n";
+ 		return -1; 
+	} 
+    
+    torch::DeviceType device_type = at::kCPU; // 定义设备类型
+    if (torch::cuda::is_available())
+        device_type = at::kCUDA;
+
+    
+	torch::jit::script::Module model; 
+	try { 
+		// Deserialize the ScriptModule from a file using torch::jit::load(). 
+		model = torch::jit::load(argv[1]); 
+	} 
+	catch (const c10::Error& e) {
+		std::cerr << "error loading the model\n"; return -1; 
+	} 
+	std::cout << "ok\n";
+    
+    // Create a vector of inputs. 
+
+    // std::vector<torch::jit::IValue> inputs; 
+    // inputs.push_back(torch::ones({1, 3, 224, 224}));
+    
+	int nums = 100;
+    model.to(device_type);
+    // std::vector<torch::jit::IValue> inputs;
+    // inputs.push_back(torch::ones({ 1, 3, 224, 224 }).to(device_type));
+	std::vector<std::vector<torch::jit::IValue>> inputs;
+	std::vector<torch::jit::IValue> inputs2;
+	for(int i=0;i<nums;++i){
+		inputs2.push_back(torch::randn({ 1, 3, 224, 224 }).to(device_type));
+		inputs.push_back(inputs2);
+		inputs2.clear();
+	}
+	
+    // Execute the model and turn its output into a tensor.
+	at::Tensor output;	
+	t_start=clock();		//程序开始计时
+	for(int i=0;i<nums;++i){
+		output = model.forward(inputs[i]).toTensor(); 
+	}
+	t_end=clock();		//程序结束用时
+	double endtime=(double)(t_end-t_start)/CLOCKS_PER_SEC;//计算
+	std::cout<<"Total time:"<<endtime/nums<<"s"<<std::endl;		//s为单位
+	// at::Tensor output = model.forward(inputs).toTensor(); 
+    std::cout << output.slice(/*dim=*/1, /*start=*/0, /*end=*/5) << '\n'; 
+ } 
+
+```
+
+
+
+
+
+# 3、onnxruntime
+
+- 0.01482[cpu py]，0.0679[cpu c++]
+
+```python
+import onnxruntime
+import numpy as np
+import time
+
+nums = 100
+model_path = "model.onnx"  # or 'model.ort'
+ort_session = onnxruntime.InferenceSession(
+    model_path,
+    providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'])
+inputs = np.random.randn(nums, 3, 224, 224).astype(np.float32)
+start = time.perf_counter()
+for i in range(nums):
+    ort_inputs = {ort_session.get_inputs()[0].name: inputs[[i]]}
+    ort_outs = ort_session.run(None, ort_inputs)[0]
+end = time.perf_counter()
+print(f"mean_time:{((end - start) / nums):.5f}")
+# ort_outs = torch.softmax(torch.from_numpy(ort_outs), -1)
+# print(ort_outs.argmax(-1))
+```
diff --git a/doc/tensorrt.md b/doc/tensorrt.md
new file mode 100644
index 0000000000000000000000000000000000000000..a26c165901a8a8de7b2c11ec38c890742c68e7ef
--- /dev/null
+++ b/doc/tensorrt.md
@@ -0,0 +1,679 @@
+- https://developer.nvidia.com/tensorrt-getting-started
+
+- https://hub.docker.com/r/pytorch/pytorch/
+
+- https://github.com/pytorch/TensorRT
+
+  
+
+[toc]
+
+# Torch-TensorRT
+
+- https://pytorch.org/TensorRT/
+
+```python
+pip3 install torch-tensorrt -f https://github.com/pytorch/TensorRT/releases
+```
+
+根据您使用的是 Python 还是 C++，您需要安装 PyTorch 或 LibTorch，并且必须安装 CUDA、cuDNN 和 TensorRT。
+
+> - [https://www.pytorch.org](https://www.pytorch.org/)
+> - https://developer.nvidia.com/cuda
+> - https://developer.nvidia.com/cudnn
+> - https://developer.nvidia.com/tensorrt
+
+
+
+```python
+from torchvision.models import resnet18
+import torch
+import torch_tensorrt
+
+model = resnet18().eval()  # torch module needs to be in eval (not training) mode
+
+inputs = [
+    torch_tensorrt.Input(
+        min_shape=[1, 3, 224, 224],
+        opt_shape=[1, 3, 256, 256],
+        max_shape=[1, 3, 320, 320],
+        dtype=torch.half,
+    )
+]
+enabled_precisions = {torch.float, torch.half}  # Run with fp16
+
+trt_ts_module = torch_tensorrt.compile(
+    model, inputs=inputs, enabled_precisions=enabled_precisions
+)
+
+input_data = torch.randn(1, 3, 224, 224)
+input_data = input_data.to("cuda").half()
+result = trt_ts_module(input_data)
+torch.jit.save(trt_ts_module, "trt_ts_module.ts")
+```
+
+```python
+# Deployment application
+import torch
+import torch_tensorrt
+
+trt_ts_module = torch.jit.load("trt_ts_module.ts")
+input_data = input_data.to("cuda").half()
+result = trt_ts_module(input_data)
+```
+
+
+
+---
+## kaggle环境配置
+```python
+cuda-11.0 / !ls /usr/include/cudnn*
+gcc version 9.4.0 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
+cmake version 3.22.5
+GNU Make 4.2.1
+python3.8
+pytorch-1.11.0
+torchvision-0.12.0
+
+# torch-tensorrt==1.1.0
+pip3 install torch-tensorrt -f https://github.com/pytorch/TensorRT/releases
+
+import torch_tensorrt 显示以下错误，需要下载 libtorch gpu版本 （https://pytorch.org/）
+# ImportError: libtorch_cuda_cu.so: cannot open shared object file: No such file or directory
+# https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.12.1%2Bcu113.zip
+```
+
+## [实例](https://developer.nvidia.com/blog/accelerating-inference-up-to-6x-faster-in-pytorch-with-torch-tensorrt/)
+
+要执行这些步骤，您需要以下资源：
+
+- 具有 NVIDIA GPU、计算架构 7 或更早版本的 Linux 机器
+- 已安装 Docker，19.03 或更高版本
+- 一个 Docker 容器，包含 PyTorch、Torch-TensorRT 以及从[NGC 目录中提取的所有依赖项](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+
+按照说明运行标记为[nvcr.io/nvidia/pytorch:21.11-py3](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch)的 Docker 容器。
+
+```python
+docker run --gpus all -it --rm nvcr.io/nvidia/pytorch:21.11-py3
+# or
+nvida-docker run  -it --rm nvcr.io/nvidia/pytorch:21.11-py3
+```
+
+在 Docker 容器中有一个实时 bash 终端，启动一个 JupyterLab 实例来运行 Python 代码
+```python
+jupyter-notebook --ip 0.0.0.0 --port 8000
+jupyter-lab --allow-root --ip=0.0.0.0 --NotebookApp.token=’TensorRT’ --port 8888
+```
+
+```python
+# pip install timm
+
+import torch
+import torch_tensorrt
+import timm
+import time
+import numpy as np
+import torch.backends.cudnn as cudnn
+
+torch.hub._validate_not_a_forked_repo=lambda a,b,c: True
+
+efficientnet_b0 = timm.create_model('efficientnet_b0',pretrained=True)
+
+model =efficientnet_b0.eval().to("cuda") 
+detections_batch = model(torch.randn(128, 3, 224, 224).to("cuda")) 
+detections_batch.shape
+
+# 要通过 PyTorch JIT 和 Torch-TensorRT AOT 编译方法对该模型进行基准测试，请编写一个简单的基准实用程序函数：
+cudnn.benchmark = True
+
+def benchmark(model, input_shape=(1024, 3, 512, 512), dtype='fp32', nwarmup=50, nruns=1000):
+    input_data = torch.randn(input_shape)
+    input_data = input_data.to("cuda")
+    if dtype=='fp16':
+        input_data = input_data.half()
+        
+    print("Warm up ...")
+    with torch.no_grad():
+        for _ in range(nwarmup):
+            features = model(input_data)
+    torch.cuda.synchronize()
+    print("Start timing ...")
+    timings = []
+    with torch.no_grad():
+        for i in range(1, nruns+1):
+            start_time = time.time()
+            pred_loc  = model(input_data)
+            torch.cuda.synchronize()
+            end_time = time.time()
+            timings.append(end_time - start_time)
+            if i%10==0:
+                print('Iteration %d/%d, avg batch time %.2f ms'%(i, nruns, np.mean(timings)*1000))
+
+    print("Input shape:", input_data.size())
+    print('Average throughput: %.2f images/second'%(input_shape[0]/np.mean(timings)))
+
+# Inference using PyTorch and TorchScript
+model = efficientnet_b0.eval().to("cuda")
+benchmark(model, input_shape=(1, 3, 224, 224), nruns=100)
+
+"""
+Start timing ...
+Iteration 10/100, avg batch time 16.47 ms
+Iteration 20/100, avg batch time 16.51 ms
+Iteration 30/100, avg batch time 17.21 ms
+Iteration 40/100, avg batch time 17.53 ms
+Iteration 50/100, avg batch time 17.67 ms
+Iteration 60/100, avg batch time 17.84 ms
+Iteration 70/100, avg batch time 17.98 ms
+Iteration 80/100, avg batch time 17.99 ms
+Iteration 90/100, avg batch time 17.82 ms
+Iteration 100/100, avg batch time 17.68 ms
+Input shape: torch.Size([1, 3, 224, 224])
+Average throughput: 56.55 images/second
+
+
+"""
+
+# 可以使用 TorchScript JIT 模块重复相同的步骤
+traced_model = torch.jit.trace(model, torch.randn((1,3,224,224)).to("cuda"))
+torch.jit.save(traced_model, "efficientnet_b0_traced.jit.pt")
+benchmark(traced_model, input_shape=(1, 3, 224, 224), nruns=100)
+
+"""
+Start timing ...
+Iteration 10/100, avg batch time 11.88 ms
+Iteration 20/100, avg batch time 12.04 ms
+Iteration 30/100, avg batch time 12.30 ms
+Iteration 40/100, avg batch time 12.43 ms
+Iteration 50/100, avg batch time 12.49 ms
+Iteration 60/100, avg batch time 12.37 ms
+Iteration 70/100, avg batch time 12.36 ms
+Iteration 80/100, avg batch time 12.43 ms
+Iteration 90/100, avg batch time 12.16 ms
+Iteration 100/100, avg batch time 11.84 ms
+Input shape: torch.Size([1, 3, 224, 224])
+Average throughput: 84.43 images/second
+
+
+"""
+
+
+# Inference using Torch-TensorRT
+# 要使用 Torch-TensorRT 以混合精度编译模型，请运行以下命令
+trt_model = torch_tensorrt.compile(model, 
+    inputs= [torch_tensorrt.Input((1, 3, 224, 224),dtype=torch.half)],
+    enabled_precisions= { torch.half} # Run with FP16
+)
+
+benchmark(trt_model, input_shape=(1, 3, 224, 224), nruns=100, dtype="fp16")
+"""
+dtype="fp32"
+Iteration 10/100, avg batch time 9.29 ms
+Iteration 20/100, avg batch time 9.24 ms
+Iteration 30/100, avg batch time 9.26 ms
+Iteration 40/100, avg batch time 9.28 ms
+Iteration 50/100, avg batch time 9.27 ms
+Iteration 60/100, avg batch time 9.28 ms
+Iteration 70/100, avg batch time 9.14 ms
+Iteration 80/100, avg batch time 9.03 ms
+Iteration 90/100, avg batch time 9.01 ms
+Iteration 100/100, avg batch time 9.03 ms
+Input shape: torch.Size([1, 3, 224, 224])
+Average throughput: 110.70 images/second
+
+dtype="fp16"
+Iteration 10/100, avg batch time 8.09 ms
+Iteration 20/100, avg batch time 5.35 ms
+Iteration 30/100, avg batch time 4.18 ms
+Iteration 40/100, avg batch time 3.57 ms
+Iteration 50/100, avg batch time 3.21 ms
+Iteration 60/100, avg batch time 2.96 ms
+Iteration 70/100, avg batch time 2.78 ms
+Iteration 80/100, avg batch time 2.64 ms
+Iteration 90/100, avg batch time 2.53 ms
+Iteration 100/100, avg batch time 2.45 ms
+Input shape: torch.Size([1, 3, 224, 224])
+Average throughput: 408.98 images/second
+"""
+```
+
+### 基准测试结果
+
+这是我在批量大小为 1 的 NVIDIA A100 GPU 上取得的结果。
+
+![Torch 和 TensorRT 之间的吞吐量比较吞吐量是 4 倍。](https://developer-blogs.nvidia.com/wp-content/uploads/2021/12/native-throughput-comparison-torch-tensorrt-625x433.png)
+
+*图 6. 在批量大小为 1 的 NVIDIA A100 GPU 上比较原生 PyTorch 与 Torch-TensorRt 的吞吐量*
+
+
+
+# TensorRT
+
+- https://docs.nvidia.com/deeplearning/tensorrt/quick-start-guide/index.html#onnx-export
+
+安装包安装参考 [here](https://note.youdao.com/old-web/#/file/WEB8c4a998486a217e1084f117df705f315/markdown/WEBa2da28f7fa10323b590866cd1f9a35e0/)
+
+pip安装`!pip install -U nvidia-tensorrt --index-url https://pypi.ngc.nvidia.com  # install`
+
+## pytorch to onnx
+
+```python
+import torchvision.models as models
+
+resnext50_32x4d = models.resnext50_32x4d(pretrained=True)
+import torch
+
+BATCH_SIZE = 64
+dummy_input=torch.randn(BATCH_SIZE, 3, 224, 224)
+
+import torch.onnx
+torch.onnx.export(resnext50_32x4d, dummy_input, "resnet50_onnx_model.onnx", verbose=False,opset_version=11)
+```
+
+## [将 ONNX 转换为 TensorRT 引擎](https://docs.nvidia.com/deeplearning/tensorrt/quick-start-guide/index.html#convert-onnx-engine)
+
+- 使用执行器
+
+```python
+/opt/TensorRT-7.2.3.4/bin/trtexec --onnx=resnet50_onnx_model.onnx --saveEngine=resnet_engine.trt --fp16
+/opt/TensorRT-7.2.3.4/bin/trtexec --onnx=model.onnx --saveEngine=model.trt --fp16 --workspace=64 --buildOnly
+/opt/TensorRT-7.2.3.4/bin/trtexec --onnx=model.onnx --saveEngine=model.trt --best --workspace=64 --buildOnly # --minTiming=5 --avgTiming=10
+
+--fp16除了 FP32 之外，还为支持它的层启用 FP16 精度。
+--int8除了 FP32 之外，还为支持它的层启用 INT8 精度。
+--best 启用所有支持的精度以实现每一层的最佳性能
+--workspace 控制构建器考虑的算法的最大可用持久暂存内存量（以 MB 为单位）
+--minShapes和--maxShapes 指定每个网络输入的维度范围和 --optShapes指定自动调谐器应用于优化的维度
+--buildOnly请求跳过推理性能测量
+--tacticSources可用于在默认策略源（cuDNN、cuBLAS 和 cuBLASLt）中添加或删除策略
+--minTiming和--avgTiming 分别设置策略选择中使用的最小和平均迭代次数。
+```
+
+- 使用 TensorRT API
+
+```python
+import torch
+import torchvision.models as models 
+def torch2onnx(model:torch.nn.Module,x:torch.Tensor,save_path:str="./model.onnx"):
+    """pytorch模型保存成 .onnx格式"""
+    # x = torch.rand([32,3,224,224])
+    model.eval()
+    # Export the model
+    torch.onnx.export(model,  # model being run
+                      x,  # model input (or a tuple for multiple inputs)
+                      save_path,  # where to save the model (can be a file or file-like object)
+                      verbose=True,
+                      export_params=True,  # store the trained parameter weights inside the model file
+                      opset_version=11,  # the ONNX version to export the model to
+                      do_constant_folding=True,  # whether to execute constant folding for optimization
+                      input_names=['input'],  # the model's input names
+                      output_names=['output'],  # the model's output names
+                      # dynamic_axes={'input': {0: 'batch_size'},  # variable lenght axes
+                      #               'output': {0: 'batch_size'}} # tensorrt 中没法执行，要注销这句
+                     )
+
+    # input_names = ["input"]
+    # output_names = ["output"]
+    #
+    # torch.onnx.export(model, x, save_path, verbose=True, opset_version=8, input_names=input_names,
+    #                   output_names=output_names)
+
+torch2onnx(models.resnet18(),torch.randn(64,3,224,224))
+```
+
+```python
+from __future__ import print_function
+
+import numpy as np
+import tensorrt as trt
+# import pycuda.driver as cuda
+# import pycuda.autoinit
+
+import os
+# import sys
+# sys.path.insert(1, os.path.join(sys.path[0], "."))
+# import common
+# import layers_trt as lytrt
+
+import time
+from functools import wraps, partial
+
+TRT_LOGGER = trt.Logger()
+
+EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+
+
+def GiB(val):
+    return val * 1 << 30
+
+
+def timeit(func):
+    @wraps(func)
+    def inner(*args, **kwargs):
+        start = time.time()
+        r = func(*args, **kwargs)
+        end = time.time()
+        print("%s cost time: %s" % (func.__name__, end - start))
+        return r
+
+    return inner
+
+
+class DefModelData:
+    PLUGIN_LIBRARY = None
+    BATCH_SIZE = 1
+    MEM_SIZE = 1 << 28  # 256MiB  ; 1 << 28/1024/1024=256
+    # MEM_SIZE = GiB(2)  # 1G
+
+    DTYPE = trt.float16
+    NP_DTYPE = np.float16
+    INPUT_SHAPE = [1, 3, 32, 32]
+    OUTPUT_SIZE = [-1, 10]  # [-1,10]
+
+    onnx_file_path = "model.onnx"
+    engine_file_path = "model.trt"
+
+    model_file_path = "model.npz"
+    INPUT_NAME = 'input'
+    OUTPUT_NAME = 'output'
+
+
+def onnx2engine(ModelData=None):
+    """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
+    if ModelData is None: ModelData = DefModelData
+
+    onnx_file_path = ModelData.onnx_file_path
+    engine_file_path = ModelData.engine_file_path
+
+    def build_engine():
+        """Takes an ONNX file and creates a TensorRT engine to run inference with"""
+        with trt.Builder(TRT_LOGGER) as builder, builder.create_network(  # common.EXPLICIT_BATCH
+                EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
+            builder.max_workspace_size = ModelData.MEM_SIZE
+            builder.max_batch_size = ModelData.BATCH_SIZE
+
+            if ModelData.DTYPE == trt.float16:
+                builder.fp16_mode = True
+
+            # Parse model file
+            if not os.path.exists(onnx_file_path):
+                print(
+                    'ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.'.format(onnx_file_path))
+                exit(0)
+            print('Loading ONNX file from path {}...'.format(onnx_file_path))
+            with open(onnx_file_path, 'rb') as model:
+                print('Beginning ONNX file parsing')
+                if not parser.parse(model.read()):
+                    print('ERROR: Failed to parse the ONNX file.')
+                    for error in range(parser.num_errors):
+                        print(parser.get_error(error))
+                    return None
+            # The actual yolov3.onnx is generated with batch size 64. Reshape input to batch size 1
+            # network.get_input(0).shape = [1, 3, 608, 608]
+            if 'INPUT_SHAPE' in ModelData.__dict__.keys():
+                network.get_input(0).shape = ModelData.INPUT_SHAPE
+
+            print('Completed parsing of ONNX file')
+            print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
+            engine = builder.build_cuda_engine(network)
+            print("Completed creating Engine")
+            with open(engine_file_path, "wb") as f:
+                f.write(engine.serialize())
+            return engine
+
+    if os.path.exists(engine_file_path):
+        # If a serialized engine exists, use it instead of building an engine.
+        print("Reading engine from file {}".format(engine_file_path))
+        with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
+            return runtime.deserialize_cuda_engine(f.read())
+    else:
+        return build_engine()
+
+
+def onnx2trt(ModelData=None):
+    # from toolsmall.tools.speed.modelTansform import onnx2engine
+
+    if ModelData is None: ModelData = DefModelData
+    onnx2engine(ModelData)
+
+
+if __name__ == "__main__":
+    ModelData = DefModelData
+    ModelData.BATCH_SIZE = 1
+    ModelData.INPUT_SHAPE = [64, 3, 224, 224]
+    ModelData.OUTPUT_SIZE = [-1, 1000]
+    onnx2trt(ModelData)
+
+```
+
+## 运行engine
+
+`/opt/TensorRT-7.2.3.4/samples/python/common.py`
+
+```python
+from __future__ import print_function
+
+import numpy as np
+import tensorrt as trt
+# import pycuda.driver as cuda
+# import pycuda.autoinit
+
+import os
+# import sys
+# sys.path.insert(1, os.path.join(sys.path[0], "."))
+import common
+# import layers_trt as lytrt
+
+import time
+from functools import wraps, partial
+
+TRT_LOGGER = trt.Logger()
+
+
+def timeit(func):
+    @wraps(func)
+    def inner(*args, **kwargs):
+        start = time.time()
+        r = func(*args, **kwargs)
+        end = time.time()
+        print("%s cost time: %s" % (func.__name__, end - start))
+        return r
+
+    return inner
+
+
+class DefModelData:
+    PLUGIN_LIBRARY = None
+    BATCH_SIZE = 1
+    MEM_SIZE = 1 << 28  # 256MiB  ; 1 << 28/1024/1024=256
+    # MEM_SIZE = common.GiB(2)  # 1G
+
+    DTYPE = trt.float16
+    NP_DTYPE = np.float16
+    INPUT_SHAPE = [1, 3, 32, 32]
+    OUTPUT_SIZE = [-1, 10]  # [-1,10]
+
+    onnx_file_path = "model.onnx"
+    engine_file_path = "model.trt"
+
+    model_file_path = "model.npz"
+    INPUT_NAME = 'input'
+    OUTPUT_NAME = 'output'
+
+
+def loadEngine(engine_file_path: str = "./model.engine"):
+    # If a serialized engine exists, use it instead of building an engine.
+    print("Reading engine from file {}".format(engine_file_path))
+    with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
+        return runtime.deserialize_cuda_engine(f.read())
+
+
+@timeit
+def runEngineInfer(data=np.ones([1, 3, 32, 32]), ModelData=None):
+    if ModelData is None: ModelData = DefModelData
+    engine_file_path = ModelData.engine_file_path
+    with loadEngine(engine_file_path) as engine, engine.create_execution_context() as context:
+        inputs, outputs, bindings, stream = common.allocate_buffers(engine)
+
+        len_data = len(data)
+        data = data.ravel().astype(ModelData.NP_DTYPE)  # 展成一行
+        np.copyto(inputs[0].host, data)
+
+        # [output] = common.do_inference(context, bindings=bindings, \
+        #             inputs=inputs, outputs=outputs, stream=stream, \
+        #             batch_size=ModelData.BATCH_SIZE)
+
+        [output] = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs,
+                                          stream=stream)
+
+        output = np.reshape(output, ModelData.OUTPUT_SIZE)[:len_data]  # 转成[-1,10]
+        pred = np.argmax(output, -1)
+        print("Prediction: " + str(pred))
+
+
+if __name__ == "__main__":
+    ModelData = DefModelData
+    ModelData.BATCH_SIZE = 64
+    ModelData.INPUT_SHAPE = [64, 3, 224, 224]
+    ModelData.OUTPUT_SIZE = [-1, 1000]
+    ModelData.engine_file_path = "model.trt"
+    runEngineInfer(data=np.random.randn(64, 3, 224, 224), ModelData=ModelData)
+
+```
+
+## test
+
+- 0.8920[cpu];0.0841[cuda]
+
+```python
+import torch
+import torchvision.models as models
+import torch.backends.cudnn as cudnn
+import time
+
+cudnn.benchmark = True
+device = "cpu"  # "cuda"  # 0.8920321515493561;0.08411713584093378
+nums = 100
+nwarmup = 5
+model = models.resnet18().eval().to(device)
+datas = torch.randn(64, 3, 224, 224).to(device)
+print("Warm up ...")
+with torch.no_grad():
+    for _ in range(nwarmup):
+        features = model(datas)
+
+start = time.perf_counter()
+with torch.no_grad():
+    for _ in range(100):
+        datas = torch.randn(64, 3, 224, 224).to(device)
+        output = model(datas)
+        pred = output.argmax(-1)
+
+end = time.perf_counter()
+print(f"mean_time:{((end - start) / nums):.5f}")
+
+```
+
+- mean_time:0.04328
+
+```python
+from __future__ import print_function
+
+import time
+import numpy as np
+import tensorrt as trt
+# import pycuda.driver as cuda
+# import pycuda.autoinit
+
+import os
+# import sys
+# sys.path.insert(1, os.path.join(sys.path[0], "."))
+import common
+# import layers_trt as lytrt
+
+import time
+from functools import wraps, partial
+
+TRT_LOGGER = trt.Logger()
+
+
+def timeit(func):
+    @wraps(func)
+    def inner(*args, **kwargs):
+        start = time.time()
+        r = func(*args, **kwargs)
+        end = time.time()
+        print("%s cost time: %s" % (func.__name__, end - start))
+        return r
+
+    return inner
+
+
+class DefModelData:
+    PLUGIN_LIBRARY = None
+    BATCH_SIZE = 1
+    MEM_SIZE = 1 << 28  # 256MiB  ; 1 << 28/1024/1024=256
+    # MEM_SIZE = common.GiB(2)  # 1G
+
+    DTYPE = trt.float16
+    NP_DTYPE = np.float16
+    INPUT_SHAPE = [1, 3, 32, 32]
+    OUTPUT_SIZE = [-1, 10]  # [-1,10]
+
+    onnx_file_path = "model.onnx"
+    engine_file_path = "model.trt"
+
+    model_file_path = "model.npz"
+    INPUT_NAME = 'input'
+    OUTPUT_NAME = 'output'
+
+
+def loadEngine(engine_file_path: str = "./model.engine"):
+    # If a serialized engine exists, use it instead of building an engine.
+    print("Reading engine from file {}".format(engine_file_path))
+    with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
+        return runtime.deserialize_cuda_engine(f.read())
+
+
+@timeit
+def runEngineInfer(data=np.ones([1, 3, 32, 32]), ModelData=None):
+    if ModelData is None: ModelData = DefModelData
+    engine_file_path = ModelData.engine_file_path
+    with loadEngine(engine_file_path) as engine, engine.create_execution_context() as context:
+        inputs, outputs, bindings, stream = common.allocate_buffers(engine)
+        len_data = len(data)
+        nums = 100
+        nwarmup = 5
+        print("Warm up ...")
+        for _ in range(nwarmup):
+            data = data.ravel().astype(ModelData.NP_DTYPE)  # 展成一行
+            np.copyto(inputs[0].host, data)
+            [output] = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs,
+                                              stream=stream)
+            output = np.reshape(output, ModelData.OUTPUT_SIZE)[:len_data]  # 转成[-1,10]
+            pred = np.argmax(output, -1)
+            # print("Prediction: " + str(pred))
+
+        start = time.perf_counter()
+        for _ in range(nums):
+            data = data.ravel().astype(ModelData.NP_DTYPE)  # 展成一行
+            np.copyto(inputs[0].host, data)
+            [output] = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs,
+                                              stream=stream)
+            output = np.reshape(output, ModelData.OUTPUT_SIZE)[:len_data]  # 转成[-1,10]
+            pred = np.argmax(output, -1)
+            # print("Prediction: " + str(pred))
+        end = time.perf_counter()
+        print(f"mean_time:{((end - start) / nums):.5f}")
+
+
+if __name__ == "__main__":
+    ModelData = DefModelData
+    ModelData.BATCH_SIZE = 64
+    ModelData.INPUT_SHAPE = [64, 3, 224, 224]
+    ModelData.OUTPUT_SIZE = [-1, 1000]
+    ModelData.engine_file_path = "model.trt"
+    runEngineInfer(data=np.random.randn(64, 3, 224, 224), ModelData=ModelData)
+
+```
+
diff --git a/readme.md b/readme.md
index c08534f00bdab05854942b59092c94700e6e8c69..5e2090fd94ed6d23d9b88a341802e7e90daf99b8 100644
--- a/readme.md
+++ b/readme.md
@@ -1,5 +1,6 @@
 - https://www.cvmart.net/community/detail/7040
 - https://www.cvmart.net/community/detail/5609
+- https://github.com/pytorch/TensorRT
 
 # 1、训练模型 python训练 (略过)
 # 2、模型推理部署
diff --git a/readme_CN.md b/readme_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..a3f0f002f2eb646a09b8ae6204bc8928428d83e1
--- /dev/null
+++ b/readme_CN.md
@@ -0,0 +1,330 @@
+- https://pytorch.org/cppdocs/
+- https://pytorch.org/get-started/locally/ # 下载 libtorch库
+
+- [LibTorch的安装与基本使用](https://zhuanlan.zhihu.com/p/513571175)
+
+- https://docs.openvino.ai/2023.3/openvino_docs_install_guides_installing_openvino_apt.html # openvino c++安装
+
+- https://github.com/openvinotoolkit/openvino
+
+- https://github.com/openvinotoolkit/openvino_notebooks
+
+- https://github.com/microsoft/onnxruntime
+
+- https://github.com/microsoft/onnxruntime-inference-examples
+
+- https://github.com/pytorch/TensorRT
+
+- https://github.com/NVIDIA/TensorRT
+
+- https://github.com/onnx/onnx-tensorrt
+
+- https://github.com/wang-xinyu/tensorrtx
+
+
+
+# 1、安装 opencv（c++）
+
+- [Linux安装Opencv（C++）](https://blog.csdn.net/weixin_44384491/article/details/121142093)
+
+# 2、安装libtorch
+
+下载的libtorch的版本最好和你的pytorch的版本一致。Linux下各个libtorch的release版本的下载链接可以在下面这篇文章中找到：
+
+## 2.2.1-cu118
+
+>Download here (Pre-cxx11 ABI):
+>https://download.pytorch.org/libtorch/cu118/libtorch-shared-with-deps-2.2.1%2Bcu118.zip
+>Download here (cxx11 ABI):
+>https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.2.1%2Bcu118.zip
+
+## 1.11.0-cu115
+
+>https://download.pytorch.org/libtorch/cu115
+
+## 2.2.1-cpu
+
+>Download here (Pre-cxx11 ABI):
+>https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-2.2.1%2Bcpu.zip
+>Download here (cxx11 ABI):
+>https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.2.1%2Bcpu.zip
+
+
+
+下载完成后，随便丢到一个地方去解压，完成。比如我是习惯性放在/usr/local/lib下的。我也建议linux小白将libtorch放在 /usr/local/lib下，并保证libtorch文件夹下存在include这个文件夹。
+
+## 配置CMakeLists.txt
+
+
+
+```py
+## 目录结构
+xxxx
+ - CMakeLists.txt
+ - digit.cpp
+ - digit.py
+```
+
+
+
+```cmake
+# CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+project(LibTorchDemo)
+
+# compile options
+set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+set(CMAKE_CXX_STANDARD 14)
+
+# package
+find_package(OpenCV REQUIRED)
+find_package(Torch REQUIRED PATHS "/usr/local/lib/libtorch")
+
+add_executable(digit digit.cpp)
+# libtorch
+target_link_libraries(digit ${TORCH_LIBRARIES})
+target_link_libraries(digit ${OpenCV_LIBS})
+```
+
+> "/usr/local/lib/libtorch" ：Torch的package路径根据你的安装目录指定
+
+然后写个文件include一下库，文件名为digit.cpp：
+
+```c++
+#include "iostream"
+#include "opencv2/opencv.hpp"
+#include "torch/script.h"
+
+int main(int argc, char const *argv[])
+{
+    std::cout << "hello world!" << std::endl;
+    return 0;
+}
+```
+
+编译一把：
+
+```bash
+$mkdir build
+$cd build
+$cmake .. && make -j8 install
+```
+
+> 如果出现错误，基本都是找不到头文件或者静态库，如果找不到头文件，在CMakeLists.txt中include_directories()中添加能够搜索到你在cpp中写的相对路径的根目录路径。如果静态库找不到，请检查安装包是否损坏，或者静态库目录是否在gcc的搜索路径中。
+
+## 第一步：先用PyTorch训练一个网络
+
+既然我们需要将PyTorch模型使用C++部署，那么首先肯定需要一个Torch的模型。我们先使用PyTorch简单训练一个手写数字识别，相信看这篇文章的靓仔都是torch老手了，我直接上代码：
+
+> 如果你已经有一个模型文件了，请直接跳转到第二步
+
+```python
+from sklearn.datasets import load_digits
+import torch
+from torch import nn
+import torch.utils.data as Data
+import numpy as np
+from sklearn.metrics import accuracy_score
+import matplotlib.pyplot as plt
+import os
+
+class Digit(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(1, 16, 3, 1, 1),
+            nn.Tanh(),
+            nn.Conv2d(16, 32, 3, 2, 1),
+            nn.Tanh(),
+            nn.Conv2d(32, 16, 3, 2, 1),
+            nn.Tanh(),
+            nn.Conv2d(16, 8, 3, 1, 1)
+        )
+
+        self.output = nn.Linear(32, 10)
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.output(out.flatten(1))
+        return out
+
+RATIO = 0.8
+BATCH_SIZE = 128
+EPOCH = 10
+
+if __name__ == "__main__":
+    X, y = load_digits(return_X_y=True)
+    X = X / 16.
+    sample_num = len(y)
+    X = [x.reshape(1, 8, 8).tolist() for x in X]
+
+    indice = np.arange(sample_num)
+    np.random.shuffle(indice)
+
+    X = torch.FloatTensor(X)
+    y = torch.LongTensor(y)
+    offline = int(sample_num * RATIO)
+
+    train = Data.TensorDataset(X[indice[:offline]], y[indice[:offline]])
+    test  = Data.TensorDataset(X[indice[offline:]], y[indice[offline:]])
+
+    train_loader = Data.DataLoader(train, BATCH_SIZE, True)
+    test_loader  = Data.DataLoader(test,  BATCH_SIZE, False)
+    
+    model = Digit()
+    optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3)
+    criterion = nn.CrossEntropyLoss(reduction="mean")
+
+    test_losses = []
+    test_accs = []
+
+    for epoch in range(EPOCH):
+        model.train()
+        for bx, by in train_loader:
+            out = model(bx)
+            loss = criterion(out, by)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        model.eval()
+        correct = 0
+        total = 0
+        test_loss = []
+        test_acc = []
+        for bx, by in test_loader:
+            with torch.no_grad():
+                out = model(bx)
+                pre_lab = out.argmax(1)
+                loss = criterion(out, by)
+
+            test_loss.append(loss.item())
+            test_acc.append(accuracy_score(pre_lab, by))
+
+        test_losses.append(np.mean(test_loss))
+        test_accs.append(np.mean(test_acc))
+    
+    plt.figure(dpi=120)
+    plt.plot(test_losses, 'o-', label="loss")
+    plt.plot(test_accs, 'o-', label="accuracy")
+    plt.legend()
+    plt.grid()
+    plt.show()
+
+    if not os.path.exists("model"):
+        os.makedirs("model")
+    torch.save(model.state_dict(), "model/digit.pth")
+```
+
+## 第二步：使用tracing将模型文件转化成TorchScript
+
+PyTorch导出的模型文件是不能直接被libtorch读取的，因为PyTorch默认导出的后端的序列化是joblib。PyTorch通过JIT搭建了Python和C++的桥梁，我们可以将模型转成TorchScript Module，将Python运行时的部分运行时包裹进去。
+
+转换方法非常简单：
+
+```py
+import torch
+from digit import Digit
+
+model = Digit()
+model.load_state_dict(torch.load("model/digit.pth", map_location="cpu"))
+
+sample = torch.randn(1, 1, 8, 8)
+
+trace_model = torch.jit.trace(model, sample)
+trace_model.save("model/digit.jit")
+```
+
+运行下述测试代码，由于Python本身的特性和JIT的即时编译的特性，模型在同一进程生命周期内运行时前几次会比较慢，所以在测试前，需要空跑几次：
+
+## 第三步：使用libtorch重写推理程序
+
+由于TorchScript可以被C++直接调用，所以我们只需要使用libtorch重写推理代码，并将模型读入就完成了。
+
+libtorch的语法和PyTorch基本一致，学起来很快，于此锦恢就不再赘述了。相应的，在C++中，我们用cv::Mat来取代Python中的numpy.ndarray对象，如何将cv::Mat转成libtorch可以读入的数据结构也会在demo中涉及。
+
+下面的例子会完成一个C++命令行程序，它的第一个参数为模型，第二个参数为需要读入的手写数字图像的路径，预测结果会打印到控制台上。期待已久的C++代码如下：
+
+```c
+#include "iostream"
+#include "opencv2/opencv.hpp"
+#include "torch/script.h"
+#include "fstream"
+
+void checkPath(const char* path) {
+    std::ifstream in;
+    in.open(path);
+    bool flag = (bool)in;
+    in.close();
+    if (flag) return;
+    else {
+        std::cout << "file " << path << " doesn't exist!" << std::endl;
+        exit(-1);
+    }
+}
+
+int main(int argc, char const *argv[])
+{
+    if (argc != 3) {
+        std::cout << "usage : digit <model path> <image path>" << std::endl;
+        return -1;
+    }
+
+    checkPath(argv[1]);
+    checkPath(argv[2]);
+    cv::Mat img = cv::imread(argv[2]), gimg, fimg, rimg;
+    cv::cvtColor(img, gimg, CV_BGR2GRAY);
+
+    gimg.convertTo(fimg, CV_32F, - 1. / 255., 1.);
+    cv::resize(fimg, rimg, {8, 8});
+
+    // convert Mat to tensor
+    at::Tensor img_tensor = torch::from_blob(
+        rimg.data,
+        {1, 1, 8, 8},
+        torch::kFloat32
+    );
+
+    // load model
+    torch::jit::Module model = torch::jit::load(argv[1]);
+
+    // torch.no_grad() 
+    torch::NoGradGuard no_grad; // 请一定加入torch::NoGradGuard no_grad; 这句话，否则内存会炸。
+    
+    // forward
+    torch::Tensor out = model({img_tensor}).toTensor();
+    int pre_lab = torch::argmax(out, 1).item().toInt();
+
+    std::cout << "predict number is " << pre_lab << std::endl;
+    return 0;
+}
+```
+
+
+
+# 实践
+
+```py
+# 安装环境依赖
+sudo apt-get update -y
+sudo apt-get install cmake -y
+sudo apt-get install build-essential libgtk2.0-dev libavcodec-dev libavformat-dev libjpeg-dev libswscale-dev libtiff5-dev -y
+sudo apt-get install libgtk2.0-dev -y
+sudo apt-get install pkg-config -y
+
+# 安装opencv
+!wget https://github.com/opencv/opencv/archive/4.9.0.zip
+cd opencv-4.9.0
+mkdir build
+cd build
+cmake -D CMAKE_BUILD_TYPE=Release -D CMAKE_INSTALL_PREFIX=/usr/local ..
+make -j8
+make install
+
+# 安装libtorch
+!wget https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.2.1%2Bcu121.zip
+unzip libtorch-cxx11-abi-shared-with-deps-2.2.1%2Bcu121.zip
+cp -r libtorch /usr/local/lib
+```
+