update doc

d5d78b48 · an1018 · fe89056c · d5d78b48 · d5d78b48 · d5d78b48
12 changed file
--- a/deploy/hubserving/readme.md
+++ b/deploy/hubserving/readme.md
@@ -20,7 +20,7 @@ PaddleOCR提供2种服务部署方式：
 # 基于PaddleHub Serving的服务部署
-hubserving服务部署目录下包括文本检测、文本方向分类，文本识别、文本检测+文本方向分类+文本识别3阶段串联，表格识别和PP-Structure六种服务包，请根据需求选择相应的服务包进行安装和启动。目录结构如下：
+hubserving服务部署目录下包括文本检测、文本方向分类，文本识别、文本检测+文本方向分类+文本识别3阶段串联，表格识别、PP-Structure和版面分析七种服务包，请根据需求选择相应的服务包进行安装和启动。目录结构如下：
 ```
 deploy/hubserving/
  └─  ocr_cls     文本方向分类模块服务包
@@ -29,6 +29,7 @@ deploy/hubserving/
  └─  ocr_system  文本检测+文本方向分类+文本识别串联服务包
  └─  structure_table  表格识别服务包
  └─  structure_system  PP-Structure服务包
+  └─  structure_layout  版面分析服务包
 ```
 每个服务包下包含3个文件。以2阶段串联服务包为例，目录如下：
@@ -43,6 +44,7 @@ deploy/hubserving/ocr_system/
 * 2022.05.05 新增PP-OCRv3检测和识别模型。
 * 2022.03.30 新增PP-Structure和表格识别两种服务。
+* 2022.08.23 新增版面分析服务。
 ## 2. 快速启动服务
 以下步骤以检测+识别2阶段串联服务为例，如果只需要检测服务或识别服务，替换相应文件路径即可。
@@ -59,7 +61,7 @@ pip3 install paddlehub==2.1.0 --upgrade -i https://mirror.baidu.com/pypi/simple
 检测模型：./inference/ch_PP-OCRv3_det_infer/
 识别模型：./inference/ch_PP-OCRv3_rec_infer/
 方向分类器：./inference/ch_ppocr_mobile_v2.0_cls_infer/
-版面分析模型：./inference/layout_infer/
+版面分析模型：./inference/picodet_lcnet_x1_0_fgd_layout_infer/
 表格结构识别模型：./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/
 ```
@@ -87,6 +89,9 @@ hub install deploy/hubserving/structure_table/
 # 或，安装PP-Structure服务模块：  
 hub install deploy/hubserving/structure_system/
+# 或，安装版面分析服务模块：  
+hub install deploy/hubserving/structure_layout/
 ```
 * 在Windows环境下(文件夹的分隔符为`\`)，安装示例如下：
@@ -108,6 +113,9 @@ hub install deploy\hubserving\structure_table\
 # 或，安装PP-Structure服务模块：  
 hub install deploy\hubserving\structure_system\
+# 或，安装版面分析服务模块：
+hub install deploy\hubserving\structure_layout\
 ```
 ### 2.4 启动服务
@@ -185,6 +193,7 @@ hub serving start -c deploy/hubserving/ocr_system/config.json
 `http://127.0.0.1:8868/predict/ocr_system`  
 `http://127.0.0.1:8869/predict/structure_table`  
 `http://127.0.0.1:8870/predict/structure_system`  
+`http://127.0.0.1:8870/predict/structure_layout`  
 - **image_dir**：测试图像路径，可以是单张图片路径，也可以是图像集合目录路径  
 - **visualize**：是否可视化结果，默认为False  
 - **output**：可视化结果保存路径，默认为`./hubserving_result`
@@ -203,17 +212,19 @@ hub serving start -c deploy/hubserving/ocr_system/config.json
 |text_region|list|文本位置坐标|
 |html|str|表格的html字符串|
 |regions|list|版面分析+表格识别+OCR的结果，每一项为一个list，包含表示区域坐标的`bbox`，区域类型的`type`和区域结果的`res`三个字段|
+|layout|list|版面分析的结果，每一项一个dict，包含版面区域坐标的`bbox`，区域类型的`label`|
 不同模块返回的字段不同，如，文本识别服务模块返回结果不含`text_region`字段，具体信息如下：
-| 字段名/模块名 | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system |
+| 字段名/模块名 | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | Structure_layout |
-|  ---  |  ---  |  ---  |  ---  |  ---  | ---  |---  |
+|  ---  |  ---  |  ---  |  ---  |  ---  | ---  |  ---  |  ---  |
-|angle| | ✔ | | ✔ | ||
+|angle| | ✔ | | ✔ | |||
-|text| | |✔|✔| | ✔ |
+|text| | |✔|✔| | ✔ |  |
-|confidence| |✔ |✔| | | ✔|
+|confidence| |✔ |✔| | | ✔| |
-|text_region| ✔| | |✔ | | ✔|
+|text_region| ✔| | |✔ | | ✔| |
-|html| | | | |✔ |✔|
+|html| | | | |✔ |✔||
-|regions| | | | |✔ |✔ |
+|regions| | | | |✔ |✔ | |
+|layout| | | | | | | ✔ |
 **说明：** 如果需要增加、删除、修改返回字段，可在相应模块的`module.py`文件中进行修改，完整流程参考下一节自定义修改服务模块。

--- a/deploy/hubserving/readme_en.md
+++ b/deploy/hubserving/readme_en.md
@@ -20,7 +20,7 @@ PaddleOCR provides 2 service deployment methods:
 # Service deployment based on PaddleHub Serving  
-The hubserving service deployment directory includes six service packages: text detection, text angle class, text recognition, text detection+text angle class+text recognition three-stage series connection, table recognition and PP-Structure. Please select the corresponding service package to install and start service according to your needs. The directory is as follows:  
+The hubserving service deployment directory includes seven service packages: text detection, text angle class, text recognition, text detection+text angle class+text recognition three-stage series connection, table recognition, PP-Structure and layout analysis. Please select the corresponding service package to install and start service according to your needs. The directory is as follows:  
 ```
 deploy/hubserving/
  └─  ocr_det     text detection module service package
@@ -29,6 +29,7 @@ deploy/hubserving/
  └─  ocr_system  text detection+text angle class+text recognition three-stage series connection service package
  └─  structure_table  table recognition service package
  └─  structure_system  PP-Structure service package
+  └─  structure_layout  layout analysis service package
 ```
 Each service pack contains 3 files. Take the 2-stage series connection service package as an example, the directory is as follows:  
@@ -43,6 +44,7 @@ deploy/hubserving/ocr_system/
 * 2022.05.05 add PP-OCRv3 text detection and recognition models.
 * 2022.03.30 add PP-Structure and table recognition services。
+* 2022.08.23 add layout analysis services。
 ## 2. Quick start service
@@ -61,7 +63,7 @@ Before installing the service module, you need to prepare the inference model an
 text detection model: ./inference/ch_PP-OCRv3_det_infer/
 text recognition model: ./inference/ch_PP-OCRv3_rec_infer/
 text angle classifier: ./inference/ch_ppocr_mobile_v2.0_cls_infer/
-layout parse model: ./inference/layout_infer/
+layout parse model: ./inference/picodet_lcnet_x1_0_fgd_layout_infer/
 tanle recognition: ./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/
 ```  
@@ -89,6 +91,9 @@ hub install deploy/hubserving/structure_table/
 # Or install PP-Structure service module
 hub install deploy/hubserving/structure_system/
+# Or install layout analysis service module
+hub install deploy/hubserving/structure_layout/
 ```
 * On Windows platform, the examples are as follows.
@@ -110,6 +115,9 @@ hub install deploy/hubserving/structure_table/
 # Or install PP-Structure service module
 hub install deploy\hubserving\structure_system\
+# Or install layout analysis service module
+hub install deploy\hubserving\structure_layout\
 ```
 ### 2.4 Start service
@@ -192,6 +200,7 @@ For example, if using the configuration file to start the text angle classificat
 `http://127.0.0.1:8868/predict/ocr_system`  
 `http://127.0.0.1:8869/predict/structure_table`  
 `http://127.0.0.1:8870/predict/structure_system`  
+`http://127.0.0.1:8870/predict/structure_layout`  
 - **image_dir**：Test image path, can be a single image path or an image directory path
 - **visualize**：Whether to visualize the results, the default value is False
 - **output**：The floder to save Visualization result, default value is `./hubserving_result`
@@ -212,17 +221,19 @@ The returned result is a list. Each item in the list is a dict. The dict may con
 |text_region|list|text location coordinates|
 |html|str|table html str|
 |regions|list|The result of layout analysis + table recognition + OCR, each item is a list, including `bbox` indicating area coordinates, `type` of area type and `res` of area results|
+|layout|list|The result of layout analysis, each item is a dict, including `bbox` indicating area coordinates, `label` of area type|
 The fields returned by different modules are different. For example, the results returned by the text recognition service module do not contain `text_region`. The details are as follows:
-| field name/module name | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system |
+| field name/module name | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | structure_layout |
-|  ---  |  ---  |  ---  |  ---  |  ---  | ---  |---  |
+|  ---  |  ---  |  ---  |  ---  |  ---  | ---  |---  |---  |
-|angle| | ✔ | | ✔ | ||
+|angle| | ✔ | | ✔ | || |
-|text| | |✔|✔| | ✔ |
+|text| | |✔|✔| | ✔ | |
-|confidence| |✔ |✔| | | ✔|
+|confidence| |✔ |✔| | | ✔| |
-|text_region| ✔| | |✔ | | ✔|
+|text_region| ✔| | |✔ | | ✔| |
-|html| | | | |✔ |✔|
+|html| | | | |✔ |✔| |
-|regions| | | | |✔ |✔ |
+|regions| | | | |✔ |✔ | |
+|layout| | | | | | |✔ |
 **Note：** If you need to add, delete or modify the returned fields, you can modify the file `module.py` of the corresponding module. For the complete process, refer to the user-defined modification service module in the next section.

--- a/deploy/hubserving/structure_layout/__init__.py
+++ b/deploy/hubserving/structure_layout/__init__.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
--- a/deploy/hubserving/structure_layout/config.json
+++ b/deploy/hubserving/structure_layout/config.json
+{
+    "modules_info": {
+        "structure_layout": {
+            "init_args": {
+                "version": "1.0.0",
+                "use_gpu": true
+            },
+            "predict_args": {
+            }
+        }
+    },
+    "port": 8871,
+    "use_multiprocess": false,
+    "workers": 2
+}
--- a/deploy/hubserving/structure_layout/module.py
+++ b/deploy/hubserving/structure_layout/module.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+sys.path.insert(0, ".")
+import copy
+import time
+import paddlehub
+from paddlehub.common.logger import logger
+from paddlehub.module.module import moduleinfo, runnable, serving
+import cv2
+import paddlehub as hub
+from tools.infer.utility import base64_to_cv2
+from ppstructure.layout.predict_layout import LayoutPredictor as _LayoutPredictor
+from ppstructure.utility import parse_args
+from deploy.hubserving.structure_layout.params import read_params
+@moduleinfo(
+    name="structure_layout",
+    version="1.0.0",
+    summary="PP-Structure layout service",
+    author="paddle-dev",
+    author_email="paddle-dev@baidu.com",
+    type="cv/structure_layout")
+class LayoutPredictor(hub.Module):
+    def _initialize(self, use_gpu=False, enable_mkldnn=False):
+        """
+        initialize with the necessary elements
+        """
+        cfg = self.merge_configs()
+        cfg.use_gpu = use_gpu
+        if use_gpu:
+            try:
+                _places = os.environ["CUDA_VISIBLE_DEVICES"]
+                int(_places[0])
+                print("use gpu: ", use_gpu)
+                print("CUDA_VISIBLE_DEVICES: ", _places)
+                cfg.gpu_mem = 8000
+            except:
+                raise RuntimeError(
+                    "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES via export CUDA_VISIBLE_DEVICES=cuda_device_id."
+                )
+        cfg.ir_optim = True
+        cfg.enable_mkldnn = enable_mkldnn
+        self.layout_predictor = _LayoutPredictor(cfg)
+    def merge_configs(self):
+        # deafult cfg
+        backup_argv = copy.deepcopy(sys.argv)
+        sys.argv = sys.argv[:1]
+        cfg = parse_args()
+        update_cfg_map = vars(read_params())
+        for key in update_cfg_map:
+            cfg.__setattr__(key, update_cfg_map[key])
+        sys.argv = copy.deepcopy(backup_argv)
+        return cfg
+    def read_images(self, paths=[]):
+        images = []
+        for img_path in paths:
+            assert os.path.isfile(
+                img_path), "The {} isn't a valid file.".format(img_path)
+            img = cv2.imread(img_path)
+            if img is None:
+                logger.info("error in loading image:{}".format(img_path))
+                continue
+            images.append(img)
+        return images
+    def predict(self, images=[], paths=[]):
+        """
+        Get the chinese texts in the predicted images.
+        Args:
+            images (list(numpy.ndarray)): images data, shape of each is [H, W, C]. If images not paths
+            paths (list[str]): The paths of images. If paths not images
+        Returns:
+            res (list): The layout results of images.
+        """
+        if images != [] and isinstance(images, list) and paths == []:
+            predicted_data = images
+        elif images == [] and isinstance(paths, list) and paths != []:
+            predicted_data = self.read_images(paths)
+        else:
+            raise TypeError("The input data is inconsistent with expectations.")
+        assert predicted_data != [], "There is not any image to be predicted. Please check the input data."
+        all_results = []
+        for img in predicted_data:
+            if img is None:
+                logger.info("error in loading image")
+                all_results.append([])
+                continue
+            starttime = time.time()
+            res, _ = self.layout_predictor(img)
+            elapse = time.time() - starttime
+            logger.info("Predict time: {}".format(elapse))
+            for item in res:
+                item['bbox'] = item['bbox'].tolist()
+            all_results.append({'layout': res})
+        return all_results
+    @serving
+    def serving_method(self, images, **kwargs):
+        """
+        Run as a service.
+        """
+        images_decode = [base64_to_cv2(image) for image in images]
+        results = self.predict(images_decode, **kwargs)
+        return results
+if __name__ == '__main__':
+    layout = LayoutPredictor()
+    layout._initialize()
+    image_path = ['./ppstructure/docs/table/1.png']
+    res = layout.predict(paths=image_path)
+    print(res)
--- a/deploy/hubserving/structure_layout/params.py
+++ b/deploy/hubserving/structure_layout/params.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+class Config(object):
+    pass
+def read_params():
+    cfg = Config()
+    # params for layout analysis
+    cfg.layout_model_dir = './inference/picodet_lcnet_x1_0_fgd_layout_infer/'
+    cfg.layout_dict_path = './ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt'
+    cfg.layout_score_threshold = 0.5
+    cfg.layout_nms_threshold = 0.5
+    return cfg
--- a/paddleocr.py
+++ b/paddleocr.py
@@ -286,11 +286,17 @@ MODEL_URLS = {
                }
            },
            'layout': {
-                'ch': {
+                'en': {
                    'url':
-                    'https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar',
+                    'https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar',
                    'dict_path':
                    'ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt'
+                },
+                'ch': {
+                    'url':
+                    'https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar',
+                    'dict_path':
+                    'ppocr/utils/dict/layout_dict/layout_cdla_dict.txt'
                }
            }
        }
@@ -634,6 +640,20 @@ def main():
            result = engine(img_path)
            save_structure_res(result, args.output, img_name)
+            if args.recovery:
+                try:
+                    from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx
+                    img = cv2.imread(img_path)
+                    h, w, _ = img.shape
+                    res = sorted_layout_boxes(result, w)
+                    convert_info_docx(img, res, args.output, img_name,
+                                      args.save_pdf)
+                except Exception as ex:
+                    logger.error(
+                        "error in layout recovery image:{}, err msg: {}".format(
+                            img_name, ex))
+                    continue
            for item in result:
                item.pop('img')
                item.pop('res')

--- a/ppstructure/docs/quickstart.md
+++ b/ppstructure/docs/quickstart.md
@@ -51,10 +51,14 @@ pip3 install "paddleocr>=2.6"
 pip3 install paddleclas
 # 安装 关键信息抽取 依赖包（如不需要KIE功能，可跳过）
-pip3 install -r kie/requirements.txt
+pip3 install -r ppstructure/kie/requirements.txt
+# 安装 版面恢复 依赖包（如不需要版面恢复功能，可跳过）
+pip3 install -r ppstructure/recovery/requirements.txt
 ```
 <a name="2"></a>
 ## 2. 便捷使用
 <a name="21"></a>
@@ -94,7 +98,10 @@ paddleocr --image_dir=ppstructure/docs/table/table.jpg --type=structure --layout
 #### 2.1.6 版面恢复
 ```bash
+# 中文测试图
 paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true
+# 英文测试图
+paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en'
 ```
 <a name="22"></a>
@@ -215,9 +222,12 @@ for line in result:
 import os
 import cv2
 from paddleocr import PPStructure,save_structure_res
-from paddelocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx
+from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx
-table_engine = PPStructure(layout=False, show_log=True)
+# 中文测试图
+table_engine = PPStructure(recovery=True)
+# 英文测试图
+# table_engine = PPStructure(recovery=True, lang='en')
 save_folder = './output'
 img_path = 'ppstructure/docs/table/1.png'
@@ -230,8 +240,8 @@ for line in result:
    print(line)
 h, w, _ = img.shape
-res = sorted_layout_boxes(res, w)
+res = sorted_layout_boxes(result, w)
-convert_info_docx(img, result, save_folder, os.path.basename(img_path).split('.')[0])
+convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0])
 ```
 <a name="23"></a>

--- a/ppstructure/docs/quickstart_en.md
+++ b/ppstructure/docs/quickstart_en.md
@@ -29,10 +29,17 @@
 ```bash
 # Install paddleocr, version 2.6 is recommended
 pip3 install "paddleocr>=2.6"
-# Install the KIE dependency packages (if you do not use the KIE, you can skip it)
-pip install -r kie/requirements.txt
 # Install the image direction classification dependency package paddleclas (if you do not use the image direction classification, you can skip it)
 pip3 install paddleclas
+# Install the KIE dependency packages (if you do not use the KIE, you can skip it)
+pip3 install -r ppstructure/kie/requirements.txt
+# Install the layout recovery dependency packages (if you do not use the layout recovery, you can skip it)
+pip3 install -r ppstructure/recovery/requirements.txt
 ```
 <a name="2"></a>
@@ -73,8 +80,11 @@ Please refer to: [Key Information Extraction](../kie/README.md) .
 <a name="216"></a>
 #### 2.1.6 layout recovery
-```bash
+```
+# Chinese pic
 paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure --recovery=true
+# English pic
+paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en'
 ```
 <a name="22"></a>
@@ -192,12 +202,15 @@ Please refer to: [Key Information Extraction](../kie/README.md) .
 import os
 import cv2
 from paddleocr import PPStructure,save_structure_res
-from paddelocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx
+from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx
-table_engine = PPStructure(layout=False, show_log=True)
+# Chinese image
+table_engine = PPStructure(recovery=True)
+# English image
+# table_engine = PPStructure(recovery=True, lang='en')
 save_folder = './output'
-img_path = 'PaddleOCR/ppstructure/docs/table/1.png'
+img_path = 'ppstructure/docs/table/1.png'
 img = cv2.imread(img_path)
 result = table_engine(img)
 save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0])
@@ -207,8 +220,8 @@ for line in result:
    print(line)
 h, w, _ = img.shape
-res = sorted_layout_boxes(res, w)
+res = sorted_layout_boxes(result, w)
-convert_info_docx(img, result, save_folder, os.path.basename(img_path).split('.')[0])
+convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0])
 ```
 <a name="23"></a>

--- a/ppstructure/predict_system.py
+++ b/ppstructure/predict_system.py
@@ -77,7 +77,7 @@ class StructureSystem(object):
        elif self.mode == 'kie':
            raise NotImplementedError
-    def __call__(self, img, img_idx=0, return_ocr_result_in_table=False):
+    def __call__(self, img, return_ocr_result_in_table=False, img_idx=0):
        time_dict = {
            'image_orientation': 0,
            'layout': 0,

--- a/ppstructure/recovery/recovery_to_doc.py
+++ b/ppstructure/recovery/recovery_to_doc.py
@@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import cv2
 import os
-import pypandoc
 from copy import deepcopy
 from docx import Document
@@ -30,7 +28,7 @@ from ppocr.utils.logging import get_logger
 logger = get_logger()
-def convert_info_docx(img, res, save_folder, img_name, save_pdf):
+def convert_info_docx(img, res, save_folder, img_name, save_pdf=False):
    doc = Document()
    doc.styles['Normal'].font.name = 'Times New Roman'
    doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')

--- a/ppstructure/recovery/requirements.txt
+++ b/ppstructure/recovery/requirements.txt
-pypandoc
 python-docx
 docx2pdf
 fitz
 PyMuPDF
+beautifulsoup4
\ No newline at end of file