From d58c70223e57a004d55d54cc10712905140f3b4b Mon Sep 17 00:00:00 2001
From: an1018 <614803115@qq.com>
Date: Fri, 14 Oct 2022 18:45:39 +0800
Subject: [PATCH] add_pdf2docx_api

---
 ppstructure/recovery/README.md        | 9 ++++++++-
 ppstructure/recovery/README_ch.md     | 9 ++++++++-
 ppstructure/recovery/requirements.txt | 3 +--
 3 files changed, 17 insertions(+), 4 deletions(-)
diff --git a/ppstructure/recovery/README.md b/ppstructure/recovery/README.md
index 41fb3e45..209c995f 100644
--- a/ppstructure/recovery/README.md
+++ b/ppstructure/recovery/README.md
@@ -86,7 +86,7 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR
 
 - **(2) Install recovery `requirements`**
 
-The layout restoration is exported as docx files, so python-docx API need to be installed, and PyMuPDF api([requires Python >= 3.7](https://pypi.org/project/PyMuPDF/)) need to be installed to process the input files in pdf format. And if using pdf parse method, we need to install pdf2docx api.
+The layout restoration is exported as docx files, so python-docx API need to be installed, and PyMuPDF api([requires Python >= 3.7](https://pypi.org/project/PyMuPDF/)) need to be installed to process the input files in pdf format.
 
 Install all the libraries by running the following command:
 
@@ -94,6 +94,13 @@ Install all the libraries by running the following command:
 python3 -m pip install -r ppstructure/recovery/requirements.txt
 ````
 
+ And if using pdf parse method, we need to install pdf2docx api.
+
+```bash
+wget https://paddleocr.bj.bcebos.com/whl/pdf2docx-0.0.0-py3-none-any.whl
+pip3 install pdf2docx-0.0.0-py3-none-any.whl
+```
+
 <a name="3"></a>
 
 ## 3. Quick Start using PDF parse
diff --git a/ppstructure/recovery/README_ch.md b/ppstructure/recovery/README_ch.md
index eaa5260b..5ef823d4 100644
--- a/ppstructure/recovery/README_ch.md
+++ b/ppstructure/recovery/README_ch.md
@@ -82,7 +82,7 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR
 
 - **（2）安装recovery的`requirements`**
 
-版面恢复导出为docx文件，所以需要安装Python处理word文档的python-docx API，同时处理pdf格式的输入文件，需要安装PyMuPDF API([要求Python >= 3.7](https://pypi.org/project/PyMuPDF/))。使用pdf2docx库解析的方式恢复文档需要安装pdf2docx等。
+版面恢复导出为docx文件，所以需要安装Python处理word文档的python-docx API，同时处理pdf格式的输入文件，需要安装PyMuPDF API([要求Python >= 3.7](https://pypi.org/project/PyMuPDF/))。
 
 通过如下命令安装全部库：
 
@@ -90,6 +90,13 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR
 python3 -m pip install -r ppstructure/recovery/requirements.txt
 ```
 
+使用pdf2docx库解析的方式恢复文档需要安装优化的pdf2docx。
+
+```bash
+wget https://paddleocr.bj.bcebos.com/whl/pdf2docx-0.0.0-py3-none-any.whl
+pip3 install pdf2docx-0.0.0-py3-none-any.whl
+```
+
 <a name="3"></a>
 
 ## 3.使用 PDF解析进行版面恢复
diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt
index d67e0a95..4e4239a1 100644
--- a/ppstructure/recovery/requirements.txt
+++ b/ppstructure/recovery/requirements.txt
@@ -2,5 +2,4 @@ python-docx
 PyMuPDF==1.19.0
 beautifulsoup4
 fonttools>=4.24.0
-fire>=0.3.0
-pdf2docx==0.0.0
\ No newline at end of file
+fire>=0.3.0
\ No newline at end of file
-- 
GitLab