未验证 提交 3f5c62d7 编写于 作者: 小小的香辛料 提交者: GitHub

update download.md (#5591)

* bb

* 2022.11.14

* 2022.11.14_10:36

* 2022.11.14 16:55

* 2022.11.14 17:03

* 2022.11.14 21:40

* 2022.11.14 21:50

* update info.yaml

* 2022.11.17 16:44
上级 077fbedf
---
title: ERNIE-Layout
emoji: 🧾
colorFrom: gray
colorTo: pink
sdk: gradio
sdk_version: 3.4.1
app_file: app.py
pinned: false
license: apache-2.0
---
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
#-*- coding: UTF-8 -*-
# Copyright 2022 The Impira Team and the HuggingFace Team.
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import json
import base64
from io import BytesIO
from PIL import Image
import traceback
import requests
import numpy as np
import gradio as gr
import pdf2image
import fitz
import cv2
fitz_tools = fitz.Tools()
def pdf2img(stream, pagenos, dpi=300, thread_count=3, height=1600):
images = []
cimages = pdf2image.convert_from_bytes(
stream, dpi=dpi, thread_count=thread_count, first_page=pagenos[0] + 1, last_page=pagenos[-1] + 1,
size=height)
for _image in cimages:
image = np.array(_image)
image = image[..., ::-1]
images.append(image)
return images
class PdfReader(object):
"""pdf reader"""
def __init__(self,
stream: bytes,
image_height: int = 1600):
self.stream = stream
self._image_height = image_height
self._dpi = 200
self._inpdf = self.load_file(stream)
@staticmethod
def load_file(stream):
"""load document"""
try:
inpdf = fitz.Document(stream=stream, filetype="pdf")
except Exception as e:
print(f"[PDF_READER]-[Failed to load the file]-[{repr(e)}]")
return inpdf
@staticmethod
def _convert_page_obj_to_image(page_obj, image_height: int = None):
"""fitz convert pdf to image
Args:
page_obj ([type]): [description]
ratio ([type]): [description]
Returns:
[type]: [description]
"""
if image_height:
_, page_height = page_obj.rect.x1 - \
page_obj.rect.x0, page_obj.rect.y1 - page_obj.rect.y0
ratio = image_height / page_height
else:
ratio = 1.0
trans = fitz.Matrix(ratio, ratio)
pixmap = page_obj.get_pixmap(matrix=trans, alpha=False)
image = cv2.imdecode(np.frombuffer(pixmap.tobytes(), np.uint8), -1)
fitz_tools.store_shrink(100)
return image
def get_page_image(self,
pageno):
"""get page image
Args:
pageno ([type]): [description]
Returns:
[type]: [description]
"""
try:
page_obj = self._inpdf[pageno]
return self._convert_page_obj_to_image(page_obj, self._image_height)
except Exception as e:
print(f"[Failed to convert the PDF to images]-[{repr(e)}]")
try:
return pdf2img(stream=self.stream,
pagenos=[pageno],
height=self._image_height,
dpi=self._dpi)[0]
except Exception as e:
print(f"[Failed to convert the PDF to images]-[{repr(e)}]")
return None
examples = [
[
"budget_form.png",
"What is the total actual and/or obligated expenses of ECG Center?"
],
[
"poster.png",
"Which gift idea needs a printer?"
],
[
"receipt.png",
"เบอร์โทรร้านอะไรคะ?"
],
[
"medical_bill_2.jpg",
"患者さんは何でお金を払いますか。"
],
[
"resume.png",
"五百丁本次想要担任的是什么职位?",
],
[
"custom_declaration_form.png",
"在哪个口岸进口?"
],
[
"invoice.jpg",
"发票号码是多少?",
],
]
prompt_files = {
"发票号码是多少?": "invoice.jpg",
"五百丁本次想要担任的是什么职位?": "resume.png",
"在哪个口岸进口?": "custom_declaration_form.png",
"What is the total actual and/or obligated expenses of ECG Center?": "budget_form.png",
"Which gift idea needs a printer?": "poster.png",
"患者さんは何でお金を払いますか。": "medical_bill_2.jpg",
"เบอร์โทรร้านอะไรคะ?": "receipt.png",
}
lang_map = {
"invoice.jpg": "ch",
"resume.png": "ch",
"custom_declaration_form.png": "ch",
"medical_bill_1.png": "ch",
"budget_form.png": "en",
"website_design_guide.jpeg": "en",
"poster.png": "en",
"medical_bill_2.jpg": "ch",
"receipt.png": "en"
}
def load_document(path):
if path.startswith("http://") or path.startswith("https://"):
resp = requests.get(path, allow_redirects=True, stream=True)
b = resp.raw
else:
b = open(path, "rb")
if path.endswith(".pdf"):
images_list = []
pdfreader = PdfReader(stream=b.read())
for p_no in range(0, pdfreader._inpdf.page_count):
img_np = pdfreader.get_page_image(pageno=p_no)
images_list.append(img_np)
else:
image = Image.open(b)
images_list = [np.array(image.convert("RGB"))]
return images_list
def process_path(path):
error = None
if path:
try:
images_list = load_document(path)
return (
path,
gr.update(visible=True, value=images_list),
gr.update(visible=True),
gr.update(visible=False, value=None),
gr.update(visible=False, value=None),
None,
)
except Exception as e:
traceback.print_exc()
error = str(e)
return (
None,
gr.update(visible=False, value=None),
gr.update(visible=False),
gr.update(visible=False, value=None),
gr.update(visible=False, value=None),
gr.update(visible=True, value=error) if error is not None else None,
None,
)
def process_upload(file):
if file:
return process_path(file.name)
else:
return (
None,
gr.update(visible=False, value=None),
gr.update(visible=False),
gr.update(visible=False, value=None),
gr.update(visible=False, value=None),
None,
)
def np2base64(image_np):
image = cv2.imencode('.jpg', image_np)[1]
base64_str = str(base64.b64encode(image))[2:-1]
return base64_str
def get_base64(path):
if path.startswith("http://") or path.startswith("https://"):
resp = requests.get(path, allow_redirects=True, stream=True)
b = resp.raw
else:
b = open(path, "rb")
if path.endswith(".pdf"):
images_list = []
pdfreader = PdfReader(stream=b.read())
for p_no in range(0, min(pdfreader._inpdf.page_count, 1)):
img_np = pdfreader.get_page_image(pageno=p_no)
images_list.append(img_np)
base64_str = np2base64(images_list[0])
else:
base64_str = base64.b64encode(b.read()).decode()
return base64_str
def process_prompt(prompt, document, lang="ch"):
if not prompt:
prompt = "What is the total actual and/or obligated expenses of ECG Center?"
if document is None:
return None, None, None
access_token = os.environ['token']
url = f"https://aip.baidubce.com/rpc/2.0/nlp-itec/poc/docprompt?access_token={access_token}"
base64_str = get_base64(document)
r = requests.post(url, json={"doc": base64_str, "prompt": [prompt], "lang": lang})
response = r.json()
predictions = response['result']
img_list = response['image']
pages = [Image.open(BytesIO(base64.b64decode(img))) for img in img_list]
text_value = predictions[0]['result'][0]['value']
return (
gr.update(visible=True, value=pages),
gr.update(visible=True, value=predictions),
gr.update(
visible=True,
value=text_value,
),
)
def load_example_document(img, prompt):
if img is not None:
document = prompt_files[prompt]
lang = lang_map[document]
preview, answer, answer_text = process_prompt(prompt, document, lang)
return document, prompt, preview, gr.update(visible=True), answer, answer_text
else:
return None, None, None, gr.update(visible=False), None, None
def read_content(file_path: str) -> str:
"""read the content of target file
"""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
return content
CSS = """
#prompt input {
font-size: 16px;
}
#url-textbox {
padding: 0 !important;
}
#short-upload-box .w-full {
min-height: 10rem !important;
}
/* I think something like this can be used to re-shape
* the table
*/
/*
.gr-samples-table tr {
display: inline;
}
.gr-samples-table .p-2 {
width: 100px;
}
*/
#select-a-file {
width: 100%;
}
#file-clear {
padding-top: 2px !important;
padding-bottom: 2px !important;
padding-left: 8px !important;
padding-right: 8px !important;
margin-top: 10px;
}
.gradio-container .gr-button-primary {
background: linear-gradient(180deg, #CDF9BE 0%, #AFF497 100%);
border: 1px solid #B0DCCC;
border-radius: 8px;
color: #1B8700;
}
.gradio-container.dark button#submit-button {
background: linear-gradient(180deg, #CDF9BE 0%, #AFF497 100%);
border: 1px solid #B0DCCC;
border-radius: 8px;
color: #1B8700
}
table.gr-samples-table tr td {
border: none;
outline: none;
}
table.gr-samples-table tr td:first-of-type {
width: 0%;
}
div#short-upload-box div.absolute {
display: none !important;
}
gradio-app > div > div > div > div.w-full > div, .gradio-app > div > div > div > div.w-full > div {
gap: 0px 2%;
}
gradio-app div div div div.w-full, .gradio-app div div div div.w-full {
gap: 0px;
}
gradio-app h2, .gradio-app h2 {
padding-top: 10px;
}
#answer {
overflow-y: scroll;
color: white;
background: #666;
border-color: #666;
font-size: 20px;
font-weight: bold;
}
#answer span {
color: white;
}
#answer textarea {
color:white;
background: #777;
border-color: #777;
font-size: 18px;
}
#url-error input {
color: red;
}
"""
with gr.Blocks(css=CSS) as demo:
gr.HTML(read_content("header.html"))
gr.Markdown(
"DocPrompt🔖 is a Document Prompt Engine using ERNIE-Layout as the backbone model."
"The engine is powered by BAIDU WenXin Document Intelligence Team "
"and has the ability for multilingual documents information extraction and question ansering. "
"For more details, please visit the [Github](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/ernie-layout)."
"ERNIE-Layout paper please refer to [ERNIE-Layout](https://paperswithcode.com/paper/ernie-layout-layout-knowledge-enhanced-pre)"
)
document = gr.Variable()
example_prompt = gr.Textbox(visible=False)
example_image = gr.Image(visible=False)
with gr.Row(equal_height=True):
with gr.Column():
with gr.Row():
gr.Markdown("## 1. Select a file", elem_id="select-a-file")
img_clear_button = gr.Button(
"Clear", variant="secondary", elem_id="file-clear", visible=False
)
image = gr.Gallery(visible=False)
with gr.Row(equal_height=True):
with gr.Column():
with gr.Row():
url = gr.Textbox(
show_label=False,
placeholder="URL",
lines=1,
max_lines=1,
elem_id="url-textbox",
)
submit = gr.Button("Get")
url_error = gr.Textbox(
visible=False,
elem_id="url-error",
max_lines=1,
interactive=False,
label="Error",
)
gr.Markdown("— or —")
upload = gr.File(label=None, interactive=True, elem_id="short-upload-box")
gr.Examples(
examples=examples,
inputs=[example_image, example_prompt],
)
with gr.Column() as col:
gr.Markdown("## 2. Make a request")
prompt = gr.Textbox(
label="Prompt (No restrictions on the setting of prompt. You can type any prompt.)",
placeholder="e.g. What is the total actual and/or obligated expenses of ECG Center?",
lines=1,
max_lines=1,
)
ocr_lang = gr.Radio(
choices=["ch", "en"],
value="en",
label="Select OCR Language (Please choose ch for Chinese images.)",
)
with gr.Row():
clear_button = gr.Button("Clear", variant="secondary")
submit_button = gr.Button(
"Submit", variant="primary", elem_id="submit-button"
)
with gr.Column():
output_text = gr.Textbox(
label="Top Answer", visible=False, elem_id="answer"
)
output = gr.JSON(label="Output", visible=False)
for cb in [img_clear_button, clear_button]:
cb.click(
lambda _: (
gr.update(visible=False, value=None),
None,
gr.update(visible=False, value=None),
gr.update(visible=False, value=None),
gr.update(visible=False),
None,
None,
None,
gr.update(visible=False, value=None),
None,
),
inputs=clear_button,
outputs=[
image,
document,
output,
output_text,
img_clear_button,
example_image,
upload,
url,
url_error,
prompt,
],
)
upload.change(
fn=process_upload,
inputs=[upload],
outputs=[document, image, img_clear_button, output, output_text, url_error],
)
submit.click(
fn=process_path,
inputs=[url],
outputs=[document, image, img_clear_button, output, output_text, url_error],
)
prompt.submit(
fn=process_prompt,
inputs=[prompt, document, ocr_lang],
outputs=[image, output, output_text],
)
submit_button.click(
fn=process_prompt,
inputs=[prompt, document, ocr_lang],
outputs=[image, output, output_text],
)
example_image.change(
fn=load_example_document,
inputs=[example_image, example_prompt],
outputs=[document, prompt, image, img_clear_button, output, output_text],
)
gr.Markdown("[![Stargazers repo roster for @PaddlePaddle/PaddleNLP](https://reporoster.com/stars/PaddlePaddle/PaddleNLP)](https://github.com/PaddlePaddle/PaddleNLP)")
gr.HTML(read_content("footer.html"))
if __name__ == "__main__":
demo.launch(enable_queue=False)
\ No newline at end of file
<div class="footer">
<p>Model by <a href="https://github.com/PaddlePaddle/PaddleNLP" style="text-decoration: underline;" target="_blank">PaddleNLP</a> - Gradio Demo by 🤗 Hugging Face
</p>
</div>
\ No newline at end of file
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
*.png filter=lfs diff=lfs merge=lfs -text
*.jpeg filter=lfs diff=lfs merge=lfs -text
<div style="text-align: center; max-width: 650px; margin: 0 auto;">
<div
style="
display: inline-flex;
gap: 0.8rem;
font-size: 1.75rem;
margin-bottom: 10px;
margin-left: 220px;
justify-content: center;
"
>
<a href="https://github.com/PaddlePaddle/PaddleNLP"><img src="https://user-images.githubusercontent.com/1371212/175816733-8ec25eb0-9af3-4380-9218-27c154518258.png" alt="PaddleNLP" width="60%"></a>
</div>
<div
style="
display: inline-flex;
align-items: center;
gap: 0.8rem;
font-size: 1.75rem;
margin-bottom: 10px;
justify-content: center;
">
<a href="https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/ernie-layout"><h1 style="font-weight: 900; align-items: center; margin-bottom: 7px;">
ERNIE-Layout DocPrompt Engine 🧾
</h1></a>
</div>
<!-- <p style="margin-bottom: 10px; font-size: 94%">
DocPrompt🔖 is a Document Prompt Engine using [ERNIE-Layout](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/ernie-layout) as the backbone model. The engine is powered by BAIDU WenXin Document Intelligence Team and has the ability for multilingual documents information extraction and question ansering.
</p> -->
<a href="https://github.com/PaddlePaddle/PaddleNLP"><img src="https://user-images.githubusercontent.com/40840292/195516769-c4452d7c-3f9e-446f-8f9d-882b99052a5b.png" width="100%"></a>
</div>
\ No newline at end of file
gradio
numpy
opencv-python
pdf2image
PyMuPDF
requests
frontend
# Benchmark
## 1.软硬件环境
ERNIE-Layout模型的训练和推理均采用Tesla V100-SXM2-16GB、CUDA 10.2、 CUDNN 7.5.1、paddlepaddle-gpu 2.3.2.
## 2.开源数据集介绍
| 数据集 | 任务类型 | 语言 | 说明 |
| ---- | ---- | ---- | ----|
| FUNSD | 文档信息抽取 | 英文 | - |
| XFUND-ZH | 文档信息抽取 | 中文 | - |
| DocVQA-ZH | 文档视觉问答 | 中文 | [DocVQA-ZH](http://ailab.aiwin.org.cn/competitions/49)已停止榜单提交,因此我们将原始训练集进行重新划分以评估模型效果,划分后训练集包含4,187张图片,验证集包含500张图片,测试集包含500张图片。 |
| RVL-CDIP (sampled) | 文档图像分类 | 英文 | RVL-CDIP原始数据集共包含400,000张图片,由于数据集较大训练较慢,为验证文档图像分类的模型效果故进行降采样,采样后的训练集包含6,400张图片,验证集包含800张图片,测试集包含800张图片。 |
## 3.评测结果
在文档智能领域主流开源数据集的**验证集**上评测指标如下表所示:
| Model | FUNSD | RVL-CDIP (sampled) | XFUND-ZH | DocVQA-ZH |
| ---- | ---- | ---- | ---- | ---- |
| LayoutXLM-Base | 86.72 | 90.88 | 86.24 | 66.01 |
| ERNIE-LayoutX-Base | 89.31 | 90.29 | 88.58 | 69.57 |
## 4.具体评测方式
* 以上所有任务均基于Grid Search方式进行超参寻优。FUNSD和XFUND-ZH每间隔 100 steps 评估验证集效果,评价指标为F1-Score。 RVL-CDIP每间隔2000 steps评估验证集效果,评价指标为Accuracy。DocVQA-ZH每间隔10000 steps评估验证集效果,取验证集最优效果作为表格中的汇报指标,评价指标为ANLS(计算方法参考[ICDAR 2019 Competition on Scene Text Visual Question Answering)
](https://arxiv.org/pdf/1907.00490.pdf)
* 以上每个下游任务的超参范围如下表所示:
| Hyper Parameters | FUNSD | RVL-CDIP (sampled) | XFUND-ZH | DocVQA-ZH |
| ---- | ---- | ---- | ---- | ---- |
| learning_rate | 5e-6, 1e-5, 2e-5, 5e-5 | 5e-6, 1e-5, 2e-5, 5e-5 | 5e-6, 1e-5, 2e-5, 5e-5 | 5e-6, 1e-5, 2e-5, 5e-5 |
| batch_size | 1, 2, 4 | 8, 16, 24 | 1, 2, 4 | 8, 16, 24 |
| warmup_ratio | - | 0, 0.05, 0.1 | - | 0, 0.05, 0.1 |
<figure>
FUNSD和XFUND-ZH使用的lr_scheduler_type策略是constant,因此不对warmup_ratio进行搜索。
</figure>
* 文档信息抽取任务FUNSD和XFUND-ZH采用最大步数(max_steps)的微调方式,分别为10000 steps和20000 steps;文档视觉问答DocVQA-ZH的num_train_epochs为6;文档图像分类RVL-CDIP的num_train_epochs为20。
* 最优超参
不同预训练模型在下游任务上做Grid Search之后的最优超参(learning_rate、batch_size、warmup_ratio)如下:
| Model | FUNSD | RVL-CDIP (sampled) | XFUND-ZH | DocVQA-ZH |
| ---- | ---- | ---- | ---- | ---- |
| LayoutXLM-Base | 1e-5, 2, _ | 1e-5, 8, 0.1 | 1e-5, 2, _ | 2e-5. 8, 0.1 |
| ERNIE-LayoutX-Base | 2e-5, 4, _ | 1e-5, 8, 0. | 1e-5, 4, _ | 2e-5. 8, 0.05 |
# 5.相关使用说明
请参考:[ERNIE-Layout](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/model_zoo/ernie-layout/README_ch.md)
\ No newline at end of file
# Benchmark
## 1.Software And Hardware Environment
The training and reasoning of ERNIE-Layout model adopt Tesla V100-SXM2-16GB, CUDA 10.2, CUDNN 7.5.1, and paddlepaddle-gpu 2.3.2.
## 2.Introduction To Open Source Datasets
| Dataset | Task Type | Language | Explain |
| ---- | ---- | ---- | ----|
| FUNSD | Document information extraction | English | - |
| XFUND-ZH | Document information extraction | Chinese | - |
| DocVQA-ZH | Document Visual Q&A | Chinese | [DocVQA-ZH](http://ailab.aiwin.org.cn/competitions/49) has stopped submitting the list, so we will re divide the original training set to evaluate the model effect. After division, the training set contains 4187 images, the verification set contains 500 images, and the test set contains 500 images. |
| RVL-CDIP (sampled) | Document Image Classification | English | The RVL-CDIP original data set contains 400000 pictures in total. Because the data set is large and the training is slow, the sampling is reduced to verify the model effect of document image classification. The sampled training set contains 6400 pictures, the verification set contains 800 pictures, and the test set contains 800 pictures. |
## 3.Evaluation Results
The evaluation indicators on the **validation set** of mainstream open source datasets in the field of document intelligence are shown in the following table:
| Model | FUNSD | RVL-CDIP (sampled) | XFUND-ZH | DocVQA-ZH |
| ---- | ---- | ---- | ---- | ---- |
| LayoutXLM-Base | 86.72 | 90.88 | 86.24 | 66.01 |
| ERNIE-LayoutX-Base | 89.31 | 90.29 | 88.58 | 69.57 |
## 4.Specific Evaluation Method
* All the above tasks are based on the Grid Search method for super parameter optimization. FUNSD and XFUND-ZH evaluate the effect of validation set every 100 steps, and the evaluation index is F1-Score. RVL-CDIP evaluates the effect of validation set every 2000 steps, and the evaluation index is Accuracy. DocVQA-ZH evaluates the effect of the validation set every 10000 steps, and takes the best effect of the validation set as the reporting indicator in the table. The evaluation indicator is ANLS (refer to the calculation method [ICDAR 2019 Competition on Scene Text Visual Question Answering](https://arxiv.org/pdf/1907.00490.pdf))
* The super parameter range of each downstream task above is shown in the following table:
| Hyper Parameters | FUNSD | RVL-CDIP (sampled) | XFUND-ZH | DocVQA-ZH |
| ---- | ---- | ---- | ---- | ---- |
| learning_rate | 5e-6, 1e-5, 2e-5, 5e-5 | 5e-6, 1e-5, 2e-5, 5e-5 | 5e-6, 1e-5, 2e-5, 5e-5 | 5e-6, 1e-5, 2e-5, 5e-5 |
| batch_size | 1, 2, 4 | 8, 16, 24 | 1, 2, 4 | 8, 16, 24 |
| warmup_ratio | - | 0, 0.05, 0.1 | - | 0, 0.05, 0.1 |
<figure>
The lr_scheduler_type policy used by FUNSD and XFUND-ZH is constant, so it is not warmup_ratio to search.
</figure>
* The document information extraction tasks FUNSD and XFUND-ZH adopt the maximum steps (max_steps) fine-tuning method, which are 10000 steps and 20000 steps respectively; Document Visual Q&A DocVQA-ZH num_train_epochs is 6; Document image classification RVL-CDIP num_train_epochs is 20.
* Optimal hyperparameter
Optimal hyperparameters of different pre training models after Grid Search on downstream tasks (learning_rate、batch_size、warmup_ratio)are as follows:
| Model | FUNSD | RVL-CDIP (sampled) | XFUND-ZH | DocVQA-ZH |
| ---- | ---- | ---- | ---- | ---- |
| LayoutXLM-Base | 1e-5, 2, _ | 1e-5, 8, 0.1 | 1e-5, 2, _ | 2e-5. 8, 0.1 |
| ERNIE-LayoutX-Base | 2e-5, 4, _ | 1e-5, 8, 0. | 1e-5, 4, _ | 2e-5. 8, 0.05 |
# 5.Relevant Instructions
Please refer to:[ERNIE-Layout](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/model_zoo/ernie-layout/README_ch.md)
\ No newline at end of file
# 提供模型所支持的任务场景、推理和预训练模型文件:
| 模型名称 | 模型简介 | 参数信息 | 下载地址 |
| ---- | ---- | ---- | ---- |
| ERNIE-LayoutX-Base | 原始预训练模型 | [参数信息](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/ernie_layout/modeling.py#L193%20%E8%BF%99%E4%B8%AA%E6%98%AF%E5%8F%82%E6%95%B0%E4%BF%A1%E6%81%AF) | [推理模型](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/ernie_layout/modeling.py#L193)/[预训练模型](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/ernie_layout/modeling.py#L193) |
| Docprompt | Docprompt是文档抽取问答模型,是基于ERNIE-Layout在文档视觉问答任务上微调后的模型 | [参数信息](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/ernie_layout/modeling.py#L193%20%E8%BF%99%E4%B8%AA%E6%98%AF%E5%8F%82%E6%95%B0%E4%BF%A1%E6%81%AF) | [推理模型](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/ernie-layout)/[预训练模型](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/ernie-layout) |
# Provide Task Scenarios, Reasoning And Pre Training Model Files Supported By The Model:
| Model name | Introduction to the model | Parameter information | Download address |
| ---- | ---- | ---- |---- |
| ERNIE-LayoutX-Base | Original pre training model |[Parameter information](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/ernie_layout/modeling.py#L193%20%E8%BF%99%E4%B8%AA%E6%98%AF%E5%8F%82%E6%95%B0%E4%BF%A1%E6%81%AF) | [Reasoning model](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/ernie_layout/modeling.py#L193)/[Pre training model](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/ernie_layout/modeling.py#L193) |
| Docprompt | Docprompt is a question and answer model for document extraction, which is based on the fine-tuning of the document visual question and answer task by ERNIE-Layout |[Parameter information](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/ernie_layout/modeling.py#L193%20%E8%BF%99%E4%B8%AA%E6%98%AF%E5%8F%82%E6%95%B0%E4%BF%A1%E6%81%AF) | [Reasoning model](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/ernie-layout)/[Pre training model](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/ernie-layout) |
---
Model_Info:
name: "ERNIE-Layout"
description: 文心跨模态预训练大模型
description_en: ERNIE transmodal pre training large model
update_time:
icon: url
from_repo: PaddleNLP
Task:
-
tag: 大模型
tag_en: Big model
sub_tag: 文心大模型
sub_tag_en: ERNIE Big Model
Example:
-
tag: 互联网
tag_en: Internet
sub_tag: 文档智能
sub_tag_en: Document Intelligence
title: PaddleNLP文档智能技术重磅升级,动手搭建端到端文档抽取问答模型
title_en: PaddleNLP document intelligent technology has been heavily upgraded, and the end-to-end document extraction question and answer model has been set up
url: https://aistudio.baidu.com/aistudio/projectdetail/4881278?channelType=0&channel=0
url_en: https://aistudio.baidu.com/aistudio/projectdetail/4881278?channelType=0&channel=0
Datasets: FUNSD, XFUND-ZH, DocVQA-ZH, RVL-CDIP (sampled)
Pulisher: Baidu
License: Apache 2.0
Paper:
-
title: ERNIE-Layout: Layout-Knowledge Enhanced Multi-modal Pre-training for Document Understanding
url: https://arxiv.org/pdf/2210.06155.pdf
IfTraining: 1
IfOnlineDemo: 1
\ No newline at end of file
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1.ERNIE-Layout模型简介\n",
"随着众多行业的数字化转型,电子文档的结构化分析和内容提取成为一项热门的研究课题。电子文档包括扫描图像文件和计算机生成的数字文档两大类,涉及单据、行业报告、合同、雇佣协议、发票、简历等多种类型。智能文档理解任务以理解格式、布局、内容多种多样的文档为目标,包括了文档分类、文档信息抽取、文档问答等任务。与纯文本文档不同的是,文档包含表格、图片等多种内容,包含丰富的视觉信息。因为文档内容丰富、布局复杂、字体样式多样、数据存在噪声,文档理解任务极具挑战性。随着ERNIE等预训练语言模型在NLP领域取得了巨大的成功,人们开始关注在文档理解领域进行大规模预训练。百度提出跨模态文档理解模型 ERNIE-Layout,首次将布局知识增强技术融入跨模态文档预训练,在4项文档理解任务上刷新世界最好效果,登顶 DocVQA 榜首。同时,ERNIE-Layout已集成至百度智能文档分析平台 TextMind,助力企业数字化升级。\n",
"\n",
"\n",
"ERNIE-Layout以文心文本大模型ERNIE为底座,融合文本、图像、布局等信息进行跨模态联合建模,创新性引入布局知识增强,提出阅读顺序预测、细粒度图文匹配等自监督预训练任务,升级空间解偶注意力机制,在各数据集上效果取得大幅度提升,相关工作[ERNIE-Layout: Layout-Knowledge Enhanced Multi-modal Pre-training for Document Understanding](https://arxiv.org/abs/2210.06155)已被EMNLP 2022 Findings会议收录。考虑到文档智能在多语种上商用广泛,依托PaddleNLP对外开源业界最强的多语言跨模态文档预训练模型ERNIE-Layout。\n",
"ERNIE-Layout是由飞浆官方出品的跨模态大模型,更多有关PaddleNLP的详情请访问<https://github.com/PaddlePaddle/PaddleNLP/>了解详情。<br/>\n",
"\n",
"<img src=\"https://user-images.githubusercontent.com/40840292/195091552-86a2d174-24b0-4ddf-825a-4503e0bc390b.png\" width = 95% align=center />"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2.模型效果及应用场景\n",
"ERNIE-Layout可以用于处理但不限于带布局数据(文档、图片等)的文档分类、信息抽取、文档问答等任务,应用场景包括但不限于发票抽取问答、海报抽取问答、网页抽取问答、表格抽取问答、试卷抽取问答、英文票据多语种(中、英、日、泰、西班牙、俄语)抽取问答、中文票据多语种(中简、中繁、英、日、法语)抽取问答等。以文档信息抽取和文档视觉问答为例,使用ERNIE-Layout模型效果速览如下。\n",
"## 2.1文档信息抽取任务:\n",
"### 2.1.1数据集:\n",
"数据集有FUNSD、XFUND-ZH等。其中FUNSD是在噪声很多的扫描文档上进行表单理解的英文数据集,数据集包含199个真实的、完全注释的、扫描的表单。文档有很多噪声,而且各种表单的外观差异很大,因此理解表单是一项很有挑战性的任务。该数据集可用于各种任务,包括文本检测、光学字符识别、空间布局分析和实体标记/链接。XFUND是一个多语言表单理解基准数据集,包括7种语言(汉语、日语、西班牙语、法语、意大利语、德语、葡萄牙语)的人为标注键值对表单,XFUND-ZH为中文版本XFUND。\n",
"### 2.1.2模型效果速览:\n",
"ERNIE-Layout在FUNSD上的模型效果为:\n",
"\n",
"<img src=\"https://gitee.com/doubleguy/typora/raw/master/img/202211082019436.png\" width = 95% align=center />\n",
"\n",
"## 2.2文档视觉问答任务:\n",
"### 2.2.1数据集:\n",
"数据集为DocVQA-ZH,DocVQA-ZH已停止榜单提交,因此我们将原始训练集进行重新划分以评估模型效果,划分后训练集包含4,187张图片,验证集包含500张图片,测试集包含500张图片。\n",
"### 2.2.2模型效果速览:\n",
"ERNIE-Layout在DocVQA-ZH上的模型效果为:\n",
"\n",
"![](https://user-images.githubusercontent.com/40840292/195611075-9323ce9f-134b-4657-ab1c-f4892075d909.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3.模型如何使用\n",
"## 3.1模型推理\n",
"我们已经在[huggingface网页](https://huggingface.co/spaces/PaddlePaddle/ERNIE-Layout)集成了ERNIE-Layout DocPrompt Engine,可一键进行体验。 \n",
"<br/><br/> \n",
"**Taskflow**\n",
"<br/><br/>\n",
"当然,也可以使用Taskflow进行推理。通过`paddlenlp.Taskflow`三行代码调用DocPrompt功能,具备多语言文档抽取问答能力,部分应用场景展示如下:\n",
"\n",
"* 输入格式\n",
"\n",
"```python\n",
"[\n",
" {\"doc\": \"./invoice.jpg\", \"prompt\": [\"发票号码是多少?\", \"校验码是多少?\"]},\n",
" {\"doc\": \"./resume.png\", \"prompt\": [\"五百丁本次想要担任的是什么职位?\", \"五百丁是在哪里上的大学?\", \"大学学的是什么专业?\"]}\n",
"]\n",
"```\n",
"\n",
"默认使用PaddleOCR进行OCR识别,同时支持用户通过`word_boxes`传入自己的OCR结果,格式为`List[str, List[float, float, float, float]]`。\n",
"\n",
"```python \n",
"[\n",
" {\"doc\": doc_path, \"prompt\": prompt, \"word_boxes\": word_boxes}\n",
"]\n",
"```\n",
"\n",
"* 支持单条、批量预测\n",
"\n",
" * 支持本地图片路径输入\n",
"\n",
" ![](https://user-images.githubusercontent.com/40840292/194748579-f9e8aa86-7f65-4827-bfae-824c037228b3.png)\n",
"\n",
" ```python \n",
" from pprint import pprint\n",
" from paddlenlp import Taskflow\n",
" docprompt = Taskflow(\"document_intelligence\")\n",
" pprint(docprompt([{\"doc\": \"./resume.png\", \"prompt\": [\"五百丁本次想要担任的是什么职位?\", \"五百丁是在哪里上的大学?\", \"大学学的是什么专业?\"]}]))\n",
" [{'prompt': '五百丁本次想要担任的是什么职位?',\n",
" 'result': [{'end': 7, 'prob': 1.0, 'start': 4, 'value': '客户经理'}]},\n",
" {'prompt': '五百丁是在哪里上的大学?',\n",
" 'result': [{'end': 37, 'prob': 1.0, 'start': 31, 'value': '广州五百丁学院'}]},\n",
" {'prompt': '大学学的是什么专业?',\n",
" 'result': [{'end': 44, 'prob': 0.82, 'start': 38, 'value': '金融学(本科)'}]}]\n",
" ```\n",
"\n",
" * http图片链接输入\n",
"\n",
" ![](https://user-images.githubusercontent.com/40840292/194748592-e20b2a5f-d36b-46fb-8057-86755d188af0.jpg)\n",
"\n",
" ```python \n",
" from pprint import pprint\n",
" from paddlenlp import Taskflow\n",
"\n",
" docprompt = Taskflow(\"document_intelligence\")\n",
" pprint(docprompt([{\"doc\": \"https://bj.bcebos.com/paddlenlp/taskflow/document_intelligence/images/invoice.jpg\", \"prompt\": [\"发票号码是多少?\", \"校验码是多少?\"]}]))\n",
" [{'prompt': '发票号码是多少?',\n",
" 'result': [{'end': 2, 'prob': 0.74, 'start': 2, 'value': 'No44527206'}]},\n",
" {'prompt': '校验码是多少?',\n",
" 'result': [{'end': 233,\n",
" 'prob': 1.0,\n",
" 'start': 231,\n",
" 'value': '01107 555427109891646'}]}]\n",
" ```\n",
"\n",
"* 可配置参数说明\n",
" * `batch_size`:批处理大小,请结合机器情况进行调整,默认为1。\n",
" * `lang`:选择PaddleOCR的语言,`ch`可在中英混合的图片中使用,`en`在英文图片上的效果更好,默认为`ch`。\n",
" * `topn`: 如果模型识别出多个结果,将返回前n个概率值最高的结果,默认为1。\n",
"\n",
"## 3.2模型微调与部署\n",
"ERNIE-Layout是依托文心ERNIE,基于布局知识增强技术,融合文本、图像、布局等信息进行联合建模的跨模态通用文档预训练模型,能够在包括但不限于文档信息抽取、文档视觉问答、文档图像分类等任务上表现出优秀的跨模态语义对齐能力和布局理解能力。\n",
"\n",
"有关使用ERNIE-Layout进行上述任务的微调与部署详情请参考:[ERNIE-Layout](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/ernie-layout\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4.模型原理\n",
"* 布局知识增强技术<br/><br/>\n",
"* 融合文本、图像、布局等信息进行联合建模<br/><br/>\n",
"* 阅读顺序预测 + 细粒度图文匹配两个自监督预训练任务<br/><br/>\n",
"\n",
"<figure>\n",
"对文档理解来说,文档中的文字阅读顺序至关重要,目前主流的基于 OCR(Optical Character Recognition,文字识别)技术的模型大多遵循「从左到右、从上到下」的原则,然而对于文档中分栏、文本图片表格混杂的复杂布局,根据 OCR 结果获取的阅读顺序多数情况下都是错误的,从而导致模型无法准确地进行文档内容的理解。\n",
"\n",
"而人类通常会根据文档结构和布局进行层次化分块阅读,受此启发,百度研究者提出在文档预训模型中对阅读顺序进行校正的布局知识增强创新思路。TextMind 平台上业界领先的文档解析工具(Document Parser)能够准确识别文档中的分块信息,产出正确的文档阅读顺序,将阅读顺序信号融合到模型的训练中,从而增强对布局信息的有效利用,提升模型对于复杂文档的理解能力。\n",
"\n",
"基于布局知识增强技术,同时依托文心 ERNIE,百度研究者提出了融合文本、图像、布局等信息进行联合建模的跨模态通用文档预训练模型 ERNIE-Layout。如下图所示,ERNIE-Layout 创新性地提出了阅读顺序预测和细粒度图文匹配两个自监督预训练任务,有效提升模型在文档任务上跨模态语义对齐能力和布局理解能力。\n",
"\n",
"\n",
"![](https://bce.bdstatic.com/doc/ai-doc/wenxin/image%20%2814%29_59cc6c8.png)\n",
"</figure>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 5.注意事项\n",
"## 5.1参数配置\n",
"* batch_size:批处理大小,请结合机器情况进行调整,默认为1。\n",
"\n",
"* lang:选择PaddleOCR的语言,ch可在中英混合的图片中使用,en在英文图片上的效果更好,默认为ch。\n",
"\n",
"* topn: 如果模型识别出多个结果,将返回前n个概率值最高的结果,默认为1。\n",
"## 5.2使用技巧\n",
"\n",
"* Prompt设计:在DocPrompt中,Prompt可以是陈述句(例如,文档键值对中的Key),也可以是疑问句。因为是开放域的抽取问答,DocPrompt对Prompt的设计没有特殊限制,只要符合自然语言语义即可。如果对当前的抽取结果不满意,可以多尝试一些不同的Prompt。 \n",
"\n",
"* 支持的语言:支持本地路径或者HTTP链接的中英文图片输入,Prompt支持多种不同语言,参考以上不同场景的例子。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 6.相关论文以及引用信息\n",
"#### ERNIE-Layout: Layout-Knowledge Enhanced Multi-modal Pre-training for Document Understanding\n",
"```\n",
"@misc{https://doi.org/10.48550/arxiv.2210.06155,\n",
" doi = {10.48550/ARXIV.2210.06155},\n",
" \n",
" url = {https://arxiv.org/abs/2210.06155},\n",
" \n",
" author = {Peng, Qiming and Pan, Yinxu and Wang, Wenjin and Luo, Bin and Zhang, Zhenyu and Huang, Zhengjie and Hu, Teng and Yin, Weichong and Chen, Yongfeng and Zhang, Yin and Feng, Shikun and Sun, Yu and Tian, Hao and Wu, Hua and Wang, Haifeng},\n",
" \n",
" keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences},\n",
" \n",
" title = {ERNIE-Layout: Layout Knowledge Enhanced Pre-training for Visually-rich Document Understanding},\n",
" \n",
" publisher = {arXiv},\n",
" \n",
" year = {2022},\n",
" \n",
" copyright = {arXiv.org perpetual, non-exclusive license}\n",
"}\n",
"```\n",
"#### ICDAR 2019 Competition on Scene Text Visual Question Answering\n",
"```\n",
"@misc{https://doi.org/10.48550/arxiv.1907.00490,\n",
" doi = {10.48550/ARXIV.1907.00490},\n",
" \n",
" url = {https://arxiv.org/abs/1907.00490},\n",
" \n",
" author = {Biten, Ali Furkan and Tito, Rubèn and Mafla, Andres and Gomez, Lluis and Rusiñol, Marçal and Mathew, Minesh and Jawahar, C. V. and Valveny, Ernest and Karatzas, Dimosthenis},\n",
" \n",
" keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},\n",
" \n",
" title = {ICDAR 2019 Competition on Scene Text Visual Question Answering},\n",
" \n",
" publisher = {arXiv},\n",
" \n",
" year = {2019},\n",
" \n",
" copyright = {arXiv.org perpetual, non-exclusive license}\n",
"}\n",
"```"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.5 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.5"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "a5f44439766e47113308a61c45e3ba0ce79cefad900abb614d22e5ec5db7fbe0"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1.ERNIE-Layout Introduction\n",
"With the digital transformation of many industries, the structural analysis and content extraction of electronic documents have become a hot research topic. Electronic documents include scanned image documents and computer-generated digital documents, involving documents, industry reports, contracts, employment agreements, invoices, resumes and other types. The intelligent document understanding task aims to understand documents with various formats, layouts and contents, including document classification, document information extraction, document question answering and other tasks. Different from plain text documents, documents contain tables, pictures and other contents, and contain rich visual information. Because the document is rich in content, complex in layout, diverse in font style, and noisy in data, the task of document understanding is extremely challenging. With the great success of pre training language models such as ERNIE in the NLP field, people began to focus on large-scale pre training in the field of document understanding. Baidu put forward the cross modal document understanding model ERNIE-Layout, which is the first time to integrate the layout knowledge enhancement technology into the cross modal document pre training, refreshing the world's best results in four document understanding tasks, and topping the DocVQA list. At the same time, ERNIE Layout has been integrated into Baidu's intelligent document analysis platform TextMind to help enterprises upgrade digitally.\n",
"\n",
"\n",
"ERNIE-Layout takes the Wenxin text big model ERNIE as the base, integrates text, image, layout and other information for cross modal joint modeling, innovatively introduces layout knowledge enhancement, proposes self-monitoring pre training tasks such as reading order prediction, fine grain image text matching, upgrades spatial decoupling attention mechanism, and greatly improves the effect on each data set. Related work [ERNIE-Layout: Layout-Knowledge Enhanced Multi-modal Pre-training for Document Understanding](https://arxiv.org/abs/2210.06155) has been included in the EMNLP 2022 Findings Conference. Considering that document intelligence is widely commercially available in multiple languages, it relies on PaddleNLP to open source the strongest multilingual cross modal document pre training model ERNIE Layout in the industry.\n",
"ERNIE-Layout is a large cross modal model officially produced by the Flying Slurry. For more details about PaddleNLP, please visit <https://github.com/PaddlePaddle/PaddleNLP/> for details.<br/>\n",
"\n",
"<img src=\"https://user-images.githubusercontent.com/40840292/195091552-86a2d174-24b0-4ddf-825a-4503e0bc390b.png\" width = 95% align=center />"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2.Model Effect and Application Scenario\n",
"ERNIE-Layout can be used to process but not limited to tasks such as document classification, information extraction, document Q&A with layout data (documents, pictures, etc.). Application scenarios include but not limited to invoice extraction Q&A, poster extraction Q&A, web page extraction Q&A, table extraction Q&A, test paper extraction Q&A, English bill multilingual (Chinese, English, Japanese, Thai, Spanish, Russian) extraction Q&A Chinese bills in multiple languages (simplified, traditional, English, Japanese, French). Taking document information extraction and document visual Q&A as examples, the effect of using ERNIE-Layout model is shown below.\n",
"## 2.1Document Information Extraction Task:\n",
"### 2.1.1Dataset:\n",
"Data sets include FUNSD, XFUND-ZH, etc. FUNSD is an English data set for form understanding on noisy scanned documents. The data set contains 199 real, fully annotated and scanned forms. Documents are noisy, and the appearance of various forms varies greatly, so understanding forms is a challenging task. The dataset can be used for a variety of tasks, including text detection, optical character recognition, spatial layout analysis, and entity tagging/linking. XFUND is a multilingual form understanding benchmark dataset, including manually labeled key value pair forms in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). XFUND-ZH is the Chinese version of XFUND.\n",
"### 2.1.2Quick View Of Model Effect:\n",
"The model effect of ERNIE-Layout on FUNSD is:\n",
"\n",
"<img src=\"https://gitee.com/doubleguy/typora/raw/master/img/202211082019436.png\" width = 95% align=center />\n",
"\n",
"## 2.2Document Visual Question And Answer Task:\n",
"### 2.2.1Dataset:\n",
"The data set is DocVQA-ZH, and DocVQA-ZH has stopped submitting the list. Therefore, we will re divide the original training set to evaluate the model effect. After division, the training set contains 4187 pictures, the verification set contains 500 pictures, and the test set contains 500 pictures.\n",
"### 2.2.2Quick View Of Model Effect:\n",
"The model effect of ERNIE-Layout on DocVQA-ZH is:\n",
"\n",
"![](https://user-images.githubusercontent.com/40840292/195611075-9323ce9f-134b-4657-ab1c-f4892075d909.png)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3.How To Use The Model\n",
"## 3.1Model Reasoning\n",
"We have integrated the ERNIE-Layout DocPrompt Engine on the [huggingface page](https://huggingface.co/spaces/PaddlePaddle/ERNIE-Layout), which can be experienced with one click.\n",
"\n",
"**Taskflow**\n",
"\n",
"Of course, you can also use Taskflow for reasoning. Through `paddlenlp.Taskflow` calls DocPrompt with three lines of code, and has the ability to extract questions and answers from multilingual documents. Some application scenarios are shown below:\n",
"\n",
"* Input Format\n",
"\n",
"```python\n",
"[\n",
" {\"doc\": \"./invoice.jpg\", \"prompt\": [\"发票号码是多少?\", \"校验码是多少?\"]},\n",
" {\"doc\": \"./resume.png\", \"prompt\": [\"五百丁本次想要担任的是什么职位?\", \"五百丁是在哪里上的大学?\", \"大学学的是什么专业?\"]}\n",
"]\n",
"```\n",
"\n",
"By default, PaddleOCR is used for OCR identification, and users can use the `word_ boxes` Pass in your own OCR results in the format `List[str, List[float, float, float, float]]`.\n",
"\n",
"```python \n",
"[\n",
" {\"doc\": doc_path, \"prompt\": prompt, \"word_boxes\": word_boxes}\n",
"]\n",
"```\n",
"\n",
"* Support single and batch forecasting\n",
"\n",
" * Support local image path input\n",
"\n",
" ![](https://user-images.githubusercontent.com/40840292/194748579-f9e8aa86-7f65-4827-bfae-824c037228b3.png)\n",
"\n",
" ```python \n",
" from pprint import pprint\n",
" from paddlenlp import Taskflow\n",
" docprompt = Taskflow(\"document_intelligence\")\n",
" pprint(docprompt([{\"doc\": \"./resume.png\", \"prompt\": [\"五百丁本次想要担任的是什么职位?\", \"五百丁是在哪里上的大学?\", \"大学学的是什么专业?\"]}]))\n",
" [{'prompt': '五百丁本次想要担任的是什么职位?',\n",
" 'result': [{'end': 7, 'prob': 1.0, 'start': 4, 'value': '客户经理'}]},\n",
" {'prompt': '五百丁是在哪里上的大学?',\n",
" 'result': [{'end': 37, 'prob': 1.0, 'start': 31, 'value': '广州五百丁学院'}]},\n",
" {'prompt': '大学学的是什么专业?',\n",
" 'result': [{'end': 44, 'prob': 0.82, 'start': 38, 'value': '金融学(本科)'}]}]\n",
" ```\n",
"\n",
" * http image link input\n",
"\n",
" ![](https://user-images.githubusercontent.com/40840292/194748592-e20b2a5f-d36b-46fb-8057-86755d188af0.jpg)\n",
"\n",
" ```python \n",
" from pprint import pprint\n",
" from paddlenlp import Taskflow\n",
"\n",
" docprompt = Taskflow(\"document_intelligence\")\n",
" pprint(docprompt([{\"doc\": \"https://bj.bcebos.com/paddlenlp/taskflow/document_intelligence/images/invoice.jpg\", \"prompt\": [\"发票号码是多少?\", \"校验码是多少?\"]}]))\n",
" [{'prompt': '发票号码是多少?',\n",
" 'result': [{'end': 2, 'prob': 0.74, 'start': 2, 'value': 'No44527206'}]},\n",
" {'prompt': '校验码是多少?',\n",
" 'result': [{'end': 233,\n",
" 'prob': 1.0,\n",
" 'start': 231,\n",
" 'value': '01107 555427109891646'}]}]\n",
" ```\n",
"\n",
"* Description of configurable parameters\n",
" * `batch_size`:Please adjust the batch size according to the machine conditions. The default value is 1.\n",
" * `lang`:Select the language of PaddleOCR. `ch` can be used in Chinese English mixed pictures. `en` is better in English pictures. The default is `ch`.\n",
" * `topn`: If the model identifies multiple results, it will return the first n results with the highest probability value, which is 1 by default.\n",
"\n",
"## 3.2Model Fine-tuning And Deployment\n",
"ERNIE-Layout is a cross modal general document pre training model that relies on Wenxin ERNIE, based on layout knowledge enhancement technology, and integrates text, image, layout and other information for joint modeling. It can show excellent cross modal semantic alignment and layout understanding ability on tasks including but not limited to document information extraction, document visual question answering, document image classification and so on.\n",
"\n",
"For details about the fine-tuning and deployment of the above tasks using ERNIE-Layout, please refer to: [ERNIE-Layout](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/ernie-layout\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4.Model Principle\n",
"* Layout knowledge enhancement technology\n",
"\n",
"* Fusion of text, image, layout and other information for joint modeling\n",
"\n",
"* Reading order prediction+fine-grained image text matching: two self-monitoring pre training tasks\n",
"\n",
"<figure>\n",
"For document understanding, the text reading order in the document is very important. At present, most mainstream models based on OCR (Optical Character Recognition) technology follow the principle of \"from left to right, from top to bottom\". However, for the complex layout of the document with a mixture of columns, text, graphics and tables, the reading order obtained according to the OCR results is wrong in most cases, As a result, the model cannot accurately understand the content of the document.\n",
"\n",
"Humans usually read in hierarchies and blocks according to the document structure and layout. Inspired by this, Baidu researchers proposed an innovative idea of layout knowledge enhancement to correct the reading order in the document pre training model. The industry-leading document parsing tool (Document Parser) on the TextMind platform can accurately identify the block information in the document, produce the correct document reading order, and integrate the reading order signal into the model training, thus enhancing the effective use of layout information and improving the model's understanding of complex documents.\n",
"\n",
"Based on the layout knowledge enhancement technology, and relying on Wenxin ERNIE, Baidu researchers proposed a cross modal general document pre training model ERNIE-Layout, which integrates text, image, layout and other information for joint modeling. As shown in the figure below, ERNIE-Layout innovatively proposed two self-monitoring pre training tasks: reading order prediction and fine-grained image text matching, which effectively improved the model's cross modal semantic alignment ability and layout understanding ability in document tasks.\n",
"\n",
"\n",
"![](https://bce.bdstatic.com/doc/ai-doc/wenxin/image%20%2814%29_59cc6c8.png)\n",
"</figure>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 5.Matters Needing Attention\n",
"## 5.1Parameter Configuration\n",
"* batch_size:Please adjust the batch size according to the machine conditions. The default value is 1.\n",
"\n",
"* lang:Choose the language of PaddleOCR. ch can be used in Chinese English mixed pictures. en has better effect on English pictures. The default is ch.\n",
"\n",
"* topn: If the model identifies multiple results, it will return the first n results with the highest probability value, which is 1 by default.\n",
"\n",
"## 5.2Tips\n",
"\n",
"* Prompt design: In DocPrompt, Prompt can be a statement (for example, the Key in the document key value pair) or a question. Because it is an open domain extracted question and answer, DocPrompt has no special restrictions on the design of Prompt, as long as it conforms to natural language semantics. If you are not satisfied with the current extraction results, you can try some different Prompts.\n",
"\n",
"* Languages supported:Support Chinese and English image input of local path or HTTP link. Prompt supports multiple languages. Refer to the examples of different scenarios above."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 6.Relevant Papers And Citations\n",
"#### ERNIE-Layout: Layout-Knowledge Enhanced Multi-modal Pre-training for Document Understanding\n",
"```\n",
"@misc{https://doi.org/10.48550/arxiv.2210.06155,\n",
" doi = {10.48550/ARXIV.2210.06155},\n",
" \n",
" url = {https://arxiv.org/abs/2210.06155},\n",
" \n",
" author = {Peng, Qiming and Pan, Yinxu and Wang, Wenjin and Luo, Bin and Zhang, Zhenyu and Huang, Zhengjie and Hu, Teng and Yin, Weichong and Chen, Yongfeng and Zhang, Yin and Feng, Shikun and Sun, Yu and Tian, Hao and Wu, Hua and Wang, Haifeng},\n",
" \n",
" keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences},\n",
" \n",
" title = {ERNIE-Layout: Layout Knowledge Enhanced Pre-training for Visually-rich Document Understanding},\n",
" \n",
" publisher = {arXiv},\n",
" \n",
" year = {2022},\n",
" \n",
" copyright = {arXiv.org perpetual, non-exclusive license}\n",
"}\n",
"```\n",
"#### ICDAR 2019 Competition on Scene Text Visual Question Answering\n",
"```\n",
"@misc{https://doi.org/10.48550/arxiv.1907.00490,\n",
" doi = {10.48550/ARXIV.1907.00490},\n",
" \n",
" url = {https://arxiv.org/abs/1907.00490},\n",
" \n",
" author = {Biten, Ali Furkan and Tito, Rubèn and Mafla, Andres and Gomez, Lluis and Rusiñol, Marçal and Mathew, Minesh and Jawahar, C. V. and Valveny, Ernest and Karatzas, Dimosthenis},\n",
" \n",
" keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},\n",
" \n",
" title = {ICDAR 2019 Competition on Scene Text Visual Question Answering},\n",
" \n",
" publisher = {arXiv},\n",
" \n",
" year = {2019},\n",
" \n",
" copyright = {arXiv.org perpetual, non-exclusive license}\n",
"}\n",
"```"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.5 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.5"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "a5f44439766e47113308a61c45e3ba0ce79cefad900abb614d22e5ec5db7fbe0"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册