未验证 提交 8f403bac 编写于 作者: L linjieccc 提交者: GitHub

Update ddparser module to v1.1.0 (#1658)

上级 e0a2073f
......@@ -195,9 +195,9 @@
headers = {"Content-Type": "application/json"}
for t in text:
print("input: {}".format(t))
result = requests.post(url=url, headers=headers, data=json.dumps(t))
result = requests.post(url=url, headers=headers, data=json.dumps({"text": t}))
# 打印预测结果
print("model output: {}\n".format(result))
print("model output: {}\n".format(result.json()['results']))
- 关于PaddleHub Serving更多信息参考:[服务部署](../../../../../docs/docs_ch/tutorial/serving.md)
......
......@@ -195,9 +195,9 @@
headers = {"Content-Type": "application/json"}
for t in text:
print("input: {}".format(t))
result = requests.post(url=url, headers=headers, data=json.dumps(t))
result = requests.post(url=url, headers=headers, data=json.dumps({"text": t}))
# 打印预测结果
print("model output: {}\n".format(result))
print("model output: {}\n".format(result.json()['results']))
- 关于PaddleHub Serving更多信息参考:[服务部署](../../../../docs/docs_ch/tutorial/serving.md)
......
......@@ -195,9 +195,9 @@
headers = {"Content-Type": "application/json"}
for t in text:
print("input: {}".format(t))
r = requests.post(url=url, headers=headers, data=json.dumps(t))
r = requests.post(url=url, headers=headers, data=json.dumps({"text": t}))
# 打印预测结果
print("model output: {}\n".format(result))
print("model output: {}\n".format(result.json()['results']))
- 关于PaddleHub Serving更多信息参考:[服务部署](../../../../docs/docs_ch/tutorial/serving.md)
......
......@@ -195,9 +195,9 @@
headers = {"Content-Type": "application/json"}
for t in text:
print("input: {}".format(t))
r = requests.post(url=url, headers=headers, data=json.dumps(t))
r = requests.post(url=url, headers=headers, data=json.dumps({"text": t}))
# 打印预测结果
print("model output: {}\n".format(result))
print("model output: {}\n".format(result.json()['results']))
- 关于PaddleHub Serving更多信息参考:[服务部署](../../../../docs/docs_ch/tutorial/serving.md)
......
......@@ -195,9 +195,9 @@
headers = {"Content-Type": "application/json"}
for t in text:
print("input: {}".format(t))
r = requests.post(url=url, headers=headers, data=json.dumps(t))
r = requests.post(url=url, headers=headers, data=json.dumps({"text": t}))
# 打印预测结果
print("model output: {}\n".format(result))
print("model output: {}\n".format(result.json()['results']))
- 关于PaddleHub Serving更多信息参考:[服务部署](../../../../docs/docs_ch/tutorial/serving.md)
......
......@@ -3,11 +3,11 @@
|模型名称|DDParser|
| :--- | :---: |
|类别|文本-句法分析|
|网络|LSTM|
|网络|Deep Biaffine Attention|
|数据集|搜索query、网页文本、语音输入等数据|
|是否支持Fine-tuning|否|
|模型大小|33MB|
|最新更新日期|2021-02-26|
|模型大小|61MB|
|最新更新日期|2021-10-26|
|数据指标|-|
......@@ -24,15 +24,11 @@
- ### 1、环境依赖
- paddlepaddle >= 1.8.2
- paddlepaddle >= 2.1.0
- paddlehub >= 1.7.0 | [如何安装PaddleHub](../../../../docs/docs_ch/get_start/installation.rst)
- paddlenlp >= 2.1.0
- 额外依赖ddparser
- ```shell
$ pip install ddparser
```
- paddlehub >= 2.1.0 | [如何安装PaddleHub](../../../../docs/docs_ch/get_start/installation.rst)
- ### 2、安装
......@@ -42,9 +38,6 @@
- 如您安装时遇到问题,可参考:[零基础windows安装](../../../../docs/docs_ch/get_start/windows_quickstart.md)
| [零基础Linux安装](../../../../docs/docs_ch/get_start/linux_quickstart.md) | [零基础MacOS安装](../../../../docs/docs_ch/get_start/mac_quickstart.md)
## 三、模型API预测
- ### 1、命令行预测
......@@ -60,33 +53,59 @@
import cv2
import paddlehub as hub
# Load ddparser
module = hub.Module(name="ddparser")
test_text = ["百度是一家高科技公司"]
results = module.parse(texts=test_text)
# String input
results = module.parse("百度是一家高科技公司")
print(results)
# [{'word': ['百度', '是', '一家', '高科技', '公司'], 'head': [2, 0, 5, 5, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'VOB']}]
# List input
results = module.parse(["百度是一家高科技公司", "他送了一本书"])
print(results)
# [{'word': ['百度', '是', '一家', '高科技', '公司'], 'head': [2, 0, 5, 5, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'VOB']}, {'word': ['他', '送', '了', '一本', '书'], 'head': [2, 0, 2, 5, 2], 'deprel': ['SBV', 'HED', 'MT', 'ATT', 'VOB']}]
test_tokens = [['百度', '是', '一家', '高科技', '公司']]
results = module.parse(texts=test_text, return_visual = True)
# Use POS Tag and probability
module = hub.Module(name="ddparser", prob=True, use_pos=True)
results = module.parse("百度是一家高科技公司")
print(results)
# [{'word': ['百度', '是', '一家', '高科技', '公司'], 'head': [2, 0, 5, 5, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'VOB'], 'postag': ['ORG', 'v', 'm', 'n', 'n'], 'prob': [1.0, 1.0, 1.0, 1.0, 1.0]}]
result = results[0]
data = module.visualize(result['word'],result['head'],result['deprel'])
# or data = result['visual']
cv2.imwrite('test.jpg',data)
# Visualization mode
module = hub.Module(name="ddparser", return_visual=True)
data = module.visualize("百度是一家高科技公司")
cv2.imwrite('test.jpg', data)
```
- ### 3、API
- ```python
def parse(texts=[], return\_visual=False)
def __init__(
tree=True,
prob=False,
use_pos=False,
batch_size=1,
return_visual=False)
```
- 模块初始化。
- **参数**
- tree(bool): 输出结果是否需要满足树状结构,默认为True。
- prob(bool): 是否输出概率值,默认为False。
- use_pos(bool): 是否输出词性标签,默认为False。
- batch_size(int): 批大小,默认为1。
- return_visual(bool): 是否返回可视化结果(需配合visualize api使用),默认为False。
- ```python
def parse(texts)
```
- 依存分析接口,输入文本,输出依存关系。
- **参数**
- texts(list\[list\[str\] or list\[str\]]): 待预测数据。各元素可以是未分词的字符串,也可以是已分词的token列表。
- return\_visual(bool): 是否返回依存分析可视化结果。如果为True,返回结果中将包含'visual'字段。
- texts(str or list\[str\]]): 待预测数据。
- **返回**
......@@ -98,31 +117,29 @@
'deprel': list[str], 当前成分与支配者的依存关系。
'prob': list[float], 从属者和支配者依存的概率。
'postag': list[str], 词性标签,只有当texts的元素是未分词的字符串时包含这个键。
'visual': 图像数组,可以使用cv2.imshow显示图像或cv2.imwrite保存图像。
'visual': numpy.ndarray, 图像数组,可以使用cv2.imshow显示图像或cv2.imwrite保存图像。
}
- ```python
def visualize(word, head, deprel)
def visualize(text)
```
- 可视化接口,输入依存分析接口得到的信息,输出依存图形数组。
- 可视化接口,输入文本信息,输出依存图形数组。
- **参数**
- word(list\[list\[str\]\): 分词信息。
- head(list\[int\]): 当前成分其支配者的id。
- deprel(list\[str\]): 当前成分与支配者的依存关系。
- text(str): 输入文本,支持string格式的单条文本输入。
- **返回**
- data(numpy.array): 图像数组。可以使用cv2.imshow显示图像或cv2.imwrite保存图像。
- data(numpy.ndarray): 图像数组。可以使用cv2.imshow显示图像或cv2.imwrite保存图像。
## 四、服务部署
- PaddleHub Serving可以部署一个在线情感分析服务,可以将此接口用于在线web应用。
- PaddleHub Serving可以部署一个在线句法分析服务,可以将此接口用于在线web应用。
- ## 第一步:启动PaddleHub Serving
......@@ -148,38 +165,45 @@
import requests
import json
import numpy as np
import cv2
# 待预测数据
# 待预测数据(input string)
text = ["百度是一家高科技公司"]
# 设置运行配置
return_visual = True
data = {"texts": text, "return_visual": return_visual}
data = {"texts": text}
# 指定预测方法为DuDepParser并发送post请求,content-type类型应指定json方式
url = "http://0.0.0.0:8866/predict/ddparser"
url = "http://127.0.0.1:8866/predict/ddparser"
headers = {"Content-Type": "application/json"}
r = requests.post(url=url, headers=headers, data=json.dumps(data))
results = r.json()['results']
for i in range(len(results)):
print(results[i]['word'])
# 不同于本地调用parse接口,serving返回的图像是list类型的,需要先用numpy加载再显示或保存。
cv2.imwrite('%s.jpg'%i, np.array(results[i]['visual']))
print(r.json())
# {'msg': '', 'results': [{'deprel': ['SBV', 'HED', 'ATT', 'VOB'], 'head': ['2', '0', '4', '2'], 'word': ['百度', '是', '一家', '公司']}], 'status': '000'}
# 待预测数据(input list)
text = ["百度是一家公司", "他送了一本书"]
# 设置运行配置
data = {"texts": text}
r = requests.post(url=url, headers=headers, data=json.dumps(data))
print(r.json())
# {'msg': '', 'results': [{'deprel': ['SBV', 'HED', 'ATT', 'VOB'], 'head': ['2', '0', '4', '2'], 'word': ['百度', '是', '一家', '公司']}, {'deprel': ['SBV', 'HED', 'MT', 'ATT', 'VOB'], 'head': ['2', '0', '2', '5', '2'], 'word': ['他', '送', '了', '一本', '书']}], 'status': '000'}
```
- 关于PaddleHub Serving更多信息参考:[服务部署](../../../../docs/docs_ch/tutorial/serving.md)
## 五、更新历史
* 1.0.0
初始发布
* 1.1.0
适配paddlepaddle 2.1版本
- ```shell
$ hub install ddparser==1.0.0
$ hub install ddparser==1.1.0
```
......@@ -2,52 +2,48 @@
import os
import argparse
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
from paddle import fluid
import paddlehub as hub
from paddlehub.module.module import serving, moduleinfo, runnable
try:
from ddparser import DDParser as DDParserModel
except:
raise ImportError(
"The module requires additional dependencies: ddparser. Please run 'pip install ddparser' to install it.")
from paddlenlp import Taskflow
@moduleinfo(
name="ddparser",
version="1.0.0",
version="1.1.0",
summary="Baidu's open-source DDParser model.",
author="baidu-nlp",
author_email="",
type="nlp/syntactic_analysis")
class ddparser(hub.NLPPredictionModule):
def _initialize(self):
"""
initialize with the necessary elements
"""
self.ddp = DDParserModel(prob=True, use_pos=True)
self.font = font_manager.FontProperties(fname=os.path.join(self.directory, "SourceHanSans-Regular.ttf"))
def __init__(self,
tree=True,
prob=False,
use_pos=False,
batch_size=1,
return_visual=False,
):
self.ddp = Taskflow(
"dependency_parsing",
tree=tree,
prob=prob,
use_pos=use_pos,
batch_size=batch_size,
return_visual=return_visual)
@serving
def serving_parse(self, texts=[], return_visual=False):
results = self.parse(texts, return_visual)
if return_visual:
for i, result in enumerate(results):
result['visual'] = result['visual'].tolist()
def serving_parse(self, texts):
results = self.parse(texts)
for i in range(len(results)):
org_list = results[i]["head"]
results[i]["head"] = [str(x) for x in org_list]
return results
def parse(self, texts=[], return_visual=False):
def parse(self, texts):
"""
parse the dependency.
Args:
texts(list[list[str] or list[list[str]]]): the input texts to be parse. It should be a list with elements: untokenized string or tokens list.
return_visual(bool): if set True, the result will contain the dependency visualization.
texts(str or list[str]): the input texts to be parse.
Returns:
results(list[dict]): a list, with elements corresponding to each of the elements in texts. The element is a dictionary of shape:
......@@ -57,23 +53,10 @@ class ddparser(hub.NLPPredictionModule):
'deprel': list[str], the dependency relation.
'prob': list[float], the prediction probility of the dependency relation.
'postag': list[str], the POS tag. If the element of the texts is list, the key 'postag' will not return.
'visual' : list[numpy.array]: the dependency visualization. Use cv2.imshow to show or cv2.imwrite to save it. If return_visual=False, it will not return.
'visual' : numpy.ndarray: the dependency visualization. Use cv2.imshow to show or cv2.imwrite to save it. If return_visual=False, it will not return.
}
"""
if not texts:
return
if all([isinstance(i, str) and i for i in texts]):
do_parse = self.ddp.parse
elif all([isinstance(i, list) and i for i in texts]):
do_parse = self.ddp.parse_seg
else:
raise ValueError("All of the elements should be string or list")
results = do_parse(texts)
if return_visual:
for result in results:
result['visual'] = self.visualize(result['word'], result['head'], result['deprel'])
return results
return self.ddp(texts)
@runnable
def run_cmd(self, argvs):
......@@ -98,94 +81,21 @@ class ddparser(hub.NLPPredictionModule):
return results
def visualize(self, word, head, deprel):
def visualize(self, text):
"""
Visualize the dependency.
Args:
word: list[str], the tokenized words.
head: list[int], the head ids.
deprel: list[str], the dependency relation.
text(str): input text.
Returns:
data: a numpy array, use cv2.imshow to show it or cv2.imwrite to save it.
data(numpy.ndarray): a numpy array, use cv2.imshow to show it or cv2.imwrite to save it.
"""
nodes = ['ROOT'] + word
x = list(range(len(nodes)))
y = [0] * (len(nodes))
fig, ax = plt.subplots()
# control the picture size
max_span = max([abs(i + 1 - j) for i, j in enumerate(head)])
fig.set_size_inches((len(nodes), max_span / 2))
# set the points
plt.scatter(x, y, c='w')
for i in range(len(nodes)):
txt = nodes[i]
xytext = (i, 0)
if i == 0:
# set 'ROOT'
ax.annotate(
txt,
xy=xytext,
xycoords='data',
xytext=xytext,
textcoords='data',
)
else:
xy = (head[i - 1], 0)
rad = 0.5 if head[i - 1] < i else -0.5
# set the word
ax.annotate(
txt,
xy=xy,
xycoords='data',
xytext=(xytext[0] - 0.1, xytext[1]),
textcoords='data',
fontproperties=self.font)
# draw the curve
ax.annotate(
"",
xy=xy,
xycoords='data',
xytext=xytext,
textcoords='data',
arrowprops=dict(
arrowstyle="<-",
shrinkA=12,
shrinkB=12,
color='blue',
connectionstyle="arc3,rad=%s" % rad,
),
)
# set the deprel label. Calculate its position by the radius
text_x = min(i, head[i - 1]) + abs((i - head[i - 1])) / 2 - 0.2
text_y = abs((i - head[i - 1])) / 4
ax.annotate(deprel[i - 1], xy=xy, xycoords='data', xytext=[text_x, text_y], textcoords='data')
# control the axis
plt.axis('equal')
plt.axis('off')
# save to numpy array
fig.canvas.draw()
data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3, ))[:, :, ::-1]
return data
if __name__ == "__main__":
module = ddparser()
# Data to be predicted
test_text = ["百度是一家高科技公司"]
results = module.parse(texts=test_text)
print(results)
test_tokens = [['百度', '是', '一家', '高科技', '公司']]
results = module.parse(texts=test_text, return_visual=True)
print(results)
result = results[0]
data = module.visualize(result['word'], result['head'], result['deprel'])
import cv2
import numpy as np
cv2.imwrite('test1.jpg', data)
cv2.imwrite('test2.jpg', result['visual'])
if isinstance(text, str):
result = self.ddp(text)[0]['visual']
return result
else:
raise TypeError(
"Invalid inputs, input text should be str, but type of {} found!".format(type(text))
)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册