未验证 提交 8f403bac 编写于 作者: L linjieccc 提交者: GitHub

Update ddparser module to v1.1.0 (#1658)

上级 e0a2073f
...@@ -195,9 +195,9 @@ ...@@ -195,9 +195,9 @@
headers = {"Content-Type": "application/json"} headers = {"Content-Type": "application/json"}
for t in text: for t in text:
print("input: {}".format(t)) print("input: {}".format(t))
result = requests.post(url=url, headers=headers, data=json.dumps(t)) result = requests.post(url=url, headers=headers, data=json.dumps({"text": t}))
# 打印预测结果 # 打印预测结果
print("model output: {}\n".format(result)) print("model output: {}\n".format(result.json()['results']))
- 关于PaddleHub Serving更多信息参考:[服务部署](../../../../../docs/docs_ch/tutorial/serving.md) - 关于PaddleHub Serving更多信息参考:[服务部署](../../../../../docs/docs_ch/tutorial/serving.md)
......
...@@ -195,9 +195,9 @@ ...@@ -195,9 +195,9 @@
headers = {"Content-Type": "application/json"} headers = {"Content-Type": "application/json"}
for t in text: for t in text:
print("input: {}".format(t)) print("input: {}".format(t))
result = requests.post(url=url, headers=headers, data=json.dumps(t)) result = requests.post(url=url, headers=headers, data=json.dumps({"text": t}))
# 打印预测结果 # 打印预测结果
print("model output: {}\n".format(result)) print("model output: {}\n".format(result.json()['results']))
- 关于PaddleHub Serving更多信息参考:[服务部署](../../../../docs/docs_ch/tutorial/serving.md) - 关于PaddleHub Serving更多信息参考:[服务部署](../../../../docs/docs_ch/tutorial/serving.md)
......
...@@ -195,9 +195,9 @@ ...@@ -195,9 +195,9 @@
headers = {"Content-Type": "application/json"} headers = {"Content-Type": "application/json"}
for t in text: for t in text:
print("input: {}".format(t)) print("input: {}".format(t))
r = requests.post(url=url, headers=headers, data=json.dumps(t)) r = requests.post(url=url, headers=headers, data=json.dumps({"text": t}))
# 打印预测结果 # 打印预测结果
print("model output: {}\n".format(result)) print("model output: {}\n".format(result.json()['results']))
- 关于PaddleHub Serving更多信息参考:[服务部署](../../../../docs/docs_ch/tutorial/serving.md) - 关于PaddleHub Serving更多信息参考:[服务部署](../../../../docs/docs_ch/tutorial/serving.md)
......
...@@ -195,9 +195,9 @@ ...@@ -195,9 +195,9 @@
headers = {"Content-Type": "application/json"} headers = {"Content-Type": "application/json"}
for t in text: for t in text:
print("input: {}".format(t)) print("input: {}".format(t))
r = requests.post(url=url, headers=headers, data=json.dumps(t)) r = requests.post(url=url, headers=headers, data=json.dumps({"text": t}))
# 打印预测结果 # 打印预测结果
print("model output: {}\n".format(result)) print("model output: {}\n".format(result.json()['results']))
- 关于PaddleHub Serving更多信息参考:[服务部署](../../../../docs/docs_ch/tutorial/serving.md) - 关于PaddleHub Serving更多信息参考:[服务部署](../../../../docs/docs_ch/tutorial/serving.md)
......
...@@ -195,9 +195,9 @@ ...@@ -195,9 +195,9 @@
headers = {"Content-Type": "application/json"} headers = {"Content-Type": "application/json"}
for t in text: for t in text:
print("input: {}".format(t)) print("input: {}".format(t))
r = requests.post(url=url, headers=headers, data=json.dumps(t)) r = requests.post(url=url, headers=headers, data=json.dumps({"text": t}))
# 打印预测结果 # 打印预测结果
print("model output: {}\n".format(result)) print("model output: {}\n".format(result.json()['results']))
- 关于PaddleHub Serving更多信息参考:[服务部署](../../../../docs/docs_ch/tutorial/serving.md) - 关于PaddleHub Serving更多信息参考:[服务部署](../../../../docs/docs_ch/tutorial/serving.md)
......
...@@ -3,11 +3,11 @@ ...@@ -3,11 +3,11 @@
|模型名称|DDParser| |模型名称|DDParser|
| :--- | :---: | | :--- | :---: |
|类别|文本-句法分析| |类别|文本-句法分析|
|网络|LSTM| |网络|Deep Biaffine Attention|
|数据集|搜索query、网页文本、语音输入等数据| |数据集|搜索query、网页文本、语音输入等数据|
|是否支持Fine-tuning|否| |是否支持Fine-tuning|否|
|模型大小|33MB| |模型大小|61MB|
|最新更新日期|2021-02-26| |最新更新日期|2021-10-26|
|数据指标|-| |数据指标|-|
...@@ -24,15 +24,11 @@ ...@@ -24,15 +24,11 @@
- ### 1、环境依赖 - ### 1、环境依赖
- paddlepaddle >= 1.8.2 - paddlepaddle >= 2.1.0
- paddlehub >= 1.7.0 | [如何安装PaddleHub](../../../../docs/docs_ch/get_start/installation.rst) - paddlenlp >= 2.1.0
- 额外依赖ddparser - paddlehub >= 2.1.0 | [如何安装PaddleHub](../../../../docs/docs_ch/get_start/installation.rst)
- ```shell
$ pip install ddparser
```
- ### 2、安装 - ### 2、安装
...@@ -42,9 +38,6 @@ ...@@ -42,9 +38,6 @@
- 如您安装时遇到问题,可参考:[零基础windows安装](../../../../docs/docs_ch/get_start/windows_quickstart.md) - 如您安装时遇到问题,可参考:[零基础windows安装](../../../../docs/docs_ch/get_start/windows_quickstart.md)
| [零基础Linux安装](../../../../docs/docs_ch/get_start/linux_quickstart.md) | [零基础MacOS安装](../../../../docs/docs_ch/get_start/mac_quickstart.md) | [零基础Linux安装](../../../../docs/docs_ch/get_start/linux_quickstart.md) | [零基础MacOS安装](../../../../docs/docs_ch/get_start/mac_quickstart.md)
## 三、模型API预测 ## 三、模型API预测
- ### 1、命令行预测 - ### 1、命令行预测
...@@ -60,33 +53,59 @@ ...@@ -60,33 +53,59 @@
import cv2 import cv2
import paddlehub as hub import paddlehub as hub
# Load ddparser
module = hub.Module(name="ddparser") module = hub.Module(name="ddparser")
test_text = ["百度是一家高科技公司"] # String input
results = module.parse(texts=test_text) results = module.parse("百度是一家高科技公司")
print(results)
# [{'word': ['百度', '是', '一家', '高科技', '公司'], 'head': [2, 0, 5, 5, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'VOB']}]
# List input
results = module.parse(["百度是一家高科技公司", "他送了一本书"])
print(results) print(results)
# [{'word': ['百度', '是', '一家', '高科技', '公司'], 'head': [2, 0, 5, 5, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'VOB']}, {'word': ['他', '送', '了', '一本', '书'], 'head': [2, 0, 2, 5, 2], 'deprel': ['SBV', 'HED', 'MT', 'ATT', 'VOB']}]
test_tokens = [['百度', '是', '一家', '高科技', '公司']] # Use POS Tag and probability
results = module.parse(texts=test_text, return_visual = True) module = hub.Module(name="ddparser", prob=True, use_pos=True)
results = module.parse("百度是一家高科技公司")
print(results) print(results)
# [{'word': ['百度', '是', '一家', '高科技', '公司'], 'head': [2, 0, 5, 5, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'VOB'], 'postag': ['ORG', 'v', 'm', 'n', 'n'], 'prob': [1.0, 1.0, 1.0, 1.0, 1.0]}]
result = results[0] # Visualization mode
data = module.visualize(result['word'],result['head'],result['deprel']) module = hub.Module(name="ddparser", return_visual=True)
# or data = result['visual'] data = module.visualize("百度是一家高科技公司")
cv2.imwrite('test.jpg',data) cv2.imwrite('test.jpg', data)
``` ```
- ### 3、API - ### 3、API
- ```python - ```python
def parse(texts=[], return\_visual=False) def __init__(
tree=True,
prob=False,
use_pos=False,
batch_size=1,
return_visual=False)
```
- 模块初始化。
- **参数**
- tree(bool): 输出结果是否需要满足树状结构,默认为True。
- prob(bool): 是否输出概率值,默认为False。
- use_pos(bool): 是否输出词性标签,默认为False。
- batch_size(int): 批大小,默认为1。
- return_visual(bool): 是否返回可视化结果(需配合visualize api使用),默认为False。
- ```python
def parse(texts)
``` ```
- 依存分析接口,输入文本,输出依存关系。 - 依存分析接口,输入文本,输出依存关系。
- **参数** - **参数**
- texts(list\[list\[str\] or list\[str\]]): 待预测数据。各元素可以是未分词的字符串,也可以是已分词的token列表。 - texts(str or list\[str\]]): 待预测数据。
- return\_visual(bool): 是否返回依存分析可视化结果。如果为True,返回结果中将包含'visual'字段。
- **返回** - **返回**
...@@ -98,31 +117,29 @@ ...@@ -98,31 +117,29 @@
'deprel': list[str], 当前成分与支配者的依存关系。 'deprel': list[str], 当前成分与支配者的依存关系。
'prob': list[float], 从属者和支配者依存的概率。 'prob': list[float], 从属者和支配者依存的概率。
'postag': list[str], 词性标签,只有当texts的元素是未分词的字符串时包含这个键。 'postag': list[str], 词性标签,只有当texts的元素是未分词的字符串时包含这个键。
'visual': 图像数组,可以使用cv2.imshow显示图像或cv2.imwrite保存图像。 'visual': numpy.ndarray, 图像数组,可以使用cv2.imshow显示图像或cv2.imwrite保存图像。
} }
- ```python - ```python
def visualize(word, head, deprel) def visualize(text)
``` ```
- 可视化接口,输入依存分析接口得到的信息,输出依存图形数组。 - 可视化接口,输入文本信息,输出依存图形数组。
- **参数** - **参数**
- word(list\[list\[str\]\): 分词信息。 - text(str): 输入文本,支持string格式的单条文本输入。
- head(list\[int\]): 当前成分其支配者的id。
- deprel(list\[str\]): 当前成分与支配者的依存关系。
- **返回** - **返回**
- data(numpy.array): 图像数组。可以使用cv2.imshow显示图像或cv2.imwrite保存图像。 - data(numpy.ndarray): 图像数组。可以使用cv2.imshow显示图像或cv2.imwrite保存图像。
## 四、服务部署 ## 四、服务部署
- PaddleHub Serving可以部署一个在线情感分析服务,可以将此接口用于在线web应用。 - PaddleHub Serving可以部署一个在线句法分析服务,可以将此接口用于在线web应用。
- ## 第一步:启动PaddleHub Serving - ## 第一步:启动PaddleHub Serving
...@@ -148,38 +165,45 @@ ...@@ -148,38 +165,45 @@
import requests import requests
import json import json
import numpy as np # 待预测数据(input string)
import cv2
# 待预测数据
text = ["百度是一家高科技公司"] text = ["百度是一家高科技公司"]
# 设置运行配置 # 设置运行配置
return_visual = True data = {"texts": text}
data = {"texts": text, "return_visual": return_visual}
# 指定预测方法为DuDepParser并发送post请求,content-type类型应指定json方式 # 指定预测方法为DuDepParser并发送post请求,content-type类型应指定json方式
url = "http://0.0.0.0:8866/predict/ddparser" url = "http://127.0.0.1:8866/predict/ddparser"
headers = {"Content-Type": "application/json"} headers = {"Content-Type": "application/json"}
r = requests.post(url=url, headers=headers, data=json.dumps(data)) r = requests.post(url=url, headers=headers, data=json.dumps(data))
results = r.json()['results']
for i in range(len(results)): print(r.json())
print(results[i]['word']) # {'msg': '', 'results': [{'deprel': ['SBV', 'HED', 'ATT', 'VOB'], 'head': ['2', '0', '4', '2'], 'word': ['百度', '是', '一家', '公司']}], 'status': '000'}
# 不同于本地调用parse接口,serving返回的图像是list类型的,需要先用numpy加载再显示或保存。
cv2.imwrite('%s.jpg'%i, np.array(results[i]['visual'])) # 待预测数据(input list)
text = ["百度是一家公司", "他送了一本书"]
# 设置运行配置
data = {"texts": text}
r = requests.post(url=url, headers=headers, data=json.dumps(data))
print(r.json())
# {'msg': '', 'results': [{'deprel': ['SBV', 'HED', 'ATT', 'VOB'], 'head': ['2', '0', '4', '2'], 'word': ['百度', '是', '一家', '公司']}, {'deprel': ['SBV', 'HED', 'MT', 'ATT', 'VOB'], 'head': ['2', '0', '2', '5', '2'], 'word': ['他', '送', '了', '一本', '书']}], 'status': '000'}
``` ```
- 关于PaddleHub Serving更多信息参考:[服务部署](../../../../docs/docs_ch/tutorial/serving.md) - 关于PaddleHub Serving更多信息参考:[服务部署](../../../../docs/docs_ch/tutorial/serving.md)
## 五、更新历史 ## 五、更新历史
* 1.0.0 * 1.0.0
初始发布 初始发布
* 1.1.0
适配paddlepaddle 2.1版本
- ```shell - ```shell
$ hub install ddparser==1.0.0 $ hub install ddparser==1.1.0
``` ```
...@@ -2,52 +2,48 @@ ...@@ -2,52 +2,48 @@
import os import os
import argparse import argparse
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
from paddle import fluid
import paddlehub as hub import paddlehub as hub
from paddlehub.module.module import serving, moduleinfo, runnable from paddlehub.module.module import serving, moduleinfo, runnable
from paddlenlp import Taskflow
try:
from ddparser import DDParser as DDParserModel
except:
raise ImportError(
"The module requires additional dependencies: ddparser. Please run 'pip install ddparser' to install it.")
@moduleinfo( @moduleinfo(
name="ddparser", name="ddparser",
version="1.0.0", version="1.1.0",
summary="Baidu's open-source DDParser model.", summary="Baidu's open-source DDParser model.",
author="baidu-nlp", author="baidu-nlp",
author_email="", author_email="",
type="nlp/syntactic_analysis") type="nlp/syntactic_analysis")
class ddparser(hub.NLPPredictionModule): class ddparser(hub.NLPPredictionModule):
def _initialize(self): def __init__(self,
""" tree=True,
initialize with the necessary elements prob=False,
""" use_pos=False,
self.ddp = DDParserModel(prob=True, use_pos=True) batch_size=1,
self.font = font_manager.FontProperties(fname=os.path.join(self.directory, "SourceHanSans-Regular.ttf")) return_visual=False,
):
self.ddp = Taskflow(
"dependency_parsing",
tree=tree,
prob=prob,
use_pos=use_pos,
batch_size=batch_size,
return_visual=return_visual)
@serving @serving
def serving_parse(self, texts=[], return_visual=False): def serving_parse(self, texts):
results = self.parse(texts, return_visual) results = self.parse(texts)
if return_visual: for i in range(len(results)):
for i, result in enumerate(results): org_list = results[i]["head"]
result['visual'] = result['visual'].tolist() results[i]["head"] = [str(x) for x in org_list]
return results return results
def parse(self, texts=[], return_visual=False): def parse(self, texts):
""" """
parse the dependency. parse the dependency.
Args: Args:
texts(list[list[str] or list[list[str]]]): the input texts to be parse. It should be a list with elements: untokenized string or tokens list. texts(str or list[str]): the input texts to be parse.
return_visual(bool): if set True, the result will contain the dependency visualization.
Returns: Returns:
results(list[dict]): a list, with elements corresponding to each of the elements in texts. The element is a dictionary of shape: results(list[dict]): a list, with elements corresponding to each of the elements in texts. The element is a dictionary of shape:
...@@ -57,23 +53,10 @@ class ddparser(hub.NLPPredictionModule): ...@@ -57,23 +53,10 @@ class ddparser(hub.NLPPredictionModule):
'deprel': list[str], the dependency relation. 'deprel': list[str], the dependency relation.
'prob': list[float], the prediction probility of the dependency relation. 'prob': list[float], the prediction probility of the dependency relation.
'postag': list[str], the POS tag. If the element of the texts is list, the key 'postag' will not return. 'postag': list[str], the POS tag. If the element of the texts is list, the key 'postag' will not return.
'visual' : list[numpy.array]: the dependency visualization. Use cv2.imshow to show or cv2.imwrite to save it. If return_visual=False, it will not return. 'visual' : numpy.ndarray: the dependency visualization. Use cv2.imshow to show or cv2.imwrite to save it. If return_visual=False, it will not return.
} }
""" """
return self.ddp(texts)
if not texts:
return
if all([isinstance(i, str) and i for i in texts]):
do_parse = self.ddp.parse
elif all([isinstance(i, list) and i for i in texts]):
do_parse = self.ddp.parse_seg
else:
raise ValueError("All of the elements should be string or list")
results = do_parse(texts)
if return_visual:
for result in results:
result['visual'] = self.visualize(result['word'], result['head'], result['deprel'])
return results
@runnable @runnable
def run_cmd(self, argvs): def run_cmd(self, argvs):
...@@ -98,94 +81,21 @@ class ddparser(hub.NLPPredictionModule): ...@@ -98,94 +81,21 @@ class ddparser(hub.NLPPredictionModule):
return results return results
def visualize(self, word, head, deprel): def visualize(self, text):
""" """
Visualize the dependency. Visualize the dependency.
Args: Args:
word: list[str], the tokenized words. text(str): input text.
head: list[int], the head ids.
deprel: list[str], the dependency relation.
Returns: Returns:
data: a numpy array, use cv2.imshow to show it or cv2.imwrite to save it. data(numpy.ndarray): a numpy array, use cv2.imshow to show it or cv2.imwrite to save it.
""" """
nodes = ['ROOT'] + word
x = list(range(len(nodes))) if isinstance(text, str):
y = [0] * (len(nodes)) result = self.ddp(text)[0]['visual']
fig, ax = plt.subplots() return result
# control the picture size else:
max_span = max([abs(i + 1 - j) for i, j in enumerate(head)]) raise TypeError(
fig.set_size_inches((len(nodes), max_span / 2)) "Invalid inputs, input text should be str, but type of {} found!".format(type(text))
# set the points )
plt.scatter(x, y, c='w')
for i in range(len(nodes)):
txt = nodes[i]
xytext = (i, 0)
if i == 0:
# set 'ROOT'
ax.annotate(
txt,
xy=xytext,
xycoords='data',
xytext=xytext,
textcoords='data',
)
else:
xy = (head[i - 1], 0)
rad = 0.5 if head[i - 1] < i else -0.5
# set the word
ax.annotate(
txt,
xy=xy,
xycoords='data',
xytext=(xytext[0] - 0.1, xytext[1]),
textcoords='data',
fontproperties=self.font)
# draw the curve
ax.annotate(
"",
xy=xy,
xycoords='data',
xytext=xytext,
textcoords='data',
arrowprops=dict(
arrowstyle="<-",
shrinkA=12,
shrinkB=12,
color='blue',
connectionstyle="arc3,rad=%s" % rad,
),
)
# set the deprel label. Calculate its position by the radius
text_x = min(i, head[i - 1]) + abs((i - head[i - 1])) / 2 - 0.2
text_y = abs((i - head[i - 1])) / 4
ax.annotate(deprel[i - 1], xy=xy, xycoords='data', xytext=[text_x, text_y], textcoords='data')
# control the axis
plt.axis('equal')
plt.axis('off')
# save to numpy array
fig.canvas.draw()
data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3, ))[:, :, ::-1]
return data
if __name__ == "__main__":
module = ddparser()
# Data to be predicted
test_text = ["百度是一家高科技公司"]
results = module.parse(texts=test_text)
print(results)
test_tokens = [['百度', '是', '一家', '高科技', '公司']]
results = module.parse(texts=test_text, return_visual=True)
print(results)
result = results[0]
data = module.visualize(result['word'], result['head'], result['deprel'])
import cv2
import numpy as np
cv2.imwrite('test1.jpg', data)
cv2.imwrite('test2.jpg', result['visual'])
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册