未验证 提交 0d8192fb 编写于 作者: J jiaozhenyu 提交者: GitHub

Update conll05.py

the label file in the conll05 dataset has a wrong order
上级 c490f1b3
...@@ -41,6 +41,28 @@ EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7' ...@@ -41,6 +41,28 @@ EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
UNK_IDX = 0 UNK_IDX = 0
def load_label_dict(filename):
d = dict()
tag_dict = set()
with open(filename, 'r') as f:
for i, line in enumerate(f):
line = line.strip()
if line.startswith("B-"):
tag_dict.add(line[2:])
elif line.startswith("I-"):
tag_dict.add(line[2:])
else:
continue
index = 0
for tag in tag_dict:
d["B-" + tag] = index
index += 1
d["I-" + tag] = index
index += 1
d["O"] = index
return d
def load_dict(filename): def load_dict(filename):
d = dict() d = dict()
with open(filename, 'r') as f: with open(filename, 'r') as f:
...@@ -188,7 +210,7 @@ def get_dict(): ...@@ -188,7 +210,7 @@ def get_dict():
verb_dict = load_dict( verb_dict = load_dict(
paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
VERBDICT_MD5)) VERBDICT_MD5))
label_dict = load_dict( label_dict = load_label_dict(
paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
TRGDICT_MD5)) TRGDICT_MD5))
return word_dict, verb_dict, label_dict return word_dict, verb_dict, label_dict
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册