Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenDocCN
Ailearning
提交
41cc04a7
A
Ailearning
项目概览
OpenDocCN
/
Ailearning
10 个月 前同步成功
通知
8
Star
36240
Fork
11272
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
A
Ailearning
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
41cc04a7
编写于
3月 15, 2020
作者:
片刻小哥哥
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
更新 tf 2.0 命名实体的识别
上级
30fd5c7b
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
329 addition
and
0 deletion
+329
-0
src/py3.x/tensorflow2.x/text_NER.py
src/py3.x/tensorflow2.x/text_NER.py
+126
-0
src/py3.x/tensorflow2.x/zh-NER-keras-master/.gitignore
src/py3.x/tensorflow2.x/zh-NER-keras-master/.gitignore
+45
-0
src/py3.x/tensorflow2.x/zh-NER-keras-master/README.md
src/py3.x/tensorflow2.x/zh-NER-keras-master/README.md
+35
-0
src/py3.x/tensorflow2.x/zh-NER-keras-master/bilsm_crf_model.py
...y3.x/tensorflow2.x/zh-NER-keras-master/bilsm_crf_model.py
+27
-0
src/py3.x/tensorflow2.x/zh-NER-keras-master/process_data.py
src/py3.x/tensorflow2.x/zh-NER-keras-master/process_data.py
+66
-0
src/py3.x/tensorflow2.x/zh-NER-keras-master/train.py
src/py3.x/tensorflow2.x/zh-NER-keras-master/train.py
+7
-0
src/py3.x/tensorflow2.x/zh-NER-keras-master/val.py
src/py3.x/tensorflow2.x/zh-NER-keras-master/val.py
+23
-0
未找到文件。
src/py3.x/tensorflow2.x/text_NER.py
0 → 100644
浏览文件 @
41cc04a7
import
pickle
import
numpy
as
np
import
platform
from
collections
import
Counter
from
keras.models
import
Sequential
from
keras.layers
import
Embedding
,
Bidirectional
,
LSTM
from
keras_contrib.layers
import
CRF
from
keras.preprocessing.sequence
import
pad_sequences
EMBED_DIM
=
200
BiRNN_UNITS
=
200
def
load_data
():
train
=
_parse_data
(
open
(
'zh-NER/data/train_data.data'
,
'rb'
))
test
=
_parse_data
(
open
(
'zh-NER/data/test_data.data'
,
'rb'
))
word_counts
=
Counter
(
row
[
0
].
lower
()
for
sample
in
train
for
row
in
sample
)
vocab
=
[
w
for
w
,
f
in
iter
(
word_counts
.
items
())
if
f
>=
2
]
chunk_tags
=
[
'O'
,
'B-PER'
,
'I-PER'
,
'B-LOC'
,
'I-LOC'
,
"B-ORG"
,
"I-ORG"
]
# save initial config data
with
open
(
'zh-NER/model/config.pkl'
,
'wb'
)
as
outp
:
pickle
.
dump
((
vocab
,
chunk_tags
),
outp
)
train
=
_process_data
(
train
,
vocab
,
chunk_tags
)
test
=
_process_data
(
test
,
vocab
,
chunk_tags
)
return
train
,
test
,
(
vocab
,
chunk_tags
)
def
_parse_data
(
fh
):
# in windows the new line is '\r\n\r\n' the space is '\r\n' . so if you use windows system,
# you have to use recorsponding instructions
if
platform
.
system
()
==
'Windows'
:
split_text
=
'
\r\n
'
else
:
split_text
=
'
\n
'
string
=
fh
.
read
().
decode
(
'utf-8'
)
data
=
[[
row
.
split
()
for
row
in
sample
.
split
(
split_text
)]
for
sample
in
string
.
strip
().
split
(
split_text
+
split_text
)]
fh
.
close
()
return
data
def
_process_data
(
data
,
vocab
,
chunk_tags
,
maxlen
=
None
,
onehot
=
False
):
if
maxlen
is
None
:
maxlen
=
max
(
len
(
s
)
for
s
in
data
)
word2idx
=
dict
((
w
,
i
)
for
i
,
w
in
enumerate
(
vocab
))
x
=
[[
word2idx
.
get
(
w
[
0
].
lower
(),
1
)
for
w
in
s
]
for
s
in
data
]
# set to <unk> (index 1) if not in vocab
y_chunk
=
[[
chunk_tags
.
index
(
w
[
1
])
for
w
in
s
]
for
s
in
data
]
x
=
pad_sequences
(
x
,
maxlen
)
# left padding
y_chunk
=
pad_sequences
(
y_chunk
,
maxlen
,
value
=-
1
)
if
onehot
:
y_chunk
=
np
.
eye
(
len
(
chunk_tags
),
dtype
=
'float32'
)[
y_chunk
]
else
:
y_chunk
=
np
.
expand_dims
(
y_chunk
,
2
)
return
x
,
y_chunk
def
process_data
(
data
,
vocab
,
maxlen
=
100
):
word2idx
=
dict
((
w
,
i
)
for
i
,
w
in
enumerate
(
vocab
))
x
=
[
word2idx
.
get
(
w
[
0
].
lower
(),
1
)
for
w
in
data
]
length
=
len
(
x
)
x
=
pad_sequences
([
x
],
maxlen
)
# left padding
return
x
,
length
def
create_model
(
train
=
True
):
if
train
:
(
train_x
,
train_y
),
(
test_x
,
test_y
),
(
vocab
,
chunk_tags
)
=
load_data
()
else
:
with
open
(
'model/config.pkl'
,
'rb'
)
as
inp
:
(
vocab
,
chunk_tags
)
=
pickle
.
load
(
inp
)
model
=
Sequential
()
model
.
add
(
Embedding
(
len
(
vocab
),
EMBED_DIM
,
mask_zero
=
True
))
# Random embedding
model
.
add
(
Bidirectional
(
LSTM
(
BiRNN_UNITS
//
2
,
return_sequences
=
True
)))
crf
=
CRF
(
len
(
chunk_tags
),
sparse_target
=
True
)
model
.
add
(
crf
)
model
.
summary
()
model
.
compile
(
'adam'
,
loss
=
crf
.
loss_function
,
metrics
=
[
crf
.
accuracy
])
if
train
:
return
model
,
(
train_x
,
train_y
),
(
test_x
,
test_y
)
else
:
return
model
,
(
vocab
,
chunk_tags
)
def
train
():
EPOCHS
=
10
model
,
(
train_x
,
train_y
),
(
test_x
,
test_y
)
=
create_model
()
# train model
model
.
fit
(
train_x
,
train_y
,
batch_size
=
16
,
epochs
=
EPOCHS
,
validation_data
=
[
test_x
,
test_y
])
model
.
save
(
'model/crf.h5'
)
def
test
():
model
,
(
vocab
,
chunk_tags
)
=
create_model
(
train
=
False
)
predict_text
=
'中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下,连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚'
str
,
length
=
process_data
(
predict_text
,
vocab
)
model
.
load_weights
(
'model/crf.h5'
)
raw
=
model
.
predict
(
str
)[
0
][
-
length
:]
result
=
[
np
.
argmax
(
row
)
for
row
in
raw
]
result_tags
=
[
chunk_tags
[
i
]
for
i
in
result
]
per
,
loc
,
org
=
''
,
''
,
''
for
s
,
t
in
zip
(
predict_text
,
result_tags
):
if
t
in
(
'B-PER'
,
'I-PER'
):
per
+=
' '
+
s
if
(
t
==
'B-PER'
)
else
s
if
t
in
(
'B-ORG'
,
'I-ORG'
):
org
+=
' '
+
s
if
(
t
==
'B-ORG'
)
else
s
if
t
in
(
'B-LOC'
,
'I-LOC'
):
loc
+=
' '
+
s
if
(
t
==
'B-LOC'
)
else
s
print
([
'person:'
+
per
,
'location:'
+
loc
,
'organzation:'
+
org
])
if
__name__
==
"__main__"
:
train
()
src/py3.x/tensorflow2.x/zh-NER-keras-master/.gitignore
0 → 100644
浏览文件 @
41cc04a7
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.cache
nosetests.xml
coverage.xml
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
.idea
\ No newline at end of file
src/py3.x/tensorflow2.x/zh-NER-keras-master/README.md
0 → 100644
浏览文件 @
41cc04a7
# zh-NER-keras
> this project is a sample for Chinese Named Entity Recognition(NER)
by Keras 2.1.4
## requirements
*
keras=>2.1.4
*
keras contribute 2.0.8 (https://github.com/keras-team/keras-contrib)
*
h5py
*
pickle
## demo
```
python
python
val
.
py
```
input:
```
text
中华人民共和国国务院总理周恩来在外交部长陈毅,
副部长王东的陪同下,
连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚
```
output:
```
python
[
'person: 周恩来 陈毅, 王东'
,
'location: 埃塞俄比亚 非洲 阿尔巴尼亚'
,
'organzation: 中华人民共和国国务院 外交部'
]
```
src/py3.x/tensorflow2.x/zh-NER-keras-master/bilsm_crf_model.py
0 → 100644
浏览文件 @
41cc04a7
from
keras.models
import
Sequential
from
keras.layers
import
Embedding
,
Bidirectional
,
LSTM
from
keras_contrib.layers
import
CRF
import
process_data
import
pickle
EMBED_DIM
=
200
BiRNN_UNITS
=
200
def
create_model
(
train
=
True
):
if
train
:
(
train_x
,
train_y
),
(
test_x
,
test_y
),
(
vocab
,
chunk_tags
)
=
process_data
.
load_data
()
else
:
with
open
(
'model/config.pkl'
,
'rb'
)
as
inp
:
(
vocab
,
chunk_tags
)
=
pickle
.
load
(
inp
)
model
=
Sequential
()
model
.
add
(
Embedding
(
len
(
vocab
),
EMBED_DIM
,
mask_zero
=
True
))
# Random embedding
model
.
add
(
Bidirectional
(
LSTM
(
BiRNN_UNITS
//
2
,
return_sequences
=
True
)))
crf
=
CRF
(
len
(
chunk_tags
),
sparse_target
=
True
)
model
.
add
(
crf
)
model
.
summary
()
model
.
compile
(
'adam'
,
loss
=
crf
.
loss_function
,
metrics
=
[
crf
.
accuracy
])
if
train
:
return
model
,
(
train_x
,
train_y
),
(
test_x
,
test_y
)
else
:
return
model
,
(
vocab
,
chunk_tags
)
src/py3.x/tensorflow2.x/zh-NER-keras-master/process_data.py
0 → 100644
浏览文件 @
41cc04a7
import
numpy
from
collections
import
Counter
from
keras.preprocessing.sequence
import
pad_sequences
import
pickle
import
platform
def
load_data
():
train
=
_parse_data
(
open
(
'data/train_data.data'
,
'rb'
))
test
=
_parse_data
(
open
(
'data/test_data.data'
,
'rb'
))
word_counts
=
Counter
(
row
[
0
].
lower
()
for
sample
in
train
for
row
in
sample
)
vocab
=
[
w
for
w
,
f
in
iter
(
word_counts
.
items
())
if
f
>=
2
]
chunk_tags
=
[
'O'
,
'B-PER'
,
'I-PER'
,
'B-LOC'
,
'I-LOC'
,
"B-ORG"
,
"I-ORG"
]
# save initial config data
with
open
(
'model/config.pkl'
,
'wb'
)
as
outp
:
pickle
.
dump
((
vocab
,
chunk_tags
),
outp
)
train
=
_process_data
(
train
,
vocab
,
chunk_tags
)
test
=
_process_data
(
test
,
vocab
,
chunk_tags
)
return
train
,
test
,
(
vocab
,
chunk_tags
)
def
_parse_data
(
fh
):
# in windows the new line is '\r\n\r\n' the space is '\r\n' . so if you use windows system,
# you have to use recorsponding instructions
if
platform
.
system
()
==
'Windows'
:
split_text
=
'
\r\n
'
else
:
split_text
=
'
\n
'
string
=
fh
.
read
().
decode
(
'utf-8'
)
data
=
[[
row
.
split
()
for
row
in
sample
.
split
(
split_text
)]
for
sample
in
string
.
strip
().
split
(
split_text
+
split_text
)]
fh
.
close
()
return
data
def
_process_data
(
data
,
vocab
,
chunk_tags
,
maxlen
=
None
,
onehot
=
False
):
if
maxlen
is
None
:
maxlen
=
max
(
len
(
s
)
for
s
in
data
)
word2idx
=
dict
((
w
,
i
)
for
i
,
w
in
enumerate
(
vocab
))
x
=
[[
word2idx
.
get
(
w
[
0
].
lower
(),
1
)
for
w
in
s
]
for
s
in
data
]
# set to <unk> (index 1) if not in vocab
y_chunk
=
[[
chunk_tags
.
index
(
w
[
1
])
for
w
in
s
]
for
s
in
data
]
x
=
pad_sequences
(
x
,
maxlen
)
# left padding
y_chunk
=
pad_sequences
(
y_chunk
,
maxlen
,
value
=-
1
)
if
onehot
:
y_chunk
=
numpy
.
eye
(
len
(
chunk_tags
),
dtype
=
'float32'
)[
y_chunk
]
else
:
y_chunk
=
numpy
.
expand_dims
(
y_chunk
,
2
)
return
x
,
y_chunk
def
process_data
(
data
,
vocab
,
maxlen
=
100
):
word2idx
=
dict
((
w
,
i
)
for
i
,
w
in
enumerate
(
vocab
))
x
=
[
word2idx
.
get
(
w
[
0
].
lower
(),
1
)
for
w
in
data
]
length
=
len
(
x
)
x
=
pad_sequences
([
x
],
maxlen
)
# left padding
return
x
,
length
src/py3.x/tensorflow2.x/zh-NER-keras-master/train.py
0 → 100644
浏览文件 @
41cc04a7
import
bilsm_crf_model
EPOCHS
=
10
model
,
(
train_x
,
train_y
),
(
test_x
,
test_y
)
=
bilsm_crf_model
.
create_model
()
# train model
model
.
fit
(
train_x
,
train_y
,
batch_size
=
16
,
epochs
=
EPOCHS
,
validation_data
=
[
test_x
,
test_y
])
model
.
save
(
'model/crf.h5'
)
src/py3.x/tensorflow2.x/zh-NER-keras-master/val.py
0 → 100644
浏览文件 @
41cc04a7
import
bilsm_crf_model
import
process_data
import
numpy
as
np
model
,
(
vocab
,
chunk_tags
)
=
bilsm_crf_model
.
create_model
(
train
=
False
)
predict_text
=
'中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下,连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚'
str
,
length
=
process_data
.
process_data
(
predict_text
,
vocab
)
model
.
load_weights
(
'model/crf.h5'
)
raw
=
model
.
predict
(
str
)[
0
][
-
length
:]
result
=
[
np
.
argmax
(
row
)
for
row
in
raw
]
result_tags
=
[
chunk_tags
[
i
]
for
i
in
result
]
per
,
loc
,
org
=
''
,
''
,
''
for
s
,
t
in
zip
(
predict_text
,
result_tags
):
if
t
in
(
'B-PER'
,
'I-PER'
):
per
+=
' '
+
s
if
(
t
==
'B-PER'
)
else
s
if
t
in
(
'B-ORG'
,
'I-ORG'
):
org
+=
' '
+
s
if
(
t
==
'B-ORG'
)
else
s
if
t
in
(
'B-LOC'
,
'I-LOC'
):
loc
+=
' '
+
s
if
(
t
==
'B-LOC'
)
else
s
print
([
'person:'
+
per
,
'location:'
+
loc
,
'organzation:'
+
org
])
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录