Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
406f26ae
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
406f26ae
编写于
9月 17, 2020
作者:
Y
yinhaofeng
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
support py2
上级
68cc383f
变更
1
显示空白变更内容
内联
并排
Showing
1 changed file
with
17 addition
and
16 deletion
+17
-16
models/contentunderstanding/tagspace/data/text2paddle.py
models/contentunderstanding/tagspace/data/text2paddle.py
+17
-16
未找到文件。
models/contentunderstanding/tagspace/data/text2paddle.py
浏览文件 @
406f26ae
...
...
@@ -18,6 +18,7 @@ import collections
import
os
import
csv
import
re
import
io
import
sys
if
six
.
PY2
:
reload
(
sys
)
...
...
@@ -45,11 +46,11 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
word_freq
=
collections
.
defaultdict
(
int
)
files
=
os
.
listdir
(
train_dir
)
for
fi
in
files
:
with
open
(
os
.
path
.
join
(
train_dir
,
fi
),
"r"
,
encoding
=
'utf-8'
)
as
f
:
with
io
.
open
(
os
.
path
.
join
(
train_dir
,
fi
),
"r"
,
encoding
=
'utf-8'
)
as
f
:
word_freq
=
word_count
(
column_num
,
f
,
word_freq
)
files
=
os
.
listdir
(
test_dir
)
for
fi
in
files
:
with
open
(
os
.
path
.
join
(
test_dir
,
fi
),
"r"
,
encoding
=
'utf-8'
)
as
f
:
with
io
.
open
(
os
.
path
.
join
(
test_dir
,
fi
),
"r"
,
encoding
=
'utf-8'
)
as
f
:
word_freq
=
word_count
(
column_num
,
f
,
word_freq
)
word_freq
=
[
x
for
x
in
six
.
iteritems
(
word_freq
)
if
x
[
1
]
>
min_word_freq
]
...
...
@@ -65,51 +66,51 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir,
if
not
os
.
path
.
exists
(
output_train_dir
):
os
.
mkdir
(
output_train_dir
)
for
fi
in
files
:
with
open
(
os
.
path
.
join
(
train_dir
,
fi
),
"r"
,
encoding
=
'utf-8'
)
as
f
:
with
open
(
with
io
.
open
(
os
.
path
.
join
(
train_dir
,
fi
),
"r"
,
encoding
=
'utf-8'
)
as
f
:
with
io
.
open
(
os
.
path
.
join
(
output_train_dir
,
fi
),
"w"
,
encoding
=
'utf-8'
)
as
wf
:
data_file
=
csv
.
reader
(
f
)
for
row
in
data_file
:
tag_raw
=
re
.
split
(
r
'\W+'
,
row
[
0
].
strip
())
pos_index
=
tag_idx
.
get
(
tag_raw
[
0
])
wf
.
write
(
str
(
pos_index
)
+
","
)
wf
.
write
(
u
"{},"
.
format
(
str
(
pos_index
))
)
text_raw
=
re
.
split
(
r
'\W+'
,
row
[
2
].
strip
())
l
=
[
text_idx
.
get
(
w
)
for
w
in
text_raw
]
for
w
in
l
:
wf
.
write
(
str
(
w
)
+
" "
)
wf
.
write
(
"
\n
"
)
wf
.
write
(
u
"{} "
.
format
(
str
(
w
))
)
wf
.
write
(
u
"
\n
"
)
files
=
os
.
listdir
(
test_dir
)
if
not
os
.
path
.
exists
(
output_test_dir
):
os
.
mkdir
(
output_test_dir
)
for
fi
in
files
:
with
open
(
os
.
path
.
join
(
test_dir
,
fi
),
"r"
,
encoding
=
'utf-8'
)
as
f
:
with
open
(
with
io
.
open
(
os
.
path
.
join
(
test_dir
,
fi
),
"r"
,
encoding
=
'utf-8'
)
as
f
:
with
io
.
open
(
os
.
path
.
join
(
output_test_dir
,
fi
),
"w"
,
encoding
=
'utf-8'
)
as
wf
:
data_file
=
csv
.
reader
(
f
)
for
row
in
data_file
:
tag_raw
=
re
.
split
(
r
'\W+'
,
row
[
0
].
strip
())
pos_index
=
tag_idx
.
get
(
tag_raw
[
0
])
wf
.
write
(
str
(
pos_index
)
+
","
)
wf
.
write
(
u
"{},"
.
format
(
str
(
pos_index
))
)
text_raw
=
re
.
split
(
r
'\W+'
,
row
[
2
].
strip
())
l
=
[
text_idx
.
get
(
w
)
for
w
in
text_raw
]
for
w
in
l
:
wf
.
write
(
str
(
w
)
+
" "
)
wf
.
write
(
"
\n
"
)
wf
.
write
(
u
"{} "
.
format
(
str
(
w
))
)
wf
.
write
(
u
"
\n
"
)
def
text2paddle
(
train_dir
,
test_dir
,
output_train_dir
,
output_test_dir
,
output_vocab_text
,
output_vocab_tag
):
print
(
"start constuct word dict"
)
vocab_text
=
build_dict
(
2
,
0
,
train_dir
,
test_dir
)
with
open
(
output_vocab_text
,
"w"
,
encoding
=
'utf-8'
)
as
wf
:
wf
.
write
(
str
(
len
(
vocab_text
))
+
"
\n
"
)
with
io
.
open
(
output_vocab_text
,
"w"
,
encoding
=
'utf-8'
)
as
wf
:
wf
.
write
(
u
"{}
\n
"
.
format
(
str
(
len
(
vocab_text
)))
)
vocab_tag
=
build_dict
(
0
,
0
,
train_dir
,
test_dir
)
with
open
(
output_vocab_tag
,
"w"
,
encoding
=
'utf-8'
)
as
wf
:
wf
.
write
(
str
(
len
(
vocab_tag
))
+
"
\n
"
)
with
io
.
open
(
output_vocab_tag
,
"w"
,
encoding
=
'utf-8'
)
as
wf
:
wf
.
write
(
u
"{}
\n
"
.
format
(
str
(
len
(
vocab_tag
)))
)
print
(
"construct word dict done
\n
"
)
write_paddle
(
vocab_text
,
vocab_tag
,
train_dir
,
test_dir
,
output_train_dir
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录