Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
0ede6c2e
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
0ede6c2e
编写于
4月 18, 2022
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
train lm
上级
c492a42f
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
52 addition
and
21 deletion
+52
-21
examples/other/ngram_lm/s0/local/download_lm_zh.sh
examples/other/ngram_lm/s0/local/download_lm_zh.sh
+5
-0
speechx/examples/ngram/zh/local/aishell_train_lms.sh
speechx/examples/ngram/zh/local/aishell_train_lms.sh
+4
-4
speechx/examples/ngram/zh/local/text_to_lexicon.py
speechx/examples/ngram/zh/local/text_to_lexicon.py
+8
-4
speechx/examples/ngram/zh/run.sh
speechx/examples/ngram/zh/run.sh
+12
-10
utils/fst/prepare_dict.py
utils/fst/prepare_dict.py
+23
-3
未找到文件。
examples/other/ngram_lm/s0/local/download_lm_zh.sh
浏览文件 @
0ede6c2e
...
...
@@ -10,6 +10,11 @@ MD5="29e02312deb2e59b3c8686c7966d4fe3"
TARGET
=
${
DIR
}
/zh_giga.no_cna_cmn.prune01244.klm
if
[
-e
$TARGET
]
;
then
echo
"already have lm"
exit
0
;
fi
echo
"Download language model ..."
download
$URL
$MD5
$TARGET
if
[
$?
-ne
0
]
;
then
...
...
speechx/examples/ngram/zh/local/aishell_train_lms.sh
浏览文件 @
0ede6c2e
...
...
@@ -29,12 +29,13 @@ mkdir -p $dir
cleantext
=
$dir
/text.no_oov
# oov to <SPOKEN_NOISE>
# line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
# lexicon line: word char0 ... charn
# text line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
cat
$text
|
awk
-v
lex
=
$lexicon
'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}'
\
>
$cleantext
||
exit
1
;
# compute word counts
# compute word counts
, sort in descending order
# line: count word
cat
$cleantext
|
awk
'{for(n=2;n<=NF;n++) print $n; }'
|
sort
|
uniq
-c
|
\
sort
-nr
>
$dir
/word.counts
||
exit
1
;
...
...
@@ -50,8 +51,7 @@ cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
cat
$dir
/unigram.counts |
awk
'{print $2}'
|
cat
- <
(
echo
"<s>"
;
echo
"</s>"
)
>
$dir
/wordlist
# hold out to compute ppl
heldout_sent
=
10000
# Don't change this if you want result to be comparable with
# kaldi_lm results
heldout_sent
=
10000
# Don't change this if you want result to be comparable with kaldi_lm results
mkdir
-p
$dir
cat
$cleantext
|
awk
'{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}'
|
\
...
...
speechx/examples/ngram/zh/local/text_to_lexicon.py
浏览文件 @
0ede6c2e
#!/usr/bin/env python3
import
argparse
from
collections
import
Counter
def
main
(
args
):
counter
=
Counter
()
with
open
(
args
.
text
,
'r'
)
as
fin
,
open
(
args
.
lexicon
,
'w'
)
as
fout
:
for
line
in
fin
:
line
=
line
.
strip
()
...
...
@@ -11,10 +13,12 @@ def main(args):
else
:
words
=
line
.
split
()
for
word
in
words
:
val
=
" "
.
join
(
list
(
word
))
fout
.
write
(
f
"
{
word
}
\t
{
val
}
\n
"
)
fout
.
flush
()
counter
.
update
(
words
)
for
word
in
counter
:
val
=
" "
.
join
(
list
(
word
))
fout
.
write
(
f
"
{
word
}
\t
{
val
}
\n
"
)
fout
.
flush
()
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
...
...
speechx/examples/ngram/zh/run.sh
浏览文件 @
0ede6c2e
...
...
@@ -3,11 +3,11 @@ set -eo pipefail
.
path.sh
stage
=
0
stage
=
-1
stop_stage
=
100
corpus
=
aishell
unit
=
data/vocab.txt
#
line: char/spm_pice, vocab fil
e
unit
=
data/vocab.txt
#
vocab file, line: char/spm_pic
e
lexicon
=
data/lexicon.txt
# line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
text
=
data/text
# line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
...
...
@@ -42,15 +42,17 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# line: char/spm_pices
cp
$unit
data/local/dict/units.txt
if
[
-f
$lexicon
]
;
then
# line: word ph0 ... phn -> line: word char0 ... charn
utils/fst/prepare_dict.py
\
--unit_file
$unit
\
--in_lexicon
${
lexicon
}
\
--out_lexicon
data/local/dict/lexicon.txt
else
local
/text_to_lexicon.py
--has_key
true
--text
$text
--lexicon
data/local/dict/lexicon.txt
if
[
!
-f
$lexicon
]
;
then
local
/text_to_lexicon.py
--has_key
true
--text
$text
--lexicon
$lexicon
echo
"Generate
$lexicon
from
$text
"
fi
# filter by vocab
# line: word ph0 ... phn -> line: word char0 ... charn
utils/fst/prepare_dict.py
\
--unit_file
$unit
\
--in_lexicon
${
lexicon
}
\
--out_lexicon
data/local/dict/lexicon.txt
fi
lm
=
data/local/lm
...
...
utils/fst/prepare_dict.py
浏览文件 @
0ede6c2e
...
...
@@ -3,7 +3,8 @@ import argparse
def
main
(
args
):
# load `unit` or `vocab` file
# load vocab file
# line: token
unit_table
=
set
()
with
open
(
args
.
unit_file
,
'r'
)
as
fin
:
for
line
in
fin
:
...
...
@@ -11,27 +12,41 @@ def main(args):
unit_table
.
add
(
unit
)
def
contain_oov
(
units
):
"""token not in vocab
Args:
units (str): token
Returns:
bool: True token in voca, else False.
"""
for
unit
in
units
:
if
unit
not
in
unit_table
:
return
True
return
False
# load spm model
# load spm model
, for English
bpemode
=
args
.
bpemodel
if
bpemode
:
import
sentencepiece
as
spm
sp
=
spm
.
SentencePieceProcessor
()
sp
.
Load
(
sys
.
bpemodel
)
# used to filter polyphone
# used to filter polyphone
and invalid word
lexicon_table
=
set
()
in_n
=
0
# in lexicon word count
out_n
=
0
# out lexicon word cout
with
open
(
args
.
in_lexicon
,
'r'
)
as
fin
,
\
open
(
args
.
out_lexicon
,
'w'
)
as
fout
:
for
line
in
fin
:
word
=
line
.
split
()[
0
]
in_n
+=
1
if
word
==
'SIL'
and
not
bpemode
:
# `sil` might be a valid piece in bpemodel
# filter 'SIL' for mandarin, keep it in English
continue
elif
word
==
'<SPOKEN_NOISE>'
:
# filter <SPOKEN_NOISE>
continue
else
:
# each word only has one pronunciation for e2e system
...
...
@@ -39,12 +54,14 @@ def main(args):
continue
if
bpemode
:
# for english
pieces
=
sp
.
EncodeAsPieces
(
word
)
if
contain_oov
(
pieces
):
print
(
'Ignoring words {}, which contains oov unit'
.
format
(
''
.
join
(
word
).
strip
(
'▁'
)))
continue
# word is piece list, which not have <unk> piece, filter out by `contain_oov(pieces)`
chars
=
' '
.
join
(
[
p
if
p
in
unit_table
else
'<unk>'
for
p
in
pieces
])
else
:
...
...
@@ -58,11 +75,14 @@ def main(args):
# we assume the model unit of our e2e system is char now.
if
word
.
encode
(
'utf8'
).
isalpha
()
and
'▁'
in
unit_table
:
word
=
'▁'
+
word
chars
=
' '
.
join
(
word
)
# word is a char list
fout
.
write
(
'{} {}
\n
'
.
format
(
word
,
chars
))
lexicon_table
.
add
(
word
)
out_n
+=
1
print
(
f
"Filter lexicon by unit table: filter out
{
in_n
-
out_n
}
,
{
out_n
}
/
{
in_n
}
"
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录