Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
6a50211c
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
6a50211c
编写于
11月 25, 2021
作者:
J
Junkun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
data process for ted-en-zh st1
上级
383b68d8
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
161 addition
and
63 deletion
+161
-63
examples/ted_en_zh/st1/local/data.sh
examples/ted_en_zh/st1/local/data.sh
+152
-62
examples/ted_en_zh/st1/path.sh
examples/ted_en_zh/st1/path.sh
+9
-1
未找到文件。
examples/ted_en_zh/st1/local/data.sh
浏览文件 @
6a50211c
...
@@ -2,16 +2,18 @@
...
@@ -2,16 +2,18 @@
set
-e
set
-e
stage
=
-
1
stage
=
1
stop_stage
=
100
stop_stage
=
100
dict_dir
=
data/lang_char
dict_dir
=
data/lang_char
# bpemode (unigram or bpe)
# bpemode (unigram or bpe)
nbpe
=
8000
nbpe
=
8000
bpemode
=
unigram
bpemode
=
bpe
bpeprefix
=
"
${
dict_dir
}
/bpe_
${
bpemode
}
_
${
nbpe
}
"
bpeprefix
=
"
${
dict_dir
}
/bpe_
${
bpemode
}
_
${
nbpe
}
"
data_dir
=
./TED_EnZh
data_dir
=
./TED_EnZh
target_dir
=
data/ted_en_zh
dumpdir
=
data/dump
do_delta
=
false
source
${
MAIN_ROOT
}
/utils/parse_options.sh
source
${
MAIN_ROOT
}
/utils/parse_options.sh
...
@@ -38,75 +40,163 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
...
@@ -38,75 +40,163 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
exit
1
exit
1
fi
fi
# generate manifests
# # extract data
python3
${
TARGET_DIR
}
/ted_en_zh/ted_en_zh.py
\
# echo "data Extraction"
--manifest_prefix
=
"data/manifest"
\
# python3 local/ted_en_zh.py \
--src_dir
=
"
${
data_dir
}
"
# --tgt-dir=${target_dir} \
# --src-dir=${data_dir}
echo
"Complete raw data pre-process."
fi
fi
prep_dir
=
${
target_dir
}
/data_prep
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
# compute mean and stddev for normalizer
### Task dependent. You have to make data the following preparation part by yourself.
num_workers
=
$(
nproc
)
### But you can utilize Kaldi recipes in most cases
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
echo
"stage 0: Data preparation"
--manifest_path
=
"data/manifest.train.raw"
\
for
set
in
train dev
test
;
do
--num_samples
=
-1
\
# for set in train; do
--spectrum_type
=
"fbank"
\
dst
=
${
target_dir
}
/
${
set
}
--feat_dim
=
80
\
for
lang
in
en zh
;
do
--delta_delta
=
false
\
--sample_rate
=
16000
\
if
[
${
lang
}
=
'en'
]
;
then
--stride_ms
=
10.0
\
echo
"remove punctuation
$lang
"
--window_ms
=
25.0
\
# remove punctuation
--use_dB_normalization
=
False
\
local
/remove_punctuation.pl <
${
dst
}
/
${
lang
}
.org
>
${
dst
}
/
${
lang
}
.raw
--num_workers
=
${
num_workers
}
\
else
--output_path
=
"data/mean_std.json"
cp
${
dst
}
/
${
lang
}
.org
${
dst
}
/
${
lang
}
.raw
fi
if
[
$?
-ne
0
]
;
then
echo
"Compute mean and stddev failed. Terminated."
paste
-d
" "
${
dst
}
/.yaml
${
dst
}
/
${
lang
}
.raw |
sort
>
${
dst
}
/text.
${
lang
}
exit
1
fi
done
# error check
n
=
$(
cat
${
dst
}
/.yaml |
wc
-l
)
n_en
=
$(
cat
${
dst
}
/en.raw |
wc
-l
)
n_tgt
=
$(
cat
${
dst
}
/zh.raw |
wc
-l
)
[
${
n
}
-ne
${
n_en
}
]
&&
echo
"Warning: expected
${
n
}
data data files, found
${
n_en
}
"
&&
exit
1
;
[
${
n
}
-ne
${
n_tgt
}
]
&&
echo
"Warning: expected
${
n
}
data data files, found
${
n_tgt
}
"
&&
exit
1
;
echo
"done text processing"
cat
${
dst
}
/wav.scp.org |
uniq
|
sort
-k1
,1
-u
>
${
dst
}
/wav.scp
cat
${
dst
}
/utt2spk.org |
uniq
|
sort
-k1
,1
-u
>
${
dst
}
/utt2spk
cat
${
dst
}
/utt2spk | utils/utt2spk_to_spk2utt.pl |
sort
-k1
,1
-u
>
${
dst
}
/spk2utt
rm
-rf
${
prep_dir
}
/
${
set
}
.en-zh
mkdir
-p
${
prep_dir
}
/
${
set
}
.en-zh
echo
"remove duplicate lines..."
cut
-d
' '
-f
1
${
dst
}
/text.en |
sort
|
uniq
-c
|
sort
-n
-k1
-r
|
grep
-v
'1 ted-en-zh'
\
|
sed
's/^[ \t]*//'
>
${
dst
}
/duplicate_lines
cut
-d
' '
-f
1
${
dst
}
/text.en |
sort
|
uniq
-c
|
sort
-n
-k1
-r
|
grep
'1 ted-en-zh'
\
|
cut
-d
'1'
-f
2- |
sed
's/^[ \t]*//'
>
${
dst
}
/reclist
reduce_data_dir.sh
${
dst
}
${
dst
}
/reclist
${
prep_dir
}
/
${
set
}
.en-zh
echo
"done wav processing"
for
l
in
en zh
;
do
cp
${
dst
}
/text.
${
l
}
${
prep_dir
}
/
${
set
}
.en-zh/text.
${
l
}
done
utils/fix_data_dir.sh
--utt_extra_files
\
"text.en text.zh"
\
${
prep_dir
}
/
${
set
}
.en-zh
done
fi
fi
feat_tr_dir
=
${
dumpdir
}
/train/delta
${
do_delta
}
;
mkdir
-p
${
feat_tr_dir
}
feat_dt_dir
=
${
dumpdir
}
/dev/delta
${
do_delta
}
;
mkdir
-p
${
feat_dt_dir
}
feat_trans_dir
=
${
dumpdir
}
/test/delta
${
do_delta
}
;
mkdir
-p
${
feat_trans_dir
}
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# build vocabulary
### Task dependent. You have to design training and dev sets by yourself.
python3
${
MAIN_ROOT
}
/utils/build_vocab.py
\
### But you can utilize Kaldi recipes in most cases
--unit_type
"spm"
\
echo
"stage 1: Feature Generation"
--spm_vocab_size
=
${
nbpe
}
\
fbankdir
=
data/fbank
--spm_mode
${
bpemode
}
\
# Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
--spm_model_prefix
${
bpeprefix
}
\
for
x
in
train dev
test
;
do
--vocab_path
=
"
${
dict_dir
}
/vocab.txt"
\
steps/make_fbank_pitch.sh
--cmd
"
$train_cmd
"
--nj
32
--write_utt2num_frames
true
\
--text_keys
'text'
'text1'
\
${
prep_dir
}
/
${
x
}
.en-zh data/make_fbank/
${
x
}
${
fbankdir
}
--manifest_paths
=
"data/manifest.train.raw"
done
echo
"speed perturbation"
if
[
$?
-ne
0
]
;
then
utils/perturb_data_dir_speed.sh 0.9
${
prep_dir
}
/train.en-zh
${
prep_dir
}
/temp1.en-zh
echo
"Build vocabulary failed. Terminated."
utils/perturb_data_dir_speed.sh 1.0
${
prep_dir
}
/train.en-zh
${
prep_dir
}
/temp2.en-zh
exit
1
utils/perturb_data_dir_speed.sh 1.1
${
prep_dir
}
/train.en-zh
${
prep_dir
}
/temp3.en-zh
fi
utils/combine_data.sh
--extra-files
utt2uniq
${
prep_dir
}
/train_sp.en-zh
\
${
prep_dir
}
/temp1.en-zh
${
prep_dir
}
/temp2.en-zh
${
prep_dir
}
/temp3.en-zh
rm
-r
${
prep_dir
}
/temp
*
.en-zh
utils/fix_data_dir.sh
${
prep_dir
}
/train_sp.en-zh
steps/make_fbank_pitch.sh
--cmd
"
$train_cmd
"
--nj
32
--write_utt2num_frames
true
\
${
prep_dir
}
/train_sp.en-zh exp/make_fbank/train_sp.en-zh
${
fbankdir
}
for
lang
in
en zh
;
do
cat
/dev/null
>
${
prep_dir
}
/train_sp.en-zh/text.
${
lang
}
for
p
in
"sp0.9-"
"sp1.0-"
"sp1.1-"
;
do
awk
-v
p
=
${
p
}
'{printf("%s %s%s\n", $1, p, $1);}'
${
prep_dir
}
/train.en-zh/utt2spk
>
${
prep_dir
}
/train_sp.en-zh/utt_map
utils/apply_map.pl
-f
1
${
prep_dir
}
/train_sp.en-zh/utt_map <
${
prep_dir
}
/train.en-zh/text.
${
lang
}
>>
${
prep_dir
}
/train_sp.en-zh/text.
${
lang
}
done
done
for
x
in
train_sp dev
test
;
do
local
/divide_lang.sh
${
prep_dir
}
/
${
x
}
.en-zh zh
done
for
x
in
train_sp dev
;
do
# remove utt having more than 3000 frames
# remove utt having more than 400 characters
for
lang
in
zh en
;
do
remove_longshortdata.sh
--maxframes
3000
--maxchars
400
${
prep_dir
}
/
${
x
}
.en-zh.
${
lang
}
${
prep_dir
}
/
${
x
}
.en-zh.
${
lang
}
.tmp
done
cut
-f
1
-d
" "
${
prep_dir
}
/
${
x
}
.en-zh.en.tmp/text
>
${
prep_dir
}
/
${
x
}
.en-zh.
${
lang
}
.tmp/reclist1
cut
-f
1
-d
" "
${
prep_dir
}
/
${
x
}
.en-zh.
${
lang
}
.tmp/text
>
${
prep_dir
}
/
${
x
}
.en-zh.
${
lang
}
.tmp/reclist2
comm
-12
${
prep_dir
}
/
${
x
}
.en-zh.
${
lang
}
.tmp/reclist1
${
prep_dir
}
/
${
x
}
.en-zh.
${
lang
}
.tmp/reclist2
>
${
prep_dir
}
/
${
x
}
.en-zh.en.tmp/reclist
for
lang
in
zh en
;
do
reduce_data_dir.sh
${
prep_dir
}
/
${
x
}
.en-zh.
${
lang
}
.tmp
${
prep_dir
}
/
${
x
}
.en-zh.en.tmp/reclist
${
prep_dir
}
/
${
x
}
.en-zh.
${
lang
}
utils/fix_data_dir.sh
${
prep_dir
}
/
${
x
}
.en-zh.
${
lang
}
done
rm
-rf
${
prep_dir
}
/
${
x
}
.en-zh.
*
.tmp
done
compute-cmvn-stats scp:
${
prep_dir
}
/train_sp.en-zh/feats.scp
${
prep_dir
}
/train_sp.en-zh/cmvn.ark
dump.sh
--cmd
"
$train_cmd
"
--nj
80
--do_delta
$do_delta
\
${
prep_dir
}
/train_sp.en-zh/feats.scp
${
prep_dir
}
/train_sp.en-zh/cmvn.ark
${
prep_dir
}
/dump_feats/train_sp.en-zh
${
feat_tr_dir
}
dump.sh
--cmd
"
$train_cmd
"
--nj
32
--do_delta
$do_delta
\
${
prep_dir
}
/dev.en-zh/feats.scp
${
prep_dir
}
/train_sp.en-zh/cmvn.ark
${
prep_dir
}
/dump_feats/dev.en-zh
${
feat_dt_dir
}
dump.sh
--cmd
"
$train_cmd
"
--nj
32
--do_delta
$do_delta
\
${
prep_dir
}
/test.en-zh/feats.scp
${
prep_dir
}
/train_sp.en-zh/cmvn.ark
${
prep_dir
}
/dump_feats/test.en-zh
${
feat_trans_dir
}
fi
fi
dict
=
${
dict_dir
}
/ted_en_zh_
${
bpemode
}${
nbpe
}
_joint.txt
nlsyms
=
${
dict_dir
}
/ted_en_zh_non_lang_syms.txt
bpemodel
=
${
dict_dir
}
/ted_en_zh_
${
bpemode
}${
nbpe
}
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
# format manifest with tokenids, vocab size
echo
"stage 2: Dictionary and Json Data Preparation"
for
set
in
train dev
test
;
do
# echo "make a non-linguistic symbol list for all languages"
{
# grep sp1.0 ${prep_dir}/train_sp.en-zh.*/text | cut -f 2- -d' ' | grep -o -P '&[^;];'| sort | uniq > ${nlsyms}
python3
${
MAIN_ROOT
}
/utils/format_triplet_data.py
\
# cat ${nlsyms}
--feat_type
"raw"
\
--cmvn_path
"data/mean_std.json"
\
echo
"make a joint source and target dictionary"
--unit_type
"spm"
\
echo
"<unk> 1"
>
${
dict
}
# <unk> must be 1, 0 will be used for "blank" in CTC
--spm_model_prefix
${
bpeprefix
}
\
offset
=
$(
wc
-l
<
${
dict
}
)
--vocab_path
=
"
${
dict_dir
}
/vocab.txt"
\
grep
sp1.0
${
prep_dir
}
/train_sp.en-zh.
*
/text |
cut
-f
2-
-d
' '
|
grep
-v
-e
'^\s*$'
>
${
dict_dir
}
/input.txt
--manifest_path
=
"data/manifest.
${
set
}
.raw"
\
spm_train
--input
=
${
dict_dir
}
/input.txt
--vocab_size
=
${
nbpe
}
--model_type
=
${
bpemode
}
--model_prefix
=
${
bpemodel
}
--input_sentence_size
=
100000000
--character_coverage
=
1.0
--output_path
=
"data/manifest.
${
set
}
"
spm_encode
--model
=
${
bpemodel
}
.model
--output_format
=
piece <
${
dict_dir
}
/input.txt |
tr
' '
'\n'
|
sort
|
uniq
|
awk
-v
offset
=
${
offset
}
'{print $0 " " NR+offset}'
>>
${
dict
}
wc
-l
${
dict
}
if
[
$?
-ne
0
]
;
then
echo
"Formt mnaifest failed. Terminated."
echo
"make json files"
exit
1
data2json.sh
--nj
16
--feat
${
feat_tr_dir
}
/feats.scp
--text
${
prep_dir
}
/train_sp.en-zh.zh/text
--bpecode
${
bpemodel
}
.model
--lang
zh
\
fi
${
prep_dir
}
/train_sp.en-zh.zh
${
dict
}
>
${
feat_tr_dir
}
/data_
${
bpemode
}${
nbpe
}
.json
}
&
data2json.sh
--feat
${
feat_dt_dir
}
/feats.scp
--text
${
prep_dir
}
/dev.en-zh.zh/text
--bpecode
${
bpemodel
}
.model
--lang
zh
\
${
prep_dir
}
/dev.en-zh.zh
${
dict
}
>
${
feat_dt_dir
}
/data_
${
bpemode
}${
nbpe
}
.json
data2json.sh
--feat
${
feat_dt_dir
}
/feats.scp
--text
${
prep_dir
}
/test.en-zh.zh/text
--bpecode
${
bpemodel
}
.model
--lang
zh
\
${
prep_dir
}
/test.en-zh.zh
${
dict
}
>
${
feat_trans_dir
}
/data_
${
bpemode
}${
nbpe
}
.json
echo
"update json (add source references)"
# update json (add source references)
for
x
in
${
train_set
}
${
train_dev
}
;
do
feat_dir
=
${
dumpdir
}
/
${
x
}
/delta
${
do_delta
}
data_dir
=
data/
$(
echo
${
x
}
|
cut
-f
1
-d
"."
)
.en-zh.en
update_json.sh
--text
${
data_dir
}
/text.
${
src_case
}
--bpecode
${
bpemodel
}
.model
\
${
feat_dir
}
/data_
${
bpemode
}${
nbpe
}
.json
${
data_dir
}
${
dict
}
done
done
wait
fi
fi
echo
"Ted En-Zh Data preparation done."
echo
"Ted En-Zh Data preparation done."
exit
0
exit
0
examples/ted_en_zh/st1/path.sh
浏览文件 @
6a50211c
export
MAIN_ROOT
=
`
realpath
${
PWD
}
/../../../
`
export
MAIN_ROOT
=
`
realpath
${
PWD
}
/../../../
`
export
PATH
=
${
MAIN_ROOT
}
:
${
MAIN_ROOT
}
/utils:
${
PATH
}
export
PATH
=
${
MAIN_ROOT
}
:
${
MAIN_ROOT
}
/utils:
${
P
WD
}
/utils:
${
P
ATH
}
export
LC_ALL
=
C
export
LC_ALL
=
C
export
PYTHONDONTWRITEBYTECODE
=
1
export
PYTHONDONTWRITEBYTECODE
=
1
...
@@ -13,3 +13,11 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
...
@@ -13,3 +13,11 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL
=
u2_st
MODEL
=
u2_st
export
BIN_DIR
=
${
MAIN_ROOT
}
/paddlespeech/s2t/exps/
${
MODEL
}
/bin
export
BIN_DIR
=
${
MAIN_ROOT
}
/paddlespeech/s2t/exps/
${
MODEL
}
/bin
# Kaldi
export
KALDI_ROOT
=
${
MAIN_ROOT
}
/tools/kaldi
[
-f
$KALDI_ROOT
/tools/env.sh
]
&&
.
$KALDI_ROOT
/tools/env.sh
export
PATH
=
$PWD
/utils/:
$KALDI_ROOT
/tools/openfst/bin:
$PWD
:
$PATH
[
!
-f
$KALDI_ROOT
/tools/config/common_path.sh
]
&&
echo
>
&2
"The standard file
$KALDI_ROOT
/tools/config/common_path.sh is not present, can not using Kaldi!"
[
-f
$KALDI_ROOT
/tools/config/common_path.sh
]
&&
.
$KALDI_ROOT
/tools/config/common_path.sh
export
train_cmd
=
"run.pl"
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录