Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
3f13797a
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
3f13797a
编写于
11月 03, 2017
作者:
X
Xinghai Sun
提交者:
GitHub
11月 03, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request
#396
from pkuyym/fix-393
Give option to disable converting from transcription text to ids.
上级
797bac45
a0843941
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
28 addition
and
24 deletion
+28
-24
data_utils/data.py
data_utils/data.py
+12
-5
data_utils/featurizer/speech_featurizer.py
data_utils/featurizer/speech_featurizer.py
+5
-3
deploy/demo_server.py
deploy/demo_server.py
+2
-1
infer.py
infer.py
+3
-5
test.py
test.py
+3
-5
tools/tune.py
tools/tune.py
+3
-5
未找到文件。
data_utils/data.py
浏览文件 @
3f13797a
...
...
@@ -55,6 +55,10 @@ class DataGenerator(object):
:type num_threads: int
:param random_seed: Random seed.
:type random_seed: int
:param keep_transcription_text: If set to True, transcription text will
be passed forward directly without
converting to index sequence.
:type keep_transcription_text: bool
"""
def
__init__
(
self
,
...
...
@@ -69,7 +73,8 @@ class DataGenerator(object):
specgram_type
=
'linear'
,
use_dB_normalization
=
True
,
num_threads
=
multiprocessing
.
cpu_count
()
//
2
,
random_seed
=
0
):
random_seed
=
0
,
keep_transcription_text
=
False
):
self
.
_max_duration
=
max_duration
self
.
_min_duration
=
min_duration
self
.
_normalizer
=
FeatureNormalizer
(
mean_std_filepath
)
...
...
@@ -84,6 +89,7 @@ class DataGenerator(object):
use_dB_normalization
=
use_dB_normalization
)
self
.
_num_threads
=
num_threads
self
.
_rng
=
random
.
Random
(
random_seed
)
self
.
_keep_transcription_text
=
keep_transcription_text
self
.
_epoch
=
0
# for caching tar files info
self
.
_local_data
=
local
()
...
...
@@ -97,8 +103,8 @@ class DataGenerator(object):
:type filename: basestring | file
:param transcript: Transcription text.
:type transcript: basestring
:return: Tuple of audio feature tensor and
list of token ids for
transcription
.
:return: Tuple of audio feature tensor and
data of transcription part,
where transcription part could be token ids or text
.
:rtype: tuple of (2darray, list)
"""
if
filename
.
startswith
(
'tar:'
):
...
...
@@ -107,9 +113,10 @@ class DataGenerator(object):
else
:
speech_segment
=
SpeechSegment
.
from_file
(
filename
,
transcript
)
self
.
_augmentation_pipeline
.
transform_audio
(
speech_segment
)
specgram
,
text_ids
=
self
.
_speech_featurizer
.
featurize
(
speech_segment
)
specgram
,
transcript_part
=
self
.
_speech_featurizer
.
featurize
(
speech_segment
,
self
.
_keep_transcription_text
)
specgram
=
self
.
_normalizer
.
apply
(
specgram
)
return
specgram
,
t
ext_ids
return
specgram
,
t
ranscript_part
def
batch_reader_creator
(
self
,
manifest_path
,
...
...
data_utils/featurizer/speech_featurizer.py
浏览文件 @
3f13797a
...
...
@@ -60,12 +60,12 @@ class SpeechFeaturizer(object):
target_dB
=
target_dB
)
self
.
_text_featurizer
=
TextFeaturizer
(
vocab_filepath
)
def
featurize
(
self
,
speech_segment
):
def
featurize
(
self
,
speech_segment
,
keep_transcription_text
):
"""Extract features for speech segment.
1. For audio parts, extract the audio features.
2. For transcript parts,
convert text string to a list of token indices
in char-level.
2. For transcript parts,
keep the original text or convert text string
to a list of token indices
in char-level.
:param audio_segment: Speech segment to extract features from.
:type audio_segment: SpeechSegment
...
...
@@ -74,6 +74,8 @@ class SpeechFeaturizer(object):
:rtype: tuple
"""
audio_feature
=
self
.
_audio_featurizer
.
featurize
(
speech_segment
)
if
keep_transcription_text
:
return
audio_feature
,
speech_segment
.
transcript
text_ids
=
self
.
_text_featurizer
.
featurize
(
speech_segment
.
transcript
)
return
audio_feature
,
text_ids
...
...
deploy/demo_server.py
浏览文件 @
3f13797a
...
...
@@ -146,7 +146,8 @@ def start_server():
mean_std_filepath
=
args
.
mean_std_path
,
augmentation_config
=
'{}'
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
1
)
num_threads
=
1
,
keep_transcription_text
=
True
)
# prepare ASR model
ds2_model
=
DeepSpeech2Model
(
vocab_size
=
data_generator
.
vocab_size
,
...
...
infer.py
浏览文件 @
3f13797a
...
...
@@ -68,7 +68,8 @@ def infer():
mean_std_filepath
=
args
.
mean_std_path
,
augmentation_config
=
'{}'
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
1
)
num_threads
=
1
,
keep_transcription_text
=
True
)
batch_reader
=
data_generator
.
batch_reader_creator
(
manifest_path
=
args
.
infer_manifest
,
batch_size
=
args
.
num_samples
,
...
...
@@ -102,10 +103,7 @@ def infer():
num_processes
=
args
.
num_proc_bsearch
)
error_rate_func
=
cer
if
args
.
error_rate_type
==
'cer'
else
wer
target_transcripts
=
[
''
.
join
([
data_generator
.
vocab_list
[
token
]
for
token
in
transcript
])
for
_
,
transcript
in
infer_data
]
target_transcripts
=
[
transcript
for
_
,
transcript
in
infer_data
]
for
target
,
result
in
zip
(
target_transcripts
,
result_transcripts
):
print
(
"
\n
Target Transcription: %s
\n
Output Transcription: %s"
%
(
target
,
result
))
...
...
test.py
浏览文件 @
3f13797a
...
...
@@ -69,7 +69,8 @@ def evaluate():
mean_std_filepath
=
args
.
mean_std_path
,
augmentation_config
=
'{}'
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_proc_data
)
num_threads
=
args
.
num_proc_data
,
keep_transcription_text
=
True
)
batch_reader
=
data_generator
.
batch_reader_creator
(
manifest_path
=
args
.
test_manifest
,
batch_size
=
args
.
batch_size
,
...
...
@@ -103,10 +104,7 @@ def evaluate():
vocab_list
=
vocab_list
,
language_model_path
=
args
.
lang_model_path
,
num_processes
=
args
.
num_proc_bsearch
)
target_transcripts
=
[
''
.
join
([
data_generator
.
vocab_list
[
token
]
for
token
in
transcript
])
for
_
,
transcript
in
infer_data
]
target_transcripts
=
[
transcript
for
_
,
transcript
in
infer_data
]
for
target
,
result
in
zip
(
target_transcripts
,
result_transcripts
):
error_sum
+=
error_rate_func
(
target
,
result
)
num_ins
+=
1
...
...
tools/tune.py
浏览文件 @
3f13797a
...
...
@@ -87,7 +87,8 @@ def tune():
mean_std_filepath
=
args
.
mean_std_path
,
augmentation_config
=
'{}'
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_proc_data
)
num_threads
=
args
.
num_proc_data
,
keep_transcription_text
=
True
)
audio_data
=
paddle
.
layer
.
data
(
name
=
"audio_spectrogram"
,
...
...
@@ -163,10 +164,7 @@ def tune():
for
i
in
xrange
(
len
(
infer_data
))
]
target_transcripts
=
[
''
.
join
([
data_generator
.
vocab_list
[
token
]
for
token
in
transcript
])
for
_
,
transcript
in
infer_data
]
target_transcripts
=
[
transcript
for
_
,
transcript
in
infer_data
]
num_ins
+=
len
(
target_transcripts
)
# grid search
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录