Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
疯人忠
PocketSphinx Speech Recognition
提交
6b4e4d79
P
PocketSphinx Speech Recognition
项目概览
疯人忠
/
PocketSphinx Speech Recognition
大约 1 年 前同步成功
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PocketSphinx Speech Recognition
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
6b4e4d79
编写于
10月 17, 2020
作者:
Z
Zhong-master
提交者:
GitHub
10月 17, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Update README.md
上级
62e19413
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
0 addition
and
287 deletion
+0
-287
README.md
README.md
+0
-287
未找到文件。
README.md
浏览文件 @
6b4e4d79
...
@@ -39,82 +39,6 @@
...
@@ -39,82 +39,6 @@
通过 pyaudio 和 scipy.fftpack 实现;
通过 pyaudio 和 scipy.fftpack 实现;
```
python
def
recording
(
filename
,
time
,
threshold
=
7000
):
"""
time值为录音时长,如果不做设置,则按照阈值进行检测录音,
但是使用阈值进行检测录音会有弊端,具体情况可以运行代码自行体会
:param filename: 文件名
:param time: 录音时间,如果指定时间,按时间来录音,默认为自动识别是否结束录音
:param threshold: 判断录音结束的阈值
:return:
"""
CHUNK
=
1024
# 采样点
FORMAT
=
pyaudio
.
paInt16
# 编码方式
CHANNELS
=
1
# 通道数
RATE
=
16000
# 采样频率
RECORD_SECONDS
=
time
# 录音时间
WAVE_OUTPUT_FILENAME
=
filename
# 音频文件名
p
=
pyaudio
.
PyAudio
()
stream
=
p
.
open
(
format
=
FORMAT
,
channels
=
CHANNELS
,
rate
=
RATE
,
input
=
True
,
frames_per_buffer
=
CHUNK
)
#playsound('answered_01.wav') # 此处为语音回馈,需要可以打开
print
(
"录音中..."
)
frames
=
[]
if
time
>
0
:
for
i
in
range
(
0
,
int
(
RATE
/
CHUNK
*
RECORD_SECONDS
)):
data
=
stream
.
read
(
CHUNK
)
frames
.
append
(
data
)
else
:
stopflag
=
0
stopflag2
=
0
while
True
:
data
=
stream
.
read
(
CHUNK
)
rt_data
=
np
.
frombuffer
(
data
,
np
.
dtype
(
'<i2'
))
fft_temp_data
=
fftpack
.
fft
(
rt_data
,
rt_data
.
size
,
overwrite_x
=
True
)
fft_data
=
np
.
abs
(
fft_temp_data
)[
0
:
fft_temp_data
.
size
//
2
+
1
]
print
(
sum
(
fft_data
)
//
len
(
fft_data
))
# 阈值
# 麦克风阈值,默认7000
if
sum
(
fft_data
)
//
len
(
fft_data
)
>
threshold
:
stopflag
+=
1
else
:
stopflag2
+=
1
oneSecond
=
int
(
RATE
/
CHUNK
)
if
stopflag2
+
stopflag
>
oneSecond
:
if
stopflag2
>
oneSecond
//
3
*
2
:
break
else
:
stopflag2
=
0
stopflag
=
0
frames
.
append
(
data
)
print
(
"* 录音结束"
)
stream
.
stop_stream
()
stream
.
close
()
p
.
terminate
()
with
wave
.
open
(
WAVE_OUTPUT_FILENAME
,
'wb'
)
as
wf
:
wf
.
setnchannels
(
CHANNELS
)
wf
.
setsampwidth
(
p
.
get_sample_size
(
FORMAT
))
wf
.
setframerate
(
RATE
)
wf
.
writeframes
(
b
''
.
join
(
frames
))
if
__name__
==
'__main__'
:
recording
(
'text_word.wav'
,
time
=
3
)
f
=
wave
.
open
(
'text_word.wav'
,
"rb"
)
# getparams() 一次性返回所有的WAV文件的格式信息
params
=
f
.
getparams
()
# nframes:采样点数目 // sampwidth:量化位数 // framerate:采样频率 // nframes:采样点数
nchannels
,
sampwidth
,
framerate
,
nframes
=
params
[:
4
]
str_data
=
f
.
readframes
(
nframes
)
# readframes() 按照采样点读取数据 // str_data 是二进制字符串
# 以上可以直接写成 str_data = f.readframes(f.getnframes())
# 转成二字节数组形式(每个采样点占两个字节)
wave_data
=
np
.
fromstring
(
str_data
,
dtype
=
np
.
short
)
print
(
"采样点数目:"
+
str
(
len
(
wave_data
)))
# 输出应为采样点数目
f
.
close
()
```
### 2、提取音频有效值( 音频预处理 ):
### 2、提取音频有效值( 音频预处理 ):
...
@@ -130,132 +54,6 @@ if __name__ == '__main__':
...
@@ -130,132 +54,6 @@ if __name__ == '__main__':
[
更新短时过零率/github
](
https://github.com/rocketeerli/Computer-VisionandAudio-Lab/tree/master/lab1
)
[
更新短时过零率/github
](
https://github.com/rocketeerli/Computer-VisionandAudio-Lab/tree/master/lab1
)
```
python
def
sgn
(
data
):
if
data
>=
0
:
return
1
else
:
return
0
def
calEnergy
(
wave_data
):
energy
=
[]
sum
=
0
for
i
in
range
(
len
(
wave_data
)):
sum
=
sum
+
(
int
(
wave_data
[
i
])
*
int
(
wave_data
[
i
]))
if
(
i
+
1
)
%
256
==
0
:
energy
.
append
(
sum
)
sum
=
0
elif
i
==
len
(
wave_data
)
-
1
:
energy
.
append
(
sum
)
return
energy
def
calZeroCrossingRate
(
wave_data
):
zeroCrossingRate
=
[]
sum
=
0
for
i
in
range
(
len
(
wave_data
)):
if
i
%
256
==
0
:
continue
sum
=
sum
+
np
.
abs
(
sgn
(
wave_data
[
i
])
-
sgn
(
wave_data
[
i
-
1
]))
if
(
i
+
1
)
%
256
==
0
:
zeroCrossingRate
.
append
(
float
(
sum
)
/
255
)
sum
=
0
elif
i
==
len
(
wave_data
)
-
1
:
zeroCrossingRate
.
append
(
float
(
sum
)
/
255
)
return
zeroCrossingRate
# 双门限法进行端点检测
def
endPointDetect
(
energy
,
zeroCrossingRate
):
sum
=
0
energyAverage
=
0
for
en
in
energy
:
sum
=
sum
+
en
energyAverage
=
sum
/
len
(
energy
)
sum
=
0
for
en
in
energy
[:
5
]:
sum
=
sum
+
en
ML
=
sum
/
4
MH
=
energyAverage
/
2
ML
=
(
ML
+
MH
)
/
4
sum
=
0
for
zcr
in
zeroCrossingRate
[:
5
]:
sum
=
float
(
sum
)
+
zcr
Zs
=
sum
/
4
# 过零率阈值
A
,
B
,
C
=
[],
[],
[]
flag
=
0
for
i
in
range
(
len
(
energy
)):
if
len
(
A
)
==
0
and
flag
==
0
and
energy
[
i
]
>
MH
:
A
.
append
(
i
)
flag
=
1
elif
flag
==
0
and
energy
[
i
]
>
MH
and
i
-
21
>
A
[
len
(
A
)
-
1
]:
A
.
append
(
i
)
flag
=
1
elif
flag
==
0
and
energy
[
i
]
>
MH
and
i
-
21
<=
A
[
len
(
A
)
-
1
]:
A
=
A
[:
len
(
A
)
-
1
]
flag
=
1
if
flag
==
1
and
energy
[
i
]
<
MH
:
A
.
append
(
i
)
flag
=
0
print
(
"较高能量阈值,计算后的浊音A:"
+
str
(
A
))
for
j
in
range
(
len
(
A
)):
i
=
A
[
j
]
if
j
%
2
==
1
:
while
i
<
len
(
energy
)
and
energy
[
i
]
>
ML
:
i
=
i
+
1
B
.
append
(
i
)
else
:
while
i
>
0
and
energy
[
i
]
>
ML
:
i
=
i
-
1
B
.
append
(
i
)
print
(
"较低能量阈值,增加一段语言B:"
+
str
(
B
))
# 利用过零率进行最后一步检测
for
j
in
range
(
len
(
B
)):
i
=
B
[
j
]
if
j
%
2
==
1
:
while
i
<
len
(
zeroCrossingRate
)
and
zeroCrossingRate
[
i
]
>=
3
*
Zs
:
i
=
i
+
1
C
.
append
(
i
)
else
:
while
i
>
0
and
zeroCrossingRate
[
i
]
>=
3
*
Zs
:
i
=
i
-
1
C
.
append
(
i
)
print
(
"过零率阈值,最终语音分段C:"
+
str
(
C
))
return
C
if
__name__
==
'__main__'
:
energy
=
calEnergy
(
wave_data
)
with
open
(
"./energy/1_en.txt"
,
"w"
)
as
f
:
for
en
in
energy
:
f
.
write
(
str
(
en
)
+
"
\n
"
)
zeroCrossingRate
=
calZeroCrossingRate
(
wave_data
)
with
open
(
"./zeroCrossingRate/1_zero.txt"
,
"w"
)
as
f
:
for
zcr
in
zeroCrossingRate
:
f
.
write
(
str
(
zcr
)
+
"
\n
"
)
N
=
endPointDetect
(
energy
,
zeroCrossingRate
)
with
open
(
'text_word.pcm'
,
"wb"
)
as
f
:
i
=
0
while
i
<
len
(
N
):
for
num
in
wave_data
[
N
[
i
]
*
256
:
N
[
i
+
1
]
*
256
]:
f
.
write
(
num
)
i
=
i
+
2
with
open
(
'text_word.pcm'
,
'rb'
)
as
pcmfile
:
pcmdata
=
pcmfile
.
read
()
with
wave
.
open
(
'text.wav'
,
'wb'
)
as
wavfile
:
wavfile
.
setparams
((
1
,
2
,
16000
,
0
,
'NONE'
,
'NONE'
))
wavfile
.
writeframes
(
pcmdata
)
# 参数说明如下:声道数、量化位数、采样频率、采样点数、压缩类型、压缩类型描述
# wave模块 只支持非压缩的数据,所以可以忽略后面两个信息
```
### 3、下载并安装PocketSphinx:
### 3、下载并安装PocketSphinx:
...
@@ -304,7 +102,6 @@ print(r.recognize_sphinx(audio, language='zh-cn')) # 输出识别到的中文词
...
@@ -304,7 +102,6 @@ print(r.recognize_sphinx(audio, language='zh-cn')) # 输出识别到的中文词
```
```
### 5、使用PocketSphinx进行简单的语音识别:
### 5、使用PocketSphinx进行简单的语音识别:
直接调用已安装好的 PocketSphinx API 即可,注意 SpeechRecognition 在导入时需要写成 speech_recognition 的形式,否则会报错;
直接调用已安装好的 PocketSphinx API 即可,注意 SpeechRecognition 在导入时需要写成 speech_recognition 的形式,否则会报错;
...
@@ -313,92 +110,8 @@ SpeechRecognition 库的具体用法可以参考以下文章:
...
@@ -313,92 +110,8 @@ SpeechRecognition 库的具体用法可以参考以下文章:
[
Python实现语音识别:SpeechRecognition
](
https://blog.csdn.net/alice_tl/article/details/89684369?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522160286569219724838500666%2522%252C%2522scm%2522%253A%252220140713.130102334.pc%255Fall.%2522%257D&request_id=160286569219724838500666&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~first_rank_v2~rank_v28-1-89684369.pc_first_rank_v2_rank_v28&utm_term=SpeechRecognition%E6%94%AF%E6%8C%81%E7%9A%84&spm=1018.2118.3001.4187
)
[
Python实现语音识别:SpeechRecognition
](
https://blog.csdn.net/alice_tl/article/details/89684369?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522160286569219724838500666%2522%252C%2522scm%2522%253A%252220140713.130102334.pc%255Fall.%2522%257D&request_id=160286569219724838500666&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~first_rank_v2~rank_v28-1-89684369.pc_first_rank_v2_rank_v28&utm_term=SpeechRecognition%E6%94%AF%E6%8C%81%E7%9A%84&spm=1018.2118.3001.4187
)
```
python
import
speech_recognition
as
sr
if
__name__
==
'__main__'
:
PATH
=
'text.wav'
r
=
sr
.
Recognizer
()
with
sr
.
AudioFile
(
PATH
)
as
source
:
audio
=
r
.
record
(
source
)
try
:
print
(
'你说了:'
+
r
.
recognize_sphinx
(
audio
,
language
=
'zh-cn'
))
print
(
'识别已结束,一共用时:'
,
time_end2
-
time_start2
)
if
r
.
recognize_sphinx
(
audio
,
language
=
'zh-cn'
)
==
'开门'
or
'西瓜开门'
:
# playsound('answered_02.wav') # 此处为语音回馈,需要可以打开
except
sr
.
UnknownValueError
:
print
(
'Sphinx could not understand audio'
)
# playsound('error_01.wav') # 此处为语音回馈,需要可以打开
except
sr
.
RequestError
as
e
:
print
(
'Sphinx error; {0}'
.
format
(
e
))
# playsound('error_02.wav') # 此处为语音回馈,需要可以打开
```
## 附:pyttsx3 文字转语音(识别后电子语音回馈 )
话不多说,直接附上代码:
```
python
import
speech_recognition
as
sr
import
pyttsx3
# 下面这段代码就是查看您当前电脑中的讲述人列表,在配置讲述人的时候可以直接复制到 value='‘ 中
# engine = pyttsx3.init()
# voices = engine.getProperty('voices')
# for voice in voices:
# print('id:', voice.id)
# # engine.setProperty('voice', voice.id)
# # engine.say('The quick brown fox jumped over the lazy dog.')
# engine.runAndWait()
# pyttsx3 初始化
engine
=
pyttsx3
.
init
()
# 定义基本信息
# 以下讲述人的语音包是我自己安装的,使用之前请先运行上面的代码,将您电脑中的讲述人粘贴到 value='‘ 中,否则代码将会报错
engine
.
setProperty
(
'voice'
,
value
=
'HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\MSTTS_V110_zhTW_YatingM'
)
# 定义讲述人
engine
.
setProperty
(
'rate'
,
180
)
# 定义语速
engine
.
setProperty
(
'volume'
,
1.0
)
# 定义音量
PATH
=
'text.wav'
r
=
sr
.
Recognizer
()
# 调用 PocketSphinx API
with
sr
.
AudioFile
(
PATH
)
as
source
:
# 读取音频
audio
=
r
.
record
(
source
)
try
:
print
(
'你说了:'
+
r
.
recognize_sphinx
(
audio
,
language
=
'zh-cn'
))
# 调用对比库文件识别音频
if
r
.
recognize_sphinx
(
audio
,
language
=
'zh-cn'
)
==
'开门'
or
'西瓜开门'
:
engine
.
say
(
'你说了'
+
r
.
recognize_sphinx
(
audio
,
language
=
'zh-cn'
))
engine
.
runAndWait
()
except
sr
.
UnknownValueError
:
print
(
'Sphinx could not understand audio'
)
engine
.
say
(
'哎呀,我听不懂你在说什么呀,要不要再说一次啊!'
)
engine
.
runAndWait
()
except
sr
.
RequestError
as
e
:
print
(
'Sphinx error; {0}'
.
format
(
e
))
engine
.
say
(
'哎呀,出错了'
)
engine
.
runAndWait
()
engine
.
stop
()
```
注意:这里的识别表现出来的反应基本在 7s 左右,其实不是识别的速度慢,而是 pyttsx3 的语音回馈慢,自定义词的识别速度与您自定义词库的大小有关,一般自定义词库的识别速度在 1~2s 左右,当然,要想提高 pyttsx3 语音回馈的反应速度也有其他的解决方法,就是将您希望用到的回馈语音先保存下来,提取有效片段,在语音识别完成后利用 Playsound库进行播放也可以达到高速反应的目的,保存命令如下:
注意:这里的识别表现出来的反应基本在 7s 左右,其实不是识别的速度慢,而是 pyttsx3 的语音回馈慢,自定义词的识别速度与您自定义词库的大小有关,一般自定义词库的识别速度在 1~2s 左右,当然,要想提高 pyttsx3 语音回馈的反应速度也有其他的解决方法,就是将您希望用到的回馈语音先保存下来,提取有效片段,在语音识别完成后利用 Playsound库进行播放也可以达到高速反应的目的,保存命令如下:
```
python
engine
=
pyttsx3
.
init
()
engine
.
setProperty
(
'voice'
,
value
=
'HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\MSTTS_V110_zhTW_YatingM'
)
engine
.
setProperty
(
'rate'
,
180
)
engine
.
setProperty
(
'volume'
,
1.0
)
engine
.
save_to_file
(
'哦'
,
'answered_02.wav'
)
engine
.
runAndWait
()
engine
.
stop
()
```
## 至此,pyhton基于PocketSphinx实现简单语音识别项目结束
## 至此,pyhton基于PocketSphinx实现简单语音识别项目结束
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录