未验证 提交 6b4e4d79 编写于 作者: Z Zhong-master 提交者: GitHub

Update README.md

上级 62e19413
......@@ -39,82 +39,6 @@
通过 pyaudio 和 scipy.fftpack 实现;
```python
def recording(filename, time, threshold=7000):
"""
time值为录音时长,如果不做设置,则按照阈值进行检测录音,
但是使用阈值进行检测录音会有弊端,具体情况可以运行代码自行体会
:param filename: 文件名
:param time: 录音时间,如果指定时间,按时间来录音,默认为自动识别是否结束录音
:param threshold: 判断录音结束的阈值
:return:
"""
CHUNK = 1024 # 采样点
FORMAT = pyaudio.paInt16 # 编码方式
CHANNELS = 1 # 通道数
RATE = 16000 # 采样频率
RECORD_SECONDS = time # 录音时间
WAVE_OUTPUT_FILENAME = filename # 音频文件名
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
#playsound('answered_01.wav') # 此处为语音回馈,需要可以打开
print("录音中...")
frames = []
if time > 0:
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
else:
stopflag = 0
stopflag2 = 0
while True:
data = stream.read(CHUNK)
rt_data = np.frombuffer(data, np.dtype('<i2'))
fft_temp_data = fftpack.fft(rt_data, rt_data.size, overwrite_x=True)
fft_data = np.abs(fft_temp_data)[0:fft_temp_data.size // 2 + 1]
print(sum(fft_data) // len(fft_data)) # 阈值
# 麦克风阈值,默认7000
if sum(fft_data) // len(fft_data) > threshold:
stopflag += 1
else:
stopflag2 += 1
oneSecond = int(RATE / CHUNK)
if stopflag2 + stopflag > oneSecond:
if stopflag2 > oneSecond // 3 * 2:
break
else:
stopflag2 = 0
stopflag = 0
frames.append(data)
print("* 录音结束")
stream.stop_stream()
stream.close()
p.terminate()
with wave.open(WAVE_OUTPUT_FILENAME, 'wb') as wf:
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
if __name__ == '__main__':
recording('text_word.wav', time=3)
f = wave.open('text_word.wav', "rb")
# getparams() 一次性返回所有的WAV文件的格式信息
params = f.getparams()
# nframes:采样点数目 // sampwidth:量化位数 // framerate:采样频率 // nframes:采样点数
nchannels, sampwidth, framerate, nframes = params[:4]
str_data = f.readframes(nframes) # readframes() 按照采样点读取数据 // str_data 是二进制字符串
# 以上可以直接写成 str_data = f.readframes(f.getnframes())
# 转成二字节数组形式(每个采样点占两个字节)
wave_data = np.fromstring(str_data, dtype=np.short)
print("采样点数目:" + str(len(wave_data))) # 输出应为采样点数目
f.close()
```
### 2、提取音频有效值( 音频预处理 ):
......@@ -130,132 +54,6 @@ if __name__ == '__main__':
[更新短时过零率/github](https://github.com/rocketeerli/Computer-VisionandAudio-Lab/tree/master/lab1)
```python
def sgn(data):
if data >= 0:
return 1
else:
return 0
def calEnergy(wave_data):
energy = []
sum = 0
for i in range(len(wave_data)):
sum = sum + (int(wave_data[i]) * int(wave_data[i]))
if (i + 1) % 256 == 0:
energy.append(sum)
sum = 0
elif i == len(wave_data) - 1:
energy.append(sum)
return energy
def calZeroCrossingRate(wave_data):
zeroCrossingRate = []
sum = 0
for i in range(len(wave_data)):
if i % 256 == 0:
continue
sum = sum + np.abs(sgn(wave_data[i]) - sgn(wave_data[i - 1]))
if (i + 1) % 256 == 0:
zeroCrossingRate.append(float(sum) / 255)
sum = 0
elif i == len(wave_data) - 1:
zeroCrossingRate.append(float(sum) / 255)
return zeroCrossingRate
# 双门限法进行端点检测
def endPointDetect(energy, zeroCrossingRate):
sum = 0
energyAverage = 0
for en in energy:
sum = sum + en
energyAverage = sum / len(energy)
sum = 0
for en in energy[:5]:
sum = sum + en
ML = sum / 4
MH = energyAverage / 2
ML = (ML + MH) / 4
sum = 0
for zcr in zeroCrossingRate[:5]:
sum = float(sum) + zcr
Zs = sum / 4 # 过零率阈值
A, B, C = [], [], []
flag = 0
for i in range(len(energy)):
if len(A) == 0 and flag == 0 and energy[i] > MH:
A.append(i)
flag = 1
elif flag == 0 and energy[i] > MH and i - 21 > A[len(A) - 1]:
A.append(i)
flag = 1
elif flag == 0 and energy[i] > MH and i - 21 <= A[len(A) - 1]:
A = A[:len(A) - 1]
flag = 1
if flag == 1 and energy[i] < MH:
A.append(i)
flag = 0
print("较高能量阈值,计算后的浊音A:" + str(A))
for j in range(len(A)):
i = A[j]
if j % 2 == 1:
while i < len(energy) and energy[i] > ML:
i = i + 1
B.append(i)
else:
while i > 0 and energy[i] > ML:
i = i - 1
B.append(i)
print("较低能量阈值,增加一段语言B:" + str(B))
# 利用过零率进行最后一步检测
for j in range(len(B)):
i = B[j]
if j % 2 == 1:
while i < len(zeroCrossingRate) and zeroCrossingRate[i] >= 3 * Zs:
i = i + 1
C.append(i)
else:
while i > 0 and zeroCrossingRate[i] >= 3 * Zs:
i = i - 1
C.append(i)
print("过零率阈值,最终语音分段C:" + str(C))
return C
if __name__ == '__main__':
energy = calEnergy(wave_data)
with open("./energy/1_en.txt", "w") as f:
for en in energy:
f.write(str(en) + "\n")
zeroCrossingRate = calZeroCrossingRate(wave_data)
with open("./zeroCrossingRate/1_zero.txt", "w") as f:
for zcr in zeroCrossingRate:
f.write(str(zcr) + "\n")
N = endPointDetect(energy, zeroCrossingRate)
with open('text_word.pcm', "wb") as f:
i = 0
while i < len(N):
for num in wave_data[N[i] * 256: N[i + 1] * 256]:
f.write(num)
i = i + 2
with open('text_word.pcm', 'rb') as pcmfile:
pcmdata = pcmfile.read()
with wave.open('text.wav', 'wb') as wavfile:
wavfile.setparams((1, 2, 16000, 0, 'NONE', 'NONE'))
wavfile.writeframes(pcmdata)
# 参数说明如下:声道数、量化位数、采样频率、采样点数、压缩类型、压缩类型描述
# wave模块 只支持非压缩的数据,所以可以忽略后面两个信息
```
### 3、下载并安装PocketSphinx:
......@@ -304,7 +102,6 @@ print(r.recognize_sphinx(audio, language='zh-cn')) # 输出识别到的中文词
```
### 5、使用PocketSphinx进行简单的语音识别:
直接调用已安装好的 PocketSphinx API 即可,注意 SpeechRecognition 在导入时需要写成 speech_recognition 的形式,否则会报错;
......@@ -313,92 +110,8 @@ SpeechRecognition 库的具体用法可以参考以下文章:
[Python实现语音识别:SpeechRecognition](https://blog.csdn.net/alice_tl/article/details/89684369?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522160286569219724838500666%2522%252C%2522scm%2522%253A%252220140713.130102334.pc%255Fall.%2522%257D&request_id=160286569219724838500666&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~first_rank_v2~rank_v28-1-89684369.pc_first_rank_v2_rank_v28&utm_term=SpeechRecognition%E6%94%AF%E6%8C%81%E7%9A%84&spm=1018.2118.3001.4187)
```python
import speech_recognition as sr
if __name__ == '__main__':
PATH = 'text.wav'
r = sr.Recognizer()
with sr.AudioFile(PATH) as source:
audio = r.record(source)
try:
print('你说了:' + r.recognize_sphinx(audio, language='zh-cn'))
print('识别已结束,一共用时:', time_end2-time_start2)
if r.recognize_sphinx(audio, language='zh-cn') == '开门' or '西瓜开门':
# playsound('answered_02.wav') # 此处为语音回馈,需要可以打开
except sr.UnknownValueError:
print('Sphinx could not understand audio')
# playsound('error_01.wav') # 此处为语音回馈,需要可以打开
except sr.RequestError as e:
print('Sphinx error; {0}'.format(e))
# playsound('error_02.wav') # 此处为语音回馈,需要可以打开
```
## 附:pyttsx3 文字转语音(识别后电子语音回馈 )
话不多说,直接附上代码:
```python
import speech_recognition as sr
import pyttsx3
# 下面这段代码就是查看您当前电脑中的讲述人列表,在配置讲述人的时候可以直接复制到 value='‘ 中
# engine = pyttsx3.init()
# voices = engine.getProperty('voices')
# for voice in voices:
# print('id:', voice.id)
# # engine.setProperty('voice', voice.id)
# # engine.say('The quick brown fox jumped over the lazy dog.')
# engine.runAndWait()
# pyttsx3 初始化
engine = pyttsx3.init()
# 定义基本信息
# 以下讲述人的语音包是我自己安装的,使用之前请先运行上面的代码,将您电脑中的讲述人粘贴到 value='‘ 中,否则代码将会报错
engine.setProperty('voice',
value='HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\MSTTS_V110_zhTW_YatingM') # 定义讲述人
engine.setProperty('rate', 180) # 定义语速
engine.setProperty('volume', 1.0) # 定义音量
PATH = 'text.wav'
r = sr.Recognizer() # 调用 PocketSphinx API
with sr.AudioFile(PATH) as source: # 读取音频
audio = r.record(source)
try:
print('你说了:' + r.recognize_sphinx(audio, language='zh-cn')) # 调用对比库文件识别音频
if r.recognize_sphinx(audio, language='zh-cn') == '开门' or '西瓜开门':
engine.say('你说了' + r.recognize_sphinx(audio, language='zh-cn'))
engine.runAndWait()
except sr.UnknownValueError:
print('Sphinx could not understand audio')
engine.say('哎呀,我听不懂你在说什么呀,要不要再说一次啊!')
engine.runAndWait()
except sr.RequestError as e:
print('Sphinx error; {0}'.format(e))
engine.say('哎呀,出错了')
engine.runAndWait()
engine.stop()
```
注意:这里的识别表现出来的反应基本在 7s 左右,其实不是识别的速度慢,而是 pyttsx3 的语音回馈慢,自定义词的识别速度与您自定义词库的大小有关,一般自定义词库的识别速度在 1~2s 左右,当然,要想提高 pyttsx3 语音回馈的反应速度也有其他的解决方法,就是将您希望用到的回馈语音先保存下来,提取有效片段,在语音识别完成后利用 Playsound库进行播放也可以达到高速反应的目的,保存命令如下:
```python
engine = pyttsx3.init()
engine.setProperty('voice',
value='HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\MSTTS_V110_zhTW_YatingM')
engine.setProperty('rate', 180)
engine.setProperty('volume', 1.0)
engine.save_to_file('哦', 'answered_02.wav')
engine.runAndWait()
engine.stop()
```
## 至此,pyhton基于PocketSphinx实现简单语音识别项目结束
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册