Floating point exception (#272) · Issue · PaddlePaddle / DeepSpeech

Floating point exception

Created by: johntyty912

I tried to write a script to record my voice and stop recording when I finished, but didn't work because when the script run feature = data_generator.process_utterance(filename, "") in test.py, I got Floating point exception. However, when I run the demo_server.py and the demo_cilent.py, it work prefectly. Please help!!!! Update: I found my speech_segment.samples only contain 0.0s. what is the problem??? Update2: I found it is because my record2.py cannot record anything. any suggestions? here's my code:

record2.py

from sys import byteorder
from array import array
from struct import pack

import pyaudio
import wave

THRESHOLD = 1000000000
CHUNK_SIZE = 1024
FORMAT = pyaudio.paInt32
RATE = 16000

def is_silent(snd_data):
    "Returns 'True' if below the 'silent' threshold"
    return max(snd_data) < THRESHOLD

def normalize(snd_data):
    "Average the volume out"
    MAXIMUM = 2147483648
    times = float(MAXIMUM)/max(abs(i) for i in snd_data)

    r = array('i')
    for i in snd_data:
        r.append(int(i*times))
    return r

def trim(snd_data):
    "Trim the blank spots at the start and end"
    def _trim(snd_data):
        snd_started = False
        r = array('i')

        for i in snd_data:
            if not snd_started and abs(i)>THRESHOLD:
                snd_started = True
                r.append(i)

            elif snd_started:
                r.append(i)
        return r

    # Trim to the left
    snd_data = _trim(snd_data)

    # Trim to the right
    snd_data.reverse()
    snd_data = _trim(snd_data)
    snd_data.reverse()
    return snd_data

def add_silence(snd_data, seconds):
    "Add silence to the start and end of 'snd_data' of length 'seconds' (float)"
    r = array('i', [0 for i in xrange(int(seconds*RATE))])
    r.extend(snd_data)
    r.extend([0 for i in xrange(int(seconds*RATE))])
    return r

def record():
    """
    Record a word or words from the microphone and
    return the data as an array of signed shorts.

    Normalizes the audio, trims silence from the
    start and end, and pads with 0.5 seconds of
    blank sound to make sure VLC et al can play
    it without getting chopped off.
    """
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT, channels=1, rate=RATE,
        input=True, output=True,
        frames_per_buffer=CHUNK_SIZE)

    num_silent = 0
    snd_started = False

    r = array('i')
    print('listening')
    while 1:
        # little endian, signed short
        snd_data = array('i', stream.read(CHUNK_SIZE))
        print max(snd_data)
        if byteorder == 'big':
            snd_data.byteswap()
        r.extend(snd_data)
        silent = is_silent(snd_data)

        if silent and snd_started:
            num_silent += 1
        elif not silent and not snd_started:
            snd_started = True

        if snd_started and num_silent > 10:
            break

    sample_width = p.get_sample_size(FORMAT)
    stream.stop_stream()
    stream.close()
    p.terminate()

    r = normalize(r)
    r = trim(r)
    r = add_silence(r, 0.5)
    return sample_width, r

def record_to_file(path):
    "Records from the microphone and outputs the resulting data to 'path'"
    sample_width, data = record()
    data = pack('<' + ('i'*len(data)), *data)

    wf = wave.open(path, 'wb')
    wf.setnchannels(1)
    wf.setsampwidth(sample_width)
    wf.setframerate(RATE)
    wf.writeframes(data)
    wf.close()

if __name__ == '__main__':
    print("please speak a word into the microphone")
    record_to_file('demo.wav')
    print("done - result written to demo.wav")

test.py

from record2 import record_to_file

"""Server-end for the ASR demo."""
import os
import time
import random
import argparse
import functools
from time import gmtime, strftime
import SocketServer
import struct
import wave
import paddle.v2 as paddle
import _init_paths
from data_utils.data import DataGenerator
from model_utils.model import DeepSpeech2Model
from data_utils.utility import read_manifest
from utils.utility import add_arguments, print_arguments

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('host_port',        int,    8086,    "Server's IP port.")
add_arg('beam_size',        int,    300,    "Beam search width.")
add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
add_arg('rnn_layer_size',   int,    1024,   "# of recurrent cells per layer.")
add_arg('alpha',            float,  2.6,   "Coef of LM for beam search.")
add_arg('beta',             float,  5.0,   "Coef of WC for beam search.")
add_arg('cutoff_prob',      float,  0.99,    "Cutoff probability for pruning.")
add_arg('cutoff_top_n',     int,    40,     "Cutoff number for pruning.")
add_arg('use_gru',          bool,   True,  "Use GRUs instead of simple RNNs.")
add_arg('use_gpu',          bool,   False,   "Use GPU or not.")
add_arg('share_rnn_weights',bool,   False,   "Share input-hidden weights across "
                                            "bi-directional RNNs. Not for GRU.")
add_arg('host_ip',          str,
        'localhost',
        "Server's IP address.")
add_arg('speech_save_dir',  str,
        'demo_cache',
        "Directory to save demo audios.")
add_arg('warmup_manifest',  str,
        'data/aishell/manifest.test',
        "Filepath of manifest to warm up.")
add_arg('mean_std_path',    str,
        'models/aishell/mean_std.npz',
        "Filepath of normalizer's mean & std.")
add_arg('vocab_path',       str,
        'models/aishell/vocab.txt',
        "Filepath of vocabulary.")
add_arg('model_path',       str,
        'models/aishell/params.tar.gz',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
add_arg('lang_model_path',  str,
        'models/lm/zh_giga.no_cna_cmn.prune01244.klm',
        "Filepath for language model.")
add_arg('decoding_method',  str,
        'ctc_beam_search',
        "Decoding method. Options: ctc_beam_search, ctc_greedy",
        choices = ['ctc_beam_search', 'ctc_greedy'])
add_arg('specgram_type',    str,
        'linear',
        "Audio feature type. Options: linear, mfcc.",
        choices=['linear', 'mfcc'])
# yapf: disable
args = parser.parse_args()

# prepare data generator
data_generator = DataGenerator(
    vocab_filepath=args.vocab_path,
    mean_std_filepath=args.mean_std_path,
    augmentation_config='{}',
    specgram_type=args.specgram_type,
    num_threads=1,
    keep_transcription_text=True)

# prepare ASR model
ds2_model = DeepSpeech2Model(
    vocab_size=data_generator.vocab_size,
    num_conv_layers=args.num_conv_layers,
    num_rnn_layers=args.num_rnn_layers,
    rnn_layer_size=args.rnn_layer_size,
    use_gru=args.use_gru,
    pretrained_model_path=args.model_path,
    share_rnn_weights=args.share_rnn_weights)

vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]

if args.decoding_method == "ctc_beam_search":
    ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path,
                              vocab_list)

# prepare ASR inference handler
def file_to_transcript(filename):
    feature = data_generator.process_utterance(filename, "")
    probs_split = ds2_model.infer_batch_probs(
        infer_data=[feature],
        feeding_dict=data_generator.feeding)

    if args.decoding_method == "ctc_greedy":
        result_transcript = ds2_model.decode_batch_greedy(
            probs_split=probs_split,
            vocab_list=vocab_list)
    else:
        result_transcript = ds2_model.decode_batch_beam_search(
            probs_split=probs_split,
            beam_alpha=args.alpha,
            beam_beta=args.beta,
            beam_size=args.beam_size,
            cutoff_prob=args.cutoff_prob,
            cutoff_top_n=args.cutoff_top_n,
            vocab_list=vocab_list,
            num_processes=1)
    return result_transcript[0]

paddle.init(use_gpu=args.use_gpu, trainer_count=1)

if __name__ == '__main__':
    record_to_file('demo.wav')
    print file_to_transcript('demo.wav')

and I got this:

*** Aborted at 1534487245 (unix time) try "date -d @1534487245" if you are using GNU date ***
PC: @                0x0 (unknown)
*** SIGFPE (@0x7f2329804bc3) received by PID 30608 (TID 0x7f232a8bc700) from PID 696273859; stack trace: ***
    @     0x7f232a4d2390 (unknown)
    @     0x7f2329804bc3 log10f
    @     0x7f231599225a (unknown)
    @     0x7f2315aa550c (unknown)
    @     0x7f2315aaed62 (unknown)
    @     0x7f2315aaf46e (unknown)
    @           0x4c15bf PyEval_EvalFrameEx
    @           0x4b9ab6 PyEval_EvalCodeEx
    @           0x4d54b9 (unknown)
    @           0x4a5371 PyObject_CallFunction
    @           0x41cdd7 _PyObject_GenericGetAttrWithDict
    @           0x4bc24b PyEval_EvalFrameEx
    @           0x4b9ab6 PyEval_EvalCodeEx
    @           0x4c16e7 PyEval_EvalFrameEx
    @           0x4b9ab6 PyEval_EvalCodeEx
    @           0x4c16e7 PyEval_EvalFrameEx
    @           0x4b9ab6 PyEval_EvalCodeEx
    @           0x4c1e6f PyEval_EvalFrameEx
    @           0x4b9ab6 PyEval_EvalCodeEx
    @           0x4c1e6f PyEval_EvalFrameEx
    @           0x4c136f PyEval_EvalFrameEx
    @           0x4b9ab6 PyEval_EvalCodeEx
    @           0x4eb30f (unknown)
    @           0x4e5422 PyRun_FileExFlags
    @           0x4e3cd6 PyRun_SimpleFileExFlags
    @           0x493ae2 Py_Main
    @     0x7f232a117830 __libc_start_main
    @           0x4933e9 _start
    @                0x0 (unknown)
Floating point exception (core dumped)

PaddlePaddle / DeepSpeech 1 年多 前同步成功

Floating point exception

PaddlePaddle / DeepSpeech
1 年多前同步成功