未验证 提交 ba812854 编写于 作者: Y YangZhou 提交者: GitHub

Merge pull request #1757 from zh794390558/ws

[speechx] change opt  convert2PCM32 to to_float32, fix shell script
# Examples for SpeechX
* ds2_ol - ds2 streaming test under `aishell-1` test dataset.
The entrypoint is `ds2_ol/aishell/run.sh`
* ds2_ol - ds2 streaming test under `aishell-1` test dataset.
The entrypoint is `ds2_ol/aishell/run.sh`
## How to run
......
# Deepspeech2 Streaming
# Deepspeech2 Streaming ASR
Please go to `aishell` to test it.
* aishell
Deepspeech2 Streaming Decoding under aishell dataset.
* websocket
Streaming ASR with websocket.
* aishell
Streaming Decoding under aishell dataset, for local WER test and so on.
## More
The below is for developing and offline testing:
* nnet
* feat
......
......@@ -112,8 +112,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_lm > $exp/${wer}.lm
fi
wfst=$data/wfst/
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
wfst=$data/wfst/
mkdir -p $wfst
if [ ! -f $wfst/aishell_graph.zip ]; then
pushd $wfst
......@@ -122,18 +122,18 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
mv aishell_graph/* $wfst
popd
fi
fi
graph_dir=$wfst/
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# TLG decoder
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
wfst-decoder-ol \
--feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
--model_path=$model_dir/avg_1.jit.pdmodel \
--param_path=$model_dir/avg_1.jit.pdiparams \
--word_symbol_table=$graph_dir/words.txt \
--word_symbol_table=$wfst/words.txt \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--graph_path=$graph_dir/TLG.fst --max_active=7500 \
--graph_path=$wfst/TLG.fst --max_active=7500 \
--acoustic_scale=1.2 \
--result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg
......@@ -142,40 +142,21 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
cmvn=$data/cmvn.ark
if [ ! -f $data/split${nj}/1/${aishell_wav_scp} ]; then
cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
./local/split_data.sh $data ${data}/${aishell_wav_scp} $aishell_wav_scp $nj
fi
wfst=$data/wfst/
mkdir -p $wfst
if [ ! -f $wfst/aishell_graph.zip ]; then
pushd $wfst
wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip
unzip aishell_graph.zip
popd
fi
graph_dir=$wfst/aishell_graph
# TLG decoder
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer.log \
recognizer_test_main \
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
--cmvn_file=$cmvn \
--model_path=$model_dir/avg_1.jit.pdmodel \
--convert2PCM32=true \
--to_float32=true \
--streaming_chunk=30 \
--params_path=$model_dir/avg_1.jit.pdiparams \
--word_symbol_table=$graph_dir/words.txt \
--param_path=$model_dir/avg_1.jit.pdiparams \
--word_symbol_table=$wfst/words.txt \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--graph_path=$graph_dir/TLG.fst --max_active=7500 \
--graph_path=$wfst/TLG.fst --max_active=7500 \
--acoustic_scale=1.2 \
--result_wspecifier=ark,t:$data/split${nj}/JOB/result_recognizer
cat $data/split${nj}/*/result_recognizer > $exp/${label_file}_recognizer
utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_recognizer > $exp/${wer}.recognizer
fi
fi
\ No newline at end of file
......@@ -115,7 +115,7 @@ int main(int argc, char* argv[]) {
flag = feature_cache.Read(&features);
feats.push_back(features);
feature_rows += features.Dim() / feature_cache.Dim();
} while(flag == true && features.Dim() != 0);
} while (flag == true && features.Dim() != 0);
sample_offset += cur_chunk_size;
}
......
......@@ -14,9 +14,7 @@ fi
# input
mkdir -p data
data=$PWD/data
ckpt_dir=$data/model
model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
vocb_dir=$ckpt_dir/data/lang_char
# output
aishell_wav_scp=aishell_test.scp
if [ ! -d $data/test ]; then
......@@ -34,4 +32,4 @@ export GLOG_logtostderr=1
# websocket client
websocket_client_main \
--wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.36
--wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.36
\ No newline at end of file
......@@ -19,12 +19,26 @@ ckpt_dir=$data/model
model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
vocb_dir=$ckpt_dir/data/lang_char/
# output
aishell_wav_scp=aishell_test.scp
if [ ! -d $data/test ]; then
pushd $data
wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
unzip aishell_test.zip
popd
realpath $data/test/*/*.wav > $data/wavlist
awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
fi
if [ ! -f $ckpt_dir/data/mean_std.json ]; then
mkdir -p $ckpt_dir
pushd $ckpt_dir
wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
popd
mkdir -p $ckpt_dir
pushd $ckpt_dir
wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
popd
fi
export GLOG_logtostderr=1
......@@ -49,9 +63,9 @@ websocket_server_main \
--cmvn_file=$cmvn \
--model_path=$model_dir/avg_1.jit.pdmodel \
--streaming_chunk=0.1 \
--convert2PCM32=true \
--to_float32=true \
--param_path=$model_dir/avg_1.jit.pdiparams \
--word_symbol_table=$data/wfst/words.txt \
--word_symbol_table=$wfst/words.txt \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--graph_path=$data/wfst/TLG.fst --max_active=7500 \
--graph_path=$wfst/TLG.fst --max_active=7500 \
--acoustic_scale=1.2
......@@ -21,7 +21,7 @@
DEFINE_string(cmvn_file, "", "read cmvn");
DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size");
DEFINE_bool(convert2PCM32, true, "audio convert to pcm32");
DEFINE_bool(to_float32, true, "audio convert to pcm32");
DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
......@@ -52,7 +52,7 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
FeaturePipelineOptions opts;
opts.cmvn_file = FLAGS_cmvn_file;
opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk;
opts.convert2PCM32 = FLAGS_convert2PCM32;
opts.to_float32 = FLAGS_to_float32;
kaldi::FrameExtractionOptions frame_opts;
frame_opts.frame_length_ms = 20;
frame_opts.frame_shift_ms = 10;
......
......@@ -21,17 +21,18 @@ using kaldi::BaseFloat;
using kaldi::VectorBase;
using kaldi::Vector;
AudioCache::AudioCache(int buffer_size, bool convert2PCM32)
AudioCache::AudioCache(int buffer_size, bool to_float32)
: finished_(false),
capacity_(buffer_size),
capacity_(buffer_size), // unit: sample
size_(0),
offset_(0),
timeout_(1),
convert2PCM32_(convert2PCM32) {
timeout_(1), // ms
to_float32_(to_float32) {
ring_buffer_.resize(capacity_);
}
BaseFloat AudioCache::Convert2PCM32(BaseFloat val) {
// sample type int16, int16->float32
return val * (1. / std::pow(2.0, 15));
}
......@@ -43,8 +44,7 @@ void AudioCache::Accept(const VectorBase<BaseFloat>& waves) {
for (size_t idx = 0; idx < waves.Dim(); ++idx) {
int32 buffer_idx = (idx + offset_ + size_) % ring_buffer_.size();
ring_buffer_[buffer_idx] = waves(idx);
if (convert2PCM32_)
ring_buffer_[buffer_idx] = Convert2PCM32(waves(idx));
if (to_float32_) ring_buffer_[buffer_idx] = Convert2PCM32(waves(idx));
}
size_ += waves.Dim();
}
......
......@@ -24,7 +24,7 @@ namespace ppspeech {
class AudioCache : public FrontendInterface {
public:
explicit AudioCache(int buffer_size = 1000 * kint16max,
bool convert2PCM32 = true);
bool to_float32 = true);
virtual void Accept(const kaldi::VectorBase<BaseFloat>& waves);
......@@ -50,14 +50,15 @@ class AudioCache : public FrontendInterface {
kaldi::BaseFloat Convert2PCM32(kaldi::BaseFloat val);
std::vector<kaldi::BaseFloat> ring_buffer_;
size_t offset_; // offset in ring_buffer_
size_t size_; // samples in ring_buffer_ now
size_t capacity_; // capacity of ring_buffer_
size_t offset_; // offset in ring_buffer_, begin of data
size_t size_; // samples in ring_buffer_, size of valid data
size_t capacity_; // capacity of ring_buffer_, full size of data buffer,
// unit: sample
bool finished_; // reach audio end
std::mutex mutex_;
std::condition_variable ready_feed_condition_;
kaldi::int32 timeout_; // millisecond
bool convert2PCM32_;
bool to_float32_;
DISALLOW_COPY_AND_ASSIGN(AudioCache);
};
......
......@@ -37,14 +37,17 @@ CMVN::CMVN(std::string cmvn_file, unique_ptr<FrontendInterface> base_extractor)
}
void CMVN::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
// feed waves/feats to compute feature
base_extractor_->Accept(inputs);
return;
}
bool CMVN::Read(kaldi::Vector<BaseFloat>* feats) {
// compute feature
if (base_extractor_->Read(feats) == false || feats->Dim() == 0) {
return false;
}
// appply cmvn
Compute(feats);
return true;
}
......
......@@ -27,6 +27,7 @@ class DataCache : public FrontendInterface {
public:
explicit DataCache() { finished_ = false; }
// accept waves/feats
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
data_ = inputs;
}
......
......@@ -15,23 +15,56 @@
// wrap the fbank feat of kaldi, todo (SmileGoat)
#include "kaldi/feat/feature-mfcc.h"
#incldue "kaldi/matrix/kaldi-vector.h"
namespace ppspeech {
class FbankExtractor : FrontendInterface {
struct FbankOptions {
kaldi::FrameExtractionOptions frame_opts;
kaldi::BaseFloat streaming_chunk; // second
LinearSpectrogramOptions() : streaming_chunk(0.1), frame_opts() {}
void Register(kaldi::OptionsItf* opts) {
opts->Register("streaming-chunk",
&streaming_chunk,
"streaming chunk size, default: 0.1 sec");
frame_opts.Register(opts);
}
};
class Fbank : FrontendInterface {
public:
explicit FbankExtractor(const FbankOptions& opts,
share_ptr<FrontendInterface> pre_extractor);
virtual void AcceptWaveform(
const kaldi::Vector<kaldi::BaseFloat>& input) = 0;
virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat) = 0;
virtual size_t Dim() const = 0;
explicit Fbank(const FbankOptions& opts,
unique_ptr<FrontendInterface> base_extractor);
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
// the dim_ is the dim of single frame feature
virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
virtual void Reset() {
base_extractor_->Reset();
remained_wav_.Resize(0);
}
private:
bool Compute(const kaldi::Vector<kaldi::BaseFloat>& wave,
kaldi::Vector<kaldi::BaseFloat>* feat) const;
bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
kaldi::Vector<kaldi::BaseFloat>* feats);
// kaldi::FeatureWindowFunction feature_window_funtion_;
// kaldi::BaseFloat hanning_window_energy_;
size_t dim_;
FbankOptions opts_;
std::unique_ptr<FrontendInterface> base_extractor_;
kaldi::Vector<kaldi::BaseFloat> remained_wav_;
int chunk_sample_size_;
DISALLOW_COPY_AND_ASSIGN(Fbank);
};
} // namespace ppspeech
\ No newline at end of file
......@@ -28,11 +28,13 @@ FeatureCache::FeatureCache(FeatureCacheOptions opts,
max_size_ = opts.max_size;
frame_chunk_stride_ = opts.frame_chunk_stride;
frame_chunk_size_ = opts.frame_chunk_size;
timeout_ = opts.timeout; // ms
base_extractor_ = std::move(base_extractor);
dim_ = base_extractor_->Dim();
}
void FeatureCache::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
// read inputs
base_extractor_->Accept(inputs);
// feed current data
bool result = false;
......@@ -49,9 +51,8 @@ bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
while (cache_.empty() && base_extractor_->IsFinished() == false) {
// todo refactor: wait
// ready_read_condition_.wait(lock);
int32 elapsed = static_cast<int32>(timer.Elapsed() * 1000);
// todo replace 1 with timeout_, 1 ms
if (elapsed > 1) {
int32 elapsed = static_cast<int32>(timer.Elapsed() * 1000); // ms
if (elapsed > timeout_) {
return false;
}
usleep(100); // sleep 0.1 ms
......@@ -70,6 +71,8 @@ bool FeatureCache::Compute() {
Vector<BaseFloat> feature;
bool result = base_extractor_->Read(&feature);
if (result == false || feature.Dim() == 0) return false;
// join with remained
int32 joint_len = feature.Dim() + remained_feature_.Dim();
int32 num_chunk =
((joint_len / dim_) - frame_chunk_size_) / frame_chunk_stride_ + 1;
......@@ -82,6 +85,7 @@ bool FeatureCache::Compute() {
for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) {
int32 start = chunk_idx * frame_chunk_stride_ * dim_;
Vector<BaseFloat> feature_chunk(frame_chunk_size_ * dim_);
SubVector<BaseFloat> tmp(joint_feature.Data() + start,
frame_chunk_size_ * dim_);
......@@ -89,6 +93,7 @@ bool FeatureCache::Compute() {
std::unique_lock<std::mutex> lock(mutex_);
while (cache_.size() >= max_size_) {
// cache full, wait
ready_feed_condition_.wait(lock);
}
......
......@@ -23,8 +23,12 @@ struct FeatureCacheOptions {
int32 max_size;
int32 frame_chunk_size;
int32 frame_chunk_stride;
int32 timeout; // ms
FeatureCacheOptions()
: max_size(kint16max), frame_chunk_size(1), frame_chunk_stride(1) {}
: max_size(kint16max),
frame_chunk_size(1),
frame_chunk_stride(1),
timeout(1) {}
};
class FeatureCache : public FrontendInterface {
......@@ -64,14 +68,15 @@ class FeatureCache : public FrontendInterface {
bool Compute();
int32 dim_;
size_t max_size_;
int32 frame_chunk_size_;
int32 frame_chunk_stride_;
size_t max_size_; // cache capacity
int32 frame_chunk_size_; // window
int32 frame_chunk_stride_; // stride
std::unique_ptr<FrontendInterface> base_extractor_;
kaldi::int32 timeout_; // ms
kaldi::Vector<kaldi::BaseFloat> remained_feature_;
std::unique_ptr<FrontendInterface> base_extractor_;
std::queue<kaldi::Vector<BaseFloat>> cache_; // feature cache
std::mutex mutex_;
std::queue<kaldi::Vector<BaseFloat>> cache_;
std::condition_variable ready_feed_condition_;
std::condition_variable ready_read_condition_;
......
......@@ -20,7 +20,7 @@ using std::unique_ptr;
FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) {
unique_ptr<FrontendInterface> data_source(
new ppspeech::AudioCache(1000 * kint16max, opts.convert2PCM32));
new ppspeech::AudioCache(1000 * kint16max, opts.to_float32));
unique_ptr<FrontendInterface> linear_spectrogram(
new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts,
......
......@@ -27,12 +27,12 @@ namespace ppspeech {
struct FeaturePipelineOptions {
std::string cmvn_file;
bool convert2PCM32;
bool to_float32;
LinearSpectrogramOptions linear_spectrogram_opts;
FeatureCacheOptions feature_cache_opts;
FeaturePipelineOptions()
: cmvn_file(""),
convert2PCM32(false),
to_float32(false),
linear_spectrogram_opts(),
feature_cache_opts() {}
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册