Merge pull request #1757 from zh794390558/ws

[speechx] change opt convert2PCM32 to to_float32, fix shell script

Merge pull request #1757 from zh794390558/ws
[speechx] change opt convert2PCM32 to to_float32, fix shell script
ba812854 · YangZhou · GitHub · 3ad43431 · 972f2dd6 · ba812854
16 changed file
--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
 # Examples for SpeechX

-* ds2_ol - ds2 streaming test under `aishell-1` test dataset.
-   The entrypoint is `ds2_ol/aishell/run.sh`
+* ds2_ol - ds2 streaming test under `aishell-1` test dataset.  
+The entrypoint is `ds2_ol/aishell/run.sh`


 ## How to run  

--- a/speechx/examples/ds2_ol/README.md
+++ b/speechx/examples/ds2_ol/README.md
-# Deepspeech2 Streaming
+# Deepspeech2 Streaming ASR

-Please go to `aishell` to test it.
-
-* aishell
-Deepspeech2 Streaming Decoding under aishell dataset.
 * websocket
 Streaming ASR with websocket.

+* aishell
+Streaming Decoding under aishell dataset, for local WER test and so on.
+
+## More
 The below is for developing and offline testing:
 * nnet
 * feat

--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@@ -112,8 +112,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_lm > $exp/${wer}.lm
 fi

+wfst=$data/wfst/
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-    wfst=$data/wfst/
    mkdir -p $wfst
    if [ ! -f $wfst/aishell_graph.zip ]; then
        pushd $wfst
@@ -122,18 +122,18 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
        mv aishell_graph/* $wfst
        popd
    fi
+fi

-    graph_dir=$wfst/
-
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    #  TLG decoder
    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
    wfst-decoder-ol \
        --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
        --model_path=$model_dir/avg_1.jit.pdmodel \
        --param_path=$model_dir/avg_1.jit.pdiparams \
-        --word_symbol_table=$graph_dir/words.txt \
+        --word_symbol_table=$wfst/words.txt \
        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
-        --graph_path=$graph_dir/TLG.fst --max_active=7500 \
+        --graph_path=$wfst/TLG.fst --max_active=7500 \
        --acoustic_scale=1.2 \
        --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg

@@ -142,40 +142,21 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
 fi

 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-
-    cmvn=$data/cmvn.ark
-    if [ ! -f $data/split${nj}/1/${aishell_wav_scp} ]; then
-        cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
-        ./local/split_data.sh $data ${data}/${aishell_wav_scp} $aishell_wav_scp $nj
-    fi
-
-    wfst=$data/wfst/
-    mkdir -p $wfst
-    if [ ! -f $wfst/aishell_graph.zip ]; then
-        pushd $wfst
-        wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip
-        unzip aishell_graph.zip
-        popd
-    fi
-
-    graph_dir=$wfst/aishell_graph
-
    #  TLG decoder
    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer.log \
    recognizer_test_main \
        --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
        --cmvn_file=$cmvn \
        --model_path=$model_dir/avg_1.jit.pdmodel \
-        --convert2PCM32=true \
+        --to_float32=true \
        --streaming_chunk=30 \
-        --params_path=$model_dir/avg_1.jit.pdiparams \
-        --word_symbol_table=$graph_dir/words.txt \
+        --param_path=$model_dir/avg_1.jit.pdiparams \
+        --word_symbol_table=$wfst/words.txt \
        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
-        --graph_path=$graph_dir/TLG.fst --max_active=7500 \
+        --graph_path=$wfst/TLG.fst --max_active=7500 \
        --acoustic_scale=1.2 \
        --result_wspecifier=ark,t:$data/split${nj}/JOB/result_recognizer

    cat $data/split${nj}/*/result_recognizer > $exp/${label_file}_recognizer
    utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_recognizer > $exp/${wer}.recognizer
-fi
-
+fi
\ No newline at end of file
--- a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
+++ b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
@@ -115,7 +115,7 @@ int main(int argc, char* argv[]) {
                flag = feature_cache.Read(&features);
                feats.push_back(features);
                feature_rows += features.Dim() / feature_cache.Dim();
-            } while(flag == true && features.Dim() != 0);
+            } while (flag == true && features.Dim() != 0);
            sample_offset += cur_chunk_size;
        }


--- a/speechx/examples/ds2_ol/websocket/websocket_client.sh
+++ b/speechx/examples/ds2_ol/websocket/websocket_client.sh
@@ -14,9 +14,7 @@ fi
 # input
 mkdir -p data
 data=$PWD/data
-ckpt_dir=$data/model
-model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
-vocb_dir=$ckpt_dir/data/lang_char
+
 # output
 aishell_wav_scp=aishell_test.scp
 if [ ! -d $data/test ]; then
@@ -34,4 +32,4 @@ export GLOG_logtostderr=1

 # websocket client
 websocket_client_main \
-    --wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.36
+    --wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.36
\ No newline at end of file
--- a/speechx/examples/ds2_ol/websocket/websocket_server.sh
+++ b/speechx/examples/ds2_ol/websocket/websocket_server.sh
@@ -19,12 +19,26 @@ ckpt_dir=$data/model
 model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
 vocb_dir=$ckpt_dir/data/lang_char/

+# output
+aishell_wav_scp=aishell_test.scp
+if [ ! -d $data/test ]; then
+    pushd $data
+    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
+    unzip  aishell_test.zip
+    popd
+
+    realpath $data/test/*/*.wav > $data/wavlist
+    awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
+    paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
+fi
+
+
 if [ ! -f $ckpt_dir/data/mean_std.json ]; then
-        mkdir -p $ckpt_dir
-        pushd $ckpt_dir
-        wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
-        tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz 
-        popd
+    mkdir -p $ckpt_dir
+    pushd $ckpt_dir
+    wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+    tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz 
+    popd
 fi

 export GLOG_logtostderr=1
@@ -49,9 +63,9 @@ websocket_server_main \
    --cmvn_file=$cmvn \
    --model_path=$model_dir/avg_1.jit.pdmodel \
    --streaming_chunk=0.1 \
-    --convert2PCM32=true \
+    --to_float32=true \
    --param_path=$model_dir/avg_1.jit.pdiparams \
-    --word_symbol_table=$data/wfst/words.txt \
+    --word_symbol_table=$wfst/words.txt \
    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
-    --graph_path=$data/wfst/TLG.fst --max_active=7500 \
+    --graph_path=$wfst/TLG.fst --max_active=7500 \
    --acoustic_scale=1.2 
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -21,7 +21,7 @@

 DEFINE_string(cmvn_file, "", "read cmvn");
 DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size");
-DEFINE_bool(convert2PCM32, true, "audio convert to pcm32");
+DEFINE_bool(to_float32, true, "audio convert to pcm32");
 DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
 DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
 DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
@@ -52,7 +52,7 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
    FeaturePipelineOptions opts;
    opts.cmvn_file = FLAGS_cmvn_file;
    opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk;
-    opts.convert2PCM32 = FLAGS_convert2PCM32;
+    opts.to_float32 = FLAGS_to_float32;
    kaldi::FrameExtractionOptions frame_opts;
    frame_opts.frame_length_ms = 20;
    frame_opts.frame_shift_ms = 10;

--- a/speechx/speechx/frontend/audio/audio_cache.cc
+++ b/speechx/speechx/frontend/audio/audio_cache.cc
@@ -21,17 +21,18 @@ using kaldi::BaseFloat;
 using kaldi::VectorBase;
 using kaldi::Vector;

-AudioCache::AudioCache(int buffer_size, bool convert2PCM32)
+AudioCache::AudioCache(int buffer_size, bool to_float32)
    : finished_(false),
-      capacity_(buffer_size),
+      capacity_(buffer_size),  // unit: sample
      size_(0),
      offset_(0),
-      timeout_(1),
-      convert2PCM32_(convert2PCM32) {
+      timeout_(1),  // ms
+      to_float32_(to_float32) {
    ring_buffer_.resize(capacity_);
 }

 BaseFloat AudioCache::Convert2PCM32(BaseFloat val) {
+    // sample type int16, int16->float32
    return val * (1. / std::pow(2.0, 15));
 }

@@ -43,8 +44,7 @@ void AudioCache::Accept(const VectorBase<BaseFloat>& waves) {
    for (size_t idx = 0; idx < waves.Dim(); ++idx) {
        int32 buffer_idx = (idx + offset_ + size_) % ring_buffer_.size();
        ring_buffer_[buffer_idx] = waves(idx);
-        if (convert2PCM32_)
-            ring_buffer_[buffer_idx] = Convert2PCM32(waves(idx));
+        if (to_float32_) ring_buffer_[buffer_idx] = Convert2PCM32(waves(idx));
    }
    size_ += waves.Dim();
 }

--- a/speechx/speechx/frontend/audio/audio_cache.h
+++ b/speechx/speechx/frontend/audio/audio_cache.h
@@ -24,7 +24,7 @@ namespace ppspeech {
 class AudioCache : public FrontendInterface {
  public:
    explicit AudioCache(int buffer_size = 1000 * kint16max,
-                        bool convert2PCM32 = true);
+                        bool to_float32 = true);

    virtual void Accept(const kaldi::VectorBase<BaseFloat>& waves);

@@ -50,14 +50,15 @@ class AudioCache : public FrontendInterface {
    kaldi::BaseFloat Convert2PCM32(kaldi::BaseFloat val);

    std::vector<kaldi::BaseFloat> ring_buffer_;
-    size_t offset_;    // offset in ring_buffer_
-    size_t size_;      // samples in ring_buffer_ now
-    size_t capacity_;  // capacity of ring_buffer_
+    size_t offset_;    // offset in ring_buffer_, begin of data
+    size_t size_;      // samples in ring_buffer_, size of valid data
+    size_t capacity_;  // capacity of ring_buffer_, full size of data buffer,
+                       // unit: sample
    bool finished_;    // reach audio end
    std::mutex mutex_;
    std::condition_variable ready_feed_condition_;
    kaldi::int32 timeout_;  // millisecond
-    bool convert2PCM32_;
+    bool to_float32_;

    DISALLOW_COPY_AND_ASSIGN(AudioCache);
 };

--- a/speechx/speechx/frontend/audio/cmvn.cc
+++ b/speechx/speechx/frontend/audio/cmvn.cc
@@ -37,14 +37,17 @@ CMVN::CMVN(std::string cmvn_file, unique_ptr<FrontendInterface> base_extractor)
 }

 void CMVN::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
+    // feed waves/feats to compute feature
    base_extractor_->Accept(inputs);
    return;
 }

 bool CMVN::Read(kaldi::Vector<BaseFloat>* feats) {
+    // compute feature
    if (base_extractor_->Read(feats) == false || feats->Dim() == 0) {
        return false;
    }
+    // appply cmvn
    Compute(feats);
    return true;
 }

--- a/speechx/speechx/frontend/audio/data_cache.h
+++ b/speechx/speechx/frontend/audio/data_cache.h
@@ -27,6 +27,7 @@ class DataCache : public FrontendInterface {
  public:
    explicit DataCache() { finished_ = false; }

+    // accept waves/feats
    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
        data_ = inputs;
    }

--- a/speechx/speechx/frontend/audio/fbank.h
+++ b/speechx/speechx/frontend/audio/fbank.h
@@ -15,23 +15,56 @@
 // wrap the fbank feat of kaldi, todo (SmileGoat)

 #include "kaldi/feat/feature-mfcc.h"
-
 #incldue "kaldi/matrix/kaldi-vector.h"

 namespace ppspeech {

-class FbankExtractor : FrontendInterface {
+struct FbankOptions {
+    kaldi::FrameExtractionOptions frame_opts;
+    kaldi::BaseFloat streaming_chunk;  // second
+
+    LinearSpectrogramOptions() : streaming_chunk(0.1), frame_opts() {}
+
+    void Register(kaldi::OptionsItf* opts) {
+        opts->Register("streaming-chunk",
+                       &streaming_chunk,
+                       "streaming chunk size, default: 0.1 sec");
+        frame_opts.Register(opts);
+    }
+};
+
+
+class Fbank : FrontendInterface {
  public:
-    explicit FbankExtractor(const FbankOptions& opts,
-                            share_ptr<FrontendInterface> pre_extractor);
-    virtual void AcceptWaveform(
-        const kaldi::Vector<kaldi::BaseFloat>& input) = 0;
-    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat) = 0;
-    virtual size_t Dim() const = 0;
+    explicit Fbank(const FbankOptions& opts,
+                   unique_ptr<FrontendInterface> base_extractor);
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
+
+    // the dim_ is the dim of single frame feature
+    virtual size_t Dim() const { return dim_; }
+
+    virtual void SetFinished() { base_extractor_->SetFinished(); }
+
+    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+
+    virtual void Reset() {
+        base_extractor_->Reset();
+        remained_wav_.Resize(0);
+    }

  private:
-    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& wave,
-                 kaldi::Vector<kaldi::BaseFloat>* feat) const;
+    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
+                 kaldi::Vector<kaldi::BaseFloat>* feats);
+
+    // kaldi::FeatureWindowFunction feature_window_funtion_;
+    // kaldi::BaseFloat hanning_window_energy_;
+    size_t dim_;
+    FbankOptions opts_;
+    std::unique_ptr<FrontendInterface> base_extractor_;
+    kaldi::Vector<kaldi::BaseFloat> remained_wav_;
+    int chunk_sample_size_;
+    DISALLOW_COPY_AND_ASSIGN(Fbank);
 };

 }  // namespace ppspeech
\ No newline at end of file
--- a/speechx/speechx/frontend/audio/feature_cache.cc
+++ b/speechx/speechx/frontend/audio/feature_cache.cc
@@ -28,11 +28,13 @@ FeatureCache::FeatureCache(FeatureCacheOptions opts,
    max_size_ = opts.max_size;
    frame_chunk_stride_ = opts.frame_chunk_stride;
    frame_chunk_size_ = opts.frame_chunk_size;
+    timeout_ = opts.timeout;  // ms
    base_extractor_ = std::move(base_extractor);
    dim_ = base_extractor_->Dim();
 }

 void FeatureCache::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
+    // read inputs
    base_extractor_->Accept(inputs);
    // feed current data
    bool result = false;
@@ -49,9 +51,8 @@ bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
    while (cache_.empty() && base_extractor_->IsFinished() == false) {
        // todo refactor: wait
        // ready_read_condition_.wait(lock);
-        int32 elapsed = static_cast<int32>(timer.Elapsed() * 1000);
-        // todo replace 1 with timeout_, 1 ms
-        if (elapsed > 1) {
+        int32 elapsed = static_cast<int32>(timer.Elapsed() * 1000);  // ms
+        if (elapsed > timeout_) {
            return false;
        }
        usleep(100);  // sleep 0.1 ms
@@ -70,6 +71,8 @@ bool FeatureCache::Compute() {
    Vector<BaseFloat> feature;
    bool result = base_extractor_->Read(&feature);
    if (result == false || feature.Dim() == 0) return false;
+
+    // join with remained
    int32 joint_len = feature.Dim() + remained_feature_.Dim();
    int32 num_chunk =
        ((joint_len / dim_) - frame_chunk_size_) / frame_chunk_stride_ + 1;
@@ -82,6 +85,7 @@ bool FeatureCache::Compute() {

    for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) {
        int32 start = chunk_idx * frame_chunk_stride_ * dim_;
+
        Vector<BaseFloat> feature_chunk(frame_chunk_size_ * dim_);
        SubVector<BaseFloat> tmp(joint_feature.Data() + start,
                                 frame_chunk_size_ * dim_);
@@ -89,6 +93,7 @@ bool FeatureCache::Compute() {

        std::unique_lock<std::mutex> lock(mutex_);
        while (cache_.size() >= max_size_) {
+            // cache full, wait
            ready_feed_condition_.wait(lock);
        }


--- a/speechx/speechx/frontend/audio/feature_cache.h
+++ b/speechx/speechx/frontend/audio/feature_cache.h
@@ -23,8 +23,12 @@ struct FeatureCacheOptions {
    int32 max_size;
    int32 frame_chunk_size;
    int32 frame_chunk_stride;
+    int32 timeout;  // ms
    FeatureCacheOptions()
-        : max_size(kint16max), frame_chunk_size(1), frame_chunk_stride(1) {}
+        : max_size(kint16max),
+          frame_chunk_size(1),
+          frame_chunk_stride(1),
+          timeout(1) {}
 };

 class FeatureCache : public FrontendInterface {
@@ -64,14 +68,15 @@ class FeatureCache : public FrontendInterface {
    bool Compute();

    int32 dim_;
-    size_t max_size_;
-    int32 frame_chunk_size_;
-    int32 frame_chunk_stride_;
+    size_t max_size_;           // cache capacity
+    int32 frame_chunk_size_;    // window
+    int32 frame_chunk_stride_;  // stride
+    std::unique_ptr<FrontendInterface> base_extractor_;

+    kaldi::int32 timeout_;  // ms
    kaldi::Vector<kaldi::BaseFloat> remained_feature_;
-    std::unique_ptr<FrontendInterface> base_extractor_;
+    std::queue<kaldi::Vector<BaseFloat>> cache_;  // feature cache
    std::mutex mutex_;
-    std::queue<kaldi::Vector<BaseFloat>> cache_;
    std::condition_variable ready_feed_condition_;
    std::condition_variable ready_read_condition_;


--- a/speechx/speechx/frontend/audio/feature_pipeline.cc
+++ b/speechx/speechx/frontend/audio/feature_pipeline.cc
@@ -20,7 +20,7 @@ using std::unique_ptr;

 FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) {
    unique_ptr<FrontendInterface> data_source(
-        new ppspeech::AudioCache(1000 * kint16max, opts.convert2PCM32));
+        new ppspeech::AudioCache(1000 * kint16max, opts.to_float32));

    unique_ptr<FrontendInterface> linear_spectrogram(
        new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts,

--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@@ -27,12 +27,12 @@ namespace ppspeech {

 struct FeaturePipelineOptions {
    std::string cmvn_file;
-    bool convert2PCM32;
+    bool to_float32;
    LinearSpectrogramOptions linear_spectrogram_opts;
    FeatureCacheOptions feature_cache_opts;
    FeaturePipelineOptions()
        : cmvn_file(""),
-          convert2PCM32(false),
+          to_float32(false),
          linear_spectrogram_opts(),
          feature_cache_opts() {}
 };