diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index b2bf1890a41c5d831d5fc777bbec7aea93e1eb0e..ae75956821189ac22c76218201d233360b587c16 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -81,8 +81,8 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { frame_opts.preemph_coeff = 0.0; opts.linear_spectrogram_opts.frame_opts = frame_opts; } - opts.feature_cache_opts.frame_chunk_size = FLAGS_receptive_field_length; - opts.feature_cache_opts.frame_chunk_stride = FLAGS_downsampling_rate; + opts.dispenser_opts.frame_chunk_size = FLAGS_receptive_field_length; + opts.dispenser_opts.frame_chunk_stride = FLAGS_downsampling_rate; return opts; } diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt index 0aec68faf5fc5f620a022f2c149d5a82fc290da8..ee7c05c4c14f8fb52820fb1b2b979ee85de464bd 100644 --- a/speechx/speechx/frontend/audio/CMakeLists.txt +++ b/speechx/speechx/frontend/audio/CMakeLists.txt @@ -8,6 +8,7 @@ add_library(frontend STATIC feature_cache.cc feature_pipeline.cc fbank.cc + dispenser.cc ) target_link_libraries(frontend PUBLIC kaldi-matrix kaldi-feat-common kaldi-fbank) @@ -27,4 +28,4 @@ foreach(bin_name IN LISTS BINS) add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) target_link_libraries(${bin_name} PUBLIC frontend utils kaldi-util gflags glog) -endforeach() \ No newline at end of file +endforeach() diff --git a/speechx/speechx/frontend/audio/compute_fbank_main.cc b/speechx/speechx/frontend/audio/compute_fbank_main.cc index 67683eebf6e2fb3f73b6a44f6c9ac682c6c5cda7..18024719b6c2bf85c28050dabc41a4fcacf7d085 100644 --- a/speechx/speechx/frontend/audio/compute_fbank_main.cc +++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc @@ -64,10 +64,6 @@ int main(int argc, char* argv[]) { ppspeech::FeatureCacheOptions feat_cache_opts; // the feature cache output feature chunk by chunk. - // frame_chunk_size : num frame of a chunk. - // frame_chunk_stride: chunk sliding window stride. - feat_cache_opts.frame_chunk_stride = 1; - feat_cache_opts.frame_chunk_size = 1; ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn)); LOG(INFO) << "fbank: " << true; LOG(INFO) << "feat dim: " << feature_cache.Dim(); diff --git a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc index 943b74b89000a4a07f5bb161b45b154f4b687d31..cc7a5e17c8a16acbacb7581cb67f232d46260caa 100644 --- a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc +++ b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc @@ -66,10 +66,6 @@ int main(int argc, char* argv[]) { ppspeech::FeatureCacheOptions feat_cache_opts; // the feature cache output feature chunk by chunk. - // frame_chunk_size : num frame of a chunk. - // frame_chunk_stride: chunk sliding window stride. - feat_cache_opts.frame_chunk_stride = 1; - feat_cache_opts.frame_chunk_size = 1; ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn)); LOG(INFO) << "feat dim: " << feature_cache.Dim(); diff --git a/speechx/speechx/frontend/audio/dispenser.cc b/speechx/speechx/frontend/audio/dispenser.cc new file mode 100644 index 0000000000000000000000000000000000000000..0e8cdc6f6d5d453e88596d88daf7ad2342c44d4a --- /dev/null +++ b/speechx/speechx/frontend/audio/dispenser.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "frontend/audio/dispenser.h" + +namespace ppspeech { + +using kaldi::Vector; +using kaldi::VectorBase; +using kaldi::BaseFloat; +using std::unique_ptr; + +Dispenser::Dispenser(DispenserOptions opts, + unique_ptr base_extractor) { + frame_chunk_stride_ = opts.frame_chunk_stride; + frame_chunk_size_ = opts.frame_chunk_size; + base_extractor_ = std::move(base_extractor); + dim_ = base_extractor_->Dim(); +} + +void Dispenser::Accept(const kaldi::VectorBase& inputs) { + // read inputs + base_extractor_->Accept(inputs); +} + +// pop feature chunk +bool Dispenser::Read(kaldi::Vector* feats) { + feats->Resize(dim_ * frame_chunk_size_); + bool result = Compute(feats); + return result; +} + +// read all data from base_feature_extractor_ into cache_ +bool Dispenser::Compute(Vector* feats) { + // compute and feed + bool result = false; + while (feature_cache_.size() < frame_chunk_size_) { + Vector feature; + result = base_extractor_->Read(&feature); + if (result == false || feature.Dim() == 0) return false; + feature_cache_.push(feature); + } + + int32 counter = 0; + int32 cache_size = frame_chunk_size_ - frame_chunk_stride_; + int32 elem_dim = base_extractor_->Dim(); + while (counter < frame_chunk_size_) { + Vector& val = feature_cache_.front(); + int32 start = counter * elem_dim; + feats->Range(start, elem_dim).CopyFromVec(val); + if (frame_chunk_size_ - counter <= cache_size ) { + feature_cache_.push(val); + } + feature_cache_.pop(); + counter++; + } + + return result; +} + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/audio/dispenser.h b/speechx/speechx/frontend/audio/dispenser.h new file mode 100644 index 0000000000000000000000000000000000000000..89d9c977b86c554c9361542f11884978249f2110 --- /dev/null +++ b/speechx/speechx/frontend/audio/dispenser.h @@ -0,0 +1,67 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "base/common.h" +#include "frontend/audio/frontend_itf.h" + +namespace ppspeech { + +struct DispenserOptions { + int32 frame_chunk_size; + int32 frame_chunk_stride; + + DispenserOptions() + : frame_chunk_size(1), + frame_chunk_stride(1) {} +}; + +class Dispenser : public FrontendInterface { + public: + explicit Dispenser( + DispenserOptions opts, + std::unique_ptr base_extractor = NULL); + + // Feed feats or waves + virtual void Accept(const kaldi::VectorBase& inputs); + + // feats size = num_frames * feat_dim + virtual bool Read(kaldi::Vector* feats); + + // feat dim + virtual size_t Dim() const { return dim_; } + + virtual void SetFinished() { + base_extractor_->SetFinished(); + } + + virtual bool IsFinished() const { return base_extractor_->IsFinished(); } + + virtual void Reset() { + base_extractor_->Reset(); + } + + private: + bool Compute(kaldi::Vector* feats); + + int32 dim_; + int32 frame_chunk_size_; // window + int32 frame_chunk_stride_; // stride + std::queue> feature_cache_; + std::unique_ptr base_extractor_; + DISALLOW_COPY_AND_ASSIGN(Dispenser); +}; + +} // namespace ppspeech diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc index 05283bb7e51e863759db8b728ce54d5448c90a88..930f29c5497efec451ff8c6bfa608b2046c4ccb8 100644 --- a/speechx/speechx/frontend/audio/feature_cache.cc +++ b/speechx/speechx/frontend/audio/feature_cache.cc @@ -26,8 +26,6 @@ using std::unique_ptr; FeatureCache::FeatureCache(FeatureCacheOptions opts, unique_ptr base_extractor) { max_size_ = opts.max_size; - frame_chunk_stride_ = opts.frame_chunk_stride; - frame_chunk_size_ = opts.frame_chunk_size; timeout_ = opts.timeout; // ms base_extractor_ = std::move(base_extractor); dim_ = base_extractor_->Dim(); @@ -74,24 +72,11 @@ bool FeatureCache::Compute() { bool result = base_extractor_->Read(&feature); if (result == false || feature.Dim() == 0) return false; - // join with remained - int32 joint_len = feature.Dim() + remained_feature_.Dim(); - Vector joint_feature(joint_len); - joint_feature.Range(0, remained_feature_.Dim()) - .CopyFromVec(remained_feature_); - joint_feature.Range(remained_feature_.Dim(), feature.Dim()) - .CopyFromVec(feature); - - // one by one, or stride with window - // controlled by frame_chunk_stride_ and frame_chunk_size_ - int32 num_chunk = - ((joint_len / dim_) - frame_chunk_size_) / frame_chunk_stride_ + 1; + int32 num_chunk = feature.Dim() / dim_ ; for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) { - int32 start = chunk_idx * frame_chunk_stride_ * dim_; - - Vector feature_chunk(frame_chunk_size_ * dim_); - SubVector tmp(joint_feature.Data() + start, - frame_chunk_size_ * dim_); + int32 start = chunk_idx * dim_; + Vector feature_chunk(dim_); + SubVector tmp(feature.Data() + start, dim_); feature_chunk.CopyFromVec(tmp); std::unique_lock lock(mutex_); @@ -104,13 +89,6 @@ bool FeatureCache::Compute() { cache_.push(feature_chunk); ready_read_condition_.notify_one(); } - - // cache remained feats - int32 remained_feature_len = - joint_len - num_chunk * frame_chunk_stride_ * dim_; - remained_feature_.Resize(remained_feature_len); - remained_feature_.CopyFromVec(joint_feature.Range( - frame_chunk_stride_ * num_chunk * dim_, remained_feature_len)); return result; } diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h index 0dc704bbff9c268652d311571d33218b902b01cc..4c016056a8379d88742334eee2e071528be39e15 100644 --- a/speechx/speechx/frontend/audio/feature_cache.h +++ b/speechx/speechx/frontend/audio/feature_cache.h @@ -21,13 +21,9 @@ namespace ppspeech { struct FeatureCacheOptions { int32 max_size; - int32 frame_chunk_size; - int32 frame_chunk_stride; int32 timeout; // ms FeatureCacheOptions() : max_size(kint16max), - frame_chunk_size(1), - frame_chunk_stride(1), timeout(1) {} }; @@ -80,7 +76,7 @@ class FeatureCache : public FrontendInterface { std::condition_variable ready_feed_condition_; std::condition_variable ready_read_condition_; - // DISALLOW_COPY_AND_ASSGIN(FeatureCache); + DISALLOW_COPY_AND_ASSIGN(FeatureCache); }; } // namespace ppspeech diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc index 087de0f0d14cc7c760389b4fb69ad216e0b77db9..026905f06451c65da0dfaaa2f8f7421311b0aa6f 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.cc +++ b/speechx/speechx/frontend/audio/feature_pipeline.cc @@ -35,8 +35,11 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) { unique_ptr cmvn( new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature))); - base_extractor_.reset( + unique_ptr cache( new ppspeech::FeatureCache(opts.feature_cache_opts, std::move(cmvn))); + + base_extractor_.reset( + new ppspeech::Dispenser(opts.dispenser_opts, std::move(cache))); } } // ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index 6b9b4795e5431b20116a1e7cbed59415e8f1e0c7..9f86c634c4f1e270f1a9efb509acaca029345a41 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -23,6 +23,7 @@ #include "frontend/audio/frontend_itf.h" #include "frontend/audio/linear_spectrogram.h" #include "frontend/audio/normalizer.h" +#include "frontend/audio/dispenser.h" namespace ppspeech { @@ -33,13 +34,16 @@ struct FeaturePipelineOptions { LinearSpectrogramOptions linear_spectrogram_opts; FbankOptions fbank_opts; FeatureCacheOptions feature_cache_opts; + DispenserOptions dispenser_opts; + FeaturePipelineOptions() : cmvn_file(""), to_float32(false), // true, only for linear feature use_fbank(true), linear_spectrogram_opts(), fbank_opts(), - feature_cache_opts() {} + feature_cache_opts(), + dispenser_opts() {} }; class FeaturePipeline : public FrontendInterface {