diff --git a/audio/CMakeLists.txt b/audio/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..281148ade17d9264793f2ca0d3bdc54b8afd2fd0 --- /dev/null +++ b/audio/CMakeLists.txt @@ -0,0 +1,37 @@ + +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +project(paddleaudio VERSION 0.1) + +# cmake dir +set(paddleaudio_cmake_dir ${PROJECT_SOURCE_DIR}/cmake) + +# Modules +list(APPEND CMAKE_MODULE_PATH ${paddleaudio_cmake_dir}/external) +list(APPEND CMAKE_MODULE_PATH ${paddleaudio_cmake_dir}) +include(FetchContent) +include(ExternalProject) + +# fc_patch dir +set(FETCHCONTENT_QUIET off) +get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}") +set(FETCHCONTENT_BASE_DIR ${fc_patch}) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=c++14 -pthread -fPIC -O0 -Wall -g") + +# see the thirdparty/kaldi/base/kaldi-types.h +# compile kaldi without openfst +add_definitions("-DCOMPILE_WITHOUT_OPENFST") + +include(openblas) +include(pybind) +#set(fc_patch /paddle/mnt/zhouyang/wks2/PaddleSpeech/paddleaudio/fc_patch) +#set(OpenBLAS_INSTALL_PREFIX ${fc_patch}/openblas-install) +#link_directories(${OpenBLAS_INSTALL_PREFIX}/lib) +#include_directories(${OpenBLAS_INSTALL_PREFIX}/include) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/third_party/kaldi) +#include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/pybind11/include) +include_directories(/usr/include/python3.7m) +add_subdirectory(third_party) +add_subdirectory(kaldi_frontend) diff --git a/audio/kaldi_frontend/CMakeLists.txt b/audio/kaldi_frontend/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..627416023eda4fae4c303348f473da25d7c9b4e8 --- /dev/null +++ b/audio/kaldi_frontend/CMakeLists.txt @@ -0,0 +1,20 @@ + +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +include_directories( +${CMAKE_CURRENT_SOURCE_DIR} +) + +add_library(kaldi_feature + kaldi_feature.cc + kaldi_feature_wrapper.cc +) +target_link_libraries(kaldi_feature kaldi-fbank) + +pybind11_add_module(kaldi_featurepy kaldi_feature.cc kaldi_feature_wrapper.cc) +target_link_libraries(kaldi_featurepy PRIVATE kaldi_feature) + +set(bin_name kaldi_feature_main) +add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) +target_include_directories(${bin_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(${bin_name} PUBLIC kaldi_feature python3.7m) diff --git a/audio/kaldi_frontend/feature_common.h b/audio/kaldi_frontend/feature_common.h new file mode 100644 index 0000000000000000000000000000000000000000..900effc2823214dcdaad2fdfd428c3c75b20b9b9 --- /dev/null +++ b/audio/kaldi_frontend/feature_common.h @@ -0,0 +1,52 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "feat/feature-window.h" +#include +#include + +namespace paddleaudio { + +namespace py = pybind11; + +template +class StreamingFeatureTpl { + public: + typedef typename F::Options Options; + StreamingFeatureTpl(const Options& opts); + bool ComputeFeature(const kaldi::VectorBase& wav, + kaldi::Vector* feats); + void Reset() { + remained_wav_.Resize(0); + } + + int Dim() { + return computer_.Dim(); + } + + private: + bool Compute(const kaldi::Vector& waves, + kaldi::Vector* feats); + Options opts_; + kaldi::FeatureWindowFunction window_function_; + kaldi::Vector remained_wav_; + F computer_; +}; + +} // namespace ppspeech + +#include "kaldi_frontend/feature_common_inl.h" + diff --git a/audio/kaldi_frontend/feature_common_inl.h b/audio/kaldi_frontend/feature_common_inl.h new file mode 100644 index 0000000000000000000000000000000000000000..db45b26a25c11f720c0e1e7891e6541234c330f0 --- /dev/null +++ b/audio/kaldi_frontend/feature_common_inl.h @@ -0,0 +1,92 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" + +namespace paddleaudio { + +template +StreamingFeatureTpl::StreamingFeatureTpl( + const Options& opts) + : opts_(opts), computer_(opts), + window_function_(opts.frame_opts) { + //window_function_(computer_.GetFrameOptions()) { the opt set to zero +} + +template +bool StreamingFeatureTpl::ComputeFeature(const kaldi::VectorBase& wav, + kaldi::Vector* feats) { + // append remaned waves + kaldi::int32 wav_len = wav.Dim(); + if (wav_len == 0) return false; + kaldi::int32 left_len = remained_wav_.Dim(); + kaldi::Vector waves(left_len + wav_len); + waves.Range(0, left_len).CopyFromVec(remained_wav_); + waves.Range(left_len, wav_len).CopyFromVec(wav); + + // cache remaned waves + kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions(); + kaldi::int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts); + kaldi::int32 frame_shift = frame_opts.WindowShift(); + kaldi::int32 left_samples = waves.Dim() - frame_shift * num_frames; + remained_wav_.Resize(left_samples); + remained_wav_.CopyFromVec( + waves.Range(frame_shift * num_frames, left_samples)); + + // compute speech feature + Compute(waves, feats); + return true; +} + +// Compute feat +template +bool StreamingFeatureTpl::Compute( + const kaldi::Vector& waves, + kaldi::Vector* feats) { + kaldi::BaseFloat vtln_warp = 1.0; + const kaldi::FrameExtractionOptions& frame_opts = + computer_.GetFrameOptions(); + kaldi::int32 num_samples = waves.Dim(); + kaldi::int32 frame_length = frame_opts.WindowSize(); + kaldi::int32 sample_rate = frame_opts.samp_freq; + if (num_samples < frame_length) { + return false; + } + + kaldi::int32 num_frames = kaldi::NumFrames(num_samples, frame_opts); + feats->Resize(num_frames * Dim()); + + kaldi::Vector window; + bool need_raw_log_energy = computer_.NeedRawLogEnergy(); + for (kaldi::int32 frame = 0; frame < num_frames; frame++) { + kaldi::BaseFloat raw_log_energy = 0.0; + kaldi::ExtractWindow(0, + waves, + frame, + frame_opts, + window_function_, + &window, + need_raw_log_energy ? &raw_log_energy : NULL); + + kaldi::Vector this_feature(computer_.Dim(), + kaldi::kUndefined); + computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature); + kaldi::SubVector output_row( + feats->Data() + frame * Dim(), Dim()); + output_row.CopyFromVec(this_feature); + } + return true; +} + +} // namespace paddleaudio diff --git a/audio/kaldi_frontend/kaldi_feature.cc b/audio/kaldi_frontend/kaldi_feature.cc new file mode 100644 index 0000000000000000000000000000000000000000..21e71d8d0eb90240436a018443cea3f307b7d071 --- /dev/null +++ b/audio/kaldi_frontend/kaldi_feature.cc @@ -0,0 +1,143 @@ + +#include +#include + +#include "kaldi_feature_wrapper.h" + +namespace py=pybind11; + +bool InitFbank( + float samp_freq, // frame opts + float frame_shift_ms, + float frame_length_ms, + float dither, + float preemph_coeff, + bool remove_dc_offset, + std::string window_type, // e.g. Hamming window + bool round_to_power_of_two, + float blackman_coeff, + bool snip_edges, + bool allow_downsample, + bool allow_upsample, + int max_feature_vectors, + int num_bins, // mel opts + float low_freq, + float high_freq, + float vtln_low, + float vtln_high, + bool debug_mel, + bool htk_mode, + bool use_energy, // fbank opts + float energy_floor, + bool raw_energy, + bool htk_compat, + bool use_log_fbank, + bool use_power) { + kaldi::FbankOptions opts; + opts.frame_opts.samp_freq = samp_freq; // frame opts + opts.frame_opts.frame_shift_ms = frame_shift_ms; + opts.frame_opts.frame_length_ms = frame_length_ms; + opts.frame_opts.dither = dither; + opts.frame_opts.preemph_coeff = preemph_coeff; + opts.frame_opts.remove_dc_offset = remove_dc_offset; + opts.frame_opts.window_type = window_type; + opts.frame_opts.round_to_power_of_two = round_to_power_of_two; + opts.frame_opts.blackman_coeff = blackman_coeff; + opts.frame_opts.snip_edges = snip_edges; + opts.frame_opts.allow_downsample = allow_downsample; + opts.frame_opts.allow_upsample = allow_upsample; + opts.frame_opts.max_feature_vectors = max_feature_vectors; + + opts.mel_opts.num_bins = num_bins; // mel opts + opts.mel_opts.low_freq = low_freq; + opts.mel_opts.high_freq = high_freq; + opts.mel_opts.vtln_low = vtln_low; + opts.mel_opts.vtln_high = vtln_high; + opts.mel_opts.debug_mel = debug_mel; + opts.mel_opts.htk_mode = htk_mode; + + opts.use_energy = use_energy; // fbank opts + opts.energy_floor = energy_floor; + opts.raw_energy = raw_energy; + opts.htk_compat = htk_compat; + opts.use_log_fbank = use_log_fbank; + opts.use_power = use_power; + paddleaudio::KaldiFeatureWrapper::GetInstance()->InitFbank(opts); + return true; +} + +py::array_t ComputeFbankStreaming(const py::array_t& wav) { + return paddleaudio::KaldiFeatureWrapper::GetInstance()->ComputeFbank(wav); +} + +py::array_t ComputeFbank( + float samp_freq, // frame opts + float frame_shift_ms, + float frame_length_ms, + float dither, + float preemph_coeff, + bool remove_dc_offset, + std::string window_type, // e.g. Hamming window + bool round_to_power_of_two, + float blackman_coeff, + bool snip_edges, + bool allow_downsample, + bool allow_upsample, + int max_feature_vectors, + int num_bins, // mel opts + float low_freq, + float high_freq, + float vtln_low, + float vtln_high, + bool debug_mel, + bool htk_mode, + bool use_energy, // fbank opts + float energy_floor, + bool raw_energy, + bool htk_compat, + bool use_log_fbank, + bool use_power, + const py::array_t& wav) { + InitFbank(samp_freq, // frame opts + frame_shift_ms, + frame_length_ms, + dither, + preemph_coeff, + remove_dc_offset, + window_type, // e.g. Hamming window + round_to_power_of_two, + blackman_coeff, + snip_edges, + allow_downsample, + allow_upsample, + max_feature_vectors, + num_bins, // mel opts + low_freq, + high_freq, + vtln_low, + vtln_high, + debug_mel, + htk_mode, + use_energy, // fbank opts + energy_floor, + raw_energy, + htk_compat, + use_log_fbank, + use_power); + py::array_t result = ComputeFbankStreaming(wav); + paddleaudio::KaldiFeatureWrapper::GetInstance()->ResetFbank(); + return result; +} + + +void ResetFbank() { + paddleaudio::KaldiFeatureWrapper::GetInstance()->ResetFbank(); +} + +PYBIND11_MODULE(kaldi_featurepy, m) { + m.doc() = "kaldi_feature example"; + m.def("InitFbank", &InitFbank, "init fbank"); + m.def("ResetFbank", &ResetFbank, "reset fbank"); + m.def("ComputeFbank", &ComputeFbank, "compute fbank"); + m.def("ComputeFbankStreaming", &ComputeFbankStreaming, "compute fbank streaming"); +} diff --git a/audio/kaldi_frontend/kaldi_feature.h b/audio/kaldi_frontend/kaldi_feature.h new file mode 100644 index 0000000000000000000000000000000000000000..b24416b18be0524ee3b7180de7c61072fe458c6e --- /dev/null +++ b/audio/kaldi_frontend/kaldi_feature.h @@ -0,0 +1,71 @@ +#include +#include + +#include "kaldi_feature_wrapper.h" + +namespace py=pybind11; + +bool InitFbank( + float samp_freq, // frame opts + float frame_shift_ms, + float frame_length_ms, + float dither, + float preemph_coeff, + bool remove_dc_offset, + std::string window_type, // e.g. Hamming window + bool round_to_power_of_two, + float blackman_coeff, + bool snip_edges, + bool allow_downsample, + bool allow_upsample, + int max_feature_vectors, + int num_bins, // mel opts + float low_freq, + float high_freq, + float vtln_low, + float vtln_high, + bool debug_mel, + bool htk_mode, + bool use_energy, // fbank opts + float energy_floor, + bool raw_energy, + bool htk_compat, + bool use_log_fbank, + bool use_power); + +py::array_t ComputeFbank( + float samp_freq, // frame opts + float frame_shift_ms, + float frame_length_ms, + float dither, + float preemph_coeff, + bool remove_dc_offset, + std::string window_type, // e.g. Hamming window + bool round_to_power_of_two, + kaldi::BaseFloat blackman_coeff, + bool snip_edges, + bool allow_downsample, + bool allow_upsample, + int max_feature_vectors, + int num_bins, // mel opts + float low_freq, + float high_freq, + float vtln_low, + float vtln_high, + bool debug_mel, + bool htk_mode, + bool use_energy, // fbank opts + float energy_floor, + bool raw_energy, + bool htk_compat, + bool use_log_fbank, + bool use_power, + const py::array_t& wav); + +py::array_t ComputeFbankStreaming(const py::array_t& wav); + +void ResetFbank(); + +py::array_t ComputeFbankStreaming(const py::array_t& wav); + +py::array_t TestFun(const py::array_t& wav); diff --git a/audio/kaldi_frontend/kaldi_feature_wrapper.cc b/audio/kaldi_frontend/kaldi_feature_wrapper.cc new file mode 100644 index 0000000000000000000000000000000000000000..a965fd6a2acb292001e2d8ca6ee2c9252179aa4c --- /dev/null +++ b/audio/kaldi_frontend/kaldi_feature_wrapper.cc @@ -0,0 +1,57 @@ +#include "kaldi_feature_wrapper.h" + +namespace paddleaudio { + +KaldiFeatureWrapper* KaldiFeatureWrapper::GetInstance() { + static KaldiFeatureWrapper instance; + return &instance; +} + +bool KaldiFeatureWrapper::InitFbank(kaldi::FbankOptions opts) { + fbank_.reset(new Fbank(opts)); + return true; +} + +py::array_t KaldiFeatureWrapper::ComputeFbank(const py::array_t wav) { + py::buffer_info info = wav.request(); + kaldi::Vector input_wav(info.size); + double* wav_ptr = (double*)info.ptr; + for (int idx = 0; idx < info.size; ++idx) { + input_wav(idx) = *wav_ptr; + wav_ptr++; + } + + + kaldi::Vector feats; + bool flag = fbank_->ComputeFeature(input_wav, &feats); + if (flag == false || feats.Dim() == 0) return py::array_t(); + auto result = py::array_t(feats.Dim()); + py::buffer_info xs = result.request(); + for (int idx = 0; idx < 10; ++idx) { + float val = feats(idx); + std::cout << val << " "; + } + std::cout << std::endl; + double* res_ptr = (double*)xs.ptr; + for (int idx = 0; idx < feats.Dim(); ++idx) { + *res_ptr = feats(idx); + res_ptr++; + } + + return result.reshape({ feats.Dim() / Dim(), Dim()}); +/* + py::buffer_info info = wav.request(); + std::cout << info.size << std::endl; + auto result = py::array_t(info.size); + //kaldi::Vector input_wav(info.size); + kaldi::Vector input_wav(info.size); + py::buffer_info info_re = result.request(); + + memcpy(input_wav.Data(), (double*)info.ptr, wav.nbytes()); + memcpy((double*)info_re.ptr, input_wav.Data(), input_wav.Dim()* sizeof(double)); + return result; +*/ +} + + +} // namespace paddleaudio diff --git a/audio/kaldi_frontend/kaldi_feature_wrapper.h b/audio/kaldi_frontend/kaldi_feature_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..561b5813243e30ce1cfcf2541757fd243aff573b --- /dev/null +++ b/audio/kaldi_frontend/kaldi_feature_wrapper.h @@ -0,0 +1,28 @@ + +#include "base/kaldi-common.h" +#include "kaldi_frontend/feature_common.h" +#include "feat/feature-fbank.h" + +#pragma once + +namespace paddleaudio { + +typedef StreamingFeatureTpl Fbank; + +class KaldiFeatureWrapper { + public: + static KaldiFeatureWrapper* GetInstance(); + bool InitFbank(kaldi::FbankOptions opts); + py::array_t ComputeFbank(const py::array_t wav); + int Dim() { + return fbank_->Dim(); + } + void ResetFbank() { + fbank_->Reset(); + } + + private: + std::unique_ptr fbank_; +}; + +} // namespace paddleaudio \ No newline at end of file diff --git a/audio/third_party/CMakeLists.txt b/audio/third_party/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..089ce541ce050caafe6981b9d87ae89daced7882 --- /dev/null +++ b/audio/third_party/CMakeLists.txt @@ -0,0 +1,8 @@ + +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +include_directories( +${CMAKE_CURRENT_SOURCE_DIR} +${CMAKE_CURRENT_SOURCE_DIR}/kaldi +) +add_subdirectory(kaldi) diff --git a/audio/third_party/kaldi/CMakeLists.txt b/audio/third_party/kaldi/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..908b2c041acf4f52ebf0b0e5b2dc55b41da9dbf5 --- /dev/null +++ b/audio/third_party/kaldi/CMakeLists.txt @@ -0,0 +1,66 @@ +project(kaldi) + + +add_library(kaldi-base + base/io-funcs.cc + base/kaldi-error.cc + base/kaldi-math.cc + base/kaldi-utils.cc + base/timer.cc +) + +add_library(kaldi-util + util/kaldi-holder.cc + util/kaldi-io.cc + util/kaldi-semaphore.cc + util/kaldi-table.cc + util/kaldi-thread.cc + util/parse-options.cc + util/simple-io-funcs.cc + util/simple-options.cc + util/text-utils.cc +) +target_link_libraries(kaldi-util PUBLIC kaldi-base kaldi-matrix) + +add_library(kaldi-mfcc + feat/feature-mfcc.cc +) +target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common) + +add_library(kaldi-fbank + feat/feature-fbank.cc +) +target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common) + +add_library(kaldi-feat-common + feat/wave-reader.cc + feat/signal.cc + feat/feature-functions.cc + feat/feature-window.cc + feat/resample.cc + feat/mel-computations.cc + feat/cmvn.cc +) +target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util) + +add_library(kaldi-matrix + matrix/compressed-matrix.cc + matrix/kaldi-matrix.cc + matrix/kaldi-vector.cc + matrix/matrix-functions.cc + matrix/optimization.cc + matrix/packed-matrix.cc + matrix/qr.cc + matrix/sparse-matrix.cc + matrix/sp-matrix.cc + matrix/srfft.cc + matrix/tp-matrix.cc +) +target_link_libraries(kaldi-matrix gfortran kaldi-base libopenblas.a) + + + +#add_subdirectory(base) +#add_subdirectory(util) +#add_subdirectory(feat) +#add_subdirectory(matrix) diff --git a/speechx/speechx/kaldi/base/kaldi-types.h b/speechx/speechx/kaldi/base/kaldi-types.h index 4fa8f224bbab1010b69930c704fbb4d5a6151409..23ef4c4c57bc072a365d81bea1eee82dc2c36dcf 100644 --- a/speechx/speechx/kaldi/base/kaldi-types.h +++ b/speechx/speechx/kaldi/base/kaldi-types.h @@ -42,6 +42,8 @@ typedef float BaseFloat; // for discussion on what to do if you need compile kaldi // without OpenFST, see the bottom of this this file +#ifndef COMPILE_WITHOUT_OPENFST + #include namespace kaldi { @@ -55,9 +57,10 @@ namespace kaldi { typedef double double64; } // end namespace kaldi +#else // In a theoretical case you decide compile Kaldi without the OpenFST // comment the previous namespace statement and uncomment the following -/* + namespace kaldi { typedef int8_t int8; typedef int16_t int16; @@ -71,6 +74,6 @@ namespace kaldi { typedef float float32; typedef double double64; } // end namespace kaldi -*/ +#endif #endif // KALDI_BASE_KALDI_TYPES_H_