diff --git a/examples/wenetspeech/asr1/local/test_wav.sh b/examples/wenetspeech/asr1/local/test_wav.sh index 474642624e0a138764ec3eae5a5ced1e4cd57076..c3a17f49167fc31ff3fcebd1ff8d17a6a3f3d2a0 100755 --- a/examples/wenetspeech/asr1/local/test_wav.sh +++ b/examples/wenetspeech/asr1/local/test_wav.sh @@ -42,6 +42,7 @@ for type in attention_rescoring; do output_dir=${ckpt_prefix} mkdir -p ${output_dir} python3 -u ${BIN_DIR}/test_wav.py \ + --debug True \ --ngpu ${ngpu} \ --config ${config_path} \ --decode_cfg ${decode_config_path} \ diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 2e067ab6b70a7a1b689321097e66a4b70e58322e..67ef2e53ca90ec47db29af92dcf6d9d9108279b4 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -16,6 +16,8 @@ import os import sys from pathlib import Path +import distutils +import numpy as np import paddle import soundfile from yacs.config import CfgNode @@ -74,6 +76,8 @@ class U2Infer(): # fbank feat = self.preprocessing(audio, **self.preprocess_args) logger.info(f"feat shape: {feat.shape}") + if self.args.debug: + np.savetxt("feat.transform.txt", feat) ilen = paddle.to_tensor(feat.shape[0]) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) @@ -125,6 +129,11 @@ if __name__ == "__main__": "--result_file", type=str, help="path of save the asr result") parser.add_argument( "--audio_file", type=str, help="path of the input audio file") + parser.add_argument( + "--debug", + type=distutils.util.strtobool, + default=False, + help="for debug.") args = parser.parse_args() config = CfgNode(new_allowed=True) diff --git a/speechx/.clang-format b/speechx/.clang-format new file mode 100644 index 0000000000000000000000000000000000000000..af946a4a90447acdb94b66143ce3ee47fd6e4043 --- /dev/null +++ b/speechx/.clang-format @@ -0,0 +1,29 @@ +# This file is used by clang-format to autoformat paddle source code +# +# The clang-format is part of llvm toolchain. +# It need to install llvm and clang to format source code style. +# +# The basic usage is, +# clang-format -i -style=file PATH/TO/SOURCE/CODE +# +# The -style=file implicit use ".clang-format" file located in one of +# parent directory. +# The -i means inplace change. +# +# The document of clang-format is +# http://clang.llvm.org/docs/ClangFormat.html +# http://clang.llvm.org/docs/ClangFormatStyleOptions.html +--- +Language: Cpp +BasedOnStyle: Google +IndentWidth: 4 +TabWidth: 4 +ContinuationIndentWidth: 4 +MaxEmptyLinesToKeep: 2 +AccessModifierOffset: -2 # The private/protected/public has no indent in class +Standard: Cpp11 +AllowAllParametersOfDeclarationOnNextLine: true +BinPackParameters: false +BinPackArguments: false +... + diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt index 8307d992021acf7ecfe0b81972c68e69165265e2..17e64c04a303172cd516f507c78b23932dff7ff2 100644 --- a/speechx/CMakeLists.txt +++ b/speechx/CMakeLists.txt @@ -31,9 +31,13 @@ SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} --std=c++14 -pthread -fPIC -O3 -Wall ############################################################################### # Option Configurations ############################################################################### -# option configurations option(TEST_DEBUG "option for debug" OFF) +option(USE_PROFILING "enable c++ profling" OFF) +option(USING_U2 "compile u2 model." ON) +option(USING_DS2 "compile with ds2 model." ON) + +option(USING_GPU "u2 compute on GPU." OFF) ############################################################################### # Include third party @@ -85,6 +89,41 @@ add_dependencies(openfst gflags glog) include(paddleinference) +# paddle core.so +find_package(Threads REQUIRED) +find_package(PythonLibs REQUIRED) +find_package(Python3 REQUIRED) +find_package(pybind11 CONFIG) + +message(STATUS "PYTHON_LIBRARIES = ${PYTHON_LIBRARIES}") +message(STATUS "Python3_EXECUTABLE = ${Python3_EXECUTABLE}") +message(STATUS "Pybind11_INCLUDES = ${pybind11_INCLUDE_DIRS}, pybind11_LIBRARIES=${pybind11_LIBRARIES}, pybind11_DEFINITIONS=${pybind11_DEFINITIONS}") + +# paddle include and link option +execute_process( + COMMAND python -c "import paddle ; print(' '.join(paddle.sysconfig.get_link_flags()), end='')" + OUTPUT_VARIABLE PADDLE_LINK_FLAGS + RESULT_VARIABLE SUCESS) + +message(STATUS PADDLE_LINK_FLAGS= ${PADDLE_LINK_FLAGS}) +string(STRIP ${PADDLE_LINK_FLAGS} PADDLE_LINK_FLAGS) + +# paddle compile option +execute_process( + COMMAND python -c "import paddle ; print(' '.join(paddle.sysconfig.get_compile_flags()), end='')" + OUTPUT_VARIABLE PADDLE_COMPILE_FLAGS) +message(STATUS PADDLE_COMPILE_FLAGS= ${PADDLE_COMPILE_FLAGS}) +string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS) + + +# for LD_LIBRARY_PATH +# set(PADDLE_LIB_DIRS /workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid:/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/libs/) +execute_process( + COMMAND python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')" + OUTPUT_VARIABLE PADDLE_LIB_DIRS) +message(STATUS PADDLE_LIB_DIRS= ${PADDLE_LIB_DIRS}) + + ############################################################################### # Add local library ############################################################################### diff --git a/speechx/README.md b/speechx/README.md index cd1cd62c154c28e007d5b20c8db77fa712f1e071..cc7b13e6a4d882404a8129539b862ad55fc36ae3 100644 --- a/speechx/README.md +++ b/speechx/README.md @@ -3,11 +3,14 @@ ## Environment We develop under: +* python - 3.7 * docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2-gpu-cuda10.2-cudnn7` * os - Ubuntu 16.04.7 LTS * gcc/g++/gfortran - 8.2.0 * cmake - 3.16.0 +> Please using `tools/env.sh` to create python `venv`, then `source venv/bin/activate` to build speechx. + > We make sure all things work fun under docker, and recommend using it to develop and deploy. * [How to Install Docker](https://docs.docker.com/engine/install/) @@ -24,13 +27,16 @@ docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/workspace --nam * More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html). +2. Create python environment. -2. Build `speechx` and `examples`. +``` +bash tools/venv.sh +``` -> Do not source venv. +2. Build `speechx` and `examples`. ``` -pushd /path/to/speechx +source venv/bin/activate ./build.sh ``` diff --git a/speechx/cmake/gflags.cmake b/speechx/cmake/gflags.cmake index 66ae47f70980dc06368ac0e4dd269a2c84ba3db0..36bebc8779fbd89636fce45d71d75ca201d29c0f 100644 --- a/speechx/cmake/gflags.cmake +++ b/speechx/cmake/gflags.cmake @@ -2,10 +2,9 @@ include(FetchContent) FetchContent_Declare( gflags - URL https://github.com/gflags/gflags/archive/v2.2.1.zip - URL_HASH SHA256=4e44b69e709c826734dbbbd5208f61888a2faf63f239d73d8ba0011b2dccc97a + URL https://github.com/gflags/gflags/archive/v2.2.2.zip + URL_HASH SHA256=19713a36c9f32b33df59d1c79b4958434cb005b5b47dc5400a7a4b078111d9b5 ) - FetchContent_MakeAvailable(gflags) # openfst need diff --git a/speechx/cmake/gtest.cmake b/speechx/cmake/gtest.cmake index 7fe397fcb081c47f2c9b0711a50ebfcd8b4b462a..1ea8ed0b763049b09e773ee406bc0075d7021526 100644 --- a/speechx/cmake/gtest.cmake +++ b/speechx/cmake/gtest.cmake @@ -1,8 +1,8 @@ include(FetchContent) FetchContent_Declare( gtest - URL https://github.com/google/googletest/archive/release-1.10.0.zip - URL_HASH SHA256=94c634d499558a76fa649edb13721dce6e98fb1e7018dfaeba3cd7a083945e91 + URL https://github.com/google/googletest/archive/release-1.11.0.zip + URL_HASH SHA256=353571c2440176ded91c2de6d6cd88ddd41401d14692ec1f99e35d013feda55a ) FetchContent_MakeAvailable(gtest) diff --git a/speechx/examples/codelab/feat/.gitignore b/speechx/examples/codelab/feat/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..bbd86a25b018bc611bf6ae52cbb6afa5f60bce62 --- /dev/null +++ b/speechx/examples/codelab/feat/.gitignore @@ -0,0 +1,2 @@ +data +exp diff --git a/speechx/examples/codelab/feat/path.sh b/speechx/examples/codelab/feat/path.sh index 3b89d01e9ec0bfea521388957ef0d73521b5db30..9d22917439cab4963e16903d30fce6e991f13e76 100644 --- a/speechx/examples/codelab/feat/path.sh +++ b/speechx/examples/codelab/feat/path.sh @@ -1,12 +1,12 @@ # This contains the locations of binarys build required for running the examples. SPEECHX_ROOT=$PWD/../../../ -SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples +SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx SPEECHX_TOOLS=$SPEECHX_ROOT/tools TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin -[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } +[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; } export LC_AL=C diff --git a/speechx/examples/codelab/feat/run.sh b/speechx/examples/codelab/feat/run.sh index 1fa37f981b4942c449d4a779b0de8d0de71b9c4a..66bd8ae203ff7b1e33c101c418a220749924bd80 100755 --- a/speechx/examples/codelab/feat/run.sh +++ b/speechx/examples/codelab/feat/run.sh @@ -54,4 +54,10 @@ compute_linear_spectrogram_main \ --cmvn_file=$exp_dir/cmvn.ark echo "compute linear spectrogram feature." +compute_fbank_main \ + --num_bins 161 \ + --wav_rspecifier=scp:$data_dir/wav.scp \ + --feature_wspecifier=ark,t:$exp_dir/fbank.ark \ + --cmvn_file=$exp_dir/cmvn.ark +echo "compute fbank feature." diff --git a/speechx/examples/codelab/nnet/path.sh b/speechx/examples/codelab/nnet/path.sh index 7d395d648348651f4a20694b35c3105d9112d392..11c8aef8b6de5e4b8f5e598c56fac51018c064c5 100644 --- a/speechx/examples/codelab/nnet/path.sh +++ b/speechx/examples/codelab/nnet/path.sh @@ -6,7 +6,7 @@ SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx SPEECHX_TOOLS=$SPEECHX_ROOT/tools TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin -[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } +[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; } export LC_AL=C diff --git a/speechx/examples/codelab/u2nnet/.gitignore b/speechx/examples/codelab/u2nnet/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d6fe69bcdb3bd877db3c8fecce1d90cd90d0f900 --- /dev/null +++ b/speechx/examples/codelab/u2nnet/.gitignore @@ -0,0 +1,3 @@ +data +exp +*log diff --git a/speechx/examples/codelab/u2nnet/README.md b/speechx/examples/codelab/u2nnet/README.md new file mode 100644 index 0000000000000000000000000000000000000000..772a58f0ea22ecf6a635ee76707b9a74527be5ef --- /dev/null +++ b/speechx/examples/codelab/u2nnet/README.md @@ -0,0 +1,3 @@ +# Deepspeech2 Streaming NNet Test + +Using for ds2 streaming nnet inference test. diff --git a/speechx/examples/codelab/u2nnet/path.sh b/speechx/examples/codelab/u2nnet/path.sh new file mode 100644 index 0000000000000000000000000000000000000000..564e9fed147e04d4655909ad0137e1160c38ade8 --- /dev/null +++ b/speechx/examples/codelab/u2nnet/path.sh @@ -0,0 +1,19 @@ +# This contains the locations of binarys build required for running the examples. + +unset GREP_OPTIONS + +SPEECHX_ROOT=$PWD/../../../ +SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx + +SPEECHX_TOOLS=$SPEECHX_ROOT/tools +TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin + +[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; } + +export LC_AL=C + +SPEECHX_BIN=$SPEECHX_BUILD/nnet +export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN + +PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')") +export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH diff --git a/speechx/examples/codelab/u2nnet/run.sh b/speechx/examples/codelab/u2nnet/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..b309bc6f2bf660e4a6c828447a0d2a00b73dab36 --- /dev/null +++ b/speechx/examples/codelab/u2nnet/run.sh @@ -0,0 +1,59 @@ +#!/bin/bash +set -x +set -e + +. path.sh + +# 1. compile +if [ ! -d ${SPEECHX_EXAMPLES} ]; then + pushd ${SPEECHX_ROOT} + bash build.sh + popd +fi + +# 2. download model +if [ ! -f data/model/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz ]; then + mkdir -p data/model + pushd data/model + wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/static/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz + tar xzfv asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz + popd +fi + +# produce wav scp +if [ ! -f data/wav.scp ]; then + mkdir -p data + pushd data + wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav + echo "utt1 " $PWD/zh.wav > wav.scp + popd +fi + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + + +cmvn_json2kaldi_main \ + --json_file $model_dir/mean_std.json \ + --cmvn_write_path $exp/cmvn.ark \ + --binary=false +echo "convert json cmvn to kaldi ark." + +compute_fbank_main \ + --num_bins 80 \ + --wav_rspecifier=scp:$data/wav.scp \ + --cmvn_file=$exp/cmvn.ark \ + --feature_wspecifier=ark,t:$exp/fbank.ark +echo "compute fbank feature." + +u2_nnet_main \ + --model_path=$model_dir/export.jit \ + --feature_rspecifier=ark,t:$exp/fbank.ark \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --downsampling_rate=4 \ + --acoustic_scale=1.0 \ + --nnet_prob_wspecifier=ark,t:$exp/probs.ark diff --git a/speechx/examples/codelab/u2nnet/valgrind.sh b/speechx/examples/codelab/u2nnet/valgrind.sh new file mode 100755 index 0000000000000000000000000000000000000000..a5aab6637d1e25aed5b6825e9277fad556644a48 --- /dev/null +++ b/speechx/examples/codelab/u2nnet/valgrind.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# this script is for memory check, so please run ./run.sh first. + +set +x +set -e + +. ./path.sh + +if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then + echo "please install valgrind in the speechx tools dir.\n" + exit 1 +fi + +ckpt_dir=./data/model +model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/ + +valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \ + ds2_model_test_main \ + --model_path=$model_dir/avg_1.jit.pdmodel \ + --param_path=$model_dir/avg_1.jit.pdparams diff --git a/speechx/examples/u2pp_ol/README.md b/speechx/examples/u2pp_ol/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ce01a8fc7f7fe499445bb4e352d2dd96025afb14 --- /dev/null +++ b/speechx/examples/u2pp_ol/README.md @@ -0,0 +1,5 @@ +# U2/U2++ Streaming ASR + +## Examples + +* `wenetspeech` - Streaming Decoding using wenetspeech u2/u2++ model. Using aishell test data for testing. diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h index dfb14885326051a237f8f44b63f9319d7f296ffc..90fc96a18387d26ae3c95cd7ac4650de26cfa561 100644 --- a/speechx/speechx/base/common.h +++ b/speechx/speechx/base/common.h @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include diff --git a/speechx/speechx/codelab/glog/glog_logtostderr_main.cc b/speechx/speechx/codelab/glog/glog_logtostderr_main.cc index b0616a7de3f1cad4aca75d9511c1cc5a6e6d5e9a..c891827a14e3b9a9e28dc244a5e8403cbabed919 100644 --- a/speechx/speechx/codelab/glog/glog_logtostderr_main.cc +++ b/speechx/speechx/codelab/glog/glog_logtostderr_main.cc @@ -17,7 +17,7 @@ int main(int argc, char* argv[]) { // Initialize Google’s logging library. google::InitGoogleLogging(argv[0]); - + google::InstallFailureSignalHandler(); FLAGS_logtostderr = 1; LOG(INFO) << "Found " << 10 << " cookies"; diff --git a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc index 283466dc1c4dc871947b51a9108a5328eb2959ba..7d99e8571091961e3839d2932d963f58a7bc09f0 100644 --- a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc +++ b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc @@ -195,8 +195,11 @@ void model_forward_test() { } int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; model_forward_test(); return 0; diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt index 0383c3ea0d64ee595d0a3b5795725366565b8f5e..1df935112e641abe6deaafb8f57b2cb635075c00 100644 --- a/speechx/speechx/decoder/CMakeLists.txt +++ b/speechx/speechx/decoder/CMakeLists.txt @@ -18,7 +18,6 @@ set(BINS tlg_decoder_main ) -message(STATUS "xxxxxxxxxx: " ${DEPS}) foreach(bin_name IN LISTS BINS) add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc index e4e5c2afb985f56cfc45fff3d7ed25f28b52a961..445f470f939b1d9f61b8acea932daea3ca1b69c1 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc @@ -53,8 +53,11 @@ using std::vector; // test ds2 online decoder by feeding speech feature int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; CHECK(FLAGS_result_wspecifier != ""); CHECK(FLAGS_feature_rspecifier != ""); diff --git a/speechx/speechx/decoder/nnet_logprob_decoder_main.cc b/speechx/speechx/decoder/nnet_logprob_decoder_main.cc index 0e249cc6b93ced6b80e9c7b522ad8bd3819a19e7..e0acbe77be92f646047de04a3af90ea927df589d 100644 --- a/speechx/speechx/decoder/nnet_logprob_decoder_main.cc +++ b/speechx/speechx/decoder/nnet_logprob_decoder_main.cc @@ -30,8 +30,11 @@ using std::vector; // test decoder by feeding nnet posterior probability int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; kaldi::SequentialBaseFloatMatrixReader likelihood_reader( FLAGS_nnet_prob_respecifier); diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/decoder/recognizer_main.cc index 232513539b157266b6299a3f099268796ea03b5a..0502664627a13c479348bbc766909f4db34d49b7 100644 --- a/speechx/speechx/decoder/recognizer_main.cc +++ b/speechx/speechx/decoder/recognizer_main.cc @@ -23,8 +23,11 @@ DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); DEFINE_int32(sample_rate, 16000, "sample rate"); int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; ppspeech::RecognizerResource resource = ppspeech::InitRecognizerResoure(); ppspeech::Recognizer recognizer(resource); diff --git a/speechx/speechx/decoder/tlg_decoder_main.cc b/speechx/speechx/decoder/tlg_decoder_main.cc index 93f84da3f562c3aa14f06450a12755b2feb8dd77..b633022a37b67f7f7657dab35b44894addc2ff86 100644 --- a/speechx/speechx/decoder/tlg_decoder_main.cc +++ b/speechx/speechx/decoder/tlg_decoder_main.cc @@ -55,8 +55,11 @@ using std::vector; // test TLG decoder by feeding speech feature. int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; kaldi::SequentialBaseFloatMatrixReader feature_reader( FLAGS_feature_rspecifier); diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt index 8ae63256a47991a0d7ed8c23cde11f417034ba68..050d78bea7dcd52253ed8f4c60b7b601901a9afe 100644 --- a/speechx/speechx/frontend/audio/CMakeLists.txt +++ b/speechx/speechx/frontend/audio/CMakeLists.txt @@ -1,5 +1,3 @@ -project(frontend) - add_library(frontend STATIC cmvn.cc db_norm.cc diff --git a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc index 0def14660d06f1db9edf4e764e7fb73511293349..93bad6886f2734e5f326dbefd7d88a0f5ee2822b 100644 --- a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc +++ b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc @@ -30,8 +30,11 @@ DEFINE_bool(binary, true, "write cmvn in binary (true) or text(false)"); using namespace boost::json; // from int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; LOG(INFO) << "cmvn josn path: " << FLAGS_json_file; diff --git a/speechx/speechx/frontend/audio/compute_fbank_main.cc b/speechx/speechx/frontend/audio/compute_fbank_main.cc index f7a42315f2d094cfb41384d87c4504bb138649d3..93a6d40722a753053019f02218e6a5670d8637a1 100644 --- a/speechx/speechx/frontend/audio/compute_fbank_main.cc +++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc @@ -32,13 +32,21 @@ DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); DEFINE_string(cmvn_file, "", "read cmvn"); DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); DEFINE_int32(num_bins, 161, "fbank num bins"); +DEFINE_int32(sample_rate, 16000, "sampe rate: 16k, 8k."); int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; + CHECK(FLAGS_wav_rspecifier.size() > 0); + CHECK(FLAGS_feature_wspecifier.size() > 0); kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); + kaldi::SequentialTableReader wav_info_reader( + FLAGS_wav_rspecifier); kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier); int32 num_done = 0, num_err = 0; @@ -54,6 +62,10 @@ int main(int argc, char* argv[]) { opt.frame_opts.frame_shift_ms = 10; opt.mel_opts.num_bins = FLAGS_num_bins; opt.frame_opts.dither = 0.0; + LOG(INFO) << "frame_length_ms: " << opt.frame_opts.frame_length_ms; + LOG(INFO) << "frame_shift_ms: " << opt.frame_opts.frame_shift_ms; + LOG(INFO) << "num_bins: " << opt.mel_opts.num_bins; + LOG(INFO) << "dither: " << opt.frame_opts.dither; std::unique_ptr fbank( new ppspeech::Fbank(opt, std::move(data_source))); @@ -61,53 +73,73 @@ int main(int argc, char* argv[]) { std::unique_ptr cmvn( new ppspeech::CMVN(FLAGS_cmvn_file, std::move(fbank))); - ppspeech::FeatureCacheOptions feat_cache_opts; // the feature cache output feature chunk by chunk. + ppspeech::FeatureCacheOptions feat_cache_opts; ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn)); LOG(INFO) << "fbank: " << true; LOG(INFO) << "feat dim: " << feature_cache.Dim(); - int sample_rate = 16000; + float streaming_chunk = FLAGS_streaming_chunk; - int chunk_sample_size = streaming_chunk * sample_rate; - LOG(INFO) << "sr: " << sample_rate; - LOG(INFO) << "chunk size (s): " << streaming_chunk; + int chunk_sample_size = streaming_chunk * FLAGS_sample_rate; + LOG(INFO) << "sr: " << FLAGS_sample_rate; + LOG(INFO) << "chunk size (sec): " << streaming_chunk; LOG(INFO) << "chunk size (sample): " << chunk_sample_size; - for (; !wav_reader.Done(); wav_reader.Next()) { - std::string utt = wav_reader.Key(); + for (; !wav_reader.Done() && !wav_info_reader.Done(); wav_reader.Next(), wav_info_reader.Next()) { + const std::string& utt = wav_reader.Key(); const kaldi::WaveData& wave_data = wav_reader.Value(); - LOG(INFO) << "process utt: " << utt; + const std::string& utt2 = wav_info_reader.Key(); + const kaldi::WaveInfo& wave_info = wav_info_reader.Value(); + + CHECK(utt == utt2) << "wav reader and wav info reader using diff rspecifier!!!"; + LOG(INFO) << "utt: " << utt; + LOG(INFO) << "samples: " << wave_info.SampleCount(); + LOG(INFO) << "dur: " << wave_info.Duration() << " sec"; + CHECK(wave_info.SampFreq() == FLAGS_sample_rate) << "need " << FLAGS_sample_rate << " get " << wave_info.SampFreq(); + + // load first channel wav int32 this_channel = 0; kaldi::SubVector waveform(wave_data.Data(), this_channel); + + // compute feat chunk by chunk int tot_samples = waveform.Dim(); - LOG(INFO) << "wav len (sample): " << tot_samples; - int sample_offset = 0; std::vector> feats; int feature_rows = 0; while (sample_offset < tot_samples) { + // cur chunk size int cur_chunk_size = std::min(chunk_sample_size, tot_samples - sample_offset); + // get chunk wav kaldi::Vector wav_chunk(cur_chunk_size); for (int i = 0; i < cur_chunk_size; ++i) { wav_chunk(i) = waveform(sample_offset + i); } - kaldi::Vector features; + // compute feat feature_cache.Accept(wav_chunk); + + // send finish signal if (cur_chunk_size < chunk_sample_size) { feature_cache.SetFinished(); } + + // read feat + kaldi::Vector features; bool flag = true; do { flag = feature_cache.Read(&features); - feats.push_back(features); - feature_rows += features.Dim() / feature_cache.Dim(); + if (flag && features.Dim() != 0) { + feats.push_back(features); + feature_rows += features.Dim() / feature_cache.Dim(); + } } while (flag == true && features.Dim() != 0); + + // forward offset sample_offset += cur_chunk_size; } @@ -125,14 +157,19 @@ int main(int argc, char* argv[]) { ++cur_idx; } } + LOG(INFO) << "feat shape: " << features.NumRows() << " , " << features.NumCols(); feat_writer.Write(utt, features); + + // reset frontend pipeline state feature_cache.Reset(); if (num_done % 50 == 0 && num_done != 0) - KALDI_VLOG(2) << "Processed " << num_done << " utterances"; + VLOG(2) << "Processed " << num_done << " utterances"; + num_done++; } - KALDI_LOG << "Done " << num_done << " utterances, " << num_err + + LOG(INFO) << "Done " << num_done << " utterances, " << num_err << " with errors."; return (num_done != 0 ? 0 : 1); } diff --git a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc index 162c3529d04a7def16a58ea7d77cc7bf22374d99..889f5663df4b909eb074292aab5e3726b0046ef6 100644 --- a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc +++ b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc @@ -31,8 +31,11 @@ DEFINE_string(cmvn_file, "./cmvn.ark", "read cmvn"); DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); diff --git a/speechx/speechx/model/CMakeLists.txt b/speechx/speechx/model/CMakeLists.txt deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/speechx/speechx/nnet/CMakeLists.txt b/speechx/speechx/nnet/CMakeLists.txt index 565bba3eb9e60d43729a86c84960eefeb853f58d..2a1812fdfc5c4a2644b5206d2783cbae03868b8b 100644 --- a/speechx/speechx/nnet/CMakeLists.txt +++ b/speechx/speechx/nnet/CMakeLists.txt @@ -1,12 +1,40 @@ -project(nnet) +set(srcs decodable.cc) -add_library(nnet STATIC - decodable.cc - ds2_nnet.cc -) +if(USING_DS2) + list(APPEND srcs ds2_nnet.cc) +endif() + +if(USING_U2) + list(APPEND srcs u2_nnet.cc) +endif() + +add_library(nnet STATIC ${srcs}) target_link_libraries(nnet absl::strings) -set(bin_name ds2_nnet_main) -add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) -target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet ${DEPS}) \ No newline at end of file +if(USING_U2) + target_compile_options(nnet PUBLIC ${PADDLE_COMPILE_FLAGS}) + target_include_directories(nnet PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) + # target_link_libraries(nnet ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) +endif() + + +if(USING_DS2) + set(bin_name ds2_nnet_main) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet) + + target_link_libraries(${bin_name} ${DEPS}) +endif() + +# test bin +if(USING_U2) + set(bin_name u2_nnet_main) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet) + + target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS}) + target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) + target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) +endif() diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index 465f64a9486ba713a1d9983af0ff266c7e07a60f..7780e5ae63eb97f8f21bf1dba37c8a53a107f926 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -30,6 +30,7 @@ Decodable::Decodable(const std::shared_ptr& nnet, frames_ready_(0), acoustic_scale_(acoustic_scale) {} +// for debug void Decodable::Acceptlikelihood(const Matrix& likelihood) { nnet_cache_ = likelihood; frames_ready_ += likelihood.NumRows(); @@ -41,6 +42,7 @@ void Decodable::Acceptlikelihood(const Matrix& likelihood) { // return the size of frame have computed. int32 Decodable::NumFramesReady() const { return frames_ready_; } + // frame idx is from 0 to frame_ready_ -1; bool Decodable::IsLastFrame(int32 frame) { bool flag = EnsureFrameHaveComputed(frame); @@ -72,26 +74,38 @@ bool Decodable::EnsureFrameHaveComputed(int32 frame) { } bool Decodable::AdvanceChunk() { + // read feats Vector features; if (frontend_ == NULL || frontend_->Read(&features) == false) { + // no feat or frontend_ not init. return false; } - int32 nnet_dim = 0; - Vector inferences; - nnet_->FeedForward(features, frontend_->Dim(), &inferences, &nnet_dim); - nnet_cache_.Resize(inferences.Dim() / nnet_dim, nnet_dim); - nnet_cache_.CopyRowsFromVec(inferences); + // forward feats + int32 vocab_dim = 0; + Vector probs; + nnet_->FeedForward(features, frontend_->Dim(), &probs, &vocab_dim); + + // cache nnet outupts + nnet_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim); + nnet_cache_.CopyRowsFromVec(probs); + + // update state frame_offset_ = frames_ready_; frames_ready_ += nnet_cache_.NumRows(); return true; } +// read one frame likelihood bool Decodable::FrameLikelihood(int32 frame, vector* likelihood) { - std::vector result; - if (EnsureFrameHaveComputed(frame) == false) return false; - likelihood->resize(nnet_cache_.NumCols()); - for (int32 idx = 0; idx < nnet_cache_.NumCols(); ++idx) { + if (EnsureFrameHaveComputed(frame) == false) { + return false; + } + + int vocab_size = nnet_cache_.NumCols(); + likelihood->resize(vocab_size); + + for (int32 idx = 0; idx < vocab_size; ++idx) { (*likelihood)[idx] = nnet_cache_(frame - frame_offset_, idx) * acoustic_scale_; } diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h index 9555fea792ddb3afed9e9dc0db838c041a9c876b..241d04198aabf12887c179e4f80f840f14440ecd 100644 --- a/speechx/speechx/nnet/decodable.h +++ b/speechx/speechx/nnet/decodable.h @@ -27,35 +27,54 @@ class Decodable : public kaldi::DecodableInterface { explicit Decodable(const std::shared_ptr& nnet, const std::shared_ptr& frontend, kaldi::BaseFloat acoustic_scale = 1.0); + // void Init(DecodableOpts config); + + // nnet logprob output virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index); + virtual bool IsLastFrame(int32 frame); + + // nnet output dim, e.g. vocab size virtual int32 NumIndices() const; - // not logprob + + // nnet prob output virtual bool FrameLikelihood(int32 frame, std::vector* likelihood); + virtual int32 NumFramesReady() const; + // for offline test void Acceptlikelihood(const kaldi::Matrix& likelihood); + void Reset(); + bool IsInputFinished() const { return frontend_->IsFinished(); } + bool EnsureFrameHaveComputed(int32 frame); + int32 TokenId2NnetId(int32 token_id); private: bool AdvanceChunk(); + std::shared_ptr frontend_; std::shared_ptr nnet_; + + // nnet outputs' cache kaldi::Matrix nnet_cache_; + // the frame is nnet prob frame rather than audio feature frame // nnet frame subsample the feature frame // eg: 35 frame features output 8 frame inferences int32 frame_offset_; int32 frames_ready_; + // todo: feature frame mismatch with nnet inference frame // so use subsampled_frame int32 current_log_post_subsampled_offset_; int32 num_chunk_computed_; + kaldi::BaseFloat acoustic_scale_; }; diff --git a/speechx/speechx/nnet/ds2_nnet_main.cc b/speechx/speechx/nnet/ds2_nnet_main.cc index e2904208260cf1900ab1d1ca77e4897556b9718b..943d7e5f293e9f37f1188207d2ffa7251d328915 100644 --- a/speechx/speechx/nnet/ds2_nnet_main.cc +++ b/speechx/speechx/nnet/ds2_nnet_main.cc @@ -13,8 +13,7 @@ // limitations under the License. #include "nnet/ds2_nnet.h" -#include "base/flags.h" -#include "base/log.h" +#include "base/common.h" #include "frontend/audio/assembler.h" #include "frontend/audio/data_cache.h" #include "kaldi/util/table-types.h" @@ -49,8 +48,11 @@ using kaldi::Matrix; using std::vector; int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; kaldi::SequentialBaseFloatMatrixReader feature_reader( FLAGS_feature_rspecifier); @@ -146,7 +148,7 @@ int main(int argc, char* argv[]) { } kaldi::Matrix result(prob_vec.size(), prob_vec[0].Dim()); - for (int32 row_idx = 0; row_idx < prob_vec.size(); ++row_idx) { + for (int row_idx = 0; row_idx < prob_vec.size(); ++row_idx) { for (int32 col_idx = 0; col_idx < prob_vec[0].Dim(); ++col_idx) { result(row_idx, col_idx) = prob_vec[row_idx](col_idx); } diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc new file mode 100644 index 0000000000000000000000000000000000000000..67ef0952ae5343c3ee574e86352e43942eafb563 --- /dev/null +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -0,0 +1,706 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "nnet/u2_nnet.h" + +#ifdef USE_PROFILING +#include "paddle/fluid/platform/profiler.h" +using paddle::platform::RecordEvent; +using paddle::platform::TracerEventType; +#endif // end USE_PROFILING + +namespace ppspeech { + +int U2NnetBase::num_frames_for_chunk(bool start) const { + int num_needed_frames = 0; // num feat frames + bool first = !start; // start == false is first + + if (chunk_size_ > 0) { + // streaming mode + if (first) { + // first chunk + // 1 decoder frame need `context` feat frames + int context = this->context(); + num_needed_frames = (chunk_size_ - 1) * subsampling_rate_ + context; + } else { + // after first chunk, we need stride this num frames. + num_needed_frames = chunk_size_ * subsampling_rate_; + } + } else { + // non-streaming mode. feed all feats once. + num_needed_frames = std::numeric_limits::max(); + } + + return num_needed_frames; +} + +// cache feats for next chunk +void U2NnetBase::CacheFeature(const std::vector& chunk_feats, + int32 feat_dim) { + // chunk_feats is nframes*feat_dim + const int chunk_size = chunk_feats.size() / feat_dim; + const int cached_feat_size = this->context() - subsampling_rate_; + if (chunk_size >= cached_feat_size) { + cached_feats_.resize(cached_feat_size); + for (int i = 0; i < cached_feat_size; ++i) { + auto start = + chunk_feats.begin() + chunk_size - cached_feat_size + i; + auto end = start + feat_dim; + cached_feats_[i] = std::vector(start, end); + } + } +} + +void U2NnetBase::ForwardEncoderChunk( + const std::vector& chunk_feats, + int32 feat_dim, + std::vector* ctc_probs, + int32* vocab_dim) { + ctc_probs->clear(); + // int num_frames = cached_feats_.size() + chunk_feats.size(); + int num_frames = chunk_feats.size() / feat_dim; + VLOG(3) << "foward encoder chunk: " << num_frames << " frames"; + VLOG(3) << "context: " << this->context() << " frames"; + + if (num_frames >= this->context()) { + this->ForwardEncoderChunkImpl( + chunk_feats, feat_dim, ctc_probs, vocab_dim); + VLOG(3) << "after forward chunk"; + this->CacheFeature(chunk_feats, feat_dim); + } +} + + +void U2Nnet::LoadModel(const std::string& model_path_w_prefix) { + paddle::jit::utils::InitKernelSignatureMap(); + +#ifdef USE_GPU + dev_ = phi::GPUPlace(); +#else + dev_ = phi::CPUPlace(); +#endif + paddle::jit::Layer model = paddle::jit::Load(model_path_w_prefix, dev_); + model_ = std::make_shared(std::move(model)); + + subsampling_rate_ = model_->Attribute("subsampling_rate"); + right_context_ = model_->Attribute("right_context"); + sos_ = model_->Attribute("sos_symbol"); + eos_ = model_->Attribute("eos_symbol"); + is_bidecoder_ = model_->Attribute("is_bidirectional_decoder"); + + forward_encoder_chunk_ = model_->Function("forward_encoder_chunk"); + forward_attention_decoder_ = model_->Function("forward_attention_decoder"); + ctc_activation_ = model_->Function("ctc_activation"); + CHECK(forward_encoder_chunk_.IsValid()); + CHECK(forward_attention_decoder_.IsValid()); + CHECK(ctc_activation_.IsValid()); + + LOG(INFO) << "Paddle Model Info: "; + LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_; + LOG(INFO) << "\tright context " << right_context_; + LOG(INFO) << "\tsos " << sos_; + LOG(INFO) << "\teos " << eos_; + LOG(INFO) << "\tis bidecoder " << is_bidecoder_ << std::endl; + + Warmup(); +} + +void U2Nnet::Warmup() { +#ifdef USE_PROFILING + RecordEvent event("warmup", TracerEventType::UserDefined, 1); +#endif + + { +#ifdef USE_PROFILING + RecordEvent event( + "warmup-encoder-ctc", TracerEventType::UserDefined, 1); +#endif + int feat_dim = 80; + int frame_num = 16 * 4 + 3; // chunk_size * downsample_rate + + // (receptive_field - downsample_rate) + paddle::Tensor feats = paddle::full( + {1, frame_num, feat_dim}, 0.12f, paddle::DataType::FLOAT32); + paddle::Tensor offset = paddle::zeros({1}, paddle::DataType::INT32); + paddle::Tensor att_cache = + paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32); + paddle::Tensor cnn_cache = + paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32); + std::vector inputs = { + feats, offset, /*required_cache_size, */ att_cache, cnn_cache}; + std::vector outputs = forward_encoder_chunk_(inputs); + + auto chunk_out = outputs[0]; + inputs = std::move(std::vector({chunk_out})); + outputs = ctc_activation_(inputs); + } + + { +#ifdef USE_PROFILING + RecordEvent event("warmup-decoder", TracerEventType::UserDefined, 1); +#endif + auto hyps = + paddle::full({10, 8}, 10, paddle::DataType::INT64, phi::CPUPlace()); + auto hyps_lens = + paddle::full({10}, 8, paddle::DataType::INT64, phi::CPUPlace()); + auto encoder_out = paddle::ones( + {1, 20, 512}, paddle::DataType::FLOAT32, phi::CPUPlace()); + + std::vector inputs{ + hyps, hyps_lens, encoder_out}; + + std::vector outputs = + forward_attention_decoder_(inputs); + } + + Reset(); +} + +U2Nnet::U2Nnet(const U2ModelOptions& opts) : opts_(opts) { + LoadModel(opts_.model_path); +} + +// shallow copy +U2Nnet::U2Nnet(const U2Nnet& other) { + // copy meta + right_context_ = other.right_context_; + subsampling_rate_ = other.subsampling_rate_; + sos_ = other.sos_; + eos_ = other.eos_; + is_bidecoder_ = other.is_bidecoder_; + chunk_size_ = other.chunk_size_; + num_left_chunks_ = other.num_left_chunks_; + + forward_encoder_chunk_ = other.forward_encoder_chunk_; + forward_attention_decoder_ = other.forward_attention_decoder_; + ctc_activation_ = other.ctc_activation_; + + // offset_ = other.offset_; // TODO: not used in nnets + + // copy model ptr + model_ = other.model_; + + // ignore inner states +} + +std::shared_ptr U2Nnet::Copy() const { + auto asr_model = std::make_shared(*this); + // reset inner state for new decoding + asr_model->Reset(); + return asr_model; +} + +void U2Nnet::Reset() { + // offset_ = 0; + // cached_feats_.clear(); // TODO: not used in nnets + + att_cache_ = + std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32)); + cnn_cache_ = + std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32)); + + encoder_outs_.clear(); +} + +// Debug API +void U2Nnet::FeedEncoderOuts(paddle::Tensor& encoder_out) { + // encoder_out (T,D) + encoder_outs_.clear(); + encoder_outs_.push_back(encoder_out); +} + + +void U2Nnet::FeedForward(const kaldi::Vector& features, + int32 feature_dim, + kaldi::Vector* inferences, + int32* inference_dim) { + std::vector chunk_feats(features.Data(), + features.Data() + features.Dim()); + std::vector ctc_probs; + ForwardEncoderChunkImpl( + chunk_feats, feature_dim, &ctc_probs, inference_dim); + inferences->Resize(ctc_probs.size(), kaldi::kSetZero); + std::memcpy(inferences->Data(), + ctc_probs.data(), + ctc_probs.size() * sizeof(kaldi::BaseFloat)); +} + + +void U2Nnet::ForwardEncoderChunkImpl( + const std::vector& chunk_feats, + int32 feat_dim, + std::vector* out_prob, + int32* vocab_dim) { +#ifdef USE_PROFILING + RecordEvent event( + "ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1); +#endif + + // 1. splice cached_feature, and chunk_feats + // First dimension is B, which is 1. + // int num_frames = cached_feats_.size() + chunk_feats.size(); + + int num_frames = chunk_feats.size() / feat_dim; + VLOG(3) << "num_frames: " << num_frames; + VLOG(3) << "feat_dim: " << feat_dim; + + // feats (B=1,T,D) + paddle::Tensor feats = + paddle::zeros({1, num_frames, feat_dim}, paddle::DataType::FLOAT32); + float* feats_ptr = feats.mutable_data(); + + // for (size_t i = 0; i < cached_feats_.size(); ++i) { + // float* row = feats_ptr + i * feat_dim; + // std::memcpy(row, cached_feats_[i].data(), feat_dim * sizeof(float)); + // } + + // for (size_t i = 0; i < chunk_feats.size(); ++i) { + // float* row = feats_ptr + (cached_feats_.size() + i) * feat_dim; + // std::memcpy(row, chunk_feats[i].data(), feat_dim * sizeof(float)); + // } + + // not cache feature in nnet + CHECK(cached_feats_.size() == 0); + // CHECK_EQ(std::is_same::value, true); + std::memcpy(feats_ptr, + chunk_feats.data(), + chunk_feats.size() * sizeof(kaldi::BaseFloat)); + + VLOG(3) << "feats shape: " << feats.shape()[0] << ", " << feats.shape()[1] + << ", " << feats.shape()[2]; + +#ifdef TEST_DEBUG + { + std::stringstream path("feat", std::ios_base::app | std::ios_base::out); + path << offset_; + std::ofstream feat_fobj(path.str().c_str(), std::ios::out); + CHECK(feat_fobj.is_open()); + // feat_fobj << feats.shape()[0] << " " << feats.shape()[1] << " " + // << feats.shape()[2] << "\n"; + for (int i = 0; i < feats.numel(); i++) { + feat_fobj << std::setprecision(18) << feats_ptr[i] << " "; + if ((i + 1) % feat_dim == 0) { + feat_fobj << "\n"; + } + } + feat_fobj << "\n"; + } +#endif + +// Endocer chunk forward +#ifdef USE_GPU + feats = feats.copy_to(paddle::GPUPlace(), /*blocking*/ false); + att_cache_ = att_cache_.copy_to(paddle::GPUPlace()), /*blocking*/ false; + cnn_cache_ = cnn_cache_.copy_to(Paddle::GPUPlace(), /*blocking*/ false); +#endif + + int required_cache_size = num_left_chunks_ * chunk_size_; // -1 * 16 + // must be scalar, but paddle do not have scalar. + paddle::Tensor offset = paddle::full({1}, offset_, paddle::DataType::INT32); + // freeze `required_cache_size` in graph, so not specific it in function + // call. + std::vector inputs = { + feats, offset, /*required_cache_size, */ att_cache_, cnn_cache_}; + VLOG(3) << "inputs size: " << inputs.size(); + CHECK(inputs.size() == 4); + std::vector outputs = forward_encoder_chunk_(inputs); + VLOG(3) << "outputs size: " << outputs.size(); + CHECK(outputs.size() == 3); + +#ifdef USE_GPU + paddle::Tensor chunk_out = outputs[0].copy_to(paddle::CPUPlace()); + att_cache_ = outputs[1].copy_to(paddle::CPUPlace()); + cnn_cache_ = outputs[2].copy_to(paddle::CPUPlace()); +#else + paddle::Tensor chunk_out = outputs[0]; + att_cache_ = outputs[1]; + cnn_cache_ = outputs[2]; +#endif + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logits", + std::ios_base::app | std::ios_base::out); + auto i = offset_ - chunk_out.shape()[1]; + path << std::max(i, 0L); + std::ofstream logits_fobj(path.str().c_str(), std::ios::out); + CHECK(logits_fobj.is_open()); + logits_fobj << chunk_out.shape()[0] << " " << chunk_out.shape()[1] + << " " << chunk_out.shape()[2] << "\n"; + const float* chunk_out_ptr = chunk_out.data(); + logits_fobj << chunk_out_ptr << std::endl; + for (int i = 0; i < chunk_out.numel(); i++) { + logits_fobj << chunk_out_ptr[i] << " "; + } + logits_fobj << "\n"; + } +#endif // end TEST_DEBUG + + // current offset in decoder frame + // not used in nnet + offset_ += chunk_out.shape()[1]; + + // collects encoder outs. + VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size(); + encoder_outs_.push_back(chunk_out); + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logits_list", + std::ios_base::app | std::ios_base::out); + path << offset_ - encoder_outs_[0].shape()[1]; + std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out); + CHECK(logits_out_fobj.is_open()); + logits_out_fobj << encoder_outs_[0].shape()[0] << " " + << encoder_outs_[0].shape()[1] << " " + << encoder_outs_[0].shape()[2] << "\n"; + const float* encoder_outs_ptr = encoder_outs_[0].data(); + logits_out_fobj << encoder_outs_ptr << std::endl; + for (int i = 0; i < encoder_outs_[0].numel(); i++) { + logits_out_fobj << encoder_outs_ptr[i] << " "; + } + logits_out_fobj << "\n"; + } +#endif // end TEST_DEBUG + +#ifdef USE_GPU + +#error "Not implementation." + +#else + // compute ctc_activation == log_softmax + inputs.clear(); + outputs.clear(); + inputs.push_back(chunk_out); + CHECK(inputs.size() == 1); + outputs = ctc_activation_(inputs); + CHECK(outputs.size() == 1); + paddle::Tensor ctc_log_probs = outputs[0]; + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logprob", + std::ios_base::app | std::ios_base::out); + path << offset_ - chunk_out.shape()[1]; + + std::ofstream logprob_fobj(path.str().c_str(), std::ios::out); + CHECK(logprob_fobj.is_open()); + logprob_fobj << ctc_log_probs.shape()[0] << " " + << ctc_log_probs.shape()[1] << " " + << ctc_log_probs.shape()[2] << "\n"; + const float* logprob_ptr = ctc_log_probs.data(); + for (int i = 0; i < ctc_log_probs.numel(); i++) { + logprob_fobj << logprob_ptr[i] << " "; + if ((i + 1) % ctc_log_probs.shape()[2] == 0) { + logprob_fobj << "\n"; + } + } + logprob_fobj << "\n"; + } +#endif // end TEST_DEBUG + +#endif // end USE_GPU + + // Copy to output, (B=1,T,D) + std::vector ctc_log_probs_shape = ctc_log_probs.shape(); + CHECK(ctc_log_probs_shape.size() == 3); + int B = ctc_log_probs_shape[0]; + CHECK(B == 1); + int T = ctc_log_probs_shape[1]; + int D = ctc_log_probs_shape[2]; + *vocab_dim = D; + + float* ctc_log_probs_ptr = ctc_log_probs.data(); + + // // vector> + // out_prob->resize(T); + // for (int i = 0; i < T; i++) { + // (*out_prob)[i].resize(D); + // float* dst_ptr = (*out_prob)[i].data(); + // float* src_ptr = ctc_log_probs_ptr + (i * D); + // std::memcpy(dst_ptr, src_ptr, D * sizeof(float)); + // } + // CHECK(std::is_same::value); + out_prob->resize(T * D); + std::memcpy( + out_prob->data(), ctc_log_probs_ptr, T * D * sizeof(kaldi::BaseFloat)); + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logits_list_ctc", + std::ios_base::app | std::ios_base::out); + path << offset_ - encoder_outs_[0].shape()[1]; + std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out); + CHECK(logits_out_fobj.is_open()); + logits_out_fobj << encoder_outs_[0].shape()[0] << " " + << encoder_outs_[0].shape()[1] << " " + << encoder_outs_[0].shape()[2] << "\n"; + const float* encoder_outs_ptr = encoder_outs_[0].data(); + logits_out_fobj << encoder_outs_ptr << std::endl; + for (int i = 0; i < encoder_outs_[0].numel(); i++) { + logits_out_fobj << encoder_outs_ptr[i] << " "; + } + logits_out_fobj << "\n"; + } +#endif // end TEST_DEBUG + + return; +} + +float U2Nnet::ComputePathScore(const paddle::Tensor& prob, + const std::vector& hyp, + int eos) { + // sum `hyp` path scores in `prob` + // prob (1, Umax, V) + // hyp (U,) + float score = 0.0f; + std::vector dims = prob.shape(); + CHECK(dims.size() == 3); + VLOG(2) << "prob shape: " << dims[0] << ", " << dims[1] << ", " << dims[2]; + CHECK(dims[0] == 1); + int vocab_dim = static_cast(dims[2]); + + const float* prob_ptr = prob.data(); + for (size_t i = 0; i < hyp.size(); ++i) { + const float* row = prob_ptr + i * vocab_dim; + score += row[hyp[i]]; + } + const float* row = prob_ptr + hyp.size() * vocab_dim; + score += row[eos]; + return score; +} + + +void U2Nnet::AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) { +#ifdef USE_PROFILING + RecordEvent event("AttentionRescoring", TracerEventType::UserDefined, 1); +#endif + + CHECK(rescoring_score != nullptr); + + int num_hyps = hyps.size(); + rescoring_score->resize(num_hyps, 0.0f); + + if (num_hyps == 0) return; + VLOG(2) << "num hyps: " << num_hyps; + + if (encoder_outs_.size() == 0) { + // no encoder outs + std::cerr << "encoder_outs_.size() is zero. Please check it." + << std::endl; + return; + } + + // prepare input + paddle::Tensor hyps_lens = + paddle::zeros({num_hyps}, paddle::DataType::INT64); + int64_t* hyps_len_ptr = hyps_lens.mutable_data(); + int max_hyps_len = 0; + for (size_t i = 0; i < num_hyps; ++i) { + int len = hyps[i].size() + 1; // eos + max_hyps_len = std::max(max_hyps_len, len); + hyps_len_ptr[i] = static_cast(len); + } + + paddle::Tensor hyps_tensor = + paddle::full({num_hyps, max_hyps_len}, eos_, paddle::DataType::INT64); + int64_t* hyps_ptr = hyps_tensor.mutable_data(); + for (size_t i = 0; i < num_hyps; ++i) { + const std::vector& hyp = hyps[i]; + int64_t* row = hyps_ptr + max_hyps_len * i; + row[0] = sos_; + for (size_t j = 0; j < hyp.size(); ++j) { + row[j + 1] = hyp[j]; + } + } + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logits_concat", + std::ios_base::app | std::ios_base::out); + for (int j = 0; j < encoder_outs_.size(); j++) { + path << j; + std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out); + CHECK(logits_out_fobj.is_open()); + logits_out_fobj << encoder_outs_[j].shape()[0] << " " + << encoder_outs_[j].shape()[1] << " " + << encoder_outs_[j].shape()[2] << "\n"; + const float* encoder_outs_ptr = encoder_outs_[j].data(); + for (int i = 0; i < encoder_outs_[j].numel(); i++) { + logits_out_fobj << encoder_outs_ptr[i] << " "; + } + logits_out_fobj << "\n"; + } + } +#endif // end TEST_DEBUG + + // forward attention decoder by hyps and correspoinding encoder_outs_ + paddle::Tensor encoder_out = paddle::concat(encoder_outs_, 1); + VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size(); + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_out0", + std::ios_base::app | std::ios_base::out); + std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out); + CHECK(encoder_out_fobj.is_open()); + + encoder_out_fobj << encoder_outs_[0].shape()[0] << " " + << encoder_outs_[0].shape()[1] << " " + << encoder_outs_[0].shape()[2] << "\n"; + const float* enc_logprob_ptr = encoder_outs_[0].data(); + + size_t size = encoder_outs_[0].numel(); + for (int i = 0; i < size; i++) { + encoder_out_fobj << enc_logprob_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_out", + std::ios_base::app | std::ios_base::out); + std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out); + CHECK(encoder_out_fobj.is_open()); + + encoder_out_fobj << encoder_out.shape()[0] << " " + << encoder_out.shape()[1] << " " + << encoder_out.shape()[2] << "\n"; + const float* enc_logprob_ptr = encoder_out.data(); + + size_t size = encoder_out.numel(); + for (int i = 0; i < size; i++) { + encoder_out_fobj << enc_logprob_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + + std::vector inputs{ + hyps_tensor, hyps_lens, encoder_out}; + std::vector outputs = forward_attention_decoder_(inputs); + CHECK(outputs.size() == 2); + + // (B, Umax, V) + paddle::Tensor probs = outputs[0]; + std::vector probs_shape = probs.shape(); + CHECK(probs_shape.size() == 3); + CHECK(probs_shape[0] == num_hyps); + CHECK(probs_shape[1] == max_hyps_len); + +#ifdef TEST_DEBUG + { + std::stringstream path("decoder_logprob", + std::ios_base::app | std::ios_base::out); + std::ofstream dec_logprob_fobj(path.str().c_str(), std::ios::out); + CHECK(dec_logprob_fobj.is_open()); + + dec_logprob_fobj << probs.shape()[0] << " " << probs.shape()[1] << " " + << probs.shape()[2] << "\n"; + const float* dec_logprob_ptr = probs.data(); + + size_t size = probs.numel(); + for (int i = 0; i < size; i++) { + dec_logprob_fobj << dec_logprob_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + +#ifdef TEST_DEBUG + { + std::stringstream path("hyps_lens", + std::ios_base::app | std::ios_base::out); + std::ofstream hyps_len_fobj(path.str().c_str(), std::ios::out); + CHECK(hyps_len_fobj.is_open()); + + const int64_t* hyps_lens_ptr = hyps_lens.data(); + + size_t size = hyps_lens.numel(); + for (int i = 0; i < size; i++) { + hyps_len_fobj << hyps_lens_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + +#ifdef TEST_DEBUG + { + std::stringstream path("hyps_tensor", + std::ios_base::app | std::ios_base::out); + std::ofstream hyps_tensor_fobj(path.str().c_str(), std::ios::out); + CHECK(hyps_tensor_fobj.is_open()); + + const int64_t* hyps_tensor_ptr = hyps_tensor.data(); + + size_t size = hyps_tensor.numel(); + for (int i = 0; i < size; i++) { + hyps_tensor_fobj << hyps_tensor_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + + paddle::Tensor r_probs = outputs[1]; + std::vector r_probs_shape = r_probs.shape(); + if (is_bidecoder_ && reverse_weight > 0) { + CHECK(r_probs_shape.size() == 3); + CHECK(r_probs_shape[0] == num_hyps); + CHECK(r_probs_shape[1] == max_hyps_len); + } else { + // dump r_probs + CHECK(r_probs_shape.size() == 1); + CHECK(r_probs_shape[0] == 1) << r_probs_shape[0]; + } + + // compute rescoring score + using IntArray = paddle::experimental::IntArray; + std::vector probs_v = + paddle::experimental::split_with_num(probs, num_hyps, 0); + VLOG(2) << "split prob: " << probs_v.size() << " " + << probs_v[0].shape().size() << " 0: " << probs_v[0].shape()[0] + << ", " << probs_v[0].shape()[1] << ", " << probs_v[0].shape()[2]; + CHECK(static_cast(probs_v.size()) == num_hyps) + << ": is " << probs_v.size() << " expect: " << num_hyps; + + std::vector r_probs_v; + if (is_bidecoder_ && reverse_weight > 0) { + r_probs_v = paddle::experimental::split_with_num(r_probs, num_hyps, 0); + CHECK(static_cast(r_probs_v.size()) == num_hyps) + << "r_probs_v size: is " << r_probs_v.size() + << " expect: " << num_hyps; + } + + for (int i = 0; i < num_hyps; ++i) { + const std::vector& hyp = hyps[i]; + + // left-to-right decoder score + float score = 0.0f; + score = ComputePathScore(probs_v[i], hyp, eos_); + + // right-to-left decoder score + float r_score = 0.0f; + if (is_bidecoder_ && reverse_weight > 0) { + std::vector r_hyp(hyp.size()); + std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin()); + r_score = ComputePathScore(r_probs_v[i], r_hyp, eos_); + } + + // combinded left-to-right and right-to-lfet score + (*rescoring_score)[i] = + score * (1 - reverse_weight) + r_score * reverse_weight; + VLOG(1) << "hyp " << i << " score: " << score << " r_score: " << r_score + << " reverse_weight: " << reverse_weight; + } +} + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h new file mode 100644 index 0000000000000000000000000000000000000000..ddc85b45fd94699eeb6bbab3d424300764976071 --- /dev/null +++ b/speechx/speechx/nnet/u2_nnet.h @@ -0,0 +1,157 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "base/common.h" +#include "kaldi/matrix/kaldi-matrix.h" + +#include "kaldi/util/options-itf.h" +#include "nnet/nnet_itf.h" + +#include "paddle/extension.h" +#include "paddle/jit/all.h" +#include "paddle/phi/api/all.h" + +namespace ppspeech { + +struct U2ModelOptions { + std::string model_path; + int thread_num; + bool use_gpu; + U2ModelOptions() : model_path(""), thread_num(1), use_gpu(false) {} + + void Register(kaldi::OptionsItf* opts) { + opts->Register("model-path", &model_path, "model file path"); + opts->Register("thread-num", &thread_num, "thread num"); + opts->Register("use-gpu", &use_gpu, "if use gpu"); + } +}; + + +class U2NnetBase : public NnetInterface { + public: + virtual int context() const { return right_context_ + 1; } + virtual int right_context() const { return right_context_; } + virtual int subsampling_rate() const { return subsampling_rate_; } + virtual int eos() const { return eos_; } + virtual int sos() const { return sos_; } + virtual int is_bidecoder() const { return is_bidecoder_; } + // current offset in decoder frame + virtual int offset() const { return offset_; } + virtual void set_chunk_size(int chunk_size) { chunk_size_ = chunk_size; } + virtual void set_num_left_chunks(int num_left_chunks) { + num_left_chunks_ = num_left_chunks; + } + // start: false, it is the start chunk of one sentence, else true + virtual int num_frames_for_chunk(bool start) const; + + virtual std::shared_ptr Copy() const = 0; + + virtual void ForwardEncoderChunk( + const std::vector& chunk_feats, + int32 feat_dim, + std::vector* ctc_probs, + int32* vocab_dim); + + virtual void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) = 0; + + protected: + virtual void ForwardEncoderChunkImpl( + const std::vector& chunk_feats, + int32 feat_dim, + std::vector* ctc_probs, + int32* vocab_dim) = 0; + + virtual void CacheFeature(const std::vector& chunk_feats, + int32 feat_dim); + + protected: + // model specification + int right_context_{0}; + int subsampling_rate_{1}; + + int sos_{0}; + int eos_{0}; + + bool is_bidecoder_{false}; + + int chunk_size_{16}; // num of decoder frames. If chunk_size > 0, streaming + // case. Otherwise, none streaming case + int num_left_chunks_{-1}; // -1 means all left chunks + + // asr decoder state + int offset_{0}; // current offset in encoder output time stamp. Used by + // position embedding. + std::vector> cached_feats_{}; // features cache +}; + + +class U2Nnet : public U2NnetBase { + public: + U2Nnet(const U2ModelOptions& opts); + U2Nnet(const U2Nnet& other); + + void FeedForward(const kaldi::Vector& features, + int32 feature_dim, + kaldi::Vector* inferences, + int32* inference_dim) override; + + void Reset() override; + + void Dim(); + + void LoadModel(const std::string& model_path_w_prefix); + void Warmup(); + + std::shared_ptr model() const { return model_; } + + std::shared_ptr Copy() const override; + + void ForwardEncoderChunkImpl( + const std::vector& chunk_feats, + int32 feat_dim, + std::vector* ctc_probs, + int32* vocab_dim) override; + + float ComputePathScore(const paddle::Tensor& prob, + const std::vector& hyp, + int eos); + + void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) override; + + // debug + void FeedEncoderOuts(paddle::Tensor& encoder_out); + + private: + U2ModelOptions opts_; + + phi::Place dev_; + std::shared_ptr model_{nullptr}; + std::vector encoder_outs_; + // transformer/conformer attention cache + paddle::Tensor att_cache_ = paddle::full({0, 0, 0, 0}, 0.0); + // conformer-only conv_module cache + paddle::Tensor cnn_cache_ = paddle::full({0, 0, 0, 0}, 0.0); + + paddle::jit::Function forward_encoder_chunk_; + paddle::jit::Function forward_attention_decoder_; + paddle::jit::Function ctc_activation_; +}; + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..1a1a5e02d75ee22d4f8786b05e3ee2885e5c46c2 --- /dev/null +++ b/speechx/speechx/nnet/u2_nnet_main.cc @@ -0,0 +1,180 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "nnet/u2_nnet.h" +#include "base/common.h" +#include "frontend/audio/assembler.h" +#include "frontend/audio/data_cache.h" +#include "kaldi/util/table-types.h" +#include "nnet/decodable.h" + +DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); +DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier"); + +DEFINE_string(model_path, "", "paddle nnet model"); + +DEFINE_int32(nnet_decoder_chunk, 16, "nnet forward chunk"); +DEFINE_int32(receptive_field_length, + 7, + "receptive field of two CNN(kernel=3) downsampling module."); +DEFINE_int32(downsampling_rate, + 4, + "two CNN(kernel=3) module downsampling rate."); +DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); + +using kaldi::BaseFloat; +using kaldi::Matrix; +using std::vector; + +int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); + gflags::ParseCommandLineFlags(&argc, &argv, false); + google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; + + int32 num_done = 0, num_err = 0; + + CHECK(FLAGS_feature_rspecifier.size() > 0); + CHECK(FLAGS_nnet_prob_wspecifier.size() > 0); + CHECK(FLAGS_model_path.size() > 0); + LOG(INFO) << "input rspecifier: " << FLAGS_feature_rspecifier; + LOG(INFO) << "output wspecifier: " << FLAGS_nnet_prob_wspecifier; + LOG(INFO) << "model path: " << FLAGS_model_path; + kaldi::SequentialBaseFloatMatrixReader feature_reader( + FLAGS_feature_rspecifier); + kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier); + + ppspeech::U2ModelOptions model_opts; + model_opts.model_path = FLAGS_model_path; + + int32 chunk_size = + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate + + FLAGS_receptive_field_length; + int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk; + int32 receptive_field_length = FLAGS_receptive_field_length; + LOG(INFO) << "chunk size (frame): " << chunk_size; + LOG(INFO) << "chunk stride (frame): " << chunk_stride; + LOG(INFO) << "receptive field (frame): " << receptive_field_length; + + std::shared_ptr nnet(new ppspeech::U2Nnet(model_opts)); + std::shared_ptr raw_data(new ppspeech::DataCache()); + std::shared_ptr decodable( + new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale)); + kaldi::Timer timer; + + for (; !feature_reader.Done(); feature_reader.Next()) { + string utt = feature_reader.Key(); + kaldi::Matrix feature = feature_reader.Value(); + + int nframes = feature.NumRows(); + int feat_dim = feature.NumCols(); + raw_data->SetDim(feat_dim); + LOG(INFO) << "utt: " << utt; + LOG(INFO) << "feat shape: " << nframes << ", " << feat_dim; + + // // pad feats + // int32 padding_len = 0; + // if ((feature.NumRows() - chunk_size) % chunk_stride != 0) { + // padding_len = + // chunk_stride - (feature.NumRows() - chunk_size) % + // chunk_stride; + // feature.Resize(feature.NumRows() + padding_len, + // feature.NumCols(), + // kaldi::kCopyData); + // } + + int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1; + int32 frame_idx = 0; + std::vector> prob_vec; + int32 ori_feature_len = feature.NumRows(); + + for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { + kaldi::Vector feature_chunk(chunk_size * + feat_dim); + + int32 feature_chunk_size = 0; + if (ori_feature_len > chunk_idx * chunk_stride) { + feature_chunk_size = std::min( + ori_feature_len - chunk_idx * chunk_stride, chunk_size); + } + if (feature_chunk_size < receptive_field_length) { + LOG(WARNING) << "utt: " << utt << " skip last " + << feature_chunk_size << " frames, expect is " + << receptive_field_length; + break; + } + + int32 start = chunk_idx * chunk_stride; + for (int row_id = 0; row_id < chunk_size; ++row_id) { + kaldi::SubVector feat_row(feature, start); + kaldi::SubVector feature_chunk_row( + feature_chunk.Data() + row_id * feat_dim, feat_dim); + + feature_chunk_row.CopyFromVec(feat_row); + ++start; + } + + // feat to frontend pipeline cache + raw_data->Accept(feature_chunk); + + // send data finish signal + if (chunk_idx == num_chunks - 1) { + raw_data->SetFinished(); + } + + // get nnet outputs + vector prob; + while (decodable->FrameLikelihood(frame_idx, &prob)) { + kaldi::Vector vec_tmp(prob.size()); + std::memcpy(vec_tmp.Data(), + prob.data(), + sizeof(kaldi::BaseFloat) * prob.size()); + prob_vec.push_back(vec_tmp); + frame_idx++; + } + } + + // after process one utt, then reset decoder state. + decodable->Reset(); + + if (prob_vec.size() == 0) { + // the TokenWriter can not write empty string. + ++num_err; + LOG(WARNING) << " the nnet prob of " << utt << " is empty"; + continue; + } + + // writer nnet output + kaldi::MatrixIndexT nrow = prob_vec.size(); + kaldi::MatrixIndexT ncol = prob_vec[0].Dim(); + LOG(INFO) << "nnet out shape: " << nrow << ", " << ncol; + kaldi::Matrix result(nrow, ncol); + for (int32 row_idx = 0; row_idx < nrow; ++row_idx) { + for (int32 col_idx = 0; col_idx < ncol; ++col_idx) { + result(row_idx, col_idx) = prob_vec[row_idx](col_idx); + } + } + nnet_out_writer.Write(utt, result); + + ++num_done; + } + + double elapsed = timer.Elapsed(); + LOG(INFO) << " cost:" << elapsed << " sec"; + + LOG(INFO) << "Done " << num_done << " utterances, " << num_err + << " with errors."; + return (num_done != 0 ? 0 : 1); +} diff --git a/speechx/speechx/protocol/CMakeLists.txt b/speechx/speechx/protocol/CMakeLists.txt index 98b2f38b43a87e9b548c34d96a1f601e957e0045..71b33daa92945540e7949b5f6bd6bba859d9bc3e 100644 --- a/speechx/speechx/protocol/CMakeLists.txt +++ b/speechx/speechx/protocol/CMakeLists.txt @@ -1,3 +1 @@ -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) - add_subdirectory(websocket) diff --git a/speechx/speechx/protocol/websocket/CMakeLists.txt b/speechx/speechx/protocol/websocket/CMakeLists.txt index c3454c399eb36355b756bfa2744361e957655a72..0f73fd24ca3a6d30107bb19f41d8f0d3d67b2967 100644 --- a/speechx/speechx/protocol/websocket/CMakeLists.txt +++ b/speechx/speechx/protocol/websocket/CMakeLists.txt @@ -1,4 +1,4 @@ -project(websocket) +# project(websocket) add_library(websocket STATIC websocket_server.cc diff --git a/speechx/speechx/utils/CMakeLists.txt b/speechx/speechx/utils/CMakeLists.txt index 95e8657443c54bf5e08cc885276ce8fda4548736..c1e875be1c8731c46b076f5d7199031470d06a2f 100644 --- a/speechx/speechx/utils/CMakeLists.txt +++ b/speechx/speechx/utils/CMakeLists.txt @@ -1,4 +1,5 @@ add_library(utils file_utils.cc + math.cc ) \ No newline at end of file diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc index 7c31929561f8c4c2f1f986344b175f1d67d49b50..5087ac60bc49b54e5ecb66cbc07a61e16cc0a459 100644 --- a/speechx/speechx/utils/math.cc +++ b/speechx/speechx/utils/math.cc @@ -38,11 +38,11 @@ float LogSumExp(float x, float y) { template struct ValGreaterComp { bool operator()(const std::pair& lhs, - const std::pair& rhs) const { + const std::pair& rhs) const { return lhs.first > rhs.first || (lhs.first == rhs.first && lhs.second < rhs.second); } -} +}; template void TopK(const std::vector& data, diff --git a/speechx/tools/venv.sh b/speechx/tools/venv.sh new file mode 100755 index 0000000000000000000000000000000000000000..3952988c68498507b7aa1d6f2758ad0a2ce63def --- /dev/null +++ b/speechx/tools/venv.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -ex + +PYTHON=python3.7 +test -d venv || virtualenv -p ${PYTHON} venv