提交 290c23b9 编写于 作者: H Hui Zhang

add u2 nnet, u2 nnet main, codelab, and can compile

上级 e1fc57de
......@@ -42,6 +42,7 @@ for type in attention_rescoring; do
output_dir=${ckpt_prefix}
mkdir -p ${output_dir}
python3 -u ${BIN_DIR}/test_wav.py \
--debug True \
--ngpu ${ngpu} \
--config ${config_path} \
--decode_cfg ${decode_config_path} \
......
......@@ -16,6 +16,8 @@ import os
import sys
from pathlib import Path
import distutils
import numpy as np
import paddle
import soundfile
from yacs.config import CfgNode
......@@ -74,6 +76,8 @@ class U2Infer():
# fbank
feat = self.preprocessing(audio, **self.preprocess_args)
logger.info(f"feat shape: {feat.shape}")
if self.args.debug:
np.savetxt("feat.transform.txt", feat)
ilen = paddle.to_tensor(feat.shape[0])
xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0)
......@@ -125,6 +129,11 @@ if __name__ == "__main__":
"--result_file", type=str, help="path of save the asr result")
parser.add_argument(
"--audio_file", type=str, help="path of the input audio file")
parser.add_argument(
"--debug",
type=distutils.util.strtobool,
default=False,
help="for debug.")
args = parser.parse_args()
config = CfgNode(new_allowed=True)
......
# This file is used by clang-format to autoformat paddle source code
#
# The clang-format is part of llvm toolchain.
# It need to install llvm and clang to format source code style.
#
# The basic usage is,
# clang-format -i -style=file PATH/TO/SOURCE/CODE
#
# The -style=file implicit use ".clang-format" file located in one of
# parent directory.
# The -i means inplace change.
#
# The document of clang-format is
# http://clang.llvm.org/docs/ClangFormat.html
# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
---
Language: Cpp
BasedOnStyle: Google
IndentWidth: 4
TabWidth: 4
ContinuationIndentWidth: 4
MaxEmptyLinesToKeep: 2
AccessModifierOffset: -2 # The private/protected/public has no indent in class
Standard: Cpp11
AllowAllParametersOfDeclarationOnNextLine: true
BinPackParameters: false
BinPackArguments: false
...
......@@ -31,9 +31,13 @@ SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} --std=c++14 -pthread -fPIC -O3 -Wall
###############################################################################
# Option Configurations
###############################################################################
# option configurations
option(TEST_DEBUG "option for debug" OFF)
option(USE_PROFILING "enable c++ profling" OFF)
option(USING_U2 "compile u2 model." ON)
option(USING_DS2 "compile with ds2 model." ON)
option(USING_GPU "u2 compute on GPU." OFF)
###############################################################################
# Include third party
......@@ -85,6 +89,41 @@ add_dependencies(openfst gflags glog)
include(paddleinference)
# paddle core.so
find_package(Threads REQUIRED)
find_package(PythonLibs REQUIRED)
find_package(Python3 REQUIRED)
find_package(pybind11 CONFIG)
message(STATUS "PYTHON_LIBRARIES = ${PYTHON_LIBRARIES}")
message(STATUS "Python3_EXECUTABLE = ${Python3_EXECUTABLE}")
message(STATUS "Pybind11_INCLUDES = ${pybind11_INCLUDE_DIRS}, pybind11_LIBRARIES=${pybind11_LIBRARIES}, pybind11_DEFINITIONS=${pybind11_DEFINITIONS}")
# paddle include and link option
execute_process(
COMMAND python -c "import paddle ; print(' '.join(paddle.sysconfig.get_link_flags()), end='')"
OUTPUT_VARIABLE PADDLE_LINK_FLAGS
RESULT_VARIABLE SUCESS)
message(STATUS PADDLE_LINK_FLAGS= ${PADDLE_LINK_FLAGS})
string(STRIP ${PADDLE_LINK_FLAGS} PADDLE_LINK_FLAGS)
# paddle compile option
execute_process(
COMMAND python -c "import paddle ; print(' '.join(paddle.sysconfig.get_compile_flags()), end='')"
OUTPUT_VARIABLE PADDLE_COMPILE_FLAGS)
message(STATUS PADDLE_COMPILE_FLAGS= ${PADDLE_COMPILE_FLAGS})
string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS)
# for LD_LIBRARY_PATH
# set(PADDLE_LIB_DIRS /workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid:/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/libs/)
execute_process(
COMMAND python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')"
OUTPUT_VARIABLE PADDLE_LIB_DIRS)
message(STATUS PADDLE_LIB_DIRS= ${PADDLE_LIB_DIRS})
###############################################################################
# Add local library
###############################################################################
......
......@@ -3,11 +3,14 @@
## Environment
We develop under:
* python - 3.7
* docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2-gpu-cuda10.2-cudnn7`
* os - Ubuntu 16.04.7 LTS
* gcc/g++/gfortran - 8.2.0
* cmake - 3.16.0
> Please using `tools/env.sh` to create python `venv`, then `source venv/bin/activate` to build speechx.
> We make sure all things work fun under docker, and recommend using it to develop and deploy.
* [How to Install Docker](https://docs.docker.com/engine/install/)
......@@ -24,13 +27,16 @@ docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/workspace --nam
* More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html).
2. Create python environment.
2. Build `speechx` and `examples`.
```
bash tools/venv.sh
```
> Do not source venv.
2. Build `speechx` and `examples`.
```
pushd /path/to/speechx
source venv/bin/activate
./build.sh
```
......
......@@ -2,10 +2,9 @@ include(FetchContent)
FetchContent_Declare(
gflags
URL https://github.com/gflags/gflags/archive/v2.2.1.zip
URL_HASH SHA256=4e44b69e709c826734dbbbd5208f61888a2faf63f239d73d8ba0011b2dccc97a
URL https://github.com/gflags/gflags/archive/v2.2.2.zip
URL_HASH SHA256=19713a36c9f32b33df59d1c79b4958434cb005b5b47dc5400a7a4b078111d9b5
)
FetchContent_MakeAvailable(gflags)
# openfst need
......
include(FetchContent)
FetchContent_Declare(
gtest
URL https://github.com/google/googletest/archive/release-1.10.0.zip
URL_HASH SHA256=94c634d499558a76fa649edb13721dce6e98fb1e7018dfaeba3cd7a083945e91
URL https://github.com/google/googletest/archive/release-1.11.0.zip
URL_HASH SHA256=353571c2440176ded91c2de6d6cd88ddd41401d14692ec1f99e35d013feda55a
)
FetchContent_MakeAvailable(gtest)
......
# This contains the locations of binarys build required for running the examples.
SPEECHX_ROOT=$PWD/../../../
SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; }
export LC_AL=C
......
......@@ -54,4 +54,10 @@ compute_linear_spectrogram_main \
--cmvn_file=$exp_dir/cmvn.ark
echo "compute linear spectrogram feature."
compute_fbank_main \
--num_bins 161 \
--wav_rspecifier=scp:$data_dir/wav.scp \
--feature_wspecifier=ark,t:$exp_dir/fbank.ark \
--cmvn_file=$exp_dir/cmvn.ark
echo "compute fbank feature."
......@@ -6,7 +6,7 @@ SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; }
export LC_AL=C
......
# Deepspeech2 Streaming NNet Test
Using for ds2 streaming nnet inference test.
# This contains the locations of binarys build required for running the examples.
unset GREP_OPTIONS
SPEECHX_ROOT=$PWD/../../../
SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; }
export LC_AL=C
SPEECHX_BIN=$SPEECHX_BUILD/nnet
export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')")
export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH
#!/bin/bash
set -x
set -e
. path.sh
# 1. compile
if [ ! -d ${SPEECHX_EXAMPLES} ]; then
pushd ${SPEECHX_ROOT}
bash build.sh
popd
fi
# 2. download model
if [ ! -f data/model/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz ]; then
mkdir -p data/model
pushd data/model
wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/static/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz
tar xzfv asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz
popd
fi
# produce wav scp
if [ ! -f data/wav.scp ]; then
mkdir -p data
pushd data
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
echo "utt1 " $PWD/zh.wav > wav.scp
popd
fi
data=data
exp=exp
mkdir -p $exp
ckpt_dir=./data/model
model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
cmvn_json2kaldi_main \
--json_file $model_dir/mean_std.json \
--cmvn_write_path $exp/cmvn.ark \
--binary=false
echo "convert json cmvn to kaldi ark."
compute_fbank_main \
--num_bins 80 \
--wav_rspecifier=scp:$data/wav.scp \
--cmvn_file=$exp/cmvn.ark \
--feature_wspecifier=ark,t:$exp/fbank.ark
echo "compute fbank feature."
u2_nnet_main \
--model_path=$model_dir/export.jit \
--feature_rspecifier=ark,t:$exp/fbank.ark \
--nnet_decoder_chunk=16 \
--receptive_field_length=7 \
--downsampling_rate=4 \
--acoustic_scale=1.0 \
--nnet_prob_wspecifier=ark,t:$exp/probs.ark
#!/bin/bash
# this script is for memory check, so please run ./run.sh first.
set +x
set -e
. ./path.sh
if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then
echo "please install valgrind in the speechx tools dir.\n"
exit 1
fi
ckpt_dir=./data/model
model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \
ds2_model_test_main \
--model_path=$model_dir/avg_1.jit.pdmodel \
--param_path=$model_dir/avg_1.jit.pdparams
# U2/U2++ Streaming ASR
## Examples
* `wenetspeech` - Streaming Decoding using wenetspeech u2/u2++ model. Using aishell test data for testing.
......@@ -34,6 +34,7 @@
#include <stdexcept>
#include <string>
#include <thread>
#include <type_traits>
#include <unordered_map>
#include <unordered_set>
#include <utility>
......
......@@ -17,7 +17,7 @@
int main(int argc, char* argv[]) {
// Initialize Google’s logging library.
google::InitGoogleLogging(argv[0]);
google::InstallFailureSignalHandler();
FLAGS_logtostderr = 1;
LOG(INFO) << "Found " << 10 << " cookies";
......
......@@ -195,8 +195,11 @@ void model_forward_test() {
}
int main(int argc, char* argv[]) {
gflags::SetUsageMessage("Usage:");
gflags::ParseCommandLineFlags(&argc, &argv, false);
google::InitGoogleLogging(argv[0]);
google::InstallFailureSignalHandler();
FLAGS_logtostderr = 1;
model_forward_test();
return 0;
......
......@@ -18,7 +18,6 @@ set(BINS
tlg_decoder_main
)
message(STATUS "xxxxxxxxxx: " ${DEPS})
foreach(bin_name IN LISTS BINS)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
......
......@@ -53,8 +53,11 @@ using std::vector;
// test ds2 online decoder by feeding speech feature
int main(int argc, char* argv[]) {
gflags::SetUsageMessage("Usage:");
gflags::ParseCommandLineFlags(&argc, &argv, false);
google::InitGoogleLogging(argv[0]);
google::InstallFailureSignalHandler();
FLAGS_logtostderr = 1;
CHECK(FLAGS_result_wspecifier != "");
CHECK(FLAGS_feature_rspecifier != "");
......
......@@ -30,8 +30,11 @@ using std::vector;
// test decoder by feeding nnet posterior probability
int main(int argc, char* argv[]) {
gflags::SetUsageMessage("Usage:");
gflags::ParseCommandLineFlags(&argc, &argv, false);
google::InitGoogleLogging(argv[0]);
google::InstallFailureSignalHandler();
FLAGS_logtostderr = 1;
kaldi::SequentialBaseFloatMatrixReader likelihood_reader(
FLAGS_nnet_prob_respecifier);
......
......@@ -23,8 +23,11 @@ DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
DEFINE_int32(sample_rate, 16000, "sample rate");
int main(int argc, char* argv[]) {
gflags::SetUsageMessage("Usage:");
gflags::ParseCommandLineFlags(&argc, &argv, false);
google::InitGoogleLogging(argv[0]);
google::InstallFailureSignalHandler();
FLAGS_logtostderr = 1;
ppspeech::RecognizerResource resource = ppspeech::InitRecognizerResoure();
ppspeech::Recognizer recognizer(resource);
......
......@@ -55,8 +55,11 @@ using std::vector;
// test TLG decoder by feeding speech feature.
int main(int argc, char* argv[]) {
gflags::SetUsageMessage("Usage:");
gflags::ParseCommandLineFlags(&argc, &argv, false);
google::InitGoogleLogging(argv[0]);
google::InstallFailureSignalHandler();
FLAGS_logtostderr = 1;
kaldi::SequentialBaseFloatMatrixReader feature_reader(
FLAGS_feature_rspecifier);
......
project(frontend)
add_library(frontend STATIC
cmvn.cc
db_norm.cc
......
......@@ -30,8 +30,11 @@ DEFINE_bool(binary, true, "write cmvn in binary (true) or text(false)");
using namespace boost::json; // from <boost/json.hpp>
int main(int argc, char* argv[]) {
gflags::SetUsageMessage("Usage:");
gflags::ParseCommandLineFlags(&argc, &argv, false);
google::InitGoogleLogging(argv[0]);
google::InstallFailureSignalHandler();
FLAGS_logtostderr = 1;
LOG(INFO) << "cmvn josn path: " << FLAGS_json_file;
......
......@@ -32,13 +32,21 @@ DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
DEFINE_string(cmvn_file, "", "read cmvn");
DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
DEFINE_int32(num_bins, 161, "fbank num bins");
DEFINE_int32(sample_rate, 16000, "sampe rate: 16k, 8k.");
int main(int argc, char* argv[]) {
gflags::SetUsageMessage("Usage:");
gflags::ParseCommandLineFlags(&argc, &argv, false);
google::InitGoogleLogging(argv[0]);
google::InstallFailureSignalHandler();
FLAGS_logtostderr = 1;
CHECK(FLAGS_wav_rspecifier.size() > 0);
CHECK(FLAGS_feature_wspecifier.size() > 0);
kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
FLAGS_wav_rspecifier);
kaldi::SequentialTableReader<kaldi::WaveInfoHolder> wav_info_reader(
FLAGS_wav_rspecifier);
kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
int32 num_done = 0, num_err = 0;
......@@ -54,6 +62,10 @@ int main(int argc, char* argv[]) {
opt.frame_opts.frame_shift_ms = 10;
opt.mel_opts.num_bins = FLAGS_num_bins;
opt.frame_opts.dither = 0.0;
LOG(INFO) << "frame_length_ms: " << opt.frame_opts.frame_length_ms;
LOG(INFO) << "frame_shift_ms: " << opt.frame_opts.frame_shift_ms;
LOG(INFO) << "num_bins: " << opt.mel_opts.num_bins;
LOG(INFO) << "dither: " << opt.frame_opts.dither;
std::unique_ptr<ppspeech::FrontendInterface> fbank(
new ppspeech::Fbank(opt, std::move(data_source)));
......@@ -61,53 +73,73 @@ int main(int argc, char* argv[]) {
std::unique_ptr<ppspeech::FrontendInterface> cmvn(
new ppspeech::CMVN(FLAGS_cmvn_file, std::move(fbank)));
ppspeech::FeatureCacheOptions feat_cache_opts;
// the feature cache output feature chunk by chunk.
ppspeech::FeatureCacheOptions feat_cache_opts;
ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
LOG(INFO) << "fbank: " << true;
LOG(INFO) << "feat dim: " << feature_cache.Dim();
int sample_rate = 16000;
float streaming_chunk = FLAGS_streaming_chunk;
int chunk_sample_size = streaming_chunk * sample_rate;
LOG(INFO) << "sr: " << sample_rate;
LOG(INFO) << "chunk size (s): " << streaming_chunk;
int chunk_sample_size = streaming_chunk * FLAGS_sample_rate;
LOG(INFO) << "sr: " << FLAGS_sample_rate;
LOG(INFO) << "chunk size (sec): " << streaming_chunk;
LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
for (; !wav_reader.Done(); wav_reader.Next()) {
std::string utt = wav_reader.Key();
for (; !wav_reader.Done() && !wav_info_reader.Done(); wav_reader.Next(), wav_info_reader.Next()) {
const std::string& utt = wav_reader.Key();
const kaldi::WaveData& wave_data = wav_reader.Value();
LOG(INFO) << "process utt: " << utt;
const std::string& utt2 = wav_info_reader.Key();
const kaldi::WaveInfo& wave_info = wav_info_reader.Value();
CHECK(utt == utt2) << "wav reader and wav info reader using diff rspecifier!!!";
LOG(INFO) << "utt: " << utt;
LOG(INFO) << "samples: " << wave_info.SampleCount();
LOG(INFO) << "dur: " << wave_info.Duration() << " sec";
CHECK(wave_info.SampFreq() == FLAGS_sample_rate) << "need " << FLAGS_sample_rate << " get " << wave_info.SampFreq();
// load first channel wav
int32 this_channel = 0;
kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
this_channel);
// compute feat chunk by chunk
int tot_samples = waveform.Dim();
LOG(INFO) << "wav len (sample): " << tot_samples;
int sample_offset = 0;
std::vector<kaldi::Vector<BaseFloat>> feats;
int feature_rows = 0;
while (sample_offset < tot_samples) {
// cur chunk size
int cur_chunk_size =
std::min(chunk_sample_size, tot_samples - sample_offset);
// get chunk wav
kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
for (int i = 0; i < cur_chunk_size; ++i) {
wav_chunk(i) = waveform(sample_offset + i);
}
kaldi::Vector<BaseFloat> features;
// compute feat
feature_cache.Accept(wav_chunk);
// send finish signal
if (cur_chunk_size < chunk_sample_size) {
feature_cache.SetFinished();
}
// read feat
kaldi::Vector<BaseFloat> features;
bool flag = true;
do {
flag = feature_cache.Read(&features);
feats.push_back(features);
feature_rows += features.Dim() / feature_cache.Dim();
if (flag && features.Dim() != 0) {
feats.push_back(features);
feature_rows += features.Dim() / feature_cache.Dim();
}
} while (flag == true && features.Dim() != 0);
// forward offset
sample_offset += cur_chunk_size;
}
......@@ -125,14 +157,19 @@ int main(int argc, char* argv[]) {
++cur_idx;
}
}
LOG(INFO) << "feat shape: " << features.NumRows() << " , " << features.NumCols();
feat_writer.Write(utt, features);
// reset frontend pipeline state
feature_cache.Reset();
if (num_done % 50 == 0 && num_done != 0)
KALDI_VLOG(2) << "Processed " << num_done << " utterances";
VLOG(2) << "Processed " << num_done << " utterances";
num_done++;
}
KALDI_LOG << "Done " << num_done << " utterances, " << num_err
LOG(INFO) << "Done " << num_done << " utterances, " << num_err
<< " with errors.";
return (num_done != 0 ? 0 : 1);
}
......@@ -31,8 +31,11 @@ DEFINE_string(cmvn_file, "./cmvn.ark", "read cmvn");
DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
int main(int argc, char* argv[]) {
gflags::SetUsageMessage("Usage:");
gflags::ParseCommandLineFlags(&argc, &argv, false);
google::InitGoogleLogging(argv[0]);
google::InstallFailureSignalHandler();
FLAGS_logtostderr = 1;
kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
FLAGS_wav_rspecifier);
......
project(nnet)
set(srcs decodable.cc)
add_library(nnet STATIC
decodable.cc
ds2_nnet.cc
)
if(USING_DS2)
list(APPEND srcs ds2_nnet.cc)
endif()
if(USING_U2)
list(APPEND srcs u2_nnet.cc)
endif()
add_library(nnet STATIC ${srcs})
target_link_libraries(nnet absl::strings)
set(bin_name ds2_nnet_main)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet ${DEPS})
\ No newline at end of file
if(USING_U2)
target_compile_options(nnet PUBLIC ${PADDLE_COMPILE_FLAGS})
target_include_directories(nnet PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
# target_link_libraries(nnet ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
endif()
if(USING_DS2)
set(bin_name ds2_nnet_main)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet)
target_link_libraries(${bin_name} ${DEPS})
endif()
# test bin
if(USING_U2)
set(bin_name u2_nnet_main)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet)
target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS})
target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
endif()
......@@ -30,6 +30,7 @@ Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
frames_ready_(0),
acoustic_scale_(acoustic_scale) {}
// for debug
void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
nnet_cache_ = likelihood;
frames_ready_ += likelihood.NumRows();
......@@ -41,6 +42,7 @@ void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
// return the size of frame have computed.
int32 Decodable::NumFramesReady() const { return frames_ready_; }
// frame idx is from 0 to frame_ready_ -1;
bool Decodable::IsLastFrame(int32 frame) {
bool flag = EnsureFrameHaveComputed(frame);
......@@ -72,26 +74,38 @@ bool Decodable::EnsureFrameHaveComputed(int32 frame) {
}
bool Decodable::AdvanceChunk() {
// read feats
Vector<BaseFloat> features;
if (frontend_ == NULL || frontend_->Read(&features) == false) {
// no feat or frontend_ not init.
return false;
}
int32 nnet_dim = 0;
Vector<BaseFloat> inferences;
nnet_->FeedForward(features, frontend_->Dim(), &inferences, &nnet_dim);
nnet_cache_.Resize(inferences.Dim() / nnet_dim, nnet_dim);
nnet_cache_.CopyRowsFromVec(inferences);
// forward feats
int32 vocab_dim = 0;
Vector<BaseFloat> probs;
nnet_->FeedForward(features, frontend_->Dim(), &probs, &vocab_dim);
// cache nnet outupts
nnet_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim);
nnet_cache_.CopyRowsFromVec(probs);
// update state
frame_offset_ = frames_ready_;
frames_ready_ += nnet_cache_.NumRows();
return true;
}
// read one frame likelihood
bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
std::vector<BaseFloat> result;
if (EnsureFrameHaveComputed(frame) == false) return false;
likelihood->resize(nnet_cache_.NumCols());
for (int32 idx = 0; idx < nnet_cache_.NumCols(); ++idx) {
if (EnsureFrameHaveComputed(frame) == false) {
return false;
}
int vocab_size = nnet_cache_.NumCols();
likelihood->resize(vocab_size);
for (int32 idx = 0; idx < vocab_size; ++idx) {
(*likelihood)[idx] =
nnet_cache_(frame - frame_offset_, idx) * acoustic_scale_;
}
......
......@@ -27,35 +27,54 @@ class Decodable : public kaldi::DecodableInterface {
explicit Decodable(const std::shared_ptr<NnetInterface>& nnet,
const std::shared_ptr<FrontendInterface>& frontend,
kaldi::BaseFloat acoustic_scale = 1.0);
// void Init(DecodableOpts config);
// nnet logprob output
virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index);
virtual bool IsLastFrame(int32 frame);
// nnet output dim, e.g. vocab size
virtual int32 NumIndices() const;
// not logprob
// nnet prob output
virtual bool FrameLikelihood(int32 frame,
std::vector<kaldi::BaseFloat>* likelihood);
virtual int32 NumFramesReady() const;
// for offline test
void Acceptlikelihood(const kaldi::Matrix<kaldi::BaseFloat>& likelihood);
void Reset();
bool IsInputFinished() const { return frontend_->IsFinished(); }
bool EnsureFrameHaveComputed(int32 frame);
int32 TokenId2NnetId(int32 token_id);
private:
bool AdvanceChunk();
std::shared_ptr<FrontendInterface> frontend_;
std::shared_ptr<NnetInterface> nnet_;
// nnet outputs' cache
kaldi::Matrix<kaldi::BaseFloat> nnet_cache_;
// the frame is nnet prob frame rather than audio feature frame
// nnet frame subsample the feature frame
// eg: 35 frame features output 8 frame inferences
int32 frame_offset_;
int32 frames_ready_;
// todo: feature frame mismatch with nnet inference frame
// so use subsampled_frame
int32 current_log_post_subsampled_offset_;
int32 num_chunk_computed_;
kaldi::BaseFloat acoustic_scale_;
};
......
......@@ -13,8 +13,7 @@
// limitations under the License.
#include "nnet/ds2_nnet.h"
#include "base/flags.h"
#include "base/log.h"
#include "base/common.h"
#include "frontend/audio/assembler.h"
#include "frontend/audio/data_cache.h"
#include "kaldi/util/table-types.h"
......@@ -49,8 +48,11 @@ using kaldi::Matrix;
using std::vector;
int main(int argc, char* argv[]) {
gflags::SetUsageMessage("Usage:");
gflags::ParseCommandLineFlags(&argc, &argv, false);
google::InitGoogleLogging(argv[0]);
google::InstallFailureSignalHandler();
FLAGS_logtostderr = 1;
kaldi::SequentialBaseFloatMatrixReader feature_reader(
FLAGS_feature_rspecifier);
......@@ -146,7 +148,7 @@ int main(int argc, char* argv[]) {
}
kaldi::Matrix<kaldi::BaseFloat> result(prob_vec.size(),
prob_vec[0].Dim());
for (int32 row_idx = 0; row_idx < prob_vec.size(); ++row_idx) {
for (int row_idx = 0; row_idx < prob_vec.size(); ++row_idx) {
for (int32 col_idx = 0; col_idx < prob_vec[0].Dim(); ++col_idx) {
result(row_idx, col_idx) = prob_vec[row_idx](col_idx);
}
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "nnet/u2_nnet.h"
#ifdef USE_PROFILING
#include "paddle/fluid/platform/profiler.h"
using paddle::platform::RecordEvent;
using paddle::platform::TracerEventType;
#endif // end USE_PROFILING
namespace ppspeech {
int U2NnetBase::num_frames_for_chunk(bool start) const {
int num_needed_frames = 0; // num feat frames
bool first = !start; // start == false is first
if (chunk_size_ > 0) {
// streaming mode
if (first) {
// first chunk
// 1 decoder frame need `context` feat frames
int context = this->context();
num_needed_frames = (chunk_size_ - 1) * subsampling_rate_ + context;
} else {
// after first chunk, we need stride this num frames.
num_needed_frames = chunk_size_ * subsampling_rate_;
}
} else {
// non-streaming mode. feed all feats once.
num_needed_frames = std::numeric_limits<int>::max();
}
return num_needed_frames;
}
// cache feats for next chunk
void U2NnetBase::CacheFeature(const std::vector<kaldi::BaseFloat>& chunk_feats,
int32 feat_dim) {
// chunk_feats is nframes*feat_dim
const int chunk_size = chunk_feats.size() / feat_dim;
const int cached_feat_size = this->context() - subsampling_rate_;
if (chunk_size >= cached_feat_size) {
cached_feats_.resize(cached_feat_size);
for (int i = 0; i < cached_feat_size; ++i) {
auto start =
chunk_feats.begin() + chunk_size - cached_feat_size + i;
auto end = start + feat_dim;
cached_feats_[i] = std::vector<float>(start, end);
}
}
}
void U2NnetBase::ForwardEncoderChunk(
const std::vector<kaldi::BaseFloat>& chunk_feats,
int32 feat_dim,
std::vector<kaldi::BaseFloat>* ctc_probs,
int32* vocab_dim) {
ctc_probs->clear();
// int num_frames = cached_feats_.size() + chunk_feats.size();
int num_frames = chunk_feats.size() / feat_dim;
VLOG(3) << "foward encoder chunk: " << num_frames << " frames";
VLOG(3) << "context: " << this->context() << " frames";
if (num_frames >= this->context()) {
this->ForwardEncoderChunkImpl(
chunk_feats, feat_dim, ctc_probs, vocab_dim);
VLOG(3) << "after forward chunk";
this->CacheFeature(chunk_feats, feat_dim);
}
}
void U2Nnet::LoadModel(const std::string& model_path_w_prefix) {
paddle::jit::utils::InitKernelSignatureMap();
#ifdef USE_GPU
dev_ = phi::GPUPlace();
#else
dev_ = phi::CPUPlace();
#endif
paddle::jit::Layer model = paddle::jit::Load(model_path_w_prefix, dev_);
model_ = std::make_shared<paddle::jit::Layer>(std::move(model));
subsampling_rate_ = model_->Attribute<int>("subsampling_rate");
right_context_ = model_->Attribute<int>("right_context");
sos_ = model_->Attribute<int>("sos_symbol");
eos_ = model_->Attribute<int>("eos_symbol");
is_bidecoder_ = model_->Attribute<int>("is_bidirectional_decoder");
forward_encoder_chunk_ = model_->Function("forward_encoder_chunk");
forward_attention_decoder_ = model_->Function("forward_attention_decoder");
ctc_activation_ = model_->Function("ctc_activation");
CHECK(forward_encoder_chunk_.IsValid());
CHECK(forward_attention_decoder_.IsValid());
CHECK(ctc_activation_.IsValid());
LOG(INFO) << "Paddle Model Info: ";
LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_;
LOG(INFO) << "\tright context " << right_context_;
LOG(INFO) << "\tsos " << sos_;
LOG(INFO) << "\teos " << eos_;
LOG(INFO) << "\tis bidecoder " << is_bidecoder_ << std::endl;
Warmup();
}
void U2Nnet::Warmup() {
#ifdef USE_PROFILING
RecordEvent event("warmup", TracerEventType::UserDefined, 1);
#endif
{
#ifdef USE_PROFILING
RecordEvent event(
"warmup-encoder-ctc", TracerEventType::UserDefined, 1);
#endif
int feat_dim = 80;
int frame_num = 16 * 4 + 3; // chunk_size * downsample_rate +
// (receptive_field - downsample_rate)
paddle::Tensor feats = paddle::full(
{1, frame_num, feat_dim}, 0.12f, paddle::DataType::FLOAT32);
paddle::Tensor offset = paddle::zeros({1}, paddle::DataType::INT32);
paddle::Tensor att_cache =
paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32);
paddle::Tensor cnn_cache =
paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32);
std::vector<paddle::Tensor> inputs = {
feats, offset, /*required_cache_size, */ att_cache, cnn_cache};
std::vector<paddle::Tensor> outputs = forward_encoder_chunk_(inputs);
auto chunk_out = outputs[0];
inputs = std::move(std::vector<paddle::Tensor>({chunk_out}));
outputs = ctc_activation_(inputs);
}
{
#ifdef USE_PROFILING
RecordEvent event("warmup-decoder", TracerEventType::UserDefined, 1);
#endif
auto hyps =
paddle::full({10, 8}, 10, paddle::DataType::INT64, phi::CPUPlace());
auto hyps_lens =
paddle::full({10}, 8, paddle::DataType::INT64, phi::CPUPlace());
auto encoder_out = paddle::ones(
{1, 20, 512}, paddle::DataType::FLOAT32, phi::CPUPlace());
std::vector<paddle::experimental::Tensor> inputs{
hyps, hyps_lens, encoder_out};
std::vector<paddle::experimental::Tensor> outputs =
forward_attention_decoder_(inputs);
}
Reset();
}
U2Nnet::U2Nnet(const U2ModelOptions& opts) : opts_(opts) {
LoadModel(opts_.model_path);
}
// shallow copy
U2Nnet::U2Nnet(const U2Nnet& other) {
// copy meta
right_context_ = other.right_context_;
subsampling_rate_ = other.subsampling_rate_;
sos_ = other.sos_;
eos_ = other.eos_;
is_bidecoder_ = other.is_bidecoder_;
chunk_size_ = other.chunk_size_;
num_left_chunks_ = other.num_left_chunks_;
forward_encoder_chunk_ = other.forward_encoder_chunk_;
forward_attention_decoder_ = other.forward_attention_decoder_;
ctc_activation_ = other.ctc_activation_;
// offset_ = other.offset_; // TODO: not used in nnets
// copy model ptr
model_ = other.model_;
// ignore inner states
}
std::shared_ptr<NnetInterface> U2Nnet::Copy() const {
auto asr_model = std::make_shared<U2Nnet>(*this);
// reset inner state for new decoding
asr_model->Reset();
return asr_model;
}
void U2Nnet::Reset() {
// offset_ = 0;
// cached_feats_.clear(); // TODO: not used in nnets
att_cache_ =
std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32));
cnn_cache_ =
std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32));
encoder_outs_.clear();
}
// Debug API
void U2Nnet::FeedEncoderOuts(paddle::Tensor& encoder_out) {
// encoder_out (T,D)
encoder_outs_.clear();
encoder_outs_.push_back(encoder_out);
}
void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
int32 feature_dim,
kaldi::Vector<BaseFloat>* inferences,
int32* inference_dim) {
std::vector<kaldi::BaseFloat> chunk_feats(features.Data(),
features.Data() + features.Dim());
std::vector<kaldi::BaseFloat> ctc_probs;
ForwardEncoderChunkImpl(
chunk_feats, feature_dim, &ctc_probs, inference_dim);
inferences->Resize(ctc_probs.size(), kaldi::kSetZero);
std::memcpy(inferences->Data(),
ctc_probs.data(),
ctc_probs.size() * sizeof(kaldi::BaseFloat));
}
void U2Nnet::ForwardEncoderChunkImpl(
const std::vector<kaldi::BaseFloat>& chunk_feats,
int32 feat_dim,
std::vector<kaldi::BaseFloat>* out_prob,
int32* vocab_dim) {
#ifdef USE_PROFILING
RecordEvent event(
"ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1);
#endif
// 1. splice cached_feature, and chunk_feats
// First dimension is B, which is 1.
// int num_frames = cached_feats_.size() + chunk_feats.size();
int num_frames = chunk_feats.size() / feat_dim;
VLOG(3) << "num_frames: " << num_frames;
VLOG(3) << "feat_dim: " << feat_dim;
// feats (B=1,T,D)
paddle::Tensor feats =
paddle::zeros({1, num_frames, feat_dim}, paddle::DataType::FLOAT32);
float* feats_ptr = feats.mutable_data<float>();
// for (size_t i = 0; i < cached_feats_.size(); ++i) {
// float* row = feats_ptr + i * feat_dim;
// std::memcpy(row, cached_feats_[i].data(), feat_dim * sizeof(float));
// }
// for (size_t i = 0; i < chunk_feats.size(); ++i) {
// float* row = feats_ptr + (cached_feats_.size() + i) * feat_dim;
// std::memcpy(row, chunk_feats[i].data(), feat_dim * sizeof(float));
// }
// not cache feature in nnet
CHECK(cached_feats_.size() == 0);
// CHECK_EQ(std::is_same<float, kaldi::BaseFloat>::value, true);
std::memcpy(feats_ptr,
chunk_feats.data(),
chunk_feats.size() * sizeof(kaldi::BaseFloat));
VLOG(3) << "feats shape: " << feats.shape()[0] << ", " << feats.shape()[1]
<< ", " << feats.shape()[2];
#ifdef TEST_DEBUG
{
std::stringstream path("feat", std::ios_base::app | std::ios_base::out);
path << offset_;
std::ofstream feat_fobj(path.str().c_str(), std::ios::out);
CHECK(feat_fobj.is_open());
// feat_fobj << feats.shape()[0] << " " << feats.shape()[1] << " "
// << feats.shape()[2] << "\n";
for (int i = 0; i < feats.numel(); i++) {
feat_fobj << std::setprecision(18) << feats_ptr[i] << " ";
if ((i + 1) % feat_dim == 0) {
feat_fobj << "\n";
}
}
feat_fobj << "\n";
}
#endif
// Endocer chunk forward
#ifdef USE_GPU
feats = feats.copy_to(paddle::GPUPlace(), /*blocking*/ false);
att_cache_ = att_cache_.copy_to(paddle::GPUPlace()), /*blocking*/ false;
cnn_cache_ = cnn_cache_.copy_to(Paddle::GPUPlace(), /*blocking*/ false);
#endif
int required_cache_size = num_left_chunks_ * chunk_size_; // -1 * 16
// must be scalar, but paddle do not have scalar.
paddle::Tensor offset = paddle::full({1}, offset_, paddle::DataType::INT32);
// freeze `required_cache_size` in graph, so not specific it in function
// call.
std::vector<paddle::Tensor> inputs = {
feats, offset, /*required_cache_size, */ att_cache_, cnn_cache_};
VLOG(3) << "inputs size: " << inputs.size();
CHECK(inputs.size() == 4);
std::vector<paddle::Tensor> outputs = forward_encoder_chunk_(inputs);
VLOG(3) << "outputs size: " << outputs.size();
CHECK(outputs.size() == 3);
#ifdef USE_GPU
paddle::Tensor chunk_out = outputs[0].copy_to(paddle::CPUPlace());
att_cache_ = outputs[1].copy_to(paddle::CPUPlace());
cnn_cache_ = outputs[2].copy_to(paddle::CPUPlace());
#else
paddle::Tensor chunk_out = outputs[0];
att_cache_ = outputs[1];
cnn_cache_ = outputs[2];
#endif
#ifdef TEST_DEBUG
{
std::stringstream path("encoder_logits",
std::ios_base::app | std::ios_base::out);
auto i = offset_ - chunk_out.shape()[1];
path << std::max(i, 0L);
std::ofstream logits_fobj(path.str().c_str(), std::ios::out);
CHECK(logits_fobj.is_open());
logits_fobj << chunk_out.shape()[0] << " " << chunk_out.shape()[1]
<< " " << chunk_out.shape()[2] << "\n";
const float* chunk_out_ptr = chunk_out.data<float>();
logits_fobj << chunk_out_ptr << std::endl;
for (int i = 0; i < chunk_out.numel(); i++) {
logits_fobj << chunk_out_ptr[i] << " ";
}
logits_fobj << "\n";
}
#endif // end TEST_DEBUG
// current offset in decoder frame
// not used in nnet
offset_ += chunk_out.shape()[1];
// collects encoder outs.
VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size();
encoder_outs_.push_back(chunk_out);
#ifdef TEST_DEBUG
{
std::stringstream path("encoder_logits_list",
std::ios_base::app | std::ios_base::out);
path << offset_ - encoder_outs_[0].shape()[1];
std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out);
CHECK(logits_out_fobj.is_open());
logits_out_fobj << encoder_outs_[0].shape()[0] << " "
<< encoder_outs_[0].shape()[1] << " "
<< encoder_outs_[0].shape()[2] << "\n";
const float* encoder_outs_ptr = encoder_outs_[0].data<float>();
logits_out_fobj << encoder_outs_ptr << std::endl;
for (int i = 0; i < encoder_outs_[0].numel(); i++) {
logits_out_fobj << encoder_outs_ptr[i] << " ";
}
logits_out_fobj << "\n";
}
#endif // end TEST_DEBUG
#ifdef USE_GPU
#error "Not implementation."
#else
// compute ctc_activation == log_softmax
inputs.clear();
outputs.clear();
inputs.push_back(chunk_out);
CHECK(inputs.size() == 1);
outputs = ctc_activation_(inputs);
CHECK(outputs.size() == 1);
paddle::Tensor ctc_log_probs = outputs[0];
#ifdef TEST_DEBUG
{
std::stringstream path("encoder_logprob",
std::ios_base::app | std::ios_base::out);
path << offset_ - chunk_out.shape()[1];
std::ofstream logprob_fobj(path.str().c_str(), std::ios::out);
CHECK(logprob_fobj.is_open());
logprob_fobj << ctc_log_probs.shape()[0] << " "
<< ctc_log_probs.shape()[1] << " "
<< ctc_log_probs.shape()[2] << "\n";
const float* logprob_ptr = ctc_log_probs.data<float>();
for (int i = 0; i < ctc_log_probs.numel(); i++) {
logprob_fobj << logprob_ptr[i] << " ";
if ((i + 1) % ctc_log_probs.shape()[2] == 0) {
logprob_fobj << "\n";
}
}
logprob_fobj << "\n";
}
#endif // end TEST_DEBUG
#endif // end USE_GPU
// Copy to output, (B=1,T,D)
std::vector<int64_t> ctc_log_probs_shape = ctc_log_probs.shape();
CHECK(ctc_log_probs_shape.size() == 3);
int B = ctc_log_probs_shape[0];
CHECK(B == 1);
int T = ctc_log_probs_shape[1];
int D = ctc_log_probs_shape[2];
*vocab_dim = D;
float* ctc_log_probs_ptr = ctc_log_probs.data<float>();
// // vector<vector<float>>
// out_prob->resize(T);
// for (int i = 0; i < T; i++) {
// (*out_prob)[i].resize(D);
// float* dst_ptr = (*out_prob)[i].data();
// float* src_ptr = ctc_log_probs_ptr + (i * D);
// std::memcpy(dst_ptr, src_ptr, D * sizeof(float));
// }
// CHECK(std::is_same<float, kaldi::BaseFloat>::value);
out_prob->resize(T * D);
std::memcpy(
out_prob->data(), ctc_log_probs_ptr, T * D * sizeof(kaldi::BaseFloat));
#ifdef TEST_DEBUG
{
std::stringstream path("encoder_logits_list_ctc",
std::ios_base::app | std::ios_base::out);
path << offset_ - encoder_outs_[0].shape()[1];
std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out);
CHECK(logits_out_fobj.is_open());
logits_out_fobj << encoder_outs_[0].shape()[0] << " "
<< encoder_outs_[0].shape()[1] << " "
<< encoder_outs_[0].shape()[2] << "\n";
const float* encoder_outs_ptr = encoder_outs_[0].data<float>();
logits_out_fobj << encoder_outs_ptr << std::endl;
for (int i = 0; i < encoder_outs_[0].numel(); i++) {
logits_out_fobj << encoder_outs_ptr[i] << " ";
}
logits_out_fobj << "\n";
}
#endif // end TEST_DEBUG
return;
}
float U2Nnet::ComputePathScore(const paddle::Tensor& prob,
const std::vector<int>& hyp,
int eos) {
// sum `hyp` path scores in `prob`
// prob (1, Umax, V)
// hyp (U,)
float score = 0.0f;
std::vector<int64_t> dims = prob.shape();
CHECK(dims.size() == 3);
VLOG(2) << "prob shape: " << dims[0] << ", " << dims[1] << ", " << dims[2];
CHECK(dims[0] == 1);
int vocab_dim = static_cast<int>(dims[2]);
const float* prob_ptr = prob.data<float>();
for (size_t i = 0; i < hyp.size(); ++i) {
const float* row = prob_ptr + i * vocab_dim;
score += row[hyp[i]];
}
const float* row = prob_ptr + hyp.size() * vocab_dim;
score += row[eos];
return score;
}
void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
float reverse_weight,
std::vector<float>* rescoring_score) {
#ifdef USE_PROFILING
RecordEvent event("AttentionRescoring", TracerEventType::UserDefined, 1);
#endif
CHECK(rescoring_score != nullptr);
int num_hyps = hyps.size();
rescoring_score->resize(num_hyps, 0.0f);
if (num_hyps == 0) return;
VLOG(2) << "num hyps: " << num_hyps;
if (encoder_outs_.size() == 0) {
// no encoder outs
std::cerr << "encoder_outs_.size() is zero. Please check it."
<< std::endl;
return;
}
// prepare input
paddle::Tensor hyps_lens =
paddle::zeros({num_hyps}, paddle::DataType::INT64);
int64_t* hyps_len_ptr = hyps_lens.mutable_data<int64_t>();
int max_hyps_len = 0;
for (size_t i = 0; i < num_hyps; ++i) {
int len = hyps[i].size() + 1; // eos
max_hyps_len = std::max(max_hyps_len, len);
hyps_len_ptr[i] = static_cast<int64_t>(len);
}
paddle::Tensor hyps_tensor =
paddle::full({num_hyps, max_hyps_len}, eos_, paddle::DataType::INT64);
int64_t* hyps_ptr = hyps_tensor.mutable_data<int64_t>();
for (size_t i = 0; i < num_hyps; ++i) {
const std::vector<int>& hyp = hyps[i];
int64_t* row = hyps_ptr + max_hyps_len * i;
row[0] = sos_;
for (size_t j = 0; j < hyp.size(); ++j) {
row[j + 1] = hyp[j];
}
}
#ifdef TEST_DEBUG
{
std::stringstream path("encoder_logits_concat",
std::ios_base::app | std::ios_base::out);
for (int j = 0; j < encoder_outs_.size(); j++) {
path << j;
std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out);
CHECK(logits_out_fobj.is_open());
logits_out_fobj << encoder_outs_[j].shape()[0] << " "
<< encoder_outs_[j].shape()[1] << " "
<< encoder_outs_[j].shape()[2] << "\n";
const float* encoder_outs_ptr = encoder_outs_[j].data<float>();
for (int i = 0; i < encoder_outs_[j].numel(); i++) {
logits_out_fobj << encoder_outs_ptr[i] << " ";
}
logits_out_fobj << "\n";
}
}
#endif // end TEST_DEBUG
// forward attention decoder by hyps and correspoinding encoder_outs_
paddle::Tensor encoder_out = paddle::concat(encoder_outs_, 1);
VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size();
#ifdef TEST_DEBUG
{
std::stringstream path("encoder_out0",
std::ios_base::app | std::ios_base::out);
std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out);
CHECK(encoder_out_fobj.is_open());
encoder_out_fobj << encoder_outs_[0].shape()[0] << " "
<< encoder_outs_[0].shape()[1] << " "
<< encoder_outs_[0].shape()[2] << "\n";
const float* enc_logprob_ptr = encoder_outs_[0].data<float>();
size_t size = encoder_outs_[0].numel();
for (int i = 0; i < size; i++) {
encoder_out_fobj << enc_logprob_ptr[i] << "\n";
}
}
#endif // end TEST_DEBUG
#ifdef TEST_DEBUG
{
std::stringstream path("encoder_out",
std::ios_base::app | std::ios_base::out);
std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out);
CHECK(encoder_out_fobj.is_open());
encoder_out_fobj << encoder_out.shape()[0] << " "
<< encoder_out.shape()[1] << " "
<< encoder_out.shape()[2] << "\n";
const float* enc_logprob_ptr = encoder_out.data<float>();
size_t size = encoder_out.numel();
for (int i = 0; i < size; i++) {
encoder_out_fobj << enc_logprob_ptr[i] << "\n";
}
}
#endif // end TEST_DEBUG
std::vector<paddle::experimental::Tensor> inputs{
hyps_tensor, hyps_lens, encoder_out};
std::vector<paddle::Tensor> outputs = forward_attention_decoder_(inputs);
CHECK(outputs.size() == 2);
// (B, Umax, V)
paddle::Tensor probs = outputs[0];
std::vector<int64_t> probs_shape = probs.shape();
CHECK(probs_shape.size() == 3);
CHECK(probs_shape[0] == num_hyps);
CHECK(probs_shape[1] == max_hyps_len);
#ifdef TEST_DEBUG
{
std::stringstream path("decoder_logprob",
std::ios_base::app | std::ios_base::out);
std::ofstream dec_logprob_fobj(path.str().c_str(), std::ios::out);
CHECK(dec_logprob_fobj.is_open());
dec_logprob_fobj << probs.shape()[0] << " " << probs.shape()[1] << " "
<< probs.shape()[2] << "\n";
const float* dec_logprob_ptr = probs.data<float>();
size_t size = probs.numel();
for (int i = 0; i < size; i++) {
dec_logprob_fobj << dec_logprob_ptr[i] << "\n";
}
}
#endif // end TEST_DEBUG
#ifdef TEST_DEBUG
{
std::stringstream path("hyps_lens",
std::ios_base::app | std::ios_base::out);
std::ofstream hyps_len_fobj(path.str().c_str(), std::ios::out);
CHECK(hyps_len_fobj.is_open());
const int64_t* hyps_lens_ptr = hyps_lens.data<int64_t>();
size_t size = hyps_lens.numel();
for (int i = 0; i < size; i++) {
hyps_len_fobj << hyps_lens_ptr[i] << "\n";
}
}
#endif // end TEST_DEBUG
#ifdef TEST_DEBUG
{
std::stringstream path("hyps_tensor",
std::ios_base::app | std::ios_base::out);
std::ofstream hyps_tensor_fobj(path.str().c_str(), std::ios::out);
CHECK(hyps_tensor_fobj.is_open());
const int64_t* hyps_tensor_ptr = hyps_tensor.data<int64_t>();
size_t size = hyps_tensor.numel();
for (int i = 0; i < size; i++) {
hyps_tensor_fobj << hyps_tensor_ptr[i] << "\n";
}
}
#endif // end TEST_DEBUG
paddle::Tensor r_probs = outputs[1];
std::vector<int64_t> r_probs_shape = r_probs.shape();
if (is_bidecoder_ && reverse_weight > 0) {
CHECK(r_probs_shape.size() == 3);
CHECK(r_probs_shape[0] == num_hyps);
CHECK(r_probs_shape[1] == max_hyps_len);
} else {
// dump r_probs
CHECK(r_probs_shape.size() == 1);
CHECK(r_probs_shape[0] == 1) << r_probs_shape[0];
}
// compute rescoring score
using IntArray = paddle::experimental::IntArray;
std::vector<paddle::Tensor> probs_v =
paddle::experimental::split_with_num(probs, num_hyps, 0);
VLOG(2) << "split prob: " << probs_v.size() << " "
<< probs_v[0].shape().size() << " 0: " << probs_v[0].shape()[0]
<< ", " << probs_v[0].shape()[1] << ", " << probs_v[0].shape()[2];
CHECK(static_cast<int>(probs_v.size()) == num_hyps)
<< ": is " << probs_v.size() << " expect: " << num_hyps;
std::vector<paddle::Tensor> r_probs_v;
if (is_bidecoder_ && reverse_weight > 0) {
r_probs_v = paddle::experimental::split_with_num(r_probs, num_hyps, 0);
CHECK(static_cast<int>(r_probs_v.size()) == num_hyps)
<< "r_probs_v size: is " << r_probs_v.size()
<< " expect: " << num_hyps;
}
for (int i = 0; i < num_hyps; ++i) {
const std::vector<int>& hyp = hyps[i];
// left-to-right decoder score
float score = 0.0f;
score = ComputePathScore(probs_v[i], hyp, eos_);
// right-to-left decoder score
float r_score = 0.0f;
if (is_bidecoder_ && reverse_weight > 0) {
std::vector<int> r_hyp(hyp.size());
std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin());
r_score = ComputePathScore(r_probs_v[i], r_hyp, eos_);
}
// combinded left-to-right and right-to-lfet score
(*rescoring_score)[i] =
score * (1 - reverse_weight) + r_score * reverse_weight;
VLOG(1) << "hyp " << i << " score: " << score << " r_score: " << r_score
<< " reverse_weight: " << reverse_weight;
}
}
} // namespace ppspeech
\ No newline at end of file
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "base/common.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "kaldi/util/options-itf.h"
#include "nnet/nnet_itf.h"
#include "paddle/extension.h"
#include "paddle/jit/all.h"
#include "paddle/phi/api/all.h"
namespace ppspeech {
struct U2ModelOptions {
std::string model_path;
int thread_num;
bool use_gpu;
U2ModelOptions() : model_path(""), thread_num(1), use_gpu(false) {}
void Register(kaldi::OptionsItf* opts) {
opts->Register("model-path", &model_path, "model file path");
opts->Register("thread-num", &thread_num, "thread num");
opts->Register("use-gpu", &use_gpu, "if use gpu");
}
};
class U2NnetBase : public NnetInterface {
public:
virtual int context() const { return right_context_ + 1; }
virtual int right_context() const { return right_context_; }
virtual int subsampling_rate() const { return subsampling_rate_; }
virtual int eos() const { return eos_; }
virtual int sos() const { return sos_; }
virtual int is_bidecoder() const { return is_bidecoder_; }
// current offset in decoder frame
virtual int offset() const { return offset_; }
virtual void set_chunk_size(int chunk_size) { chunk_size_ = chunk_size; }
virtual void set_num_left_chunks(int num_left_chunks) {
num_left_chunks_ = num_left_chunks;
}
// start: false, it is the start chunk of one sentence, else true
virtual int num_frames_for_chunk(bool start) const;
virtual std::shared_ptr<NnetInterface> Copy() const = 0;
virtual void ForwardEncoderChunk(
const std::vector<kaldi::BaseFloat>& chunk_feats,
int32 feat_dim,
std::vector<kaldi::BaseFloat>* ctc_probs,
int32* vocab_dim);
virtual void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
float reverse_weight,
std::vector<float>* rescoring_score) = 0;
protected:
virtual void ForwardEncoderChunkImpl(
const std::vector<kaldi::BaseFloat>& chunk_feats,
int32 feat_dim,
std::vector<kaldi::BaseFloat>* ctc_probs,
int32* vocab_dim) = 0;
virtual void CacheFeature(const std::vector<kaldi::BaseFloat>& chunk_feats,
int32 feat_dim);
protected:
// model specification
int right_context_{0};
int subsampling_rate_{1};
int sos_{0};
int eos_{0};
bool is_bidecoder_{false};
int chunk_size_{16}; // num of decoder frames. If chunk_size > 0, streaming
// case. Otherwise, none streaming case
int num_left_chunks_{-1}; // -1 means all left chunks
// asr decoder state
int offset_{0}; // current offset in encoder output time stamp. Used by
// position embedding.
std::vector<std::vector<float>> cached_feats_{}; // features cache
};
class U2Nnet : public U2NnetBase {
public:
U2Nnet(const U2ModelOptions& opts);
U2Nnet(const U2Nnet& other);
void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
int32 feature_dim,
kaldi::Vector<kaldi::BaseFloat>* inferences,
int32* inference_dim) override;
void Reset() override;
void Dim();
void LoadModel(const std::string& model_path_w_prefix);
void Warmup();
std::shared_ptr<paddle::jit::Layer> model() const { return model_; }
std::shared_ptr<NnetInterface> Copy() const override;
void ForwardEncoderChunkImpl(
const std::vector<kaldi::BaseFloat>& chunk_feats,
int32 feat_dim,
std::vector<kaldi::BaseFloat>* ctc_probs,
int32* vocab_dim) override;
float ComputePathScore(const paddle::Tensor& prob,
const std::vector<int>& hyp,
int eos);
void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
float reverse_weight,
std::vector<float>* rescoring_score) override;
// debug
void FeedEncoderOuts(paddle::Tensor& encoder_out);
private:
U2ModelOptions opts_;
phi::Place dev_;
std::shared_ptr<paddle::jit::Layer> model_{nullptr};
std::vector<paddle::Tensor> encoder_outs_;
// transformer/conformer attention cache
paddle::Tensor att_cache_ = paddle::full({0, 0, 0, 0}, 0.0);
// conformer-only conv_module cache
paddle::Tensor cnn_cache_ = paddle::full({0, 0, 0, 0}, 0.0);
paddle::jit::Function forward_encoder_chunk_;
paddle::jit::Function forward_attention_decoder_;
paddle::jit::Function ctc_activation_;
};
} // namespace ppspeech
\ No newline at end of file
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "nnet/u2_nnet.h"
#include "base/common.h"
#include "frontend/audio/assembler.h"
#include "frontend/audio/data_cache.h"
#include "kaldi/util/table-types.h"
#include "nnet/decodable.h"
DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier");
DEFINE_string(model_path, "", "paddle nnet model");
DEFINE_int32(nnet_decoder_chunk, 16, "nnet forward chunk");
DEFINE_int32(receptive_field_length,
7,
"receptive field of two CNN(kernel=3) downsampling module.");
DEFINE_int32(downsampling_rate,
4,
"two CNN(kernel=3) module downsampling rate.");
DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
using kaldi::BaseFloat;
using kaldi::Matrix;
using std::vector;
int main(int argc, char* argv[]) {
gflags::SetUsageMessage("Usage:");
gflags::ParseCommandLineFlags(&argc, &argv, false);
google::InitGoogleLogging(argv[0]);
google::InstallFailureSignalHandler();
FLAGS_logtostderr = 1;
int32 num_done = 0, num_err = 0;
CHECK(FLAGS_feature_rspecifier.size() > 0);
CHECK(FLAGS_nnet_prob_wspecifier.size() > 0);
CHECK(FLAGS_model_path.size() > 0);
LOG(INFO) << "input rspecifier: " << FLAGS_feature_rspecifier;
LOG(INFO) << "output wspecifier: " << FLAGS_nnet_prob_wspecifier;
LOG(INFO) << "model path: " << FLAGS_model_path;
kaldi::SequentialBaseFloatMatrixReader feature_reader(
FLAGS_feature_rspecifier);
kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier);
ppspeech::U2ModelOptions model_opts;
model_opts.model_path = FLAGS_model_path;
int32 chunk_size =
(FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate +
FLAGS_receptive_field_length;
int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
int32 receptive_field_length = FLAGS_receptive_field_length;
LOG(INFO) << "chunk size (frame): " << chunk_size;
LOG(INFO) << "chunk stride (frame): " << chunk_stride;
LOG(INFO) << "receptive field (frame): " << receptive_field_length;
std::shared_ptr<ppspeech::U2Nnet> nnet(new ppspeech::U2Nnet(model_opts));
std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
std::shared_ptr<ppspeech::Decodable> decodable(
new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale));
kaldi::Timer timer;
for (; !feature_reader.Done(); feature_reader.Next()) {
string utt = feature_reader.Key();
kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
int nframes = feature.NumRows();
int feat_dim = feature.NumCols();
raw_data->SetDim(feat_dim);
LOG(INFO) << "utt: " << utt;
LOG(INFO) << "feat shape: " << nframes << ", " << feat_dim;
// // pad feats
// int32 padding_len = 0;
// if ((feature.NumRows() - chunk_size) % chunk_stride != 0) {
// padding_len =
// chunk_stride - (feature.NumRows() - chunk_size) %
// chunk_stride;
// feature.Resize(feature.NumRows() + padding_len,
// feature.NumCols(),
// kaldi::kCopyData);
// }
int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1;
int32 frame_idx = 0;
std::vector<kaldi::Vector<kaldi::BaseFloat>> prob_vec;
int32 ori_feature_len = feature.NumRows();
for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
feat_dim);
int32 feature_chunk_size = 0;
if (ori_feature_len > chunk_idx * chunk_stride) {
feature_chunk_size = std::min(
ori_feature_len - chunk_idx * chunk_stride, chunk_size);
}
if (feature_chunk_size < receptive_field_length) {
LOG(WARNING) << "utt: " << utt << " skip last "
<< feature_chunk_size << " frames, expect is "
<< receptive_field_length;
break;
}
int32 start = chunk_idx * chunk_stride;
for (int row_id = 0; row_id < chunk_size; ++row_id) {
kaldi::SubVector<kaldi::BaseFloat> feat_row(feature, start);
kaldi::SubVector<kaldi::BaseFloat> feature_chunk_row(
feature_chunk.Data() + row_id * feat_dim, feat_dim);
feature_chunk_row.CopyFromVec(feat_row);
++start;
}
// feat to frontend pipeline cache
raw_data->Accept(feature_chunk);
// send data finish signal
if (chunk_idx == num_chunks - 1) {
raw_data->SetFinished();
}
// get nnet outputs
vector<kaldi::BaseFloat> prob;
while (decodable->FrameLikelihood(frame_idx, &prob)) {
kaldi::Vector<kaldi::BaseFloat> vec_tmp(prob.size());
std::memcpy(vec_tmp.Data(),
prob.data(),
sizeof(kaldi::BaseFloat) * prob.size());
prob_vec.push_back(vec_tmp);
frame_idx++;
}
}
// after process one utt, then reset decoder state.
decodable->Reset();
if (prob_vec.size() == 0) {
// the TokenWriter can not write empty string.
++num_err;
LOG(WARNING) << " the nnet prob of " << utt << " is empty";
continue;
}
// writer nnet output
kaldi::MatrixIndexT nrow = prob_vec.size();
kaldi::MatrixIndexT ncol = prob_vec[0].Dim();
LOG(INFO) << "nnet out shape: " << nrow << ", " << ncol;
kaldi::Matrix<kaldi::BaseFloat> result(nrow, ncol);
for (int32 row_idx = 0; row_idx < nrow; ++row_idx) {
for (int32 col_idx = 0; col_idx < ncol; ++col_idx) {
result(row_idx, col_idx) = prob_vec[row_idx](col_idx);
}
}
nnet_out_writer.Write(utt, result);
++num_done;
}
double elapsed = timer.Elapsed();
LOG(INFO) << " cost:" << elapsed << " sec";
LOG(INFO) << "Done " << num_done << " utterances, " << num_err
<< " with errors.";
return (num_done != 0 ? 0 : 1);
}
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
add_subdirectory(websocket)
project(websocket)
# project(websocket)
add_library(websocket STATIC
websocket_server.cc
......
add_library(utils
file_utils.cc
math.cc
)
\ No newline at end of file
......@@ -38,11 +38,11 @@ float LogSumExp(float x, float y) {
template <typename T>
struct ValGreaterComp {
bool operator()(const std::pair<T, int32_t>& lhs,
const std::pair<T, int32_>& rhs) const {
const std::pair<T, int32_t>& rhs) const {
return lhs.first > rhs.first ||
(lhs.first == rhs.first && lhs.second < rhs.second);
}
}
};
template <typename T>
void TopK(const std::vector<T>& data,
......
#!/bin/bash
set -ex
PYTHON=python3.7
test -d venv || virtualenv -p ${PYTHON} venv
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册