linear-spectrogram-wo-db-norm-ol.cc 5.7 KB
Newer Older
Y
Yang Zhou 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// todo refactor, repalce with gtest

#include "base/flags.h"
#include "base/log.h"
#include "kaldi/feat/wave-reader.h"
#include "kaldi/util/kaldi-io.h"
#include "kaldi/util/table-types.h"

#include "frontend/audio/audio_cache.h"
#include "frontend/audio/data_cache.h"
#include "frontend/audio/feature_cache.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/audio/linear_spectrogram.h"
#include "frontend/audio/normalizer.h"

DEFINE_string(wav_rspecifier, "", "test wav scp path");
DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
DEFINE_string(cmvn_file, "./cmvn.ark", "read cmvn");
DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");

int main(int argc, char* argv[]) {
    gflags::ParseCommandLineFlags(&argc, &argv, false);
    google::InitGoogleLogging(argv[0]);

    kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
        FLAGS_wav_rspecifier);
    kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);

    int32 num_done = 0, num_err = 0;

H
format  
Hui Zhang 已提交
45
    // feature pipeline: wave cache --> hanning window
H
Hui Zhang 已提交
46
    // -->linear_spectrogram --> global cmvn -> feat cache
Y
Yang Zhou 已提交
47 48 49 50 51 52 53 54 55 56 57 58

    std::unique_ptr<ppspeech::FrontendInterface> data_source(
        new ppspeech::AudioCache(3600 * 1600, true));

    ppspeech::LinearSpectrogramOptions opt;
    opt.frame_opts.frame_length_ms = 20;
    opt.frame_opts.frame_shift_ms = 10;
    opt.streaming_chunk = FLAGS_streaming_chunk;
    opt.frame_opts.dither = 0.0;
    opt.frame_opts.remove_dc_offset = false;
    opt.frame_opts.window_type = "hanning";
    opt.frame_opts.preemph_coeff = 0.0;
59
    LOG(INFO) << "linear feature: " << true;
Y
Yang Zhou 已提交
60 61 62 63 64 65 66 67 68
    LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms;
    LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms;

    std::unique_ptr<ppspeech::FrontendInterface> linear_spectrogram(
        new ppspeech::LinearSpectrogram(opt, std::move(data_source)));

    std::unique_ptr<ppspeech::FrontendInterface> cmvn(
        new ppspeech::CMVN(FLAGS_cmvn_file, std::move(linear_spectrogram)));

Y
Yang Zhou 已提交
69
    ppspeech::FeatureCacheOptions feat_cache_opts;
70 71 72 73 74
    // the feature cache output feature chunk by chunk.
    // frame_chunk_size : num frame of a chunk.
    // frame_chunk_stride: chunk sliding window stride.
    feat_cache_opts.frame_chunk_stride = 1;
    feat_cache_opts.frame_chunk_size = 1;
Y
Yang Zhou 已提交
75
    ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
Y
Yang Zhou 已提交
76 77 78 79 80
    LOG(INFO) << "feat dim: " << feature_cache.Dim();

    int sample_rate = 16000;
    float streaming_chunk = FLAGS_streaming_chunk;
    int chunk_sample_size = streaming_chunk * sample_rate;
81
    LOG(INFO) << "sample rate: " << sample_rate;
Y
Yang Zhou 已提交
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
    LOG(INFO) << "chunk size (s): " << streaming_chunk;
    LOG(INFO) << "chunk size (sample): " << chunk_sample_size;


    for (; !wav_reader.Done(); wav_reader.Next()) {
        std::string utt = wav_reader.Key();
        const kaldi::WaveData& wave_data = wav_reader.Value();
        LOG(INFO) << "process utt: " << utt;

        int32 this_channel = 0;
        kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
                                                    this_channel);
        int tot_samples = waveform.Dim();
        LOG(INFO) << "wav len (sample): " << tot_samples;

        int sample_offset = 0;
        std::vector<kaldi::Vector<BaseFloat>> feats;
        int feature_rows = 0;
        while (sample_offset < tot_samples) {
            int cur_chunk_size =
                std::min(chunk_sample_size, tot_samples - sample_offset);

            kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
            for (int i = 0; i < cur_chunk_size; ++i) {
                wav_chunk(i) = waveform(sample_offset + i);
            }

            kaldi::Vector<BaseFloat> features;
            feature_cache.Accept(wav_chunk);
            if (cur_chunk_size < chunk_sample_size) {
                feature_cache.SetFinished();
            }
114 115 116 117 118
            bool flag = true;
            do {
                flag = feature_cache.Read(&features);
                feats.push_back(features);
                feature_rows += features.Dim() / feature_cache.Dim();
H
Hui Zhang 已提交
119
            } while (flag == true && features.Dim() != 0);
Y
Yang Zhou 已提交
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
            sample_offset += cur_chunk_size;
        }

        int cur_idx = 0;
        kaldi::Matrix<kaldi::BaseFloat> features(feature_rows,
                                                 feature_cache.Dim());
        for (auto feat : feats) {
            int num_rows = feat.Dim() / feature_cache.Dim();
            for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
                for (size_t col_idx = 0; col_idx < feature_cache.Dim();
                     ++col_idx) {
                    features(cur_idx, col_idx) =
                        feat(row_idx * feature_cache.Dim() + col_idx);
                }
                ++cur_idx;
            }
        }
        feat_writer.Write(utt, features);
        feature_cache.Reset();

        if (num_done % 50 == 0 && num_done != 0)
            KALDI_VLOG(2) << "Processed " << num_done << " utterances";
        num_done++;
    }
    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
              << " with errors.";
    return (num_done != 0 ? 0 : 1);
}