Merge pull request #1616 from zh794390558/spx

[speechx] more comment of code

Merge pull request #1616 from zh794390558/spx
[speechx] more comment of code
b75268c5 · YangZhou · GitHub · 2ea578e8 · 84d712d4 · b75268c5
22 changed file
--- a/dataset/librispeech/librispeech.py
+++ b/dataset/librispeech/librispeech.py
@@ -20,12 +20,12 @@ of each audio file in the data set.
 """
 import argparse
 import codecs
-import distutils.util
 import io
 import json
 import os
 from multiprocessing.pool import Pool
+import distutils.util
 import soundfile
 from utils.utility import download

--- a/demos/audio_searching/src/encode.py
+++ b/demos/audio_searching/src/encode.py
@@ -16,8 +16,8 @@ import os
 import librosa
 import numpy as np
 from config import DEFAULT_TABLE
 from logs import LOGGER
 from paddlespeech.cli import VectorExecutor
 vector_executor = VectorExecutor()

--- a/demos/audio_searching/src/operations/load.py
+++ b/demos/audio_searching/src/operations/load.py
@@ -26,9 +26,8 @@ def get_audios(path):
    """
    supported_formats = [".wav", ".mp3", ".ogg", ".flac", ".m4a"]
    return [
-        item
+        item for sublist in [[os.path.join(dir, file) for file in files]
-        for sublist in [[os.path.join(dir, file) for file in files]
+                             for dir, _, files in list(os.walk(path))]
-                        for dir, _, files in list(os.walk(path))]
        for item in sublist if os.path.splitext(item)[1] in supported_formats
    ]

--- a/examples/ami/sd0/local/ami_prepare.py
+++ b/examples/ami/sd0/local/ami_prepare.py
@@ -24,11 +24,11 @@ import json
 import logging
 import os
 import xml.etree.ElementTree as et
-from distutils.util import strtobool
 from ami_splits import get_AMI_split
 from dataio import load_pkl
 from dataio import save_pkl
+from distutils.util import strtobool
 logger = logging.getLogger(__name__)
 SAMPLERATE = 16000

--- a/paddlespeech/s2t/decoders/recog_bin.py
+++ b/paddlespeech/s2t/decoders/recog_bin.py
@@ -17,10 +17,10 @@ import logging
 import os
 import random
 import sys
-from distutils.util import strtobool
 import configargparse
 import numpy as np
+from distutils.util import strtobool
 def get_parser():

--- a/paddlespeech/s2t/utils/cli_utils.py
+++ b/paddlespeech/s2t/utils/cli_utils.py
@@ -14,9 +14,9 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 import sys
 from collections.abc import Sequence
-from distutils.util import strtobool as dist_strtobool
 import numpy
+from distutils.util import strtobool as dist_strtobool
 def strtobool(x):

--- a/paddlespeech/s2t/utils/utility.py
+++ b/paddlespeech/s2t/utils/utility.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains common utility functions."""
-import distutils.util
 import math
 import os
 import random
@@ -21,6 +20,7 @@ from contextlib import contextmanager
 from pprint import pformat
 from typing import List
+import distutils.util
 import numpy as np
 import paddle
 import soundfile

--- a/paddlespeech/vector/cluster/diarization.py
+++ b/paddlespeech/vector/cluster/diarization.py
@@ -18,11 +18,11 @@ A few sklearn functions are modified in this script as per requirement.
 """
 import argparse
 import warnings
-from distutils.util import strtobool
 import numpy as np
 import scipy
 import sklearn
+from distutils.util import strtobool
 from scipy import sparse
 from scipy.sparse.csgraph import connected_components
 from scipy.sparse.csgraph import laplacian as csgraph_laplacian

--- a/speechx/README.md
+++ b/speechx/README.md
@@ -5,7 +5,7 @@
 We develop under:
 * docker - registry.baidubce.com/paddlepaddle/paddle:2.1.1-gpu-cuda10.2-cudnn7
 * os - Ubuntu 16.04.7 LTS
-* gcc/g++ - 8.2.0
+* ** gcc/g++/gfortran - 8.2.0 **
 * cmake - 3.16.0
 > We make sure all things work fun under docker, and recommend using it to develop and deploy.
@@ -29,6 +29,8 @@ nvidia-docker run --privileged  --net=host --ipc=host -it --rm -v $PWD:/workspac
 2. Build `speechx` and `examples`.
+> Do not source venv.
 ```
 pushd /path/to/speechx
 ./build.sh

--- a/speechx/build.sh
+++ b/speechx/build.sh
@@ -2,8 +2,7 @@
 # the build script had verified in the paddlepaddle docker image.
 # please follow the instruction below to install PaddlePaddle image.
-# https://www.paddlepaddle.org.cn/documentation/docs/zh/install/docker/linux-docker.html
+# https://www.paddlepaddle.org.cn/documentation/docs/zh/install/docker/linux-docker.html 
 boost_SOURCE_DIR=$PWD/fc_patch/boost-src
 if [ ! -d ${boost_SOURCE_DIR} ]; then wget -c https://boostorg.jfrog.io/artifactory/main/release/1.75.0/source/boost_1_75_0.tar.gz 
  tar xzfv boost_1_75_0.tar.gz
@@ -23,6 +22,6 @@ cd build
 cmake .. -DBOOST_ROOT:STRING=${boost_SOURCE_DIR}
 #cmake .. 
-make -j1
+make -j10
 cd -
--- a/speechx/cmake/FindGFortranLibs.cmake
+++ b/speechx/cmake/FindGFortranLibs.cmake
+#.rst:
+# FindGFortranLibs
+# --------
+#  https://github.com/Argonne-National-Laboratory/PIPS/blob/master/cmake/Modules/FindGFortranLibs.cmake
+#  https://enccs.github.io/cmake-workshop/cxx-fortran/
+#
+# Find gcc Fortran compiler & library paths
+#
+# The module defines the following variables:
+#
+# ::
+#
+#
+#   GFORTRANLIBS_FOUND - true if system has gfortran
+#   LIBGFORTRAN_LIBRARIES - path to libgfortran
+#   LIBQUADMATH_LIBRARIES - path to libquadmath
+#   GFORTRAN_LIBARIES_DIR - directory containing libgfortran, libquadmath
+#   GFORTRAN_INCLUDE_DIR - directory containing gfortran/gcc headers
+#   LIBGOMP_LIBRARIES - path to libgomp
+#   LIBGOMP_INCLUDE_DIR - directory containing omp.h header
+#   GFORTRAN_VERSION_STRING - version of gfortran found
+#
+set(CMAKE_REQUIRED_QUIET ${LIBIOMP_FIND_QUIETLY})
+if(NOT CMAKE_REQUIRED_QUIET)
+  message(STATUS "Looking for gfortran related libraries...")
+endif()
+enable_language(Fortran)
+if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU")
+  # Basically, call "gfortran -v" to dump compiler info to the string
+  # GFORTRAN_VERBOSE_STR, which will be used to get necessary paths
+  message(STATUS "Extracting library and header information by calling 'gfortran -v'...")
+  execute_process(COMMAND "${CMAKE_Fortran_COMPILER}" "-v" ERROR_VARIABLE
+    GFORTRAN_VERBOSE_STR RESULT_VARIABLE FLAG)
+  # For debugging
+  message(STATUS "'gfortran -v' returned:")
+  message(STATUS "${GFORTRAN_VERBOSE_STR}")
+  # Detect gfortran version
+  string(REGEX MATCH "gcc version [^\t\n ]+" GFORTRAN_VER_STR "${GFORTRAN_VERBOSE_STR}")
+  string(REGEX REPLACE "gcc version ([^\t\n ]+)" "\\1" GFORTRAN_VERSION_STRING "${GFORTRAN_VER_STR}")
+  message(STATUS "Detected gfortran version ${GFORTRAN_VERSION_STRING}")
+  unset(GFORTRAN_VER_STR)
+  set(MATCH_REGEX "[^\t\n ]+[\t\n ]+")
+  set(REPLACE_REGEX "([^\t\n ]+)")
+  # Find architecture for compiler
+  string(REGEX MATCH "Target: [^\t\n ]+"
+    GFORTRAN_ARCH_STR "${GFORTRAN_VERBOSE_STR}")
+  message(STATUS "Architecture string: ${GFORTRAN_ARCH_STR}")
+  string(REGEX REPLACE "Target: ([^\t\n ]+)" "\\1"
+    GFORTRAN_ARCH "${GFORTRAN_ARCH_STR}")
+  message(STATUS "Detected gfortran architecture: ${GFORTRAN_ARCH}")
+  unset(GFORTRAN_ARCH_STR)
+  # Find install prefix, if it exists; if not, use default
+  string(REGEX MATCH  "--prefix=[^\t\n ]+[\t\n ]+"
+    GFORTRAN_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
+  if(NOT GFORTRAN_PREFIX_STR)
+    message(STATUS "Detected default gfortran prefix")
+    set(GFORTRAN_PREFIX_DIR "/usr/local") # default prefix for gcc install
+  else()
+    string(REGEX REPLACE "--prefix=([^\t\n ]+)" "\\1"
+      GFORTRAN_PREFIX_DIR "${GFORTRAN_PREFIX_STR}")
+  endif()
+  message(STATUS "Detected gfortran prefix: ${GFORTRAN_PREFIX_DIR}")
+  unset(GFORTRAN_PREFIX_STR)
+  # Find install exec-prefix, if it exists; if not, use default
+  string(REGEX MATCH "--exec-prefix=[^\t\n ]+[\t\n ]+" "\\1"
+    GFORTRAN_EXEC_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
+  if(NOT GFORTRAN_EXEC_PREFIX_STR)
+    message(STATUS "Detected default gfortran exec-prefix")
+    set(GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_PREFIX_DIR}")
+  else()
+    string(REGEX REPLACE "--exec-prefix=([^\t\n ]+)" "\\1"
+      GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_EXEC_PREFIX_STR}")
+  endif()
+  message(STATUS "Detected gfortran exec-prefix: ${GFORTRAN_EXEC_PREFIX_DIR}")
+  UNSET(GFORTRAN_EXEC_PREFIX_STR)
+  # Find library directory and include directory, if library directory specified
+  string(REGEX MATCH "--libdir=[^\t\n ]+"
+    GFORTRAN_LIB_DIR_STR "${GFORTRAN_VERBOSE_STR}")
+  if(NOT GFORTRAN_LIB_DIR_STR)
+    message(STATUS "Found --libdir flag -- not found")
+    message(STATUS "Using default gfortran library & include directory paths")
+    set(GFORTRAN_LIBRARIES_DIR
+      "${GFORTRAN_EXEC_PREFIX_DIR}/lib/gcc/${GFORTRAN_ARCH}/${GFORTRAN_VERSION_STRING}")
+    string(CONCAT GFORTRAN_INCLUDE_DIR "${GFORTRAN_LIBRARIES_DIR}" "/include")
+  else()
+    message(STATUS "Found --libdir flag -- yes")
+    string(REGEX REPLACE "--libdir=([^\t\n ]+)" "\\1"
+      GFORTRAN_LIBRARIES_DIR "${GFORTRAN_LIB_DIR_STR}")
+    string(CONCAT GFORTRAN_INCLUDE_DIR "${GFORTRAN_LIBRARIES_DIR}" "/gcc/" "${GFORTRAN_ARCH}" "/" "${GFORTRAN_VERSION_STRING}" "/include")
+  endif()
+  message(STATUS "gfortran libraries path: ${GFORTRAN_LIBRARIES_DIR}")
+  message(STATUS "gfortran include path dir: ${GFORTRAN_INCLUDE_DIR}")
+  unset(GFORTRAN_LIB_DIR_STR)
+  # There are lots of other build options for gcc & gfortran. For now, the
+  # options implemented above should cover a lot of common use cases.
+  # Clean up be deleting the output string from "gfortran -v"
+  unset(GFORTRAN_VERBOSE_STR)
+  # Find paths for libgfortran, libquadmath, libgomp
+  # libgomp needed for OpenMP support without Clang
+  find_library(LIBGFORTRAN_LIBRARIES NAMES gfortran libgfortran
+    HINTS ${GFORTRAN_LIBRARIES_DIR})
+  find_library(LIBQUADMATH_LIBRARIES NAMES quadmath libquadmath
+    HINTS ${GFORTRAN_LIBRARIES_DIR})
+  find_library(LIBGOMP_LIBRARIES NAMES gomp libgomp
+    HINTS ${GFORTRAN_LIBRARIES_DIR})
+  # Find OpenMP headers
+  find_path(LIBGOMP_INCLUDE_DIR NAMES omp.h HINTS ${GFORTRAN_INCLUDE_DIR})
+else()
+  message(STATUS "CMAKE_Fortran_COMPILER_ID does not match 'GNU'!")
+endif()
+include(FindPackageHandleStandardArgs)
+# Required: libgfortran, libquadmath, path for gfortran libraries
+# Optional: libgomp, path for OpenMP headers, path for gcc/gfortran headers
+find_package_handle_standard_args(GFortranLibs
+  REQUIRED_VARS LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES GFORTRAN_LIBRARIES_DIR
+  VERSION_VAR GFORTRAN_VERSION_STRING)
+if(GFORTRANLIBS_FOUND)
+  message(STATUS "Looking for gfortran libraries -- found")
+  message(STATUS "gfortran version: ${GFORTRAN_VERSION_STRING}")
+else()
+  message(STATUS "Looking for gfortran libraries -- not found")
+endif()
+mark_as_advanced(LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES
+  LIBGOMP_LIBRARIES LIBGOMP_INCLUDE_DIR
+  GFORTRAN_LIBRARIES_DIR GFORTRAN_INCLUDE_DIR)
+# FindGFortranLIBS.cmake ends here
\ No newline at end of file
--- a/speechx/cmake/external/openblas.cmake
+++ b/speechx/cmake/external/openblas.cmake
@@ -7,6 +7,27 @@ set(OpenBLAS_PREFIX ${fc_patch}/OpenBLAS-prefix)
 # OPENBLAS  https://github.com/lattice/quda/blob/develop/CMakeLists.txt#L575
 # ######################################################################################################################
 enable_language(Fortran)
+include(FortranCInterface)
+# # Clang doesn't have a Fortran compiler in its suite (yet),
+# # so detect libraries for gfortran; we need equivalents to
+# # libgfortran and libquadmath, which are implicitly
+# # linked by flags in CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES
+# include(FindGFortranLibs REQUIRED)
+# # Add directory containing libgfortran and libquadmath to
+# # linker. Should also contain libgomp, if not using
+# # Intel OpenMP runtime
+# link_directories(${GFORTRAN_LIBRARIES_DIR})
+# # gfortan dir in the docker.
+# link_directories(/usr/local/gcc-8.2/lib64)
+# # if you are working with C and Fortran
+# FortranCInterface_VERIFY()
+# # if you are working with C++ and Fortran
+# FortranCInterface_VERIFY(CXX)
 #TODO: switch to CPM
 include(GNUInstallDirs)
 ExternalProject_Add(

--- a/speechx/cmake/external/openfst.cmake
+++ b/speechx/cmake/external/openfst.cmake
 include(FetchContent)
+set(openfst_PREFIX_DIR ${fc_patch}/openfst)
 set(openfst_SOURCE_DIR ${fc_patch}/openfst-src)
 set(openfst_BINARY_DIR ${fc_patch}/openfst-build)
 ExternalProject_Add(openfst
  URL               https://github.com/mjansche/openfst/archive/refs/tags/1.7.2.zip
  URL_HASH          SHA256=ffc56931025579a8af3515741c0f3b0fc3a854c023421472c07ca0c6389c75e6
-#   #PREFIX            ${openfst_PREFIX_DIR} 
+  PREFIX            ${openfst_PREFIX_DIR} 
-#   SOURCE_DIR        ${openfst_SOURCE_DIR}
+  SOURCE_DIR        ${openfst_SOURCE_DIR}
-#   BINARY_DIR        ${openfst_BINARY_DIR}
+  BINARY_DIR        ${openfst_BINARY_DIR}
  CONFIGURE_COMMAND ${openfst_SOURCE_DIR}/configure --prefix=${openfst_PREFIX_DIR}
                      "CPPFLAGS=-I${gflags_BINARY_DIR}/include -I${glog_SOURCE_DIR}/src -I${glog_BINARY_DIR}"
                      "LDFLAGS=-L${gflags_BINARY_DIR} -L${glog_BINARY_DIR}"
@@ -16,4 +17,4 @@ ExternalProject_Add(openfst
  BUILD_COMMAND     make -j 4
 )
 link_directories(${openfst_PREFIX_DIR}/lib)
 include_directories(${openfst_PREFIX_DIR}/include)
\ No newline at end of file
--- a/speechx/speechx/frontend/feature_cache.cc
+++ b/speechx/speechx/frontend/feature_cache.cc
@@ -41,6 +41,7 @@ void FeatureCache::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
 // pop feature chunk
 bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
    kaldi::Timer timer;
    std::unique_lock<std::mutex> lock(mutex_);
    while (cache_.empty() && base_extractor_->IsFinished() == false) {
        ready_read_condition_.wait(lock);
@@ -64,10 +65,13 @@ bool FeatureCache::Compute() {
    // compute and feed
    Vector<BaseFloat> feature_chunk;
    bool result = base_extractor_->Read(&feature_chunk);
    std::unique_lock<std::mutex> lock(mutex_);
    while (cache_.size() >= max_size_) {
        ready_feed_condition_.wait(lock);
    }
+    // feed cache
    if (feature_chunk.Dim() != 0) {
        cache_.push(feature_chunk);
    }

--- a/speechx/speechx/frontend/feature_cache.h
+++ b/speechx/speechx/frontend/feature_cache.h
@@ -24,17 +24,24 @@ class FeatureCache : public FeatureExtractorInterface {
    explicit FeatureCache(
        int32 max_size = kint16max,
        std::unique_ptr<FeatureExtractorInterface> base_extractor = NULL);
+    // Feed feats or waves
    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
    // feats dim = num_frames * feature_dim
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
    // feature cache only cache feature which from base extractor
    virtual size_t Dim() const { return base_extractor_->Dim(); }
    virtual void SetFinished() {
        base_extractor_->SetFinished();
        // read the last chunk data
        Compute();
    }
    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
    virtual void Reset() {
        base_extractor_->Reset();
        while (!cache_.empty()) {
@@ -45,12 +52,14 @@ class FeatureCache : public FeatureExtractorInterface {
  private:
    bool Compute();
-    std::mutex mutex_;
    size_t max_size_;
-    std::queue<kaldi::Vector<BaseFloat>> cache_;
    std::unique_ptr<FeatureExtractorInterface> base_extractor_;
+    std::mutex mutex_;
+    std::queue<kaldi::Vector<BaseFloat>> cache_;
    std::condition_variable ready_feed_condition_;
    std::condition_variable ready_read_condition_;
    // DISALLOW_COPY_AND_ASSGIN(FeatureCache);
 };

--- a/speechx/speechx/frontend/feature_extractor_interface.h
+++ b/speechx/speechx/frontend/feature_extractor_interface.h
@@ -21,17 +21,26 @@ namespace ppspeech {
 class FeatureExtractorInterface {
  public:
-    // accept input data, accept feature or raw waves which decided
+    // Feed inputs: features(2D saved in 1D) or waveforms(1D).
-    // by the base_extractor
    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) = 0;
-    // get the processed result
-    // the length of output = feature_row * feature_dim,
+    // Fetch processed data: features or waveforms.
-    // the Matrix is squashed into Vector
+    // For features(2D saved in 1D), the Matrix is squashed into Vector,
+    //    the length of output = feature_row * feature_dim.
+    // For waveforms(1D), samples saved in vector.
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* outputs) = 0;
-    // the Dim is the feature dim
+    // Dim is the feature dim. For waveforms(1D), Dim is zero; else is specific,
+    // e.g 80 for fbank.
    virtual size_t Dim() const = 0;
+    // End Flag for Streaming Data.
    virtual void SetFinished() = 0;
+    // whether is end of Streaming Data.
    virtual bool IsFinished() const = 0;
+    // Reset to start state.
    virtual void Reset() = 0;
 };

--- a/speechx/speechx/frontend/linear_spectrogram.h
+++ b/speechx/speechx/frontend/linear_spectrogram.h
@@ -23,12 +23,14 @@ namespace ppspeech {
 struct LinearSpectrogramOptions {
    kaldi::FrameExtractionOptions frame_opts;
-    kaldi::BaseFloat streaming_chunk;
+    kaldi::BaseFloat streaming_chunk;  // second
    LinearSpectrogramOptions() : streaming_chunk(0.36), frame_opts() {}
    void Register(kaldi::OptionsItf* opts) {
-        opts->Register(
+        opts->Register("streaming-chunk",
-            "streaming-chunk", &streaming_chunk, "streaming chunk size");
+                       &streaming_chunk,
+                       "streaming chunk size, default: 0.36 sec");
        frame_opts.Register(opts);
    }
 };

--- a/utils/DER.py
+++ b/utils/DER.py
@@ -26,9 +26,9 @@ import argparse
 import os
 import re
 import subprocess
-from distutils.util import strtobool
 import numpy as np
+from distutils.util import strtobool
 FILE_IDS = re.compile(r"(?<=Speaker Diarization for).+(?=\*\*\*)")
 SCORED_SPEAKER_TIME = re.compile(r"(?<=SCORED SPEAKER TIME =)[\d.]+")

--- a/utils/addjson.py
+++ b/utils/addjson.py
@@ -10,8 +10,8 @@ import codecs
 import json
 import logging
 import sys
-from distutils.util import strtobool
+from distutils.util import strtobool
 from espnet.utils.cli_utils import get_commandline_args
 is_python2 = sys.version_info[0] == 2

--- a/utils/apply-cmvn.py
+++ b/utils/apply-cmvn.py
 #!/usr/bin/env python3
 import argparse
 import logging
-from distutils.util import strtobool
 import kaldiio
 import numpy
+from distutils.util import strtobool
 from paddlespeech.s2t.transform.cmvn import CMVN
 from paddlespeech.s2t.utils.cli_readers import file_reader_helper

--- a/utils/copy-feats.py
+++ b/utils/copy-feats.py
 #!/usr/bin/env python3
 import argparse
 import logging
 from distutils.util import strtobool
 from paddlespeech.s2t.transform.transformation import Transformation

--- a/utils/merge_scp2json.py
+++ b/utils/merge_scp2json.py
@@ -5,9 +5,10 @@ import codecs
 import json
 import logging
 import sys
-from distutils.util import strtobool
 from io import open
+from distutils.util import strtobool
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
 PY2 = sys.version_info[0] == 2