Merge pull request #1096 from Jackwaterveg/setup

[Setup]separate the sox and soxbindings with the requirements

Merge pull request #1096 from Jackwaterveg/setup
[Setup]separate the sox and soxbindings with the requirements
1ac9e781 · Hui Zhang · GitHub · 63e69997 · 5adaaaea · 1ac9e781
Showing with 47 addition and 23 deletion

paddlespeech/s2t/frontend/audio.py paddlespeech/s2t/frontend/audio.py +25 -13

paddlespeech/s2t/transform/perturb.py paddlespeech/s2t/transform/perturb.py +20 -8

setup.py setup.py +2 -2

未找到文件。
--- a/paddlespeech/s2t/frontend/audio.py
+++ b/paddlespeech/s2t/frontend/audio.py
@@ -21,7 +21,6 @@ import struct
 import numpy as np
 import resampy
 import soundfile
-import soxbindings as sox
 from scipy import signal
 from .utility import convert_samples_from_float32
@@ -98,7 +97,7 @@ class AudioSegment():
        :param file: Input audio filepath or file object.
        :type file: str|file
        :param start: Start time in seconds. If start is negative, it wraps
-                      around from the end. If not provided, this function 
+                      around from the end. If not provided, this function
                      reads from the very beginning.
        :type start: float
        :param end: End time in seconds. If end is negative, it wraps around
@@ -199,7 +198,7 @@ class AudioSegment():
    @classmethod
    def from_bytes(cls, bytes):
        """Create audio segment from a byte string containing audio samples.
        :param bytes: Byte string containing audio samples.
        :type bytes: str
        :return: Audio segment instance.
@@ -217,7 +216,7 @@ class AudioSegment():
        :type *segments: tuple of AudioSegment
        :return: Audio segment instance as concatenating results.
        :rtype: AudioSegment
-        :raises ValueError: If the number of segments is zero, or if the 
+        :raises ValueError: If the number of segments is zero, or if the
                            sample_rate of any segments does not match.
        :raises TypeError: If any segment is not AudioSegment instance.
        """
@@ -251,7 +250,7 @@ class AudioSegment():
    def to_wav_file(self, filepath, dtype='float32'):
        """Save audio segment to disk as wav file.
        :param filepath: WAV filepath or file object to save the
                         audio segment.
        :type filepath: str|file
@@ -297,7 +296,7 @@ class AudioSegment():
    def to_bytes(self, dtype='float32'):
        """Create a byte string containing the audio content.
        :param dtype: Data type for export samples. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
@@ -309,7 +308,7 @@ class AudioSegment():
    def to(self, dtype='int16'):
        """Create a `dtype` audio content.
        :param dtype: Data type for export samples. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
@@ -323,8 +322,8 @@ class AudioSegment():
        """Apply gain in decibels to samples.
        Note that this is an in-place transformation.
-        :param gain: Gain in decibels to apply to samples. 
+        :param gain: Gain in decibels to apply to samples.
        :type gain: float|1darray
        """
        self._samples *= 10.**(gain / 20.)
@@ -333,7 +332,7 @@ class AudioSegment():
        """Change the audio speed by linear interpolation.
        Note that this is an in-place transformation.
        :param speed_rate: Rate of speed change:
                           speed_rate > 1.0, speed up the audio;
                           speed_rate = 1.0, unchanged;
@@ -355,6 +354,19 @@ class AudioSegment():
        # self._samples = np.interp(new_indices, old_indices, self._samples)
        # sox, slow
+        try:
+            import soxbindings as sox
+        except:
+            try:
+                from paddlespeech.s2t.utils import dynamic_pip_install
+                package = "sox"
+                dynamic_pip_install.install(package)
+                package = "soxbindings"
+                dynamic_pip_install.install(package)
+                import soxbindings as sox
+            except:
+                raise RuntimeError("Can not install soxbindings on your system." )
        tfm = sox.Transformer()
        tfm.set_globals(multithread=False)
        tfm.speed(speed_rate)
@@ -405,7 +417,7 @@ class AudioSegment():
        :param prior_samples: Prior strength in number of samples.
        :type prior_samples: float
        :param startup_delay: Default 0.0s. If provided, this function will
-                              accrue statistics for the first startup_delay 
+                              accrue statistics for the first startup_delay
                              seconds before applying online normalization.
        :type startup_delay: float
        """
@@ -557,7 +569,7 @@ class AudioSegment():
        :param impulse_segment: Impulse response segments.
        :type impulse_segment: AudioSegment
        :param allow_resample: Indicates whether resampling is allowed when
-                               the impulse_segment has a different sample 
+                               the impulse_segment has a different sample
                               rate from this signal.
        :type allow_resample: bool
        :raises ValueError: If the sample rate is not match between two
@@ -695,7 +707,7 @@ class AudioSegment():
    def _convert_samples_from_float32(self, samples, dtype):
        """Convert sample type from float32 to dtype.
        Audio sample type is usually integer or float-point. For integer
        type, float32 will be rescaled from [-1, 1] to the maximum range
        supported by the integer type.

--- a/paddlespeech/s2t/transform/perturb.py
+++ b/paddlespeech/s2t/transform/perturb.py
@@ -16,7 +16,6 @@ import librosa
 import numpy
 import scipy
 import soundfile
-import soxbindings as sox
 from paddlespeech.s2t.io.reader import SoundHDF5File
@@ -115,10 +114,10 @@ class SpeedPerturbationSox():
    and sox-speed just to resample the input,
    i.e pitch and tempo are changed both.
-    To speed up or slow down the sound of a file, 
+    To speed up or slow down the sound of a file,
-    use speed to modify the pitch and the duration of the file. 
+    use speed to modify the pitch and the duration of the file.
-    This raises the speed and reduces the time. 
+    This raises the speed and reduces the time.
-    The default factor is 1.0 which makes no change to the audio. 
+    The default factor is 1.0 which makes no change to the audio.
    2.0 doubles speed, thus time length is cut by a half and pitch is one interval higher.
    "Why use speed option instead of tempo -s in SoX for speed perturbation"
@@ -130,7 +129,7 @@ class SpeedPerturbationSox():
    speed option:
    sox -t wav input.wav -t wav output.speed0.9.wav speed 0.9
-    If we use speed option like above, the pitch of audio also will be changed, 
+    If we use speed option like above, the pitch of audio also will be changed,
    but the tempo option does not change the pitch.
    """
@@ -146,6 +145,19 @@ class SpeedPerturbationSox():
        self.keep_length = keep_length
        self.state = numpy.random.RandomState(seed)
+        try:
+            import soxbindings as sox
+        except:
+            try:
+                from paddlespeech.s2t.utils import dynamic_pip_install
+                package = "sox"
+                dynamic_pip_install.install(package)
+                package = "soxbindings"
+                dynamic_pip_install.install(package)
+                import soxbindings as sox
+            except:
+                raise RuntimeError("Can not install soxbindings on your system." )
        if utt2ratio is not None:
            self.utt2ratio = {}
            # Use the scheduled ratio for each utterances
@@ -168,8 +180,8 @@ class SpeedPerturbationSox():
    def __repr__(self):
        if self.utt2ratio is None:
            return f"""{self.__class__.__name__}(
-                lower={self.lower}, 
+                lower={self.lower},
-                upper={self.upper}, 
+                upper={self.upper},
                keep_length={self.keep_length},
                sample_rate={self.sr})"""

--- a/setup.py
+++ b/setup.py
@@ -55,8 +55,6 @@ requirements = {
        "scipy",
        "sentencepiece~=0.1.96",
        "soundfile~=0.10",
-        "sox",
-        "soxbindings",
        "textgrid",
        "timer",
        "tqdm",
@@ -74,6 +72,8 @@ requirements = {
        "Pillow",
        "pybind11",
        "snakeviz",
+        "sox",
+        "soxbindings",
        "unidecode",
        "yq",
        "pre-commit",