diff --git a/demos/speech_recognition/run.sh b/demos/speech_recognition/run.sh deleted file mode 100755 index a9ae937d2a54fe4a4d5663b2f673e9ce49d3ea1c..0000000000000000000000000000000000000000 --- a/demos/speech_recognition/run.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -# wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav - -# asr -export CUDA_VISIBLE_DEVICES=0 -paddlespeech asr --input audio/119994.wav -v - - -# asr + punc -# paddlespeech asr --input ./zh.wav | paddlespeech text --task punc \ No newline at end of file diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml index 6fccd19547b6c51d3bf33721802e42a174081296..d3992cb9fc3e46b2a8779d4e276223e1477570af 100644 --- a/examples/aishell/asr1/conf/preprocess.yaml +++ b/examples/aishell/asr1/conf/preprocess.yaml @@ -5,7 +5,7 @@ process: n_mels: 80 n_shift: 160 win_length: 400 - dither: 0.0 + dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument diff --git a/examples/aishell/asr1/conf/tuning/decode.yaml b/examples/aishell/asr1/conf/tuning/decode.yaml index f0a5ba6b5775f5f69e60112f0404a33acfc8d0a4..72ede9272b190289daac57946c7e44d5407ef0b0 100644 --- a/examples/aishell/asr1/conf/tuning/decode.yaml +++ b/examples/aishell/asr1/conf/tuning/decode.yaml @@ -3,9 +3,9 @@ decode_batch_size: 128 error_rate_type: cer decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. -decoding_chunk_size: 1 # decoding chunk size. Defaults to -1. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. # <0: for decoding, use full chunk. # >0: for decoding, use fixed chunk size as set. # 0: used for training, it's prohibited here. num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. -simulate_streaming: True # simulate streaming inference. Defaults to False. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh index be7116a750e43f3d9ba84084d11467bfccbd6026..c54dae9cffd7bf3f1ad86833200531d64a4c6397 100644 --- a/examples/aishell/asr1/run.sh +++ b/examples/aishell/asr1/run.sh @@ -3,12 +3,12 @@ source path.sh set -e gpus=0,1,2,3 -stage=5 -stop_stage=5 -conf_path=conf/chunk_conformer.yaml +stage=0 +stop_stage=50 +conf_path=conf/conformer.yaml decode_conf_path=conf/tuning/decode.yaml avg_num=20 -audio_file=audio/zh.wav +audio_file=data/demo_01_03.wav source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -44,7 +44,7 @@ fi # Optionally, you can add LM and test it with runtime. if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/chunk_conformer/checkpoints/multi_cn ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi # Not supported at now!!! diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py index 92c1df7c4085d60ac2ecc25a22b80ef270bd51da..b781c4a8e5cc99590e179faf1c4c3989349d4216 100644 --- a/paddlespeech/__init__.py +++ b/paddlespeech/__init__.py @@ -14,6 +14,3 @@ import _locale _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8']) - - - diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 6bc86d8f8756a9b1b4f044abeeb7e8a16bf1f87f..86c3db89f82878e093d71ec9a56740fb05767b7b 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -128,12 +128,10 @@ if __name__ == "__main__": args = parser.parse_args() config = CfgNode(new_allowed=True) - + if args.config: - print(f"load config: {args.config}") config.merge_from_file(args.config) if args.decode_cfg: - print(f"load decode cfg: {args.decode_cfg}") decode_confs = CfgNode(new_allowed=True) decode_confs.merge_from_file(args.decode_cfg) config.decode = decode_confs diff --git a/paddlespeech/server/conf/ws_application.yaml b/paddlespeech/server/conf/ws_application.yaml index b2eaf50019baef1bec53fabb16646c3b1e02a878..b958bdf69668382adceb10ea90aaf7cde7e0fe4f 100644 --- a/paddlespeech/server/conf/ws_application.yaml +++ b/paddlespeech/server/conf/ws_application.yaml @@ -4,7 +4,7 @@ # SERVER SETTING # ################################################################################# host: 0.0.0.0 -port: 8096 +port: 8090 # The task format in the engin_list is: _ # task choices = ['asr_online', 'tts_online'] diff --git a/paddlespeech/server/tests/asr/online/websocket_client.py b/paddlespeech/server/tests/asr/online/websocket_client.py index 58506606edce48d635ec84e3108fb83320bf7480..661eb4dd9eb2c62a300343d19368b91629c43f31 100644 --- a/paddlespeech/server/tests/asr/online/websocket_client.py +++ b/paddlespeech/server/tests/asr/online/websocket_client.py @@ -105,7 +105,7 @@ class ASRAudioHandler: def main(args): logging.basicConfig(level=logging.INFO) logging.info("asr websocket client start") - handler = ASRAudioHandler("127.0.0.1", 8096) + handler = ASRAudioHandler("127.0.0.1", 8090) loop = asyncio.get_event_loop() # support to process single audio file diff --git a/paddlespeech/server/ws/asr_socket.py b/paddlespeech/server/ws/asr_socket.py index 65c04f67fc9524315558d46625446c7976725911..ad4a1124ec01698245ee83803c8573835ab06214 100644 --- a/paddlespeech/server/ws/asr_socket.py +++ b/paddlespeech/server/ws/asr_socket.py @@ -93,7 +93,6 @@ async def websocket_endpoint(websocket: WebSocket): sample_rate = asr_engine.config.sample_rate x_chunk, x_chunk_lens = asr_engine.preprocess(samples, sample_rate) - print(x_chunk_lens) asr_engine.run(x_chunk, x_chunk_lens) asr_results = asr_engine.postprocess() asr_results = asr_engine.postprocess() diff --git a/paddlespeech/vector/cluster/diarization.py b/paddlespeech/vector/cluster/diarization.py index 5b215725761849398ff70235b48fd55d5be8c699..816ab0dee68bca949051c390051750f739a9c413 100644 --- a/paddlespeech/vector/cluster/diarization.py +++ b/paddlespeech/vector/cluster/diarization.py @@ -18,11 +18,11 @@ A few sklearn functions are modified in this script as per requirement. """ import argparse import warnings +from distutils.util import strtobool import numpy as np import scipy import sklearn -from distutils.util import strtobool from scipy import sparse from scipy.sparse.csgraph import connected_components from scipy.sparse.csgraph import laplacian as csgraph_laplacian diff --git a/setup.py b/setup.py index 9a8bb66bb0d93d24ee172841512e53ad2edb1ee5..82ff6341265a407c62b2599e6e73493c8b9087e1 100644 --- a/setup.py +++ b/setup.py @@ -168,7 +168,7 @@ class DevelopCommand(develop): def run(self): develop.run(self) # must after develop.run, or pkg install by shell will not see - # self.execute(_post_install, (self.install_lib, ), msg="Post Install...") + self.execute(_post_install, (self.install_lib, ), msg="Post Install...") class InstallCommand(install): diff --git a/utils/DER.py b/utils/DER.py index 59bcbec473489e0f7930caed5838c152d3d0f874..d6ab695d8f498dd9aafebe6b43b645cc5de709e3 100755 --- a/utils/DER.py +++ b/utils/DER.py @@ -26,9 +26,9 @@ import argparse import os import re import subprocess +from distutils.util import strtobool import numpy as np -from distutils.util import strtobool FILE_IDS = re.compile(r"(?<=Speaker Diarization for).+(?=\*\*\*)") SCORED_SPEAKER_TIME = re.compile(r"(?<=SCORED SPEAKER TIME =)[\d.]+")