diff --git a/demos/audio_tagging/README.md b/demos/audio_tagging/README.md index 1144cbb1f98c630f624a41e15d5e4a75044d7ab2..9031c2fe94cd5a61058ce33c9e68386790d0c245 100644 --- a/demos/audio_tagging/README.md +++ b/demos/audio_tagging/README.md @@ -16,7 +16,7 @@ Input of this demo should be a WAV file(`.wav`). Here are sample files for this demo that can be downloaded: ```bash -wget https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav ``` ### 3. Usage diff --git a/demos/automatic_video_subtitiles/README.md b/demos/automatic_video_subtitiles/README.md new file mode 100644 index 0000000000000000000000000000000000000000..cb900a6da99b3cc1aa941f989892263ea08b8f4f --- /dev/null +++ b/demos/automatic_video_subtitiles/README.md @@ -0,0 +1,52 @@ +# Automatic Video Subtitiles + +## Introduction +Automatic video subtitiles can generate subtitiles from a specific video by using Automatic Speech Recognition (ASR) system. + +This demo is an implementation to automatic video subtitiles from a video file. It can be done by a single command or a few lines in python using `PaddleSpeech`. + +## Usage +### 1. Installation +```bash +pip install paddlespeech +``` + +### 2. Prepare Input +Get a video file with speech of the specific language: +```bash +wget -c https://paddlespeech.bj.bcebos.com/demos/asr_demos/subtitle_demo1.mp4 +``` + +Extract `.wav` with one channel and 16000 sample rate from the video: +```bash +ffmpeg -i subtitle_demo1.mp4 -ac 1 -ar 16000 -vn input.wav +``` + + +### 3. Usage + +- Python API + ```python + import paddle + from paddlespeech.cli import ASRExecutor, TextExecutor + + asr_executor = ASRExecutor() + text_executor = TextExecutor() + + text = asr_executor( + audio_file='input.wav', + device=paddle.get_device()) + result = text_executor( + text=text, + task='punc', + model='ernie_linear_p3_wudao', + device=paddle.get_device()) + print('Text Result: \n{}'.format(result)) + ``` + Output: + ```bash + Text Result: + 当我说我可以把三十年的经验变成一个准确的算法,他们说不可能。当我说我们十个人就能实现对十九个城市变电站七乘二十四小时的实时监管,他们说不可能。 + ``` + +automatic_video_subtitiles diff --git a/demos/automatic_video_subtitiles/recognize.py b/demos/automatic_video_subtitiles/recognize.py new file mode 100644 index 0000000000000000000000000000000000000000..72e3c3a8597283916762d92dcc9f5e95c6261d2f --- /dev/null +++ b/demos/automatic_video_subtitiles/recognize.py @@ -0,0 +1,43 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os + +import paddle + +from paddlespeech.cli import ASRExecutor +from paddlespeech.cli import TextExecutor + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--input", type=str, required=True) +parser.add_argument("--device", type=str, default=paddle.get_device()) +args = parser.parse_args() +# yapf: enable + +if __name__ == "__main__": + asr_executor = ASRExecutor() + text_executor = TextExecutor() + + text = asr_executor( + audio_file=os.path.abspath(os.path.expanduser(args.input)), + device=args.device) + result = text_executor( + text=text, + task='punc', + model='ernie_linear_p3_wudao', + device=args.device) + + print('ASR Result: \n{}'.format(text)) + print('Text Result: \n{}'.format(result)) diff --git a/demos/automatic_video_subtitiles/run.sh b/demos/automatic_video_subtitiles/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..9b9fd2ccc82d72312503e1884714d6ed44f2d8fc --- /dev/null +++ b/demos/automatic_video_subtitiles/run.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +video_url=https://paddlespeech.bj.bcebos.com/demos/asr_demos/subtitle_demo1.mp4 +video_file=$(basename ${video_url}) +audio_file=$(echo ${video_file} | awk -F'.' '{print $1}').wav +num_channels=1 +sr=16000 + +# Download video +if [ ! -f ${video_file} ]; then + wget -c ${video_url} +fi + +# Extract audio from video +if [ ! -f ${audio_file} ]; then + ffmpeg -i ${video_file} -ac ${num_channels} -ar ${sr} -vn ${audio_file} +fi + +python -u recognize.py --input ${audio_file} +exit 0 diff --git a/demos/punctuation_restoration/README.md b/demos/punctuation_restoration/README.md index 18d462d4c54ef75a091876a496e59a2feefe2ef2..d55d069a17cade4a86e44344b91c07e57725229b 100644 --- a/demos/punctuation_restoration/README.md +++ b/demos/punctuation_restoration/README.md @@ -27,7 +27,7 @@ Input of this demo should be a text of the specific language that can be passed Arguments: - `input`(required): Input raw text. - `task`: Choose subtask. Default: `punc`. - - `model`: Model type of text task. Default: `ernie_linear_wudao`. + - `model`: Model type of text task. Default: `ernie_linear_p7_wudao`. - `lang`: Choose model language.. Default: `zh`. - `config`: Config of text task. Use pretrained model when it is None. Default: `None`. - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`. @@ -49,7 +49,7 @@ Input of this demo should be a text of the specific language that can be passed result = text_executor( text='今天的天气真不错啊你下午有空吗我想约你一起去吃饭', task='punc', - model='ernie_linear_wudao', + model='ernie_linear_p7_wudao', lang='zh', config=None, ckpt_path=None, @@ -68,6 +68,8 @@ Input of this demo should be a text of the specific language that can be passed Here is a list of pretrained models released by PaddleSpeech that can be used by command and python api: -| Model | Task | Language -| :--- | :---: | :---: -| ernie_linear_wudao| punc(Punctuation Restoration) | zh +- Punctuation Restoration + | Model | Language | Number of Punctuation Characters + | :--- | :---: | :---: + | ernie_linear_p3_wudao| zh | 3(,。?) + | ernie_linear_p7_wudao| zh | 7(,。!?、:;) diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md index c911653156c0bf6150f6a02530b57bf54b3d3c86..e13434649845c33fa0977004171878387b490000 100644 --- a/demos/speech_recognition/README.md +++ b/demos/speech_recognition/README.md @@ -16,7 +16,7 @@ Input of this demo should be a WAV file(`.wav`), and the sample rate must be sam Here are sample files for this demo that can be downloaded: ```bash -wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav ``` ### 3. Usage diff --git a/demos/speech_translation/README.md b/demos/speech_translation/README.md index caca05dd16543a3625006d07497c5893d3688147..e3fa18c61533d7f220acb6aad9768c5c0b2bcc4c 100644 --- a/demos/speech_translation/README.md +++ b/demos/speech_translation/README.md @@ -16,7 +16,7 @@ Input of this demo should be a WAV file(`.wav`). Here are sample files for this demo that can be downloaded: ```bash -wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav ``` ### 3. Usage (not support for Windows now) diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py index 13d170f57f5852f710109aff5c2987cb525aae5a..da9c5fe05710d9262e8ab8461020820720ed481d 100644 --- a/paddlespeech/cli/text/infer.py +++ b/paddlespeech/cli/text/infer.py @@ -34,9 +34,9 @@ pretrained_models = { # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k". # Command line and python api use "{model_name}[_{dataset}]" as --model, usage: # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav" - "ernie_linear_wudao-punc-zh": { + "ernie_linear_p7_wudao-punc-zh": { 'url': - 'https://paddlespeech.bj.bcebos.com/text/ernie_linear_wudao-punc-zh.tar.gz', + 'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p7_wudao-punc-zh.tar.gz', 'md5': '12283e2ddde1797c5d1e57036b512746', 'cfg_path': @@ -46,14 +46,28 @@ pretrained_models = { 'vocab_file': 'punc_vocab.txt', }, + "ernie_linear_p3_wudao-punc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao-punc-zh.tar.gz', + 'md5': + '448eb2fdf85b6a997e7e652e80c51dd2', + 'cfg_path': + 'ckpt/model_config.json', + 'ckpt_path': + 'ckpt/model_state.pdparams', + 'vocab_file': + 'punc_vocab.txt', + }, } model_alias = { - "ernie_linear": "paddlespeech.text.models:ErnieLinear", + "ernie_linear_p7": "paddlespeech.text.models:ErnieLinear", + "ernie_linear_p3": "paddlespeech.text.models:ErnieLinear", } tokenizer_alias = { - "ernie_linear": "paddlenlp.transformers:ErnieTokenizer", + "ernie_linear_p7": "paddlenlp.transformers:ErnieTokenizer", + "ernie_linear_p3": "paddlenlp.transformers:ErnieTokenizer", } @@ -75,7 +89,7 @@ class TextExecutor(BaseExecutor): self.parser.add_argument( '--model', type=str, - default='ernie_linear_wudao', + default='ernie_linear_p7_wudao', choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()], help='Choose model type of text task.') self.parser.add_argument( @@ -123,7 +137,7 @@ class TextExecutor(BaseExecutor): def _init_from_path(self, task: str='punc', - model_type: str='ernie_linear_wudao', + model_type: str='ernie_linear_p7_wudao', lang: str='zh', cfg_path: Optional[os.PathLike]=None, ckpt_path: Optional[os.PathLike]=None, @@ -182,7 +196,6 @@ class TextExecutor(BaseExecutor): Input preprocess and return paddle.Tensor stored in self.input. Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet). """ - logger.info("Preprocessing input text: " + text) if self.task == 'punc': clean_text = self._clean_text(text) assert len(clean_text) > 0, f'Invalid input string: {text}' @@ -263,7 +276,7 @@ class TextExecutor(BaseExecutor): self, text: str, task: str='punc', - model: str='ernie_linear_wudao', + model: str='ernie_linear_p7_wudao', lang: str='zh', config: Optional[os.PathLike]=None, ckpt_path: Optional[os.PathLike]=None,