未验证 提交 4e9bca17 编写于 作者: Z zxcd 提交者: GitHub

[ASR] change optimizer and fix import error, test=asr (#3023)

* mv dataio.py to s2t.io.speechbrain.dataio

mv dataio.py to paddlespeech.s2t.io.speechbrain.dataio

* remove transformers import.

* change optimizer same with released model

* add paddlenlp version in RESULT.md.

* fix run.sh

* fix data.sh step_num.

* add adadelta optimizer config.

* fix wav2vec2 test_wav.sh run error.

* add tokenizer config.
上级 65c3217b
......@@ -190,9 +190,9 @@ tar xzvf wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz
```
You can download the audio demo:
```bash
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
```
You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
```bash
CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 data/demo_002_en.wav
CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 data/demo_01_03.wav
```
......@@ -4,6 +4,7 @@
* paddle version: develop (commit id: daea892c67e85da91906864de40ce9f6f1b893ae)
* paddlespeech version: develop (commit id: c14b4238b256693281e59605abff7c9435b3e2b2)
* paddlenlp version: 2.5.2
## Device
* python: 3.7
......
......@@ -83,7 +83,7 @@ dnn_neurons: 1024
freeze_wav2vec: False
dropout: 0.15
tokenizer: !apply:transformers.BertTokenizer.from_pretrained
tokenizer: !apply:paddlenlp.transformers.AutoTokenizer.from_pretrained
pretrained_model_name_or_path: bert-base-chinese
# bert-base-chinese tokens length
output_neurons: 21128
......
......@@ -107,6 +107,7 @@ vocab_filepath: data/lang_char/vocab.txt
###########################################
unit_type: 'char'
tokenizer: bert-base-chinese
mean_std_filepath:
preprocess_config: conf/preprocess.yaml
sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
......@@ -139,12 +140,10 @@ n_epoch: 80
accum_grad: 1
global_grad_clip: 5.0
model_optim: adadelta
model_optim: sgd
model_optim_conf:
lr: 1.0
weight_decay: 0.0
rho: 0.95
epsilon: 1.0e-8
wav2vec2_optim: adam
wav2vec2_optim_conf:
......@@ -165,3 +164,4 @@ log_interval: 1
checkpoint:
kbest_n: 50
latest_n: 5
############################################
# Network Architecture #
############################################
freeze_wav2vec2: False
normalize_wav: True
output_norm: True
init_type: 'kaiming_uniform' # !Warning: need to convergence
enc:
input_shape: 1024
dnn_blocks: 3
dnn_neurons: 1024
activation: True
normalization: True
dropout_rate: [0.15, 0.15, 0.0]
ctc:
enc_n_units: 1024
blank_id: 0
dropout_rate: 0.0
audio_augment:
speeds: [90, 100, 110]
spec_augment:
time_warp: True
time_warp_window: 5
time_warp_mode: bicubic
freq_mask: True
n_freq_mask: 2
time_mask: True
n_time_mask: 2
replace_with_zero: False
freq_mask_width: 30
time_mask_width: 40
wav2vec2_params_path: exp/wav2vec2/chinese-wav2vec2-large.pdparams
############################################
# Wav2Vec2.0 #
############################################
# vocab_size: 1000000
hidden_size: 1024
num_hidden_layers: 24
num_attention_heads: 16
intermediate_size: 4096
hidden_act: gelu
hidden_dropout: 0.1
activation_dropout: 0.0
attention_dropout: 0.1
feat_proj_dropout: 0.1
feat_quantizer_dropout: 0.0
final_dropout: 0.0
layerdrop: 0.1
initializer_range: 0.02
layer_norm_eps: 1e-5
feat_extract_norm: layer
feat_extract_activation: gelu
conv_dim: [512, 512, 512, 512, 512, 512, 512]
conv_stride: [5, 2, 2, 2, 2, 2, 2]
conv_kernel: [10, 3, 3, 3, 3, 2, 2]
conv_bias: True
num_conv_pos_embeddings: 128
num_conv_pos_embedding_groups: 16
do_stable_layer_norm: True
apply_spec_augment: False
mask_channel_length: 10
mask_channel_min_space: 1
mask_channel_other: 0.0
mask_channel_prob: 0.0
mask_channel_selection: static
mask_feature_length: 10
mask_feature_min_masks: 0
mask_feature_prob: 0.0
mask_time_length: 10
mask_time_min_masks: 2
mask_time_min_space: 1
mask_time_other: 0.0
mask_time_prob: 0.075
mask_time_selection: static
num_codevectors_per_group: 320
num_codevector_groups: 2
contrastive_logits_temperature: 0.1
num_negatives: 100
codevector_dim: 256
proj_codevector_dim: 256
diversity_loss_weight: 0.1
use_weighted_layer_sum: False
# pad_token_id: 0
# bos_token_id: 1
# eos_token_id: 2
add_adapter: False
adapter_kernel_size: 3
adapter_stride: 2
num_adapter_layers: 3
output_hidden_size: None
###########################################
# Data #
###########################################
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
vocab_filepath: data/lang_char/vocab.txt
###########################################
# Dataloader #
###########################################
unit_type: 'char'
tokenizer: bert-base-chinese
mean_std_filepath:
preprocess_config: conf/preprocess.yaml
sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size: 5 # Different batch_size may cause large differences in results
maxlen_in: 51200000000 # if input length > maxlen-in batchsize is automatically reduced
maxlen_out: 1500000 # if output length > maxlen-out batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
num_workers: 6
subsampling_factor: 1
num_encs: 1
dist_sampler: True
shortest_first: True
return_lens_rate: True
###########################################
# use speechbrain dataloader #
###########################################
use_sb_pipeline: True # whether use speechbrain pipeline. Default is True.
sb_pipeline_conf: conf/train_with_wav2vec.yaml
###########################################
# Training #
###########################################
n_epoch: 80
accum_grad: 1
global_grad_clip: 5.0
model_optim: adadelta
model_optim_conf:
lr: 1.0
weight_decay: 0.0
rho: 0.95
epsilon: 1.0e-8
wav2vec2_optim: adam
wav2vec2_optim_conf:
lr: 0.0001
weight_decay: 0.0
model_scheduler: newbobscheduler
model_scheduler_conf:
improvement_threshold: 0.0025
annealing_factor: 0.8
patient: 0
wav2vec2_scheduler: newbobscheduler
wav2vec2_scheduler_conf:
improvement_threshold: 0.0025
annealing_factor: 0.9
patient: 0
log_interval: 1
checkpoint:
kbest_n: 50
latest_n: 5
......@@ -21,7 +21,7 @@ import glob
import logging
import os
from paddlespeech.s2t.models.wav2vec2.io.dataio import read_audio
from paddlespeech.s2t.io.speechbrain.dataio import read_audio
logger = logging.getLogger(__name__)
......
#!/bin/bash
stage=-1
stop_stage=-1
stop_stage=3
dict_dir=data/lang_char
. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
......
......@@ -8,9 +8,7 @@ echo "using $ngpu gpus..."
expdir=exp
datadir=data
train_set=train_960
recog_set="test-clean test-other dev-clean dev-other"
recog_set="test-clean"
train_set=train
config_path=$1
decode_config_path=$2
......@@ -75,7 +73,7 @@ for type in ctc_prefix_beam_search; do
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
python3 utils/compute-wer.py --char=1 --v=1 \
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
data/manifest.test.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
echo "decoding ${type} done."
done
......
......@@ -14,7 +14,7 @@ ckpt_prefix=$3
audio_file=$4
mkdir -p data
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
if [ $? -ne 0 ]; then
exit 1
fi
......
......@@ -15,11 +15,11 @@ resume= # xx e.g. 30
export FLAGS_cudnn_deterministic=1
. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
audio_file=data/demo_002_en.wav
audio_file=data/demo_01_03.wav
avg_ckpt=avg_${avg_num}
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
echo "checkpoint name ${ckpt}"git revert -v
echo "checkpoint name ${ckpt}"
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
......
......@@ -102,13 +102,11 @@ ssl_dynamic_pretrained_models = {
'params':
'exp/wav2vec2ASR/checkpoints/avg_1.pdparams',
},
},
"wav2vec2ASR_aishell1-zh-16k": {
'1.4': {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz',
'md5':
'9f0bc943adb822789bf61e674b229d17',
'150e51b8ea5d255ccce6b395de8d916a',
'cfg_path':
'model.yaml',
'ckpt_path':
......
......@@ -18,6 +18,7 @@ from pathlib import Path
import paddle
import soundfile
from paddlenlp.transformers import AutoTokenizer
from yacs.config import CfgNode
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
......@@ -34,8 +35,13 @@ class Wav2vec2Infer():
self.config = config
self.audio_file = args.audio_file
self.text_feature = TextFeaturizer(
unit_type=config.unit_type, vocab=config.vocab_filepath)
if self.config.tokenizer:
self.text_feature = AutoTokenizer.from_pretrained(
self.config.tokenizer)
else:
self.text_feature = TextFeaturizer(
unit_type=config.unit_type, vocab=config.vocab_filepath)
paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
# model
......@@ -59,14 +65,14 @@ class Wav2vec2Infer():
audio, _ = soundfile.read(
self.audio_file, dtype="int16", always_2d=True)
logger.info(f"audio shape: {audio.shape}")
xs = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
decode_config = self.config.decode
result_transcripts, result_tokenids = self.model.decode(
xs,
text_feature=self.text_feature,
decoding_method=decode_config.decoding_method,
beam_size=decode_config.beam_size)
beam_size=decode_config.beam_size,
tokenizer=self.config.tokenizer, )
rsl = result_transcripts[0]
utt = Path(self.audio_file).name
logger.info(f"hyp: {utt} {rsl}")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册