diff --git a/examples/other/mfa/local/detect_oov.py b/examples/other/mfa/local/detect_oov.py old mode 100644 new mode 100755 index 4928e45346820f578c65976c7ec0e232fc41c990..cd259ccea57dd237c444579afab7f24c6361a9c0 --- a/examples/other/mfa/local/detect_oov.py +++ b/examples/other/mfa/local/detect_oov.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/examples/other/mfa/local/generate_canton_lexicon_wavlabs.py b/examples/other/mfa/local/generate_canton_lexicon_wavlabs.py old mode 100644 new mode 100755 diff --git a/examples/other/mfa/local/generate_lexicon.py b/examples/other/mfa/local/generate_lexicon.py old mode 100644 new mode 100755 index e63b5eb27d05e576e98cada80a64bbf8391f4f07..089ce8526c1dd9a35ca4174f4e078910e2678e98 --- a/examples/other/mfa/local/generate_lexicon.py +++ b/examples/other/mfa/local/generate_lexicon.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/examples/other/mfa/local/reorganize_aishell3.py b/examples/other/mfa/local/reorganize_aishell3.py old mode 100644 new mode 100755 diff --git a/examples/other/mfa/local/reorganize_baker.py b/examples/other/mfa/local/reorganize_baker.py old mode 100644 new mode 100755 index 0e0035bda59f6f33345672f0abf7b8575f50bd2e..dde4021034d7e2870d569e4926b3053a778b9d95 --- a/examples/other/mfa/local/reorganize_baker.py +++ b/examples/other/mfa/local/reorganize_baker.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/examples/other/mfa/local/reorganize_ljspeech.py b/examples/other/mfa/local/reorganize_ljspeech.py old mode 100644 new mode 100755 diff --git a/examples/other/mfa/local/reorganize_vctk.py b/examples/other/mfa/local/reorganize_vctk.py old mode 100644 new mode 100755 diff --git a/examples/other/mfa/run.sh b/examples/other/mfa/run.sh index 40241683023670199cff13e9e5fd94b1e26f08a3..ca4777e93b6e3506677e72ba602b3c15d778cf42 100755 --- a/examples/other/mfa/run.sh +++ b/examples/other/mfa/run.sh @@ -1,29 +1,32 @@ -EXP_DIR=exp +exp=exp +data=data + +mkdir -p $exp +mkdir -p $data -mkdir -p $EXP_DIR LEXICON_NAME='simple' -if [ ! -f "$EXP_DIR/$LEXICON_NAME.lexicon" ]; then +MFA_DOWNLOAD_DIR=local/ + +if [ ! -f "$exp/$LEXICON_NAME.lexicon" ]; then echo "generating lexicon..." - python local/generate_lexicon.py "$EXP_DIR/$LEXICON_NAME" --with-r --with-tone + python local/generate_lexicon.py "$exp/$LEXICON_NAME" --with-r --with-tone echo "lexicon done" fi -if [ ! -d $EXP_DIR/baker_corpus ]; then +if [ ! -d $exp/baker_corpus ]; then echo "reorganizing baker corpus..." - python local/reorganize_baker.py --root-dir=~/datasets/BZNSYP --output-dir=$EXP_DIR/baker_corpus --resample-audio - echo "reorganization done. Check output in $EXP_DIR/baker_corpus." + python local/reorganize_baker.py --root-dir=~/datasets/BZNSYP --output-dir=$exp/baker_corpus --resample-audio + echo "reorganization done. Check output in $exp/baker_corpus." echo "audio files are resampled to 16kHz" - echo "transcription for each audio file is saved with the same namd in $EXP_DIR/baker_corpus " + echo "transcription for each audio file is saved with the same namd in $exp/baker_corpus " fi echo "detecting oov..." -python local/detect_oov.py $EXP_DIR/baker_corpus $EXP_DIR/"$LEXICON_NAME.lexicon" +python local/detect_oov.py $exp/baker_corpus $exp/"$LEXICON_NAME.lexicon" echo "detecting oov done. you may consider regenerate lexicon if there is unexpected OOVs." -MFA_DOWNLOAD_DIR=local/ - if [ ! -f "$MFA_DOWNLOAD_DIR/montreal-forced-aligner_linux.tar.gz" ]; then echo "downloading mfa..." (cd $MFA_DOWNLOAD_DIR && wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz) @@ -37,11 +40,15 @@ if [ ! -d "$MFA_DOWNLOAD_DIR/montreal-forced-aligner" ]; then fi export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin" -if [ ! -d "$EXP_DIR/baker_alignment" ]; then + +if [ ! -d "$exp/baker_alignment" ]; then echo "Start MFA training..." - mfa_train_and_align $EXP_DIR/baker_corpus "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/baker_alignment -o $EXP_DIR/baker_model --clean --verbose --temp_directory $EXP_DIR/.mfa_train_and_align + PATH=$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin/:$PATH \ + LD_LIBRARY_PATH=$MFA_DOWNLOAD_DIR/montreal-forced-aligner/lib/:$LD_LIBRARY_PATH \ + ./$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin/mfa_train_and_align \ + $exp/baker_corpus "$exp/$LEXICON_NAME.lexicon" $exp/baker_alignment -o $exp/baker_model --clean --verbose -j 10 --temp_directory $exp/.mfa_train_and_align echo "training done!" - echo "results: $EXP_DIR/baker_alignment" - echo "model: $EXP_DIR/baker_model" + echo "results: $exp/baker_alignment" + echo "model: $exp/baker_model" fi diff --git a/examples/other/mfa/run_canton.sh b/examples/other/mfa/run_canton.sh index cef6a2f04488b1302abe4e76de639f3557604b13..851e42d03b04423454000f17e6ee91493adc6ef5 100755 --- a/examples/other/mfa/run_canton.sh +++ b/examples/other/mfa/run_canton.sh @@ -1,16 +1,15 @@ -EXP_DIR=exp +exp=exp -mkdir -p $EXP_DIR +mkdir -p $exp LEXICON_NAME='canton' -if [ ! -f "$EXP_DIR/$LEXICON_NAME.lexicon" ]; then +MFA_DOWNLOAD_DIR=local/ + +if [ ! -f "$exp/$LEXICON_NAME.lexicon" ]; then echo "generating lexicon and training data..." - python local/generate_canton_lexicon_wavlabs.py --output_lexicon "$EXP_DIR/$LEXICON_NAME.lexicon" --output_wavlabs "$EXP_DIR/$LEXICON_NAME"_wavlabs --inputs ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle + python local/generate_canton_lexicon_wavlabs.py --output_lexicon "$exp/$LEXICON_NAME.lexicon" --output_wavlabs "$exp/$LEXICON_NAME"_wavlabs --inputs ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle echo "lexicon and training data done" fi - -MFA_DOWNLOAD_DIR=local/ - if [ ! -f "$MFA_DOWNLOAD_DIR/montreal-forced-aligner_linux.tar.gz" ]; then echo "downloading mfa..." (cd $MFA_DOWNLOAD_DIR && wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz) @@ -24,11 +23,14 @@ if [ ! -d "$MFA_DOWNLOAD_DIR/montreal-forced-aligner" ]; then fi export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin" -if [ ! -d "$EXP_DIR/canton_alignment" ]; then +if [ ! -d "$exp/canton_alignment" ]; then echo "Start MFA training..." - mfa_train_and_align "$EXP_DIR/$LEXICON_NAME"_wavlabs "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/canton_alignment -o $EXP_DIR/canton_model --clean --verbose --temp_directory $EXP_DIR/.mfa_train_and_align + PATH=$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin/:$PATH \ + LD_LIBRARY_PATH=$MFA_DOWNLOAD_DIR/montreal-forced-aligner/lib/:$LD_LIBRARY_PATH \ + ./$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin/mfa_train_and_align \ + "$exp/$LEXICON_NAME"_wavlabs "$exp/$LEXICON_NAME.lexicon" $exp/canton_alignment -o $exp/canton_model --clean --verbose -j 10 --temp_directory $exp/.mfa_train_and_align echo "training done!" - echo "results: $EXP_DIR/canton_alignment" - echo "model: $EXP_DIR/canton_model" + echo "results: $exp/canton_alignment" + echo "model: $exp/canton_model" fi diff --git a/examples/other/spm/README.md b/examples/other/spm/README.md index fc4478ebbaf691183b2545385a2d26de7fe50fc9..5eee8baba87b6a6f9afc0574c1c232060aa2fbc5 100644 --- a/examples/other/spm/README.md +++ b/examples/other/spm/README.md @@ -33,6 +33,18 @@ ec5a9b24acc35469229e41256ceaf77d data/lang_char/input.txt ``` ``` +==> data/lang_char/input.txt <== +mister quilter is the apostle of the middle classes and we are glad to welcome his gospel +nor is mister quilter's manner less interesting than his matter +he tells us that at this festive season of the year with christmas and roast beef looming before us similes drawn from eating and its results occur most readily to the mind +he has grave doubts whether sir frederick leighton's work is really greek after all and can discover in it but little of rocky ithaca +linnell's pictures are a sort of up guards and at em paintings and mason's exquisite idylls are as national as a jingo poem mister birket foster's landscapes smile at one much in the same way that mister carker used to flash his teeth and mister john collier gives his sitter a cheerful slap on the back before he says like a shampooer in a turkish bath next man +it is obviously unnecessary for us to point out how luminous these criticisms are how delicate in expression +on the general principles of art mister quilter writes with equal lucidity +painting he tells us is of a different quality to mathematics and finish in art is adding more fact +as for etchings they are of two kinds british and foreign +he laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes the customary appeal to the last judgment and reminds us that in the great days of art michael angelo was the furnishing upholsterer + ==> data/lang_char/input.bpe <== ▁mi ster ▁quilter ▁ is ▁the ▁a p ost le ▁o f ▁the ▁mi d d le ▁c las s es ▁ and ▁we ▁ar e ▁g l a d ▁ to ▁we l c om e ▁h is ▁g o s pe l ▁ n or ▁ is ▁mi ster ▁quilter ' s ▁ma nne r ▁ l ess ▁in ter es t ing ▁tha n ▁h is ▁ma t ter @@ -58,17 +70,6 @@ painting he tells us is of a different quality to mathematics and finish in art as for etchings they are of two kinds british and foreign he laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes the customary appeal to the last judgment and reminds us that in the great days of art michael angelo was the furnishing upholsterer -==> data/lang_char/input.txt <== -mister quilter is the apostle of the middle classes and we are glad to welcome his gospel -nor is mister quilter's manner less interesting than his matter -he tells us that at this festive season of the year with christmas and roast beef looming before us similes drawn from eating and its results occur most readily to the mind -he has grave doubts whether sir frederick leighton's work is really greek after all and can discover in it but little of rocky ithaca -linnell's pictures are a sort of up guards and at em paintings and mason's exquisite idylls are as national as a jingo poem mister birket foster's landscapes smile at one much in the same way that mister carker used to flash his teeth and mister john collier gives his sitter a cheerful slap on the back before he says like a shampooer in a turkish bath next man -it is obviously unnecessary for us to point out how luminous these criticisms are how delicate in expression -on the general principles of art mister quilter writes with equal lucidity -painting he tells us is of a different quality to mathematics and finish in art is adding more fact -as for etchings they are of two kinds british and foreign -he laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes the customary appeal to the last judgment and reminds us that in the great days of art michael angelo was the furnishing upholsterer ==> data/lang_char/train_unigram100_units.txt <== 0