diff --git a/examples/chinese_g2p/README.md b/examples/chinese_g2p/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e3fdfe6841c0c1f1f1653324854d01c9054ea6cf --- /dev/null +++ b/examples/chinese_g2p/README.md @@ -0,0 +1,5 @@ +# Download Baker dataset + +Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset. + +Download URL https://test.data-baker.com/#/data/index/source. diff --git a/examples/chinese_g2p/local/convert_transcription.py b/examples/chinese_g2p/local/convert_transcription.py new file mode 100644 index 0000000000000000000000000000000000000000..b133ad2c57186b55e82ad0044f508bc537edb57a --- /dev/null +++ b/examples/chinese_g2p/local/convert_transcription.py @@ -0,0 +1,53 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import re + +import jieba +from pypinyin import lazy_pinyin +from pypinyin import Style + + +def extract_pinyin(source, target, use_jieba=False): + with open(source, 'rt', encoding='utf-8') as fin: + with open(target, 'wt', encoding='utf-8') as fout: + for i, line in enumerate(fin): + if i % 2 == 0: + sentence_id, raw_text = line.strip().split() + raw_text = re.sub(r'#\d', '', raw_text) + if use_jieba: + raw_text = jieba.lcut(raw_text) + syllables = lazy_pinyin( + raw_text, + errors='ignore', + style=Style.TONE3, + neutral_tone_with_five=True) + transcription = ' '.join(syllables) + fout.write(f'{sentence_id}\t{transcription}\n') + else: + continue + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="extract baker pinyin labels") + parser.add_argument( + "input", type=str, help="source file of baker's prosody label file") + parser.add_argument( + "output", type=str, help="target file to write pinyin lables") + parser.add_argument( + "--use-jieba", + action='store_true', + help="use jieba for word segmentation.") + args = parser.parse_args() + extract_pinyin(args.input, args.output, use_jieba=args.use_jieba) diff --git a/examples/chinese_g2p/local/extract_pinyin_label.py b/examples/chinese_g2p/local/extract_pinyin_label.py new file mode 100644 index 0000000000000000000000000000000000000000..be7b287f4d2c79e1b047e553b9df20f1ca282958 --- /dev/null +++ b/examples/chinese_g2p/local/extract_pinyin_label.py @@ -0,0 +1,37 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + + +def extract_pinyin_lables(source, target): + """Extract pinyin labels from Baker's prosody labeling.""" + with open(source, 'rt', encoding='utf-8') as fin: + with open(target, 'wt', encoding='utf-8') as fout: + for i, line in enumerate(fin): + if i % 2 == 0: + sentence_id, raw_text = line.strip().split() + fout.write(f'{sentence_id}\t') + else: + transcription = line.strip() + fout.write(f'{transcription}\n') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="extract baker pinyin labels") + parser.add_argument( + "input", type=str, help="source file of baker's prosody label file") + parser.add_argument( + "output", type=str, help="target file to write pinyin lables") + args = parser.parse_args() + extract_pinyin_lables(args.input, args.output) diff --git a/examples/chinese_g2p/local/prepare_dataset.sh b/examples/chinese_g2p/local/prepare_dataset.sh new file mode 100644 index 0000000000000000000000000000000000000000..fe9948ed3f9cfcf4e8b61ad9100c41439710126a --- /dev/null +++ b/examples/chinese_g2p/local/prepare_dataset.sh @@ -0,0 +1,32 @@ +echo "Extracting Prosody Labeling" + +exp_dir="exp" +data_dir="data" +source ${MAIN_ROOT}/utils/parse_options.sh || exit -1 + +archive=${data_dir}/"BZNSYP.rar" +if [ ! -f ${archive} ]; then + echo "Baker Dataset not found! Download it first to the data_dir." + exit -1 +fi + +MD5='c4350563bf7dc298f7dd364b2607be83' +md5_result=$(md5sum ${archive} | awk -F[' '] '{print $1}') +if [ ${md5_result} != ${MD5} ]; then + echo "MD5 mismatch! The Archive has been changed." + exit -1 +fi + + +label_file='ProsodyLabeling/000001-010000.txt' +filename='000001-010000.txt' +unrar e ${archive} ${label_file} +cp ${filename} ${exp_dir} +rm -f ${filename} + +if [ ! -f ${exp_dir}/${filename} ];then + echo "File extraction failed!" + exit +fi + +exit 0 diff --git a/examples/chinese_g2p/path.sh b/examples/chinese_g2p/path.sh new file mode 100644 index 0000000000000000000000000000000000000000..b4c625f958d3f85372dfe93bf13aee9b45f08175 --- /dev/null +++ b/examples/chinese_g2p/path.sh @@ -0,0 +1,8 @@ +export MAIN_ROOT=${PWD}/../../ + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} diff --git a/examples/chinese_g2p/requirements.txt b/examples/chinese_g2p/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c84f42278a6f1081cc3f2330dc7881409c6df3ab --- /dev/null +++ b/examples/chinese_g2p/requirements.txt @@ -0,0 +1 @@ +jieba diff --git a/examples/chinese_g2p/run.sh b/examples/chinese_g2p/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..6bde2e264a0d4411625b7d1bbf496d3034e395ee --- /dev/null +++ b/examples/chinese_g2p/run.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +source path.sh + +stage=-1 +stop_stage=100 + +exp_dir="exp" +data_dir="data" +source ${MAIN_ROOT}/utils/parse_options.sh || exit -1 +mkdir -p ${exp_dir} +bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data_dir} + +# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin +filename="000001-010000.txt" +echo "Processing transcriptions..." + +python3 local/extract_pinyin_label.py ${exp_dir}/${filename} ${exp_dir}/"pinyin_baker.py" +python3 local/convert_transcription.py ${exp_dir}/${filename} ${exp_dir}/"result_pypinyin.txt" +python3 local/convert_transcription.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/"result_pypinyin_with_jieba.txt" + +echo "done" +exit 0