From 075635d2b48594e3528b7ca66b36aca7ccd59339 Mon Sep 17 00:00:00 2001 From: Feiyu Chan Date: Wed, 19 May 2021 19:26:34 +0800 Subject: [PATCH] add a g2p example (#623) * add an example to convert transcription to pinyin with pypinyin and jieba * format code * 1. remove script for data downloading, since Baker dataset is not easily downloaded via terminal; 2. remove pypinyin as an extra requirement; it is alreay required by the main project; 3. clean code. * change output format --- examples/chinese_g2p/README.md | 5 ++ .../local/convert_transcription.py | 53 +++++++++++++++++++ .../chinese_g2p/local/extract_pinyin_label.py | 37 +++++++++++++ examples/chinese_g2p/local/prepare_dataset.sh | 32 +++++++++++ examples/chinese_g2p/path.sh | 8 +++ examples/chinese_g2p/requirements.txt | 1 + examples/chinese_g2p/run.sh | 22 ++++++++ 7 files changed, 158 insertions(+) create mode 100644 examples/chinese_g2p/README.md create mode 100644 examples/chinese_g2p/local/convert_transcription.py create mode 100644 examples/chinese_g2p/local/extract_pinyin_label.py create mode 100644 examples/chinese_g2p/local/prepare_dataset.sh create mode 100644 examples/chinese_g2p/path.sh create mode 100644 examples/chinese_g2p/requirements.txt create mode 100644 examples/chinese_g2p/run.sh diff --git a/examples/chinese_g2p/README.md b/examples/chinese_g2p/README.md new file mode 100644 index 00000000..e3fdfe68 --- /dev/null +++ b/examples/chinese_g2p/README.md @@ -0,0 +1,5 @@ +# Download Baker dataset + +Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset. + +Download URL https://test.data-baker.com/#/data/index/source. diff --git a/examples/chinese_g2p/local/convert_transcription.py b/examples/chinese_g2p/local/convert_transcription.py new file mode 100644 index 00000000..b133ad2c --- /dev/null +++ b/examples/chinese_g2p/local/convert_transcription.py @@ -0,0 +1,53 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import re + +import jieba +from pypinyin import lazy_pinyin +from pypinyin import Style + + +def extract_pinyin(source, target, use_jieba=False): + with open(source, 'rt', encoding='utf-8') as fin: + with open(target, 'wt', encoding='utf-8') as fout: + for i, line in enumerate(fin): + if i % 2 == 0: + sentence_id, raw_text = line.strip().split() + raw_text = re.sub(r'#\d', '', raw_text) + if use_jieba: + raw_text = jieba.lcut(raw_text) + syllables = lazy_pinyin( + raw_text, + errors='ignore', + style=Style.TONE3, + neutral_tone_with_five=True) + transcription = ' '.join(syllables) + fout.write(f'{sentence_id}\t{transcription}\n') + else: + continue + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="extract baker pinyin labels") + parser.add_argument( + "input", type=str, help="source file of baker's prosody label file") + parser.add_argument( + "output", type=str, help="target file to write pinyin lables") + parser.add_argument( + "--use-jieba", + action='store_true', + help="use jieba for word segmentation.") + args = parser.parse_args() + extract_pinyin(args.input, args.output, use_jieba=args.use_jieba) diff --git a/examples/chinese_g2p/local/extract_pinyin_label.py b/examples/chinese_g2p/local/extract_pinyin_label.py new file mode 100644 index 00000000..be7b287f --- /dev/null +++ b/examples/chinese_g2p/local/extract_pinyin_label.py @@ -0,0 +1,37 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + + +def extract_pinyin_lables(source, target): + """Extract pinyin labels from Baker's prosody labeling.""" + with open(source, 'rt', encoding='utf-8') as fin: + with open(target, 'wt', encoding='utf-8') as fout: + for i, line in enumerate(fin): + if i % 2 == 0: + sentence_id, raw_text = line.strip().split() + fout.write(f'{sentence_id}\t') + else: + transcription = line.strip() + fout.write(f'{transcription}\n') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="extract baker pinyin labels") + parser.add_argument( + "input", type=str, help="source file of baker's prosody label file") + parser.add_argument( + "output", type=str, help="target file to write pinyin lables") + args = parser.parse_args() + extract_pinyin_lables(args.input, args.output) diff --git a/examples/chinese_g2p/local/prepare_dataset.sh b/examples/chinese_g2p/local/prepare_dataset.sh new file mode 100644 index 00000000..fe9948ed --- /dev/null +++ b/examples/chinese_g2p/local/prepare_dataset.sh @@ -0,0 +1,32 @@ +echo "Extracting Prosody Labeling" + +exp_dir="exp" +data_dir="data" +source ${MAIN_ROOT}/utils/parse_options.sh || exit -1 + +archive=${data_dir}/"BZNSYP.rar" +if [ ! -f ${archive} ]; then + echo "Baker Dataset not found! Download it first to the data_dir." + exit -1 +fi + +MD5='c4350563bf7dc298f7dd364b2607be83' +md5_result=$(md5sum ${archive} | awk -F[' '] '{print $1}') +if [ ${md5_result} != ${MD5} ]; then + echo "MD5 mismatch! The Archive has been changed." + exit -1 +fi + + +label_file='ProsodyLabeling/000001-010000.txt' +filename='000001-010000.txt' +unrar e ${archive} ${label_file} +cp ${filename} ${exp_dir} +rm -f ${filename} + +if [ ! -f ${exp_dir}/${filename} ];then + echo "File extraction failed!" + exit +fi + +exit 0 diff --git a/examples/chinese_g2p/path.sh b/examples/chinese_g2p/path.sh new file mode 100644 index 00000000..b4c625f9 --- /dev/null +++ b/examples/chinese_g2p/path.sh @@ -0,0 +1,8 @@ +export MAIN_ROOT=${PWD}/../../ + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} diff --git a/examples/chinese_g2p/requirements.txt b/examples/chinese_g2p/requirements.txt new file mode 100644 index 00000000..c84f4227 --- /dev/null +++ b/examples/chinese_g2p/requirements.txt @@ -0,0 +1 @@ +jieba diff --git a/examples/chinese_g2p/run.sh b/examples/chinese_g2p/run.sh new file mode 100644 index 00000000..6bde2e26 --- /dev/null +++ b/examples/chinese_g2p/run.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +source path.sh + +stage=-1 +stop_stage=100 + +exp_dir="exp" +data_dir="data" +source ${MAIN_ROOT}/utils/parse_options.sh || exit -1 +mkdir -p ${exp_dir} +bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data_dir} + +# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin +filename="000001-010000.txt" +echo "Processing transcriptions..." + +python3 local/extract_pinyin_label.py ${exp_dir}/${filename} ${exp_dir}/"pinyin_baker.py" +python3 local/convert_transcription.py ${exp_dir}/${filename} ${exp_dir}/"result_pypinyin.txt" +python3 local/convert_transcription.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/"result_pypinyin_with_jieba.txt" + +echo "done" +exit 0 -- GitLab