From 075635d2b48594e3528b7ca66b36aca7ccd59339 Mon Sep 17 00:00:00 2001
From: Feiyu Chan <chenfeiyu@baidu.com>
Date: Wed, 19 May 2021 19:26:34 +0800
Subject: [PATCH] add a g2p example  (#623)

* add an example to convert transcription to pinyin with pypinyin and jieba

* format code

* 1. remove script for data downloading, since Baker dataset is not easily downloaded via terminal;
2. remove pypinyin as an extra requirement; it is alreay required by the main project;
3. clean code.

* change output format
---
 examples/chinese_g2p/README.md                |  5 ++
 .../local/convert_transcription.py            | 53 +++++++++++++++++++
 .../chinese_g2p/local/extract_pinyin_label.py | 37 +++++++++++++
 examples/chinese_g2p/local/prepare_dataset.sh | 32 +++++++++++
 examples/chinese_g2p/path.sh                  |  8 +++
 examples/chinese_g2p/requirements.txt         |  1 +
 examples/chinese_g2p/run.sh                   | 22 ++++++++
 7 files changed, 158 insertions(+)
 create mode 100644 examples/chinese_g2p/README.md
 create mode 100644 examples/chinese_g2p/local/convert_transcription.py
 create mode 100644 examples/chinese_g2p/local/extract_pinyin_label.py
 create mode 100644 examples/chinese_g2p/local/prepare_dataset.sh
 create mode 100644 examples/chinese_g2p/path.sh
 create mode 100644 examples/chinese_g2p/requirements.txt
 create mode 100644 examples/chinese_g2p/run.sh

diff --git a/examples/chinese_g2p/README.md b/examples/chinese_g2p/README.md
new file mode 100644
index 00000000..e3fdfe68
--- /dev/null
+++ b/examples/chinese_g2p/README.md
@@ -0,0 +1,5 @@
+# Download Baker dataset
+
+Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset.
+
+Download URL https://test.data-baker.com/#/data/index/source.
diff --git a/examples/chinese_g2p/local/convert_transcription.py b/examples/chinese_g2p/local/convert_transcription.py
new file mode 100644
index 00000000..b133ad2c
--- /dev/null
+++ b/examples/chinese_g2p/local/convert_transcription.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import re
+
+import jieba
+from pypinyin import lazy_pinyin
+from pypinyin import Style
+
+
+def extract_pinyin(source, target, use_jieba=False):
+    with open(source, 'rt', encoding='utf-8') as fin:
+        with open(target, 'wt', encoding='utf-8') as fout:
+            for i, line in enumerate(fin):
+                if i % 2 == 0:
+                    sentence_id, raw_text = line.strip().split()
+                    raw_text = re.sub(r'#\d', '', raw_text)
+                    if use_jieba:
+                        raw_text = jieba.lcut(raw_text)
+                    syllables = lazy_pinyin(
+                        raw_text,
+                        errors='ignore',
+                        style=Style.TONE3,
+                        neutral_tone_with_five=True)
+                    transcription = ' '.join(syllables)
+                    fout.write(f'{sentence_id}\t{transcription}\n')
+                else:
+                    continue
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="extract baker pinyin labels")
+    parser.add_argument(
+        "input", type=str, help="source file of baker's prosody label file")
+    parser.add_argument(
+        "output", type=str, help="target file to write pinyin lables")
+    parser.add_argument(
+        "--use-jieba",
+        action='store_true',
+        help="use jieba for word segmentation.")
+    args = parser.parse_args()
+    extract_pinyin(args.input, args.output, use_jieba=args.use_jieba)
diff --git a/examples/chinese_g2p/local/extract_pinyin_label.py b/examples/chinese_g2p/local/extract_pinyin_label.py
new file mode 100644
index 00000000..be7b287f
--- /dev/null
+++ b/examples/chinese_g2p/local/extract_pinyin_label.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+
+def extract_pinyin_lables(source, target):
+    """Extract pinyin labels from Baker's prosody labeling."""
+    with open(source, 'rt', encoding='utf-8') as fin:
+        with open(target, 'wt', encoding='utf-8') as fout:
+            for i, line in enumerate(fin):
+                if i % 2 == 0:
+                    sentence_id, raw_text = line.strip().split()
+                    fout.write(f'{sentence_id}\t')
+                else:
+                    transcription = line.strip()
+                    fout.write(f'{transcription}\n')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="extract baker pinyin labels")
+    parser.add_argument(
+        "input", type=str, help="source file of baker's prosody label file")
+    parser.add_argument(
+        "output", type=str, help="target file to write pinyin lables")
+    args = parser.parse_args()
+    extract_pinyin_lables(args.input, args.output)
diff --git a/examples/chinese_g2p/local/prepare_dataset.sh b/examples/chinese_g2p/local/prepare_dataset.sh
new file mode 100644
index 00000000..fe9948ed
--- /dev/null
+++ b/examples/chinese_g2p/local/prepare_dataset.sh
@@ -0,0 +1,32 @@
+echo "Extracting Prosody Labeling"
+
+exp_dir="exp"
+data_dir="data"
+source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
+
+archive=${data_dir}/"BZNSYP.rar"
+if [ ! -f ${archive} ]; then
+    echo "Baker Dataset not found! Download it first to the data_dir."
+    exit -1
+fi
+
+MD5='c4350563bf7dc298f7dd364b2607be83'
+md5_result=$(md5sum ${archive} | awk -F[' '] '{print $1}')
+if [ ${md5_result} != ${MD5} ]; then
+    echo "MD5 mismatch! The Archive has been changed."
+    exit -1
+fi
+
+   
+label_file='ProsodyLabeling/000001-010000.txt'
+filename='000001-010000.txt'
+unrar e ${archive} ${label_file}
+cp ${filename} ${exp_dir}
+rm -f ${filename}
+
+if [ ! -f ${exp_dir}/${filename} ];then
+    echo "File extraction failed!"
+    exit
+fi
+
+exit 0
diff --git a/examples/chinese_g2p/path.sh b/examples/chinese_g2p/path.sh
new file mode 100644
index 00000000..b4c625f9
--- /dev/null
+++ b/examples/chinese_g2p/path.sh
@@ -0,0 +1,8 @@
+export MAIN_ROOT=${PWD}/../../
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
diff --git a/examples/chinese_g2p/requirements.txt b/examples/chinese_g2p/requirements.txt
new file mode 100644
index 00000000..c84f4227
--- /dev/null
+++ b/examples/chinese_g2p/requirements.txt
@@ -0,0 +1 @@
+jieba
diff --git a/examples/chinese_g2p/run.sh b/examples/chinese_g2p/run.sh
new file mode 100644
index 00000000..6bde2e26
--- /dev/null
+++ b/examples/chinese_g2p/run.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+source path.sh
+
+stage=-1
+stop_stage=100
+
+exp_dir="exp"
+data_dir="data"
+source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
+mkdir -p ${exp_dir}
+bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data_dir}
+
+# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
+filename="000001-010000.txt"
+echo "Processing transcriptions..."
+
+python3 local/extract_pinyin_label.py ${exp_dir}/${filename} ${exp_dir}/"pinyin_baker.py"
+python3 local/convert_transcription.py ${exp_dir}/${filename} ${exp_dir}/"result_pypinyin.txt"
+python3 local/convert_transcription.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/"result_pypinyin_with_jieba.txt"
+
+echo "done"
+exit 0
-- 
GitLab