diff --git a/examples/dataset/ted_en_zh/ted_en_zh.py b/examples/dataset/ted_en_zh/ted_en_zh.py index 14bef01d2b129a04dc0aac21765321893c470ac8..a8cbb83793710d9971ff320d6968b743a13d5df1 100644 --- a/examples/dataset/ted_en_zh/ted_en_zh.py +++ b/examples/dataset/ted_en_zh/ted_en_zh.py @@ -72,14 +72,17 @@ def create_manifest(data_dir, manifest_path_prefix): continue audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) + + + translation_str = " ".join(translation.split()) + trancription_str = " ".join(trancription.split()) json_lines.append( json.dumps( { 'utt': utt, 'feat': audio_path, 'feat_shape': (duration, ), # second - 'text': " ".join(translation.split()), - 'text1': " ".join(trancription.split()) + 'text': [translation_str, trancription_str], }, ensure_ascii=False)) diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh index ce58f539d81fb527eb3a3554b50ae6353525e4ab..d3acbd4486b3753e70fe7d0c3f71b4f1b3576583 100755 --- a/examples/ted_en_zh/t0/local/data.sh +++ b/examples/ted_en_zh/t0/local/data.sh @@ -9,7 +9,7 @@ stop_stage=100 nbpe=8000 bpemode=unigram bpeprefix="data/bpe_${bpemode}_${nbpe}" -data_dir=./TED_EnZh +data_dir=./TED-En-Zh source ${MAIN_ROOT}/utils/parse_options.sh @@ -21,7 +21,7 @@ mkdir -p data if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ! -e ${data_dir} ]; then - echo "Error: Dataset is not avaiable. Please download and unzip the dataset" + echo "Error: ${data_dir} Dataset is not avaiable. Please download and unzip the dataset" echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0" echo "The tree of the directory should be:" echo "." @@ -88,7 +88,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size for set in train dev test; do { - python3 ${MAIN_ROOT}/utils/format_triplet_data.py \ + python3 ${MAIN_ROOT}/utils/format_data.py \ --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py index a500f10c9c3af3bbf5f4ce53190f601b1030cf94..35b868718035da2ada642a51bbe72f9b6c056313 100644 --- a/paddlespeech/s2t/io/collator.py +++ b/paddlespeech/s2t/io/collator.py @@ -237,8 +237,8 @@ class SpeechCollatorBase(): for idx, item in enumerate(batch): utts.append(item['utt']) - audio = item['feat'] - text = item['text'] + audio = item['input'][0]['feat'] + text = item['output'][0]['text'] audio, text = self.process_utterance(audio, text) audios.append(audio) # [T, D] @@ -381,9 +381,10 @@ class TripletSpeechCollator(SpeechCollator): for idx, item in enumerate(batch): utts.append(item['utt']) - audio = item['feat'] - translation = item['text'] - transcription = item['text1'] + audio = item['input'][0]['feat'] + translation = item['output'][0]['text'] + transcription = item['output'][1]['text'] + audio, translation, transcription = self.process_utterance( audio, translation, transcription) diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index 7007518da8a5cb2750744edb971cb121a17d54c4..c5df2d6bd0da869b5d5da1b4de32092e12cf9f48 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -122,7 +122,7 @@ class ManifestDataset(Dataset): min_output_len=min_output_len, max_output_input_ratio=max_output_input_ratio, min_output_input_ratio=min_output_input_ratio) - self._manifest.sort(key=lambda x: x["feat_shape"][0]) + self._manifest.sort(key=lambda x: x["input"][0]["shape"][0]) def __len__(self): return len(self._manifest)