提交 b944418d 编写于 作者: H Hui Zhang

new format data support ds2/st

上级 02c7ef31
...@@ -72,14 +72,17 @@ def create_manifest(data_dir, manifest_path_prefix): ...@@ -72,14 +72,17 @@ def create_manifest(data_dir, manifest_path_prefix):
continue continue
audio_data, samplerate = soundfile.read(audio_path) audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate) duration = float(len(audio_data) / samplerate)
translation_str = " ".join(translation.split())
trancription_str = " ".join(trancription.split())
json_lines.append( json_lines.append(
json.dumps( json.dumps(
{ {
'utt': utt, 'utt': utt,
'feat': audio_path, 'feat': audio_path,
'feat_shape': (duration, ), # second 'feat_shape': (duration, ), # second
'text': " ".join(translation.split()), 'text': [translation_str, trancription_str],
'text1': " ".join(trancription.split())
}, },
ensure_ascii=False)) ensure_ascii=False))
......
...@@ -9,7 +9,7 @@ stop_stage=100 ...@@ -9,7 +9,7 @@ stop_stage=100
nbpe=8000 nbpe=8000
bpemode=unigram bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}" bpeprefix="data/bpe_${bpemode}_${nbpe}"
data_dir=./TED_EnZh data_dir=./TED-En-Zh
source ${MAIN_ROOT}/utils/parse_options.sh source ${MAIN_ROOT}/utils/parse_options.sh
...@@ -21,7 +21,7 @@ mkdir -p data ...@@ -21,7 +21,7 @@ mkdir -p data
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
if [ ! -e ${data_dir} ]; then if [ ! -e ${data_dir} ]; then
echo "Error: Dataset is not avaiable. Please download and unzip the dataset" echo "Error: ${data_dir} Dataset is not avaiable. Please download and unzip the dataset"
echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0" echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0"
echo "The tree of the directory should be:" echo "The tree of the directory should be:"
echo "." echo "."
...@@ -88,7 +88,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -88,7 +88,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size # format manifest with tokenids, vocab size
for set in train dev test; do for set in train dev test; do
{ {
python3 ${MAIN_ROOT}/utils/format_triplet_data.py \ python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \ --cmvn_path "data/mean_std.json" \
--unit_type "spm" \ --unit_type "spm" \
--spm_model_prefix ${bpeprefix} \ --spm_model_prefix ${bpeprefix} \
......
...@@ -237,8 +237,8 @@ class SpeechCollatorBase(): ...@@ -237,8 +237,8 @@ class SpeechCollatorBase():
for idx, item in enumerate(batch): for idx, item in enumerate(batch):
utts.append(item['utt']) utts.append(item['utt'])
audio = item['feat'] audio = item['input'][0]['feat']
text = item['text'] text = item['output'][0]['text']
audio, text = self.process_utterance(audio, text) audio, text = self.process_utterance(audio, text)
audios.append(audio) # [T, D] audios.append(audio) # [T, D]
...@@ -381,9 +381,10 @@ class TripletSpeechCollator(SpeechCollator): ...@@ -381,9 +381,10 @@ class TripletSpeechCollator(SpeechCollator):
for idx, item in enumerate(batch): for idx, item in enumerate(batch):
utts.append(item['utt']) utts.append(item['utt'])
audio = item['feat'] audio = item['input'][0]['feat']
translation = item['text'] translation = item['output'][0]['text']
transcription = item['text1'] transcription = item['output'][1]['text']
audio, translation, transcription = self.process_utterance( audio, translation, transcription = self.process_utterance(
audio, translation, transcription) audio, translation, transcription)
......
...@@ -122,7 +122,7 @@ class ManifestDataset(Dataset): ...@@ -122,7 +122,7 @@ class ManifestDataset(Dataset):
min_output_len=min_output_len, min_output_len=min_output_len,
max_output_input_ratio=max_output_input_ratio, max_output_input_ratio=max_output_input_ratio,
min_output_input_ratio=min_output_input_ratio) min_output_input_ratio=min_output_input_ratio)
self._manifest.sort(key=lambda x: x["feat_shape"][0]) self._manifest.sort(key=lambda x: x["input"][0]["shape"][0])
def __len__(self): def __len__(self):
return len(self._manifest) return len(self._manifest)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册