#!/usr/bin/env python3 import argparse import math import os import pickle import random import string import sys from pathlib import Path from typing import Collection from typing import Dict from typing import List from typing import Tuple from typing import Union import librosa import numpy as np import paddle import soundfile as sf import torch from ParallelWaveGAN.parallel_wavegan.utils.utils import download_pretrained_model from align_english import alignment from align_mandarin import alignment_zh from dataset import get_segment_pos from dataset import make_non_pad_mask from dataset import make_pad_mask from dataset import pad_list from dataset import pad_to_longformer_att_window from dataset import phones_masking from model_paddle import build_model_from_file from read_text import load_num_sequence_text from read_text import read_2column_text from sedit_arg_parser import parse_args from utils import build_vocoder_from_file from utils import evaluate_durations from utils import get_voc_out from utils import is_chinese from utils import sentence2phns from paddlespeech.t2s.datasets.get_feats import LogMelFBank random.seed(0) np.random.seed(0) PHONEME = 'tools/aligner/english_envir/english2phoneme/phoneme' MODEL_DIR_EN = 'tools/aligner/english' MODEL_DIR_ZH = 'tools/aligner/mandarin' def plot_mel_and_vocode_wav(uid, prefix, clone_uid, clone_prefix, source_language, target_language, model_name, wav_path, full_origin_str, old_str, new_str, use_pt_vocoder, duration_preditor_path, sid=None, non_autoreg=True): wav_org, input_feat, output_feat, old_span_boundary, new_span_boundary, fs, hop_length = get_mlm_output( uid, prefix, clone_uid, clone_prefix, source_language, target_language, model_name, wav_path, old_str, new_str, duration_preditor_path, use_teacher_forcing=non_autoreg, sid=sid) masked_feat = output_feat[new_span_boundary[0]:new_span_boundary[ 1]].detach().float().cpu().numpy() if target_language == 'english': if use_pt_vocoder: output_feat = output_feat.detach().float().cpu().numpy() output_feat = torch.tensor(output_feat, dtype=torch.float) vocoder = load_vocoder('vctk_parallel_wavegan.v1.long') replaced_wav = vocoder( output_feat).detach().float().data.cpu().numpy() else: output_feat_np = output_feat.detach().float().cpu().numpy() replaced_wav = get_voc_out(output_feat_np, target_language) elif target_language == 'chinese': output_feat_np = output_feat.detach().float().cpu().numpy() replaced_wav_only_mask_fst2_voc = get_voc_out(masked_feat, target_language) old_time_boundary = [hop_length * x for x in old_span_boundary] new_time_boundary = [hop_length * x for x in new_span_boundary] if target_language == 'english': wav_org_replaced_paddle_voc = np.concatenate([ wav_org[:old_time_boundary[0]], replaced_wav[new_time_boundary[0]:new_time_boundary[1]], wav_org[old_time_boundary[1]:] ]) data_dict = {"origin": wav_org, "output": wav_org_replaced_paddle_voc} elif target_language == 'chinese': wav_org_replaced_only_mask_fst2_voc = np.concatenate([ wav_org[:old_time_boundary[0]], replaced_wav_only_mask_fst2_voc, wav_org[old_time_boundary[1]:] ]) data_dict = { "origin": wav_org, "output": wav_org_replaced_only_mask_fst2_voc, } return data_dict, old_span_boundary def get_unk_phns(word_str): tmpbase = '/tmp/tp.' f = open(tmpbase + 'temp.words', 'w') f.write(word_str) f.close() os.system(PHONEME + ' ' + tmpbase + 'temp.words' + ' ' + tmpbase + 'temp.phons') f = open(tmpbase + 'temp.phons', 'r') lines2 = f.readline().strip().split() f.close() phns = [] for phn in lines2: phons = phn.replace('\n', '').replace(' ', '') seq = [] j = 0 while (j < len(phons)): if (phons[j] > 'Z'): if (phons[j] == 'j'): seq.append('JH') elif (phons[j] == 'h'): seq.append('HH') else: seq.append(phons[j].upper()) j += 1 else: p = phons[j:j + 2] if (p == 'WH'): seq.append('W') elif (p in ['TH', 'SH', 'HH', 'DH', 'CH', 'ZH', 'NG']): seq.append(p) elif (p == 'AX'): seq.append('AH0') else: seq.append(p + '1') j += 2 phns.extend(seq) return phns def words2phns(line): dictfile = MODEL_DIR_EN + '/dict' tmpbase = '/tmp/tp.' line = line.strip() words = [] for pun in [',', '.', ':', ';', '!', '?', '"', '(', ')', '--', '---']: line = line.replace(pun, ' ') for wrd in line.split(): if (wrd[-1] == '-'): wrd = wrd[:-1] if (wrd[0] == "'"): wrd = wrd[1:] if wrd: words.append(wrd) ds = set([]) word2phns_dict = {} with open(dictfile, 'r') as fid: for line in fid: word = line.split()[0] ds.add(word) if word not in word2phns_dict.keys(): word2phns_dict[word] = " ".join(line.split()[1:]) phns = [] wrd2phns = {} for index, wrd in enumerate(words): if wrd == '[MASK]': wrd2phns[str(index) + "_" + wrd] = [wrd] phns.append(wrd) elif (wrd.upper() not in ds): wrd2phns[str(index) + "_" + wrd.upper()] = get_unk_phns(wrd) phns.extend(get_unk_phns(wrd)) else: wrd2phns[str(index) + "_" + wrd.upper()] = word2phns_dict[wrd.upper()].split() phns.extend(word2phns_dict[wrd.upper()].split()) return phns, wrd2phns def words2phns_zh(line): dictfile = MODEL_DIR_ZH + '/dict' tmpbase = '/tmp/tp.' line = line.strip() words = [] for pun in [ ',', '.', ':', ';', '!', '?', '"', '(', ')', '--', '---', u',', u'。', u':', u';', u'!', u'?', u'(', u')' ]: line = line.replace(pun, ' ') for wrd in line.split(): if (wrd[-1] == '-'): wrd = wrd[:-1] if (wrd[0] == "'"): wrd = wrd[1:] if wrd: words.append(wrd) ds = set([]) word2phns_dict = {} with open(dictfile, 'r') as fid: for line in fid: word = line.split()[0] ds.add(word) if word not in word2phns_dict.keys(): word2phns_dict[word] = " ".join(line.split()[1:]) phns = [] wrd2phns = {} for index, wrd in enumerate(words): if wrd == '[MASK]': wrd2phns[str(index) + "_" + wrd] = [wrd] phns.append(wrd) elif (wrd.upper() not in ds): print("出现非法词错误,请输入正确的文本...") else: wrd2phns[str(index) + "_" + wrd] = word2phns_dict[wrd].split() phns.extend(word2phns_dict[wrd].split()) return phns, wrd2phns def load_vocoder(vocoder_tag="vctk_parallel_wavegan.v1.long"): vocoder_tag = vocoder_tag.replace("parallel_wavegan/", "") vocoder_file = download_pretrained_model(vocoder_tag) vocoder_config = Path(vocoder_file).parent / "config.yml" vocoder = build_vocoder_from_file(vocoder_config, vocoder_file, None, 'cpu') return vocoder def load_model(model_name): config_path = './pretrained_model/{}/config.yaml'.format(model_name) model_path = './pretrained_model/{}/model.pdparams'.format(model_name) mlm_model, args = build_model_from_file( config_file=config_path, model_file=model_path) return mlm_model, args def read_data(uid, prefix): mfa_text = read_2column_text(prefix + '/text')[uid] mfa_wav_path = read_2column_text(prefix + '/wav.scp')[uid] if 'mnt' not in mfa_wav_path: mfa_wav_path = prefix.split('dump')[0] + mfa_wav_path return mfa_text, mfa_wav_path def get_align_data(uid, prefix): mfa_path = prefix + "mfa_" mfa_text = read_2column_text(mfa_path + 'text')[uid] mfa_start = load_num_sequence_text( mfa_path + 'start', loader_type='text_float')[uid] mfa_end = load_num_sequence_text( mfa_path + 'end', loader_type='text_float')[uid] mfa_wav_path = read_2column_text(mfa_path + 'wav.scp')[uid] return mfa_text, mfa_start, mfa_end, mfa_wav_path def get_masked_mel_boundary(mfa_start, mfa_end, fs, hop_length, span_tobe_replaced): align_start = paddle.to_tensor(mfa_start).unsqueeze(0) align_end = paddle.to_tensor(mfa_end).unsqueeze(0) align_start = paddle.floor(fs * align_start / hop_length).int() align_end = paddle.floor(fs * align_end / hop_length).int() if span_tobe_replaced[0] >= len(mfa_start): span_boundary = [align_end[0].tolist()[-1], align_end[0].tolist()[-1]] else: span_boundary = [ align_start[0].tolist()[span_tobe_replaced[0]], align_end[0].tolist()[span_tobe_replaced[1] - 1] ] return span_boundary def recover_dict(word2phns, tp_word2phns): dic = {} need_del_key = [] exist_index = [] sp_count = 0 add_sp_count = 0 for key in word2phns.keys(): idx, wrd = key.split('_') if wrd == 'sp': sp_count += 1 exist_index.append(int(idx)) else: need_del_key.append(key) for key in need_del_key: del word2phns[key] cur_id = 0 for key in tp_word2phns.keys(): # print("debug: ",key) if cur_id in exist_index: dic[str(cur_id) + "_sp"] = 'sp' cur_id += 1 add_sp_count += 1 idx, wrd = key.split('_') dic[str(cur_id) + "_" + wrd] = tp_word2phns[key] cur_id += 1 if add_sp_count + 1 == sp_count: dic[str(cur_id) + "_sp"] = 'sp' add_sp_count += 1 assert add_sp_count == sp_count, "sp are not added in dic" return dic def get_phns_and_spans(wav_path, old_str, new_str, source_language, clone_target_language): append_new_str = (old_str == new_str[:len(old_str)]) old_phns, mfa_start, mfa_end = [], [], [] if source_language == "english": times2, word2phns = alignment(wav_path, old_str) elif source_language == "chinese": times2, word2phns = alignment_zh(wav_path, old_str) _, tp_word2phns = words2phns_zh(old_str) for key, value in tp_word2phns.items(): idx, wrd = key.split('_') cur_val = " ".join(value) tp_word2phns[key] = cur_val word2phns = recover_dict(word2phns, tp_word2phns) else: assert source_language == "chinese" or source_language == "english", "source_language is wrong..." for item in times2: mfa_start.append(float(item[1])) mfa_end.append(float(item[2])) old_phns.append(item[0]) if append_new_str and (source_language != clone_target_language): is_cross_lingual_clone = True else: is_cross_lingual_clone = False if is_cross_lingual_clone: new_str_origin = new_str[:len(old_str)] new_str_append = new_str[len(old_str):] if clone_target_language == "chinese": new_phns_origin, new_origin_word2phns = words2phns(new_str_origin) new_phns_append, temp_new_append_word2phns = words2phns_zh( new_str_append) elif clone_target_language == "english": new_phns_origin, new_origin_word2phns = words2phns_zh( new_str_origin) # 原始句子 new_phns_append, temp_new_append_word2phns = words2phns( new_str_append) # clone句子 else: assert clone_target_language == "chinese" or clone_target_language == "english", "cloning is not support for this language, please check it." new_phns = new_phns_origin + new_phns_append new_append_word2phns = {} length = len(new_origin_word2phns) for key, value in temp_new_append_word2phns.items(): idx, wrd = key.split('_') new_append_word2phns[str(int(idx) + length) + '_' + wrd] = value new_word2phns = dict( list(new_origin_word2phns.items()) + list( new_append_word2phns.items())) else: if source_language == clone_target_language and clone_target_language == "english": new_phns, new_word2phns = words2phns(new_str) elif source_language == clone_target_language and clone_target_language == "chinese": new_phns, new_word2phns = words2phns_zh(new_str) else: assert source_language == clone_target_language, "source language is not same with target language..." span_tobe_replaced = [0, len(old_phns) - 1] span_tobe_added = [0, len(new_phns) - 1] left_index = 0 new_phns_left = [] sp_count = 0 # find the left different index for key in word2phns.keys(): idx, wrd = key.split('_') if wrd == 'sp': sp_count += 1 new_phns_left.append('sp') else: idx = str(int(idx) - sp_count) if idx + '_' + wrd in new_word2phns: left_index += len(new_word2phns[idx + '_' + wrd]) new_phns_left.extend(word2phns[key].split()) else: span_tobe_replaced[0] = len(new_phns_left) span_tobe_added[0] = len(new_phns_left) break # reverse word2phns and new_word2phns right_index = 0 new_phns_right = [] sp_count = 0 word2phns_max_index = int(list(word2phns.keys())[-1].split('_')[0]) new_word2phns_max_index = int(list(new_word2phns.keys())[-1].split('_')[0]) new_phns_middle = [] if append_new_str: new_phns_right = [] new_phns_middle = new_phns[left_index:] span_tobe_replaced[0] = len(new_phns_left) span_tobe_added[0] = len(new_phns_left) span_tobe_added[1] = len(new_phns_left) + len(new_phns_middle) span_tobe_replaced[1] = len(old_phns) - len(new_phns_right) else: for key in list(word2phns.keys())[::-1]: idx, wrd = key.split('_') if wrd == 'sp': sp_count += 1 new_phns_right = ['sp'] + new_phns_right else: idx = str(new_word2phns_max_index - (word2phns_max_index - int( idx) - sp_count)) if idx + '_' + wrd in new_word2phns: right_index -= len(new_word2phns[idx + '_' + wrd]) new_phns_right = word2phns[key].split() + new_phns_right else: span_tobe_replaced[1] = len(old_phns) - len(new_phns_right) new_phns_middle = new_phns[left_index:right_index] span_tobe_added[1] = len(new_phns_left) + len( new_phns_middle) if len(new_phns_middle) == 0: span_tobe_added[1] = min(span_tobe_added[1] + 1, len(new_phns)) span_tobe_added[0] = max(0, span_tobe_added[0] - 1) span_tobe_replaced[0] = max(0, span_tobe_replaced[0] - 1) span_tobe_replaced[1] = min(span_tobe_replaced[1] + 1, len(old_phns)) break new_phns = new_phns_left + new_phns_middle + new_phns_right return mfa_start, mfa_end, old_phns, new_phns, span_tobe_replaced, span_tobe_added def duration_adjust_factor(original_dur, pred_dur, phns): length = 0 accumulate = 0 factor_list = [] for ori, pred, phn in zip(original_dur, pred_dur, phns): if pred == 0 or phn == 'sp': continue else: factor_list.append(ori / pred) factor_list = np.array(factor_list) factor_list.sort() if len(factor_list) < 5: return 1 length = 2 return np.average(factor_list[length:-length]) def prepare_features_with_duration(uid, prefix, clone_uid, clone_prefix, source_language, target_language, mlm_model, old_str, new_str, wav_path, duration_preditor_path, sid=None, mask_reconstruct=False, duration_adjust=True, start_end_sp=False, train_args=None): wav_org, rate = librosa.load( wav_path, sr=train_args.feats_extract_conf['fs']) fs = train_args.feats_extract_conf['fs'] hop_length = train_args.feats_extract_conf['hop_length'] mfa_start, mfa_end, old_phns, new_phns, span_tobe_replaced, span_tobe_added = get_phns_and_spans( wav_path, old_str, new_str, source_language, target_language) if start_end_sp: if new_phns[-1] != 'sp': new_phns = new_phns + ['sp'] if target_language == "english": old_durations = evaluate_durations( old_phns, target_language=target_language) elif target_language == "chinese": if source_language == "english": old_durations = evaluate_durations( old_phns, target_language=source_language) elif source_language == "chinese": old_durations = evaluate_durations( old_phns, target_language=source_language) else: assert target_language == "chinese" or target_language == "english", "calculate duration_predict is not support for this language..." original_old_durations = [e - s for e, s in zip(mfa_end, mfa_start)] if '[MASK]' in new_str: new_phns = old_phns span_tobe_added = span_tobe_replaced d_factor_left = duration_adjust_factor( original_old_durations[:span_tobe_replaced[0]], old_durations[:span_tobe_replaced[0]], old_phns[:span_tobe_replaced[0]]) d_factor_right = duration_adjust_factor( original_old_durations[span_tobe_replaced[1]:], old_durations[span_tobe_replaced[1]:], old_phns[span_tobe_replaced[1]:]) d_factor = (d_factor_left + d_factor_right) / 2 new_durations_adjusted = [d_factor * i for i in old_durations] else: if duration_adjust: d_factor = duration_adjust_factor(original_old_durations, old_durations, old_phns) d_factor_paddle = duration_adjust_factor(original_old_durations, old_durations, old_phns) d_factor = d_factor * 1.25 else: d_factor = 1 if target_language == "english": new_durations = evaluate_durations( new_phns, target_language=target_language) elif target_language == "chinese": new_durations = evaluate_durations( new_phns, target_language=target_language) new_durations_adjusted = [d_factor * i for i in new_durations] if span_tobe_replaced[0] < len(old_phns) and old_phns[ span_tobe_replaced[0]] == new_phns[span_tobe_added[0]]: new_durations_adjusted[span_tobe_added[0]] = original_old_durations[ span_tobe_replaced[0]] if span_tobe_replaced[1] < len(old_phns) and span_tobe_added[1] < len( new_phns): if old_phns[span_tobe_replaced[1]] == new_phns[span_tobe_added[1]]: new_durations_adjusted[span_tobe_added[ 1]] = original_old_durations[span_tobe_replaced[1]] new_span_duration_sum = sum( new_durations_adjusted[span_tobe_added[0]:span_tobe_added[1]]) old_span_duration_sum = sum( original_old_durations[span_tobe_replaced[0]:span_tobe_replaced[1]]) duration_offset = new_span_duration_sum - old_span_duration_sum new_mfa_start = mfa_start[:span_tobe_replaced[0]] new_mfa_end = mfa_end[:span_tobe_replaced[0]] for i in new_durations_adjusted[span_tobe_added[0]:span_tobe_added[1]]: if len(new_mfa_end) == 0: new_mfa_start.append(0) new_mfa_end.append(i) else: new_mfa_start.append(new_mfa_end[-1]) new_mfa_end.append(new_mfa_end[-1] + i) new_mfa_start += [ i + duration_offset for i in mfa_start[span_tobe_replaced[1]:] ] new_mfa_end += [ i + duration_offset for i in mfa_end[span_tobe_replaced[1]:] ] # 3. get new wav if span_tobe_replaced[0] >= len(mfa_start): left_index = len(wav_org) right_index = left_index else: left_index = int(np.floor(mfa_start[span_tobe_replaced[0]] * fs)) right_index = int(np.ceil(mfa_end[span_tobe_replaced[1] - 1] * fs)) new_blank_wav = np.zeros( (int(np.ceil(new_span_duration_sum * fs)), ), dtype=wav_org.dtype) new_wav_org = np.concatenate( [wav_org[:left_index], new_blank_wav, wav_org[right_index:]]) # 4. get old and new mel span to be mask old_span_boundary = get_masked_mel_boundary( mfa_start, mfa_end, fs, hop_length, span_tobe_replaced) # [92, 92] new_span_boundary = get_masked_mel_boundary(new_mfa_start, new_mfa_end, fs, hop_length, span_tobe_added) # [92, 174] return new_wav_org, new_phns, new_mfa_start, new_mfa_end, old_span_boundary, new_span_boundary def prepare_features(uid, prefix, clone_uid, clone_prefix, source_language, target_language, mlm_model, processor, wav_path, old_str, new_str, duration_preditor_path, sid=None, duration_adjust=True, start_end_sp=False, mask_reconstruct=False, train_args=None): wav_org, phns_list, mfa_start, mfa_end, old_span_boundary, new_span_boundary = prepare_features_with_duration( uid, prefix, clone_uid, clone_prefix, source_language, target_language, mlm_model, old_str, new_str, wav_path, duration_preditor_path, sid=sid, duration_adjust=duration_adjust, start_end_sp=start_end_sp, mask_reconstruct=mask_reconstruct, train_args=train_args) speech = np.array(wav_org, dtype=np.float32) align_start = np.array(mfa_start) align_end = np.array(mfa_end) token_to_id = {item: i for i, item in enumerate(train_args.token_list)} text = np.array( list( map(lambda x: token_to_id.get(x, token_to_id['']), phns_list))) # print('unk id is', token_to_id['']) # text = np.array(processor(uid='1', data={'text':" ".join(phns_list)})['text']) span_boundary = np.array(new_span_boundary) batch = [('1', { "speech": speech, "align_start": align_start, "align_end": align_end, "text": text, "span_boundary": span_boundary })] return batch, old_span_boundary, new_span_boundary def decode_with_model(uid, prefix, clone_uid, clone_prefix, source_language, target_language, mlm_model, processor, collate_fn, wav_path, old_str, new_str, duration_preditor_path, sid=None, decoder=False, use_teacher_forcing=False, duration_adjust=True, start_end_sp=False, train_args=None): fs, hop_length = train_args.feats_extract_conf[ 'fs'], train_args.feats_extract_conf['hop_length'] batch, old_span_boundary, new_span_boundary = prepare_features( uid, prefix, clone_uid, clone_prefix, source_language, target_language, mlm_model, processor, wav_path, old_str, new_str, duration_preditor_path, sid, duration_adjust=duration_adjust, start_end_sp=start_end_sp, train_args=train_args) feats = collate_fn(batch)[1] if 'text_masked_position' in feats.keys(): feats.pop('text_masked_position') for k, v in feats.items(): feats[k] = paddle.to_tensor(v) rtn = mlm_model.inference( **feats, span_boundary=new_span_boundary, use_teacher_forcing=use_teacher_forcing) output = rtn['feat_gen'] if 0 in output[0].shape and 0 not in output[-1].shape: output_feat = paddle.concat( output[1:-1] + [output[-1].squeeze()], axis=0).cpu() elif 0 not in output[0].shape and 0 in output[-1].shape: output_feat = paddle.concat( [output[0].squeeze()] + output[1:-1], axis=0).cpu() elif 0 in output[0].shape and 0 in output[-1].shape: output_feat = paddle.concat(output[1:-1], axis=0).cpu() else: output_feat = paddle.concat( [output[0].squeeze(0)] + output[1:-1] + [output[-1].squeeze(0)], axis=0).cpu() wav_org, rate = librosa.load( wav_path, sr=train_args.feats_extract_conf['fs']) origin_speech = paddle.to_tensor( np.array(wav_org, dtype=np.float32)).unsqueeze(0) speech_lengths = paddle.to_tensor(len(wav_org)).unsqueeze(0) return wav_org, None, output_feat, old_span_boundary, new_span_boundary, fs, hop_length class MLMCollateFn: """Functor class of common_collate_fn()""" def __init__(self, feats_extract, float_pad_value: Union[float, int]=0.0, int_pad_value: int=-32768, not_sequence: Collection[str]=(), mlm_prob: float=0.8, mean_phn_span: int=8, attention_window: int=0, pad_speech: bool=False, sega_emb: bool=False, duration_collect: bool=False, text_masking: bool=False): self.mlm_prob = mlm_prob self.mean_phn_span = mean_phn_span self.feats_extract = feats_extract self.float_pad_value = float_pad_value self.int_pad_value = int_pad_value self.not_sequence = set(not_sequence) self.attention_window = attention_window self.pad_speech = pad_speech self.sega_emb = sega_emb self.duration_collect = duration_collect self.text_masking = text_masking def __repr__(self): return (f"{self.__class__}(float_pad_value={self.float_pad_value}, " f"int_pad_value={self.float_pad_value})") def __call__(self, data: Collection[Tuple[str, Dict[str, np.ndarray]]] ) -> Tuple[List[str], Dict[str, paddle.Tensor]]: return mlm_collate_fn( data, float_pad_value=self.float_pad_value, int_pad_value=self.int_pad_value, not_sequence=self.not_sequence, mlm_prob=self.mlm_prob, mean_phn_span=self.mean_phn_span, feats_extract=self.feats_extract, attention_window=self.attention_window, pad_speech=self.pad_speech, sega_emb=self.sega_emb, duration_collect=self.duration_collect, text_masking=self.text_masking) def mlm_collate_fn( data: Collection[Tuple[str, Dict[str, np.ndarray]]], float_pad_value: Union[float, int]=0.0, int_pad_value: int=-32768, not_sequence: Collection[str]=(), mlm_prob: float=0.8, mean_phn_span: int=8, feats_extract=None, attention_window: int=0, pad_speech: bool=False, sega_emb: bool=False, duration_collect: bool=False, text_masking: bool=False) -> Tuple[List[str], Dict[str, paddle.Tensor]]: """Concatenate ndarray-list to an array and convert to torch.Tensor. Examples: >>> from espnet2.samplers.constant_batch_sampler import ConstantBatchSampler, >>> import espnet2.tasks.abs_task >>> from espnet2.train.dataset import ESPnetDataset >>> sampler = ConstantBatchSampler(...) >>> dataset = ESPnetDataset(...) >>> keys = next(iter(sampler) >>> batch = [dataset[key] for key in keys] >>> batch = common_collate_fn(batch) >>> model(**batch) Note that the dict-keys of batch are propagated from that of the dataset as they are. """ uttids = [u for u, _ in data] data = [d for _, d in data] assert all(set(data[0]) == set(d) for d in data), "dict-keys mismatching" assert all(not k.endswith("_lengths") for k in data[0]), f"*_lengths is reserved: {list(data[0])}" output = {} for key in data[0]: # NOTE(kamo): # Each models, which accepts these values finally, are responsible # to repaint the pad_value to the desired value for each tasks. if data[0][key].dtype.kind == "i": pad_value = int_pad_value else: pad_value = float_pad_value array_list = [d[key] for d in data] # Assume the first axis is length: # tensor_list: Batch x (Length, ...) tensor_list = [paddle.to_tensor(a) for a in array_list] # tensor: (Batch, Length, ...) tensor = pad_list(tensor_list, pad_value) output[key] = tensor # lens: (Batch,) if key not in not_sequence: lens = paddle.to_tensor( [d[key].shape[0] for d in data], dtype=paddle.long) output[key + "_lengths"] = lens feats = feats_extract.get_log_mel_fbank(np.array(output["speech"][0])) feats = paddle.to_tensor(feats) # print('out shape', paddle.shape(feats)) feats_lengths = paddle.shape(feats)[0] feats = paddle.unsqueeze(feats, 0) batch_size = paddle.shape(feats)[0] if 'text' not in output: text = paddle.zeros_like(feats_lengths.unsqueeze(-1)) - 2 text_lengths = paddle.zeros_like(feats_lengths) + 1 max_tlen = 1 align_start = paddle.zeros_like(text) align_end = paddle.zeros_like(text) align_start_lengths = paddle.zeros_like(feats_lengths) align_end_lengths = paddle.zeros_like(feats_lengths) sega_emb = False mean_phn_span = 0 mlm_prob = 0.15 else: text, text_lengths = output["text"], output["text_lengths"] align_start, align_start_lengths, align_end, align_end_lengths = output[ "align_start"], output["align_start_lengths"], output[ "align_end"], output["align_end_lengths"] align_start = paddle.floor(feats_extract.sr * align_start / feats_extract.hop_length).int() align_end = paddle.floor(feats_extract.sr * align_end / feats_extract.hop_length).int() max_tlen = max(text_lengths).item() max_slen = max(feats_lengths).item() speech_pad = feats[:, :max_slen] if attention_window > 0 and pad_speech: speech_pad, max_slen = pad_to_longformer_att_window( speech_pad, max_slen, max_slen, attention_window) max_len = max_slen + max_tlen if attention_window > 0: text_pad, max_tlen = pad_to_longformer_att_window( text, max_len, max_tlen, attention_window) else: text_pad = text text_mask = make_non_pad_mask( text_lengths.tolist(), text_pad, length_dim=1).unsqueeze(-2) if attention_window > 0: text_mask = text_mask * 2 speech_mask = make_non_pad_mask( feats_lengths.tolist(), speech_pad[:, :, 0], length_dim=1).unsqueeze(-2) span_boundary = None if 'span_boundary' in output.keys(): span_boundary = output['span_boundary'] if text_masking: masked_position, text_masked_position, _ = phones_text_masking( speech_pad, speech_mask, text_pad, text_mask, align_start, align_end, align_start_lengths, mlm_prob, mean_phn_span, span_boundary) else: text_masked_position = np.zeros(text_pad.size()) masked_position, _ = phones_masking( speech_pad, speech_mask, align_start, align_end, align_start_lengths, mlm_prob, mean_phn_span, span_boundary) output_dict = {} if duration_collect and 'text' in output: reordered_index, speech_segment_pos, text_segment_pos, durations, feats_lengths = get_segment_pos_reduce_duration( speech_pad, text_pad, align_start, align_end, align_start_lengths, sega_emb, masked_position, feats_lengths) speech_mask = make_non_pad_mask( feats_lengths.tolist(), speech_pad[:, :reordered_index.shape[1], 0], length_dim=1).unsqueeze(-2) output_dict['durations'] = durations output_dict['reordered_index'] = reordered_index else: speech_segment_pos, text_segment_pos = get_segment_pos( speech_pad, text_pad, align_start, align_end, align_start_lengths, sega_emb) output_dict['speech'] = speech_pad output_dict['text'] = text_pad output_dict['masked_position'] = masked_position output_dict['text_masked_position'] = text_masked_position output_dict['speech_mask'] = speech_mask output_dict['text_mask'] = text_mask output_dict['speech_segment_pos'] = speech_segment_pos output_dict['text_segment_pos'] = text_segment_pos output_dict['speech_lengths'] = output["speech_lengths"] output_dict['text_lengths'] = text_lengths output = (uttids, output_dict) return output def build_collate_fn(args: argparse.Namespace, train: bool, epoch=-1): # -> Callable[ # [Collection[Tuple[str, Dict[str, np.ndarray]]]], # Tuple[List[str], Dict[str, torch.Tensor]], # ]: # assert check_argument_types() # return CommonCollateFn(float_pad_value=0.0, int_pad_value=0) feats_extract_class = LogMelFBank if args.feats_extract_conf['win_length'] == None: args.feats_extract_conf['win_length'] = args.feats_extract_conf['n_fft'] args_dic = {} for k, v in args.feats_extract_conf.items(): if k == 'fs': args_dic['sr'] = v else: args_dic[k] = v # feats_extract = feats_extract_class(**args.feats_extract_conf) feats_extract = feats_extract_class(**args_dic) sega_emb = True if args.encoder_conf['input_layer'] == 'sega_mlm' else False if args.encoder_conf['selfattention_layer_type'] == 'longformer': attention_window = args.encoder_conf['attention_window'] pad_speech = True if 'pre_speech_layer' in args.encoder_conf and args.encoder_conf[ 'pre_speech_layer'] > 0 else False else: attention_window = 0 pad_speech = False if epoch == -1: mlm_prob_factor = 1 else: mlm_probs = [1.0, 1.0, 0.7, 0.6, 0.5] mlm_prob_factor = 0.8 #mlm_probs[epoch // 100] if 'duration_predictor_layers' in args.model_conf.keys( ) and args.model_conf['duration_predictor_layers'] > 0: duration_collect = True else: duration_collect = False return MLMCollateFn( feats_extract, float_pad_value=0.0, int_pad_value=0, mlm_prob=args.model_conf['mlm_prob'] * mlm_prob_factor, mean_phn_span=args.model_conf['mean_phn_span'], attention_window=attention_window, pad_speech=pad_speech, sega_emb=sega_emb, duration_collect=duration_collect) def get_mlm_output(uid, prefix, clone_uid, clone_prefix, source_language, target_language, model_name, wav_path, old_str, new_str, duration_preditor_path, sid=None, decoder=False, use_teacher_forcing=False, dynamic_eval=(0, 0), duration_adjust=True, start_end_sp=False): mlm_model, train_args = load_model(model_name) mlm_model.eval() processor = None collate_fn = build_collate_fn(train_args, False) return decode_with_model( uid, prefix, clone_uid, clone_prefix, source_language, target_language, mlm_model, processor, collate_fn, wav_path, old_str, new_str, duration_preditor_path, sid=sid, decoder=decoder, use_teacher_forcing=use_teacher_forcing, duration_adjust=duration_adjust, start_end_sp=start_end_sp, train_args=train_args) def test_vctk(uid, clone_uid, clone_prefix, source_language, target_language, vocoder, prefix='dump/raw/dev', model_name="conformer", old_str="", new_str="", prompt_decoding=False, dynamic_eval=(0, 0), task_name=None): duration_preditor_path = None spemd = None full_origin_str, wav_path = read_data(uid, prefix) if task_name == 'edit': new_str = new_str elif task_name == 'synthesize': new_str = full_origin_str + new_str else: new_str = full_origin_str + ' '.join( [ch for ch in new_str if is_chinese(ch)]) print('new_str is ', new_str) if not old_str: old_str = full_origin_str results_dict, old_span = plot_mel_and_vocode_wav( uid, prefix, clone_uid, clone_prefix, source_language, target_language, model_name, wav_path, full_origin_str, old_str, new_str, vocoder, duration_preditor_path, sid=spemd) return results_dict if __name__ == "__main__": # parse config and args args = parse_args() data_dict = test_vctk( args.uid, args.clone_uid, args.clone_prefix, args.source_language, args.target_language, args.use_pt_vocoder, args.prefix, args.model_name, new_str=args.new_str, task_name=args.task_name) sf.write(args.output_name, data_dict['output'], samplerate=24000) print("finished...") # exit()