import logging from pathlib import Path from typing import Dict from typing import List from typing import Union def read_2col_text(path: Union[Path, str]) -> Dict[str, str]: """Read a text file having 2 column as dict object. Examples: wav.scp: key1 /some/path/a.wav key2 /some/path/b.wav >>> read_2col_text('wav.scp') {'key1': '/some/path/a.wav', 'key2': '/some/path/b.wav'} """ data = {} with Path(path).open("r", encoding="utf-8") as f: for linenum, line in enumerate(f, 1): sps = line.rstrip().split(maxsplit=1) if len(sps) == 1: k, v = sps[0], "" else: k, v = sps if k in data: raise RuntimeError(f"{k} is duplicated ({path}:{linenum})") data[k] = v return data def load_num_sequence_text(path: Union[Path, str], loader_type: str="csv_int" ) -> Dict[str, List[Union[float, int]]]: """Read a text file indicating sequences of number Examples: key1 1 2 3 key2 34 5 6 >>> d = load_num_sequence_text('text') >>> np.testing.assert_array_equal(d["key1"], np.array([1, 2, 3])) """ if loader_type == "text_int": delimiter = " " dtype = int elif loader_type == "text_float": delimiter = " " dtype = float elif loader_type == "csv_int": delimiter = "," dtype = int elif loader_type == "csv_float": delimiter = "," dtype = float else: raise ValueError(f"Not supported loader_type={loader_type}") # path looks like: # utta 1,0 # uttb 3,4,5 # -> return {'utta': np.ndarray([1, 0]), # 'uttb': np.ndarray([3, 4, 5])} d = read_2column_text(path) # Using for-loop instead of dict-comprehension for debuggability retval = {} for k, v in d.items(): try: retval[k] = [dtype(i) for i in v.split(delimiter)] except TypeError: logging.error( f'Error happened with path="{path}", id="{k}", value="{v}"') raise return retval