# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """format manifest with more metadata.""" import argparse import functools import json from collections import Counter import os import copy import tempfile from deepspeech.frontend.utility import read_manifest from deepspeech.frontend.utility import UNK from deepspeech.frontend.utility import BLANK from deepspeech.frontend.utility import SOS from deepspeech.utils.utility import add_arguments from deepspeech.utils.utility import print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kaldi") add_arg('unit_type', str, "character", "Unit type, e.g. character, word, bpe") add_arg('vocab_path', str, 'examples/librispeech/data/vocab.txt', "Filepath to write the vocabulary.") add_arg('manifest_paths', str, None, "Filepaths of manifests for building vocabulary. " "You can provide multiple manifest files.", nargs='+', required=True) # bpe add_arg('bpe_model_prefix', str, "bpe_model_%(bpe_mode)_%(count_threshold)", "bpe model prefix, only need when `unit_type` is bpe") add_arg('output_path', str, None, "filepath of formated manifest.", required=True) # yapf: disable args = parser.parse_args() def main(): print_arguments(args) # read vocab vocab = dict() with open(args.vocab_path, 'r', encoding='utf-8') as fin: for line in fin: token = line.strip() vocab[token] = len(vocab) vocab_size = len(vocab) fout = open(args.output_path, 'w', encoding='utf-8') if args.unit_type != 'bpe': for manifest_path in args.manifest_paths: manifest_jsons = read_manifest(manifest_path) for line_json in manifest_jsons: tokens = [] tokenids = [] if args.unit_type == 'character': for char in line_json['text']: tokens.append(char) tokenids.append(vocab[char]) elif args.unit_type == 'word': for word in line_json['text'].split(): tokens.append(word) tokenids.append(vocab[word]) line_json['token'] = tokens line_json['token_id'] = tokenids line_json['token_shape'] = (len(tokenids), vocab_size) fout.write(json.dumps(line_json) + '\n') else: import sentencepiece as spm # encode sp = spm.SentencePieceProcessor() sp.Load(args.bpe_model_prefix + '.model') def valid(line): return True def encode(l): return sp.EncodeAsPieces(l) def encode_line(line): line = line.strip() if len(line) > 0: line = encode(line) if valid(line): return line else: stats["num_filtered"] += 1 else: stats["num_empty"] += 1 return None for manifest_path in args.manifest_paths: manifest_jsons = read_manifest(manifest_path) for line_json in manifest_jsons: line = line_json['text'] tokens = [] tokenids = [] enc_line = encode_line(line) for code in enc_line: tokens.append(code) tokenids.append(vocab[code]) #print(code, vocab[code]) line_json['token'] = tokens line_json['token_id'] = tokenids line_json['token_shape'] = (len(tokenids), vocab_size) fout.write(json.dumps(line_json) + '\n') fout.close() if __name__ == '__main__': main()