提交 c7d9b115 编写于 作者: H Hui Zhang

format

上级 caf72258
......@@ -12,6 +12,8 @@ exclude =
.git,
# python cache
__pycache__,
# third party
utils/compute-wer.py,
third_party/,
# Provide a comma-separate list of glob patterns to include for checks.
filename =
......
......@@ -40,6 +40,7 @@ from paddlespeech.s2t.utils.utility import UpdateConfig
__all__ = ['ASRExecutor']
@cli_register(
name='paddlespeech.asr', description='Speech to text infer command.')
class ASRExecutor(BaseExecutor):
......@@ -278,7 +279,8 @@ class ASRExecutor(BaseExecutor):
self._outputs["result"] = result_transcripts[0]
elif "conformer" in model_type or "transformer" in model_type:
logger.info(f"we will use the transformer like model : {model_type}")
logger.info(
f"we will use the transformer like model : {model_type}")
try:
result_transcripts = self.model.decode(
audio,
......
......@@ -305,6 +305,7 @@ class ASRClientExecutor(BaseExecutor):
return res['asr_results']
@cli_client_register(
name='paddlespeech_client.cls', description='visit cls service')
class CLSClientExecutor(BaseExecutor):
......
......@@ -12,7 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict
import paddle
from paddlespeech.cli.log import logger
from paddlespeech.s2t.utils.utility import log_add
......
......@@ -36,7 +36,7 @@ class ASRAudioHandler:
x_len = len(samples)
chunk_size = 85 * 16 #80ms, sample_rate = 16kHz
if x_len % chunk_size!= 0:
if x_len % chunk_size != 0:
padding_len_x = chunk_size - x_len % chunk_size
else:
padding_len_x = 0
......
......@@ -20,11 +20,11 @@ A few sklearn functions are modified in this script as per requirement.
import argparse
import copy
import warnings
from distutils.util import strtobool
import numpy as np
import scipy
import sklearn
from distutils.util import strtobool
from scipy import linalg
from scipy import sparse
from scipy.sparse.csgraph import connected_components
......
......@@ -2,6 +2,7 @@
import argparse
from collections import Counter
def main(args):
counter = Counter()
with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout:
......@@ -20,21 +21,16 @@ def main(args):
fout.write(f"{word}\t{val}\n")
fout.flush()
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='text(line:utt1 中国 人) to lexicon(line:中国 中 国).')
parser.add_argument(
'--has_key',
default=True,
help='text path, with utt or not')
'--has_key', default=True, help='text path, with utt or not')
parser.add_argument(
'--text',
required=True,
help='text path. line: utt1 中国 人 or 中国 人')
'--text', required=True, help='text path. line: utt1 中国 人 or 中国 人')
parser.add_argument(
'--lexicon',
required=True,
help='lexicon path. line:中国 中 国')
'--lexicon', required=True, help='lexicon path. line:中国 中 国')
args = parser.parse_args()
print(args)
......
#!/usr/bin/env python3
# modify from https://sites.google.com/site/homepageoffuyanwei/Home/remarksandexcellentdiscussion/page-2
class Word:
def __init__(self,text = '',freq = 0):
def __init__(self, text='', freq=0):
self.text = text
self.freq = freq
self.length = len(text)
class Chunk:
def __init__(self,w1,w2 = None,w3 = None):
def __init__(self, w1, w2=None, w3=None):
self.words = []
self.words.append(w1)
if w2:
......@@ -44,8 +45,8 @@ class Chunk:
sum += word.freq
return sum
class ComplexCompare:
class ComplexCompare:
def takeHightest(self, chunks, comparator):
i = 1
for j in range(1, len(chunks)):
......@@ -59,23 +60,27 @@ class ComplexCompare:
#以下四个函数是mmseg算法的四种过滤原则,核心算法
def mmFilter(self, chunks):
def comparator(a,b):
def comparator(a, b):
return a.totalWordLength() - b.totalWordLength()
return self.takeHightest(chunks, comparator)
def lawlFilter(self,chunks):
def comparator(a,b):
def lawlFilter(self, chunks):
def comparator(a, b):
return a.averageWordLength() - b.averageWordLength()
return self.takeHightest(chunks,comparator)
def svmlFilter(self,chunks):
def comparator(a,b):
return self.takeHightest(chunks, comparator)
def svmlFilter(self, chunks):
def comparator(a, b):
return b.standardDeviation() - a.standardDeviation()
return self.takeHightest(chunks, comparator)
def logFreqFilter(self,chunks):
def comparator(a,b):
def logFreqFilter(self, chunks):
def comparator(a, b):
return a.wordFrequency() - b.wordFrequency()
return self.takeHightest(chunks, comparator)
......@@ -83,6 +88,7 @@ class ComplexCompare:
dictWord = {}
maxWordLength = 0
def loadDictChars(filepath):
global maxWordLength
fsock = open(filepath)
......@@ -90,18 +96,22 @@ def loadDictChars(filepath):
freq, word = line.split()
word = word.strip()
dictWord[word] = (len(word), int(freq))
maxWordLength = len(word) if maxWordLength < len(word) else maxWordLength
maxWordLength = len(word) if maxWordLength < len(
word) else maxWordLength
fsock.close()
def loadDictWords(filepath):
global maxWordLength
fsock = open(filepath)
for line in fsock.readlines():
word = line.strip()
dictWord[word] = (len(word), 0)
maxWordLength = len(word) if maxWordLength < len(word) else maxWordLength
maxWordLength = len(word) if maxWordLength < len(
word) else maxWordLength
fsock.close()
#判断该词word是否在字典dictWord中
def getDictWord(word):
result = dictWord.get(word)
......@@ -109,14 +119,15 @@ def getDictWord(word):
return Word(word, result[1])
return None
#开始加载字典
def run():
from os.path import join, dirname
loadDictChars(join(dirname(__file__), 'data', 'chars.dic'))
loadDictWords(join(dirname(__file__), 'data', 'words.dic'))
class Analysis:
class Analysis:
def __init__(self, text):
self.text = text
self.cacheSize = 3
......@@ -134,11 +145,10 @@ class Analysis:
if not dictWord:
run()
def __iter__(self):
while True:
token = self.getNextToken()
if token == None:
if token is None:
raise StopIteration
yield token
......@@ -146,7 +156,7 @@ class Analysis:
return self.text[self.pos]
#判断该字符是否是中文字符(不包括中文标点)
def isChineseChar(self,charater):
def isChineseChar(self, charater):
return 0x4e00 <= ord(charater) < 0x9fa6
#判断是否是ASCII码
......@@ -163,8 +173,8 @@ class Analysis:
while self.pos < self.textLength:
if self.isChineseChar(self.getNextChar()):
token = self.getChineseWords()
else :
token = self.getASCIIWords()+'/'
else:
token = self.getASCIIWords() + '/'
if len(token) > 0:
return token
return None
......@@ -211,7 +221,7 @@ class Analysis:
chunks = self.complexCompare.svmlFilter(chunks)
if len(chunks) > 1:
chunks = self.complexCompare.logFreqFilter(chunks)
if len(chunks) == 0 :
if len(chunks) == 0:
return ''
#最后只有一种切割方法
......@@ -242,13 +252,13 @@ class Analysis:
for word3 in words3:
# print(word3.length, word3.text)
if word3.length == -1:
chunk = Chunk(word1,word2)
chunk = Chunk(word1, word2)
# print("Ture")
else :
chunk = Chunk(word1,word2,word3)
else:
chunk = Chunk(word1, word2, word3)
chunks.append(chunk)
elif self.pos == self.textLength:
chunks.append(Chunk(word1,word2))
chunks.append(Chunk(word1, word2))
self.pos -= len(word2.text)
elif self.pos == self.textLength:
chunks.append(Chunk(word1))
......@@ -268,7 +278,7 @@ class Analysis:
words = []
index = 0
while self.pos < self.textLength:
if index >= maxWordLength :
if index >= maxWordLength:
break
if not self.isChineseChar(self.getNextChar()):
break
......@@ -288,18 +298,18 @@ class Analysis:
word.text = 'X'
words.append(word)
self.cache[self.cacheIndex] = (self.pos,words)
self.cache[self.cacheIndex] = (self.pos, words)
self.cacheIndex += 1
if self.cacheIndex >= self.cacheSize:
self.cacheIndex = 0
return words
if __name__=="__main__":
if __name__ == "__main__":
def cuttest(text):
#cut = Analysis(text)
tmp=""
tmp = ""
try:
for word in iter(Analysis(text)):
tmp += word
......@@ -375,6 +385,8 @@ if __name__=="__main__":
cuttest(u"好人使用了它就可以解决一些问题")
cuttest(u"是因为和国家")
cuttest(u"老年搜索还支持")
cuttest(u"干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
cuttest(
u"干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 "
)
cuttest("2022年12月30日是星期几?")
cuttest("二零二二年十二月三十日是星期几?")
......@@ -26,9 +26,9 @@ import argparse
import os
import re
import subprocess
from distutils.util import strtobool
import numpy as np
from distutils.util import strtobool
FILE_IDS = re.compile(r"(?<=Speaker Diarization for).+(?=\*\*\*)")
SCORED_SPEAKER_TIME = re.compile(r"(?<=SCORED SPEAKER TIME =)[\d.]+")
......
此差异已折叠。
import os
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import jsonlines
def trans_hyp(origin_hyp,
trans_hyp = None,
trans_hyp_sclite = None):
def trans_hyp(origin_hyp, trans_hyp=None, trans_hyp_sclite=None):
"""
Args:
origin_hyp: The input json file which contains the model output
......@@ -24,12 +34,11 @@ def trans_hyp(origin_hyp,
if trans_hyp_sclite is not None:
with open(trans_hyp_sclite, "w+") as f:
for key in input_dict.keys():
line = input_dict[key] + "(" + key + ".wav" +")" + "\n"
line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
f.write(line)
def trans_ref(origin_ref,
trans_ref = None,
trans_ref_sclite = None):
def trans_ref(origin_ref, trans_ref=None, trans_ref_sclite=None):
"""
Args:
origin_hyp: The input json file which contains the model output
......@@ -49,42 +58,48 @@ def trans_ref(origin_ref,
if trans_ref_sclite is not None:
with open(trans_ref_sclite, "w") as f:
for key in input_dict.keys():
line = input_dict[key] + "(" + key + ".wav" +")" + "\n"
line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
f.write(line)
if __name__ == "__main__":
parser = argparse.ArgumentParser(prog='format hyp file for compute CER/WER', add_help=True)
parser = argparse.ArgumentParser(
prog='format hyp file for compute CER/WER', add_help=True)
parser.add_argument(
'--origin_hyp',
type=str,
default = None,
help='origin hyp file')
'--origin_hyp', type=str, default=None, help='origin hyp file')
parser.add_argument(
'--trans_hyp', type=str, default = None, help='hyp file for caculating CER/WER')
'--trans_hyp',
type=str,
default=None,
help='hyp file for caculating CER/WER')
parser.add_argument(
'--trans_hyp_sclite', type=str, default = None, help='hyp file for caculating CER/WER by sclite')
'--trans_hyp_sclite',
type=str,
default=None,
help='hyp file for caculating CER/WER by sclite')
parser.add_argument(
'--origin_ref',
type=str,
default = None,
help='origin ref file')
'--origin_ref', type=str, default=None, help='origin ref file')
parser.add_argument(
'--trans_ref', type=str, default = None, help='ref file for caculating CER/WER')
'--trans_ref',
type=str,
default=None,
help='ref file for caculating CER/WER')
parser.add_argument(
'--trans_ref_sclite', type=str, default = None, help='ref file for caculating CER/WER by sclite')
'--trans_ref_sclite',
type=str,
default=None,
help='ref file for caculating CER/WER by sclite')
parser_args = parser.parse_args()
if parser_args.origin_hyp is not None:
trans_hyp(
origin_hyp = parser_args.origin_hyp,
trans_hyp = parser_args.trans_hyp,
trans_hyp_sclite = parser_args.trans_hyp_sclite, )
origin_hyp=parser_args.origin_hyp,
trans_hyp=parser_args.trans_hyp,
trans_hyp_sclite=parser_args.trans_hyp_sclite, )
if parser_args.origin_ref is not None:
trans_ref(
origin_ref = parser_args.origin_ref,
trans_ref = parser_args.trans_ref,
trans_ref_sclite = parser_args.trans_ref_sclite, )
origin_ref=parser_args.origin_ref,
trans_ref=parser_args.trans_ref,
trans_ref_sclite=parser_args.trans_ref_sclite, )
......@@ -82,7 +82,10 @@ def main(args):
lexicon_table.add(word)
out_n += 1
print(f"Filter lexicon by unit table: filter out {in_n - out_n}, {out_n}/{in_n}")
print(
f"Filter lexicon by unit table: filter out {in_n - out_n}, {out_n}/{in_n}"
)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册