未验证 提交 2aef6958 编写于 作者: Z Zhao Yuting 提交者: GitHub

Create preprocess.py

If there are no spaces between sentences in your text file, use this file to generate a new file, which adds spaces between each token.
上级 5a58a274
import argparse
import os
def process_sentence(line):
if line == '': return ''
res = line[0]
for i in range(1, len(line)):
res += (' ' + line[i])
return res
if __name__ == "__main__":
paser = argparse.ArgumentParser(description = "Input filename")
paser.add_argument('-input_file')
paser.add_argument('-output_file')
sentence_cnt = 0
args = paser.parse_args()
with open(args.input_file, 'r') as f:
with open(args.output_file, 'w') as write_f:
while True:
line = f.readline()
if line:
sentence_cnt += 1
write_f.write(process_sentence(line))
else:
break
print('preprocess over')
print('total sentences number:', sentence_cnt)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册