未验证 提交 ff5dd783 编写于 作者: W wuzewu 提交者: GitHub

Fix the unicode in tokenization (#27)

上级 d6db70bd
......@@ -18,6 +18,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import codecs
import collections
import unicodedata
......@@ -70,7 +71,7 @@ def printable_text(text):
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
fin = open(vocab_file)
fin = codecs.open(vocab_file, "r", "UTF-8")
for num, line in enumerate(fin):
items = convert_to_unicode(line.strip()).split("\t")
if len(items) > 2:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册