未验证 提交 ff5dd783 编写于 作者: W wuzewu 提交者: GitHub

Fix the unicode in tokenization (#27)

上级 d6db70bd
...@@ -18,6 +18,7 @@ from __future__ import absolute_import ...@@ -18,6 +18,7 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import codecs
import collections import collections
import unicodedata import unicodedata
...@@ -70,7 +71,7 @@ def printable_text(text): ...@@ -70,7 +71,7 @@ def printable_text(text):
def load_vocab(vocab_file): def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary.""" """Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict() vocab = collections.OrderedDict()
fin = open(vocab_file) fin = codecs.open(vocab_file, "r", "UTF-8")
for num, line in enumerate(fin): for num, line in enumerate(fin):
items = convert_to_unicode(line.strip()).split("\t") items = convert_to_unicode(line.strip()).split("\t")
if len(items) > 2: if len(items) > 2:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册