[CodeStyle][py2] remove `compat` module (to_text) (#47036)

* [CodeStyle][py2] remove `compat` module (to_text) * remove some unnecessary decode * remove to_text definition and unittest * Revert "remove to_text definition and unittest" This reverts commit a6b69cb8dca8b9b031ce10ea32d1040e7e0dd267. * remove an assertion * empty commit

[CodeStyle][py2] remove `compat` module (to_text) (#47036)
* [CodeStyle][py2] remove `compat` module (to_text) * remove some unnecessary decode * remove to_text definition and unittest * Revert "remove to_text definition and unittest" This reverts commit a6b69cb8dca8b9b031ce10ea32d1040e7e0dd267. * remove an assertion * empty commit
ad4c773b · Nyakku Shigure · GitHub · 62c0abac · ad4c773b · ad4c773b
18 changed file
--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -23,7 +23,6 @@ to initialize SRL model.
 import tarfile
 import gzip
 import paddle.dataset.common
-import paddle.compat as cpt
 import paddle.utils.deprecated as deprecated
 from six.moves import zip, range

@@ -90,8 +89,8 @@ def corpus_reader(data_path, words_name, props_name):
            labels = []
            one_seg = []
            for word, label in zip(words_file, props_file):
-                word = cpt.to_text(word.strip())
-                label = cpt.to_text(label.strip().split())
+                word = word.strip().decode()
+                label = label.strip().decode().split()

                if len(label) == 0:  # end of sentence
                    for i in range(len(one_seg[0])):

--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -29,7 +29,6 @@ import paddle.utils.deprecated as deprecated
 import re
 import functools
 import six
-import paddle.compat as cpt

 __all__ = []

@@ -112,7 +111,7 @@ def __initialize_meta_info__():
                categories_set = set()
                with package.open('ml-1m/movies.dat') as movie_file:
                    for i, line in enumerate(movie_file):
-                        line = cpt.to_text(line, encoding='latin')
+                        line = line.decode(encoding='latin')
                        movie_id, title, categories = line.strip().split('::')
                        categories = categories.split('|')
                        for c in categories:
@@ -137,7 +136,7 @@ def __initialize_meta_info__():
                USER_INFO = dict()
                with package.open('ml-1m/users.dat') as user_file:
                    for line in user_file:
-                        line = cpt.to_text(line, encoding='latin')
+                        line = line.decode(encoding='latin')
                        uid, gender, age, job, _ = line.strip().split("::")
                        USER_INFO[int(uid)] = UserInfo(index=uid,
                                                       gender=gender,
@@ -152,7 +151,7 @@ def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
    with zipfile.ZipFile(file=fn) as package:
        with package.open('ml-1m/ratings.dat') as rating:
            for line in rating:
-                line = cpt.to_text(line, encoding='latin')
+                line = line.decode(encoding='latin')
                if (np.random.random() < test_ratio) == is_test:
                    uid, mov_id, rating, _ = line.strip().split("::")
                    uid = int(uid)

--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -24,7 +24,6 @@ import six
 import tarfile

 import paddle.dataset.common
-import paddle.compat as cpt
 import paddle.utils.deprecated as deprecated

 __all__ = []
@@ -52,7 +51,7 @@ def __read_to_dict(tar_file, dict_size):
        out_dict = dict()
        for line_count, line in enumerate(fd):
            if line_count < size:
-                out_dict[cpt.to_text(line.strip())] = line_count
+                out_dict[line.strip().decode()] = line_count
            else:
                break
        return out_dict
@@ -84,7 +83,7 @@ def reader_creator(tar_file, file_name, dict_size):
            ]
            for name in names:
                for line in f.extractfile(name):
-                    line = cpt.to_text(line)
+                    line = line.decode()
                    line_split = line.strip().split('\t')
                    if len(line_split) != 2:
                        continue

--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -34,7 +34,6 @@ import tarfile
 from collections import defaultdict

 import paddle
-import paddle.compat as cpt
 import paddle.utils.deprecated as deprecated

 __all__ = []
@@ -54,7 +53,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
    word_dict = defaultdict(int)
    with tarfile.open(tar_file, mode="r") as f:
        for line in f.extractfile("wmt16/train"):
-            line = cpt.to_text(line)
+            line = line.decode()
            line_split = line.strip().split("\t")
            if len(line_split) != 2: continue
            sen = line_split[0] if lang == "en" else line_split[1]
@@ -83,9 +82,9 @@ def __load_dict(tar_file, dict_size, lang, reverse=False):
    with open(dict_path, "rb") as fdict:
        for idx, line in enumerate(fdict):
            if reverse:
-                word_dict[idx] = cpt.to_text(line.strip())
+                word_dict[idx] = line.strip().decode()
            else:
-                word_dict[cpt.to_text(line.strip())] = idx
+                word_dict[line.strip().decode()] = idx
    return word_dict


@@ -116,7 +115,7 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):

        with tarfile.open(tar_file, mode="r") as f:
            for line in f.extractfile(file_name):
-                line = cpt.to_text(line)
+                line = line.decode()
                line_split = line.strip().split("\t")
                if len(line_split) != 2:
                    continue

--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -14,7 +14,6 @@

 import os
 import paddle
-import paddle.compat as cpt
 from ..ps.utils.public import *
 from paddle.framework import core
 from paddle.distributed.passes.pass_base import PassBase, register_pass
@@ -707,7 +706,7 @@ class PsGpuPass(PassBase):
            if op.type != "pull_box_sparse" and op.type != "pull_gpups_sparse":
                continue
            grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-                op.desc, cpt.to_text(set()), [])
+                op.desc, set(), [])
            for op_desc in grad_op_desc:
                new_op_desc = program.global_block().desc._insert_op(
                    insert_index + 1)

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -423,10 +423,8 @@ def _some_in_set_(cands, s):
    """
    if len(cands) == 0:
        return False
-    literal_set = cpt.to_text(s)
-    literal_cands = cpt.to_text(cands)
-    for c in literal_cands:
-        if c in literal_set:
+    for c in cands:
+        if c in s:
            return True
    return False

@@ -437,7 +435,6 @@ def _strip_grad_suffix_(name):
    e.g. x@GRAD ==> x
         y@GRAD@RENAME@1 ==> y
    """
-    name = cpt.to_text(name)
    pos = name.find(core.grad_var_suffix())
    new_name = name[:pos] if pos != -1 else name
    new_pos = name.rfind('grad/')
@@ -449,7 +446,7 @@ def _append_grad_suffix_(name):
    Append grad suffix to the given variable name
    e.g. x ==> x@GRAD
    """
-    return cpt.to_text(name) + core.grad_var_suffix()
+    return name + core.grad_var_suffix()


 def _accumulate_gradients_by_sum_op_(var_name,
@@ -967,7 +964,7 @@ def _append_backward_ops_with_checkpoints_(block,
                                "invoke op: %s" %
                                _pretty_op_desc_(op.desc, "with_sub_block"))
            grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-                op.desc, cpt.to_text(no_grad_dict[block.idx]), [])
+                op.desc, no_grad_dict[block.idx], [])

            # record the mapping between fwd and bwd
            if grad_op_id_to_fwd_op is not None:
@@ -993,7 +990,7 @@ def _append_backward_ops_with_checkpoints_(block,
                                "invoke op: %s" %
                                _pretty_op_desc_(op.desc, "with_sub_block"))
            grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-                op.desc, cpt.to_text(no_grad_dict[block.idx]), [])
+                op.desc, no_grad_dict[block.idx], [])

            # record the mapping between fwd and bwd
            if grad_op_id_to_fwd_op is not None:
@@ -1055,7 +1052,7 @@ def _append_backward_ops_with_checkpoints_(block,
        # 3.c. add backward ops for all ops in current segment
        for op_desc in reversed(added_descs):
            grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-                op_desc, cpt.to_text(no_grad_dict[block.idx]), [])
+                op_desc, no_grad_dict[block.idx], [])

            # record the mapping between fwd and bwd
            if grad_op_id_to_fwd_op is not None:
@@ -1239,7 +1236,7 @@ def _append_backward_ops_(block,

        # Getting op's corresponding grad_op
        grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-            op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)
+            op.desc, no_grad_dict[block.idx], grad_sub_block_list)

        # record the mapping between fwd and bwd
        if grad_op_id_to_fwd_op is not None:
@@ -1841,7 +1838,7 @@ def append_backward(loss,
    params_and_grads = []
    op_role_var_attr_name = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
    for param in parameters:
-        if cpt.to_text(param) not in grad_info_map:
+        if param not in grad_info_map:
            continue
        grad_info = grad_info_map[param]
        grad_block = grad_info[1]

--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -17,7 +17,6 @@ import os
 import six
 import sys
 import warnings
-from .. import compat as cpt
 from . import framework
 from .framework import _get_paddle_place, _get_paddle_place_list
 from .framework import cuda_places, cpu_places, xpu_places
@@ -418,10 +417,10 @@ class CompiledProgram(object):
        for node in self._graph.nodes():
            if node.is_var() and node.var() is not None and node.var().persistable() and \
                    node.var().type() != core.VarDesc.VarType.RAW:
-                name = cpt.to_text(node.name())
+                name = node.name()
                if self._program is not None and _should_broadcast_or_not_exists(
                        self._program, name):
-                    self._persistable_vars.append(cpt.to_text(node.name()))
+                    self._persistable_vars.append(node.name())

        places = list(map(_place_obj, places))

@@ -433,9 +432,9 @@ class CompiledProgram(object):

        return core.ParallelExecutor(
            places, self._persistable_vars,
-            cpt.to_text(self._loss_name) if self._loss_name else six.u(''),
-            self._scope, self._local_scopes, self._exec_strategy,
-            self._build_strategy, self._graph)
+            self._loss_name if self._loss_name else six.u(''), self._scope,
+            self._local_scopes, self._exec_strategy, self._build_strategy,
+            self._graph)

    def _compile_inference(self):
        return core.create_paddle_predictor(self._infer_config)

--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -18,7 +18,6 @@ try:
    from tqdm import tqdm
 except:
    from .utils import tqdm
-from ..... import compat as cpt
 from .... import core
 from ....framework import IrGraph
 from ....framework import IrNode
@@ -400,7 +399,7 @@ class QuantizationTransformPass(object):
    def _create_global_step(self, graph):
        if self._weight_quantize_type == 'range_abs_max' or \
                self._activation_quantize_type == 'range_abs_max':
-            counter_name = cpt.to_text('@STEP_COUNTER@')
+            counter_name = '@STEP_COUNTER@'
            for node in graph.all_var_nodes():
                if node.name() == counter_name:
                    self._global_step = node
@@ -1339,7 +1338,7 @@ class ConvertToInt8Pass(object):
    def _convert_to_int8(self, graph, var_node):
        int8_var_node_name = var_node.name() + ".int8"
        int8_var_node = graph.create_persistable_node(
-            name=cpt.to_text(int8_var_node_name),
+            name=int8_var_node_name,
            var_type=var_node.type(),
            shape=var_node.shape(),
            var_dtype=core.VarDesc.VarType.INT8)

--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -125,7 +125,6 @@ def _append_loaded_suffix(name):
    e.g. x ==> x.load_0, x.load_0 ==> x.load_0.load_0
    """
    suffix = LOADED_VAR_SUFFIX
-    name = cpt.to_text(name)
    new_name = unique_name.generate_with_ignorable_key('.'.join((name, suffix)))
    return new_name


--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1390,7 +1390,6 @@ class Variable(object):
        self.error_clip = error_clip

        is_new_var = False
-        name = cpt.to_text(name)
        self.desc = self.block.desc.find_var(name.encode())

        if self.desc is None:
@@ -1757,8 +1756,7 @@ class Variable(object):
        if with_details:
            additional_attr = ("error_clip", )
            for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (attr_name,
-                                         cpt.to_text(getattr(self, attr_name)))
+                res_str += "%s: %s\n" % (attr_name, getattr(self, attr_name))

        return res_str

@@ -1900,7 +1898,7 @@ class Variable(object):
                                                dtype='float32')
            print("name of current Var is: {}".format(new_variable.name))
        """
-        return cpt.to_text(self.desc.name())
+        return self.desc.name()

    @property
    def grad_name(self):
@@ -2807,7 +2805,7 @@ class Operator(object):
                            elif isinstance(arg, six.binary_type):
                                in_arg_names.append(arg.decode())
                            elif isinstance(arg, (Variable, core.VarBase)):
-                                in_arg_names.append(cpt.to_text(arg.name))
+                                in_arg_names.append(arg.name)
                            else:
                                raise TypeError(
                                    "The type of '%s' in operator %s should be "
@@ -2843,7 +2841,7 @@ class Operator(object):
                        if isinstance(arg, six.string_types):
                            out_arg_names.append(arg)
                        else:
-                            out_arg_names.append(cpt.to_text(arg.name))
+                            out_arg_names.append(arg.name)
                        # TODO(minqiyang): could we remove variable's op in static mode?
                        if not _non_static_mode():
                            if isinstance(arg, six.string_types):
@@ -3660,8 +3658,8 @@ class Block(object):
        Rename variable in vars and ops' inputs and outputs

        Args:
-            name(str): the name that need to be renamed.
-            new_name(str): the name that need to rename to.
+            name(bytes): the name that need to be renamed.
+            new_name(bytes): the name that need to rename to.

        Raises:
            ValueError: If this block doesn't have this the giving name,
@@ -3671,8 +3669,8 @@ class Block(object):
        Returns:
            Variable: the Variable with the giving name.
        """
-        name = cpt.to_text(name)
-        new_name = cpt.to_text(new_name)
+        name = name.decode()
+        new_name = new_name.decode()

        if not self.has_var(name):
            raise ValueError("var %s is not in current block" % name)
@@ -6643,8 +6641,7 @@ class Parameter(Variable):
            additional_attr = ("trainable", "optimize_attr", "regularizer",
                               "do_model_average", "need_clip")
            for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (attr_name,
-                                         cpt.to_text(getattr(self, attr_name)))
+                res_str += "%s: %s\n" % (attr_name, getattr(self, attr_name))
        else:
            res_str = Variable.to_string(self, throw_on_error, False)
        return res_str

--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -23,7 +23,6 @@ from functools import reduce
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
-import paddle.compat as cpt

 from paddle.fluid.transpiler.details.program_utils import delete_ops
 from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops
@@ -593,7 +592,7 @@ def ps_gpu_pass(program):
            if op.type != "pull_box_sparse" and op.type != "pull_gpups_sparse":
                continue
            grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-                op.desc, cpt.to_text(set()), [])
+                op.desc, set(), [])
            for op_desc in grad_op_desc:
                new_op_desc = program.global_block().desc.append_op()
                new_op_desc.copy_from(op_desc)

--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -42,7 +42,6 @@ from .reader import *
 from . import dataloader
 from .dataloader import *
 from . import core
-from .. import compat as cpt
 from paddle.utils import deprecated
 from paddle.fluid.framework import static_only

@@ -1660,7 +1659,7 @@ def _save_persistable_nodes(executor, dirname, graph):
    persistable_nodes = []
    all_persistable_nodes = graph.all_persistable_nodes()
    for node in all_persistable_nodes:
-        name = cpt.to_text(node.name())
+        name = node.name()
        if name not in persistable_node_names:
            persistable_node_names.add(name)
            persistable_nodes.append(node)
@@ -1695,7 +1694,7 @@ def _load_persistable_nodes(executor, dirname, graph):
    persistable_nodes = []
    all_persistable_nodes = graph.all_persistable_nodes()
    for node in all_persistable_nodes:
-        name = cpt.to_text(node.name())
+        name = node.name()
        if name not in persistable_node_names:
            persistable_node_names.add(name)
            persistable_nodes.append(node)

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -2431,11 +2431,10 @@ class ConditionalBlock(object):
        for inner_input_name in params:
            inner_var = parent_block._find_var_recursive(inner_input_name)
            if inner_var:
-                param_list.append(cpt.to_text(inner_var.name))
+                param_list.append(inner_var.name)

        grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-            conditional_block_op.desc, cpt.to_text(set()),
-            [grad_sub_block.desc])
+            conditional_block_op.desc, set(), [grad_sub_block.desc])

        # append op_desc in grad_op_descs to target_block
        op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()

--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -26,7 +26,6 @@ import tarfile
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from test_dist_base import TestDistRunnerBase, runtime_main, RUN_STEP
-import paddle.compat as cpt

 const_para_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(0.001))
 const_bias_attr = const_para_attr
@@ -856,7 +855,7 @@ class DataReader(object):

            f = tarfile.open(fpaths[0], "r")
            for line in f.extractfile(tar_fname):
-                line = cpt.to_text(line)
+                line = line.decode()
                fields = line.strip("\n").split(self._field_delimiter)
                if (not self._only_src
                        and len(fields) == 2) or (self._only_src
@@ -869,7 +868,7 @@ class DataReader(object):

                with open(fpath, "rb") as f:
                    for line in f:
-                        line = cpt.to_text(line)
+                        line = line.decode()
                        fields = line.strip("\n").split(self._field_delimiter)
                        if (not self._only_src
                                and len(fields) == 2) or (self._only_src
@@ -881,7 +880,7 @@ class DataReader(object):
        word_dict = {}
        with open(dict_path, "rb") as fdict:
            for idx, line in enumerate(fdict):
-                line = cpt.to_text(line)
+                line = line.decode()
                if reverse:
                    word_dict[idx] = line.strip("\n")
                else:

--- a/python/paddle/text/datasets/conll05.py
+++ b/python/paddle/text/datasets/conll05.py
@@ -17,7 +17,6 @@ import tarfile
 import numpy as np

 from paddle.io import Dataset
-import paddle.compat as cpt
 from paddle.dataset.common import _check_exists_and_download

 __all__ = []
@@ -182,8 +181,8 @@ class Conll05st(Dataset):
            labels = []
            one_seg = []
            for word, label in zip(words_file, props_file):
-                word = cpt.to_text(word.strip())
-                label = cpt.to_text(label.strip().split())
+                word = word.strip().decode()
+                label = label.strip().decode().split()

                if len(label) == 0:  # end of sentence
                    for i in range(len(one_seg[0])):

--- a/python/paddle/text/datasets/movielens.py
+++ b/python/paddle/text/datasets/movielens.py
@@ -17,7 +17,6 @@ import zipfile
 import re

 from paddle.io import Dataset
-import paddle.compat as cpt
 from paddle.dataset.common import _check_exists_and_download

 __all__ = []
@@ -161,7 +160,7 @@ class Movielens(Dataset):
                categories_set = set()
                with package.open('ml-1m/movies.dat') as movie_file:
                    for i, line in enumerate(movie_file):
-                        line = cpt.to_text(line, encoding='latin')
+                        line = line.decode(encoding='latin')
                        movie_id, title, categories = line.strip().split('::')
                        categories = categories.split('|')
                        for c in categories:
@@ -180,7 +179,7 @@ class Movielens(Dataset):

                with package.open('ml-1m/users.dat') as user_file:
                    for line in user_file:
-                        line = cpt.to_text(line, encoding='latin')
+                        line = line.decode(encoding='latin')
                        uid, gender, age, job, _ = line.strip().split("::")
                        self.user_info[int(uid)] = UserInfo(index=uid,
                                                            gender=gender,
@@ -193,7 +192,7 @@ class Movielens(Dataset):
        with zipfile.ZipFile(self.data_file) as package:
            with package.open('ml-1m/ratings.dat') as rating:
                for line in rating:
-                    line = cpt.to_text(line, encoding='latin')
+                    line = line.decode(encoding='latin')
                    if (np.random.random() < self.test_ratio) == is_test:
                        uid, mov_id, rating, _ = line.strip().split("::")
                        uid = int(uid)

--- a/python/paddle/text/datasets/wmt14.py
+++ b/python/paddle/text/datasets/wmt14.py
@@ -17,7 +17,6 @@ import numpy as np
 import six

 from paddle.io import Dataset
-import paddle.compat as cpt
 from paddle.dataset.common import _check_exists_and_download

 __all__ = []
@@ -111,7 +110,7 @@ class WMT14(Dataset):
            out_dict = dict()
            for line_count, line in enumerate(fd):
                if line_count < size:
-                    out_dict[cpt.to_text(line.strip())] = line_count
+                    out_dict[line.strip().decode()] = line_count
                else:
                    break
            return out_dict
@@ -140,7 +139,7 @@ class WMT14(Dataset):
            ]
            for name in names:
                for line in f.extractfile(name):
-                    line = cpt.to_text(line)
+                    line = line.decode()
                    line_split = line.strip().split('\t')
                    if len(line_split) != 2:
                        continue

--- a/python/paddle/text/datasets/wmt16.py
+++ b/python/paddle/text/datasets/wmt16.py
@@ -22,7 +22,6 @@ from collections import defaultdict

 import paddle
 from paddle.io import Dataset
-import paddle.compat as cpt
 from paddle.dataset.common import _check_exists_and_download

 __all__ = []
@@ -152,16 +151,16 @@ class WMT16(Dataset):
        with open(dict_path, "rb") as fdict:
            for idx, line in enumerate(fdict):
                if reverse:
-                    word_dict[idx] = cpt.to_text(line.strip())
+                    word_dict[idx] = line.strip().decode()
                else:
-                    word_dict[cpt.to_text(line.strip())] = idx
+                    word_dict[line.strip().decode()] = idx
        return word_dict

    def _build_dict(self, dict_path, dict_size, lang):
        word_dict = defaultdict(int)
        with tarfile.open(self.data_file, mode="r") as f:
            for line in f.extractfile("wmt16/train"):
-                line = cpt.to_text(line)
+                line = line.decode()
                line_split = line.strip().split("\t")
                if len(line_split) != 2: continue
                sen = line_split[0] if self.lang == "en" else line_split[1]
@@ -195,7 +194,7 @@ class WMT16(Dataset):
        self.trg_ids_next = []
        with tarfile.open(self.data_file, mode="r") as f:
            for line in f.extractfile("wmt16/{}".format(self.mode)):
-                line = cpt.to_text(line)
+                line = line.decode()
                line_split = line.strip().split("\t")
                if len(line_split) != 2:
                    continue