From e6ae1e4ffce3a39f21a2ca7d0a7d2e9883f83528 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 8 Aug 2018 15:52:43 +0800 Subject: [PATCH] Replace the dependency of paddle.v2 dataset --- python/paddle/dataset/cifar.py | 2 +- python/paddle/dataset/wmt14.py | 10 ++++++---- python/paddle/dataset/wmt16.py | 9 +++++---- .../tests/unittests/test_data_balance.py | 2 +- .../tests/unittests/test_preprocessor.py | 4 ++-- .../fluid/tests/unittests/test_profiler.py | 2 +- .../tests/unittests/test_protobuf_descs.py | 19 ++++++++++--------- .../tests/unittests/test_reader_reset.py | 2 +- .../tests/unittests/test_recordio_reader.py | 4 ++-- 9 files changed, 29 insertions(+), 25 deletions(-) diff --git a/python/paddle/dataset/cifar.py b/python/paddle/dataset/cifar.py index e399b5215f5..f6b4ff8fbd0 100644 --- a/python/paddle/dataset/cifar.py +++ b/python/paddle/dataset/cifar.py @@ -53,7 +53,7 @@ def reader_creator(filename, sub_name, cycle=False): yield (sample / 255.0).astype(numpy.float32), int(label) def reader(): - with tarfile.open(filename, mode='rb') as f: + with tarfile.open(filename, mode='r') as f: names = (each_item.name for each_item in f if sub_name in each_item.name) diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py index 250fd03ffb9..7488e21f1fb 100644 --- a/python/paddle/dataset/wmt14.py +++ b/python/paddle/dataset/wmt14.py @@ -19,10 +19,12 @@ http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and parse training set and test set into paddle reader creators. """ +import six import tarfile import gzip import paddle.dataset.common +import paddle.fluid.compat as cpt __all__ = [ 'train', @@ -40,8 +42,8 @@ URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/' 'wmt_shrinked_data/wmt14.tgz') MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c' # BLEU of this trained model is 26.92 -URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz' -MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3' +URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz' +MD5_MODEL = '0791583d57d5beb693b9414c5b36798c' START = "" END = "" @@ -54,7 +56,7 @@ def __read_to_dict(tar_file, dict_size): out_dict = dict() for line_count, line in enumerate(fd): if line_count < size: - out_dict[line.strip()] = line_count + out_dict[cpt.to_literal_str(line.strip())] = line_count else: break return out_dict @@ -85,7 +87,7 @@ def reader_creator(tar_file, file_name, dict_size): ] for name in names: for line in f.extractfile(name): - line_split = line.strip().split('\t') + line_split = line.strip().split(six.b('\t')) if len(line_split) != 2: continue src_seq = line_split[0] # one source sequence diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py index 186f9476d81..cd34b523eb7 100644 --- a/python/paddle/dataset/wmt16.py +++ b/python/paddle/dataset/wmt16.py @@ -35,6 +35,7 @@ import gzip from collections import defaultdict import paddle.dataset.common +import paddle.fluid.compat as cpt __all__ = [ "train", @@ -82,16 +83,16 @@ def __load_dict(tar_file, dict_size, lang, reverse=False): dict_path = os.path.join(paddle.dataset.common.DATA_HOME, "wmt16/%s_%d.dict" % (lang, dict_size)) if not os.path.exists(dict_path) or ( - len(open(dict_path, "r").readlines()) != dict_size): + len(open(dict_path, "rb").readlines()) != dict_size): __build_dict(tar_file, dict_size, dict_path, lang) word_dict = {} - with open(dict_path, "r") as fdict: + with open(dict_path, "rb") as fdict: for idx, line in enumerate(fdict): if reverse: - word_dict[idx] = line.strip() + word_dict[idx] = cpt.to_literal_str(line.strip()) else: - word_dict[line.strip()] = idx + word_dict[cpt.to_literal_str(line.strip())] = idx return word_dict diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py index d3c7b6e7146..09edf05fd7b 100644 --- a/python/paddle/fluid/tests/unittests/test_data_balance.py +++ b/python/paddle/fluid/tests/unittests/test_data_balance.py @@ -14,7 +14,7 @@ import unittest import paddle.fluid as fluid -import paddle as paddle +import paddle import numpy as np diff --git a/python/paddle/fluid/tests/unittests/test_preprocessor.py b/python/paddle/fluid/tests/unittests/test_preprocessor.py index cbf1a7e0c50..6a82746c61a 100644 --- a/python/paddle/fluid/tests/unittests/test_preprocessor.py +++ b/python/paddle/fluid/tests/unittests/test_preprocessor.py @@ -15,9 +15,9 @@ import unittest import numpy as np +import paddle import paddle.fluid as fluid -import paddle.v2 as paddle -import paddle.v2.dataset.mnist as mnist +import paddle.dataset.mnist as mnist class TestPreprocessor(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py index 9f8d33f9bbf..705d01165ac 100644 --- a/python/paddle/fluid/tests/unittests/test_profiler.py +++ b/python/paddle/fluid/tests/unittests/test_profiler.py @@ -93,7 +93,7 @@ class TestProfiler(unittest.TestCase): "profiler is enabled only with GPU") def test_all_profiler(self): self.net_profiler('All', '/tmp/profile_out') - with open('/tmp/profile_out', 'r') as f: + with open('/tmp/profile_out', 'rb') as f: self.assertGreater(len(f.read()), 0) diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py index 621dd681345..2176db71b95 100644 --- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py +++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py @@ -14,6 +14,7 @@ import unittest import paddle.fluid.core as core +import paddle.fluid.compat as cpt from paddle.fluid.framework import Program @@ -108,7 +109,7 @@ class TestVarDesc(unittest.TestCase): def test_shape(self): program_desc = core.ProgramDesc() block = program_desc.block(0) - var = block.var('my_var') + var = block.var(cpt.to_bytes('my_var')) var.set_type(core.VarDesc.VarType.SELECTED_ROWS) src_shape = [3, 2, 10, 8] var.set_shape(src_shape) @@ -119,7 +120,7 @@ class TestVarDesc(unittest.TestCase): def test_multiple_shape(self): program_desc = core.ProgramDesc() block = program_desc.block(0) - var = block.var('my_reader') + var = block.var(cpt.to_bytes('my_reader')) var.set_type(core.VarDesc.VarType.READER) src_shapes = [[2, 3, 3], [4, 5], [6, 7, 8, 9]] var.set_shapes(src_shapes) @@ -130,7 +131,7 @@ class TestVarDesc(unittest.TestCase): def test_dtype(self): program_desc = core.ProgramDesc() block = program_desc.block(0) - var = block.var('my_var') + var = block.var(cpt.to_bytes('my_var')) var.set_type(core.VarDesc.VarType.LOD_TENSOR) var.set_dtype(core.VarDesc.VarType.INT32) self.assertEqual(core.VarDesc.VarType.INT32, var.dtype()) @@ -139,7 +140,7 @@ class TestVarDesc(unittest.TestCase): def test_multiple_dtype(self): program_desc = core.ProgramDesc() block = program_desc.block(0) - var = block.var('my_reader') + var = block.var(cpt.to_bytes('my_reader')) var.set_type(core.VarDesc.VarType.READER) src_types = [ core.VarDesc.VarType.INT32, core.VarDesc.VarType.FP64, @@ -152,7 +153,7 @@ class TestVarDesc(unittest.TestCase): def test_multiple_lod_level(self): program_desc = core.ProgramDesc() block = program_desc.block(0) - var = block.var('my_reader') + var = block.var(cpt.to_bytes('my_reader')) var.set_type(core.VarDesc.VarType.READER) src_types = [3, 1, 2] var.set_lod_levels(src_types) @@ -166,12 +167,12 @@ class TestBlockDesc(unittest.TestCase): self.assertIsNotNone(program_desc) block = program_desc.block(0) self.assertIsNotNone(block) - var1 = block.var("var1") - var2 = block.var("var2") - var3 = block.var("var3") + var1 = block.var(cpt.to_bytes("var1")) + var2 = block.var(cpt.to_bytes("var2")) + var3 = block.var(cpt.to_bytes("var3")) all_vars = block.all_vars() self.assertEqual(set(all_vars), {var1, var2, var3}) - var2_re = block.find_var("var2") + var2_re = block.find_var(cpt.to_bytes("var2")) self.assertEqual(var2_re, var2) def test_add_op(self): diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py index d3ab991c847..698612acf4a 100644 --- a/python/paddle/fluid/tests/unittests/test_reader_reset.py +++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py @@ -13,7 +13,7 @@ # limitations under the License. import paddle.fluid as fluid -import paddle as paddle +import paddle import numpy as np import unittest diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py index 69a522e273d..09c3167152f 100644 --- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py +++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py @@ -15,8 +15,8 @@ import unittest import paddle.fluid as fluid -import paddle.v2 as paddle -import paddle.v2.dataset.mnist as mnist +import paddle +import paddle.dataset.mnist as mnist class TestRecordIO(unittest.TestCase): -- GitLab