From ca16e8fd7bd1bf27abb9b2cea053b9f98eddea76 Mon Sep 17 00:00:00 2001 From: yaoxuefeng Date: Fri, 8 Oct 2021 16:52:05 +0800 Subject: [PATCH] add fs list_files_info (#36224) --- python/paddle/distributed/fleet/utils/fs.py | 32 +++++++++++++++++++ .../fluid/tests/unittests/hdfs_test_utils.py | 9 ++++++ .../fluid/tests/unittests/test_hdfs2.py | 1 + 3 files changed, 42 insertions(+) diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py index fb518f62a1..d3f84d50ac 100644 --- a/python/paddle/distributed/fleet/utils/fs.py +++ b/python/paddle/distributed/fleet/utils/fs.py @@ -1106,3 +1106,35 @@ class HDFSClient(FS): begin += blocks[i] return trainer_files[trainer_id] + + def list_files_info(self, path_list): + """ + list_files return file path and size + Args: + path_list(list): file list + Returns: + fileist(list): file list with file path and size + """ + if len(path_list) <= 0: + return [] + + file_list = [] + + #concat filelist can speed up 'hadoop ls' + str_concat = "" + for path in path_list: + str_concat += path + " " + cmd = "ls " + str_concat + " | awk '{if ($8 != \"\") {print $5\" \"$8 }}'" + ret, lines = self._run_cmd(cmd) + if (len(lines) == 0): + logger.warning("list_files empty, path[%s]" % path_list) + return [] + for line in lines: + arr = line.split(' ') + if len(arr) < 2: + continue + file_path = arr[1] + file_size = int(arr[0]) + file_list.append({'path': file_path, 'size': file_size}) + + return file_list diff --git a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py index 1535fac499..6b49049073 100644 --- a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py +++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py @@ -245,6 +245,15 @@ class FSTestBase(unittest.TestCase): self.assertFalse(fs.is_dir(path)) fs.delete(path) + def _test_list_files_info(self, fs): + path = [] + fs.list_files_info(path) + path = ["./list_files_info.flag"] + fs.list_files_info(path) + fs.touch(path, exist_ok=True) + fs.list_files_info(path) + fs.delete(path) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_hdfs2.py b/python/paddle/fluid/tests/unittests/test_hdfs2.py index 1fa019bb9c..a74fc55838 100644 --- a/python/paddle/fluid/tests/unittests/test_hdfs2.py +++ b/python/paddle/fluid/tests/unittests/test_hdfs2.py @@ -35,6 +35,7 @@ class FSTest2(FSTestBase): self._test_rm(fs) self._test_touch(fs) self._test_dirs(fs) + self._test_list_files_info(fs) def test_local(self): fs = LocalFS() -- GitLab