From 8dc43fa811d2110598d2894e420bd9af4cecf03d Mon Sep 17 00:00:00 2001 From: yanliang567 <82361606+yanliang567@users.noreply.github.com> Date: Tue, 24 May 2022 10:47:58 +0800 Subject: [PATCH] Add test for numpy files in diff folders (#17175) Signed-off-by: yanliang567 --- .../python_client/bulk_load/test_bulk_load.py | 86 ++++++++++++++----- 1 file changed, 63 insertions(+), 23 deletions(-) diff --git a/tests/python_client/bulk_load/test_bulk_load.py b/tests/python_client/bulk_load/test_bulk_load.py index a3f48637f..2d00b37a5 100644 --- a/tests/python_client/bulk_load/test_bulk_load.py +++ b/tests/python_client/bulk_load/test_bulk_load.py @@ -847,11 +847,10 @@ class TestBulkLoad(TestcaseBase): "limit": 1}) @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("auto_id", [True]) + @pytest.mark.parametrize("auto_id", [True, False]) @pytest.mark.parametrize("dim", [6]) - @pytest.mark.parametrize("entities", [10]) - @pytest.mark.parametrize("file_nums", [2]) # 32, max task nums 32? need improve - @pytest.mark.xfail(reason="only one numpy file imported successfully, issue #16992") + @pytest.mark.parametrize("entities", [1000]) + @pytest.mark.parametrize("file_nums", [10]) def test_multi_numpy_files_from_diff_folders(self, auto_id, dim, entities, file_nums): """ collection schema 1: [pk, float_vector] @@ -859,18 +858,10 @@ class TestBulkLoad(TestcaseBase): Steps: 1. create collection 2. import data - 3. if row_based: verify import failed - 4. if column_based: - 4.1 verify the data entities equal the import data - 4.2 verify search and query successfully + 3. verify that import numpy files in a loop """ row_based = False # numpy files supports only column based - data_fields = [df.vec_field] - if not auto_id: - data_fields.append(df.pk_field) - files = prepare_bulk_load_numpy_files(rows=entities, dim=dim, - data_fields=data_fields, - file_nums=file_nums, force=True) + self._connect() c_name = cf.gen_unique_str() fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), @@ -882,16 +873,22 @@ class TestBulkLoad(TestcaseBase): self.collection_wrap.create_index(field_name=df.vec_field, index_params=index_params) # load collection self.collection_wrap.load() - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) + + data_fields = [df.vec_field] + if not auto_id: + data_fields.append(df.pk_field) + for i in range(file_nums): + files = prepare_bulk_load_numpy_files(rows=entities, dim=dim, + data_fields=data_fields, + file_nums=1, force=True) + task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, + row_based=row_based, + files=files) success, states = self.utility_wrap.\ wait_for_bulk_load_tasks_completed(task_ids=task_ids, target_state=BulkLoadStates.BulkLoadPersisted, timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") + log.info(f"bulk load state:{success}") assert success log.info(f" collection entities: {self.collection_wrap.num_entities}") @@ -1734,6 +1731,51 @@ class TestBulkLoadInvalidParams(TestcaseBase): # res, _ = self.collection_wrap.query(expr=f"{float_field} in [1.0]", output_fields=[float_field]) # assert res[0].get(float_field, 0) == 1.0 + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [6]) + @pytest.mark.parametrize("entities", [10]) + @pytest.mark.parametrize("file_nums", [2]) + def test_multi_numpy_files_from_diff_folders_in_one_request(self, auto_id, dim, entities, file_nums): + """ + collection schema 1: [pk, float_vector] + data file: .npy files in different folders + Steps: + 1. create collection + 2. import data + 3. fail to import data with errors + """ + row_based = False # numpy files supports only column based + data_fields = [df.vec_field] + if not auto_id: + data_fields.append(df.pk_field) + files = prepare_bulk_load_numpy_files(rows=entities, dim=dim, + data_fields=data_fields, + file_nums=file_nums, force=True) + self._connect() + c_name = cf.gen_unique_str() + fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim)] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, + row_based=row_based, + files=files) + success, states = self.utility_wrap. \ + wait_for_bulk_load_tasks_completed(task_ids=task_ids, + target_state=BulkLoadStates.BulkLoadPersisted, + timeout=30) + tt = time.time() - t0 + log.info(f"bulk load state:{success} in {tt}") + + assert not success + failed_reason = "duplicate file" + for state in states.values(): + assert state.state_name == "BulkLoadFailed" + assert failed_reason in state.infos.get("failed_reason", "") + assert self.collection_wrap.num_entities == 0 # TODO: string data on float field @@ -1810,6 +1852,4 @@ class TestBulkLoadAdvanced(TestcaseBase): "limit": 1}) # self.collection_wrap.query(expr=f"id in {ids}") - """Validate data consistency and availability during import""" - - + """Validate data consistency and availability during import""" \ No newline at end of file -- GitLab