From 4dc175c9660ff1f7bd8e3db1c792cdf650b5ef5f Mon Sep 17 00:00:00 2001
From: weishengyu <weishengyu@baidu.com>
Date: Sat, 4 Sep 2021 18:47:08 +0800
Subject: [PATCH] add write_hard dataset and sampler

---
 ppcls/data/dataloader/__init__.py            |  4 +
 ppcls/data/dataloader/writer_hard_dataset.py | 38 +++++++++
 ppcls/data/dataloader/writer_hard_sampler.py | 82 ++++++++++++++++++++
 3 files changed, 124 insertions(+)
 create mode 100644 ppcls/data/dataloader/writer_hard_dataset.py
 create mode 100644 ppcls/data/dataloader/writer_hard_sampler.py

diff --git a/ppcls/data/dataloader/__init__.py b/ppcls/data/dataloader/__init__.py
index 5d6fe91f..1a08f989 100644
--- a/ppcls/data/dataloader/__init__.py
+++ b/ppcls/data/dataloader/__init__.py
@@ -4,3 +4,7 @@ from ppcls.data.dataloader.common_dataset import create_operators
 from ppcls.data.dataloader.vehicle_dataset import CompCars, VeriWild
 from ppcls.data.dataloader.logo_dataset import LogoDataset
 from ppcls.data.dataloader.icartoon_dataset import ICartoonDataset
+from ppcls.data.dataloader.writer_hard_sampler import WriterHardSampler
+from ppcls.data.dataloader.mix_dataset import MixDataset
+from ppcls.data.dataloader.mix_sampler import MixSampler
+from ppcls.data.dataloader.writer_hard_dataset import WriterHardDataset
diff --git a/ppcls/data/dataloader/writer_hard_dataset.py b/ppcls/data/dataloader/writer_hard_dataset.py
new file mode 100644
index 00000000..83aacb59
--- /dev/null
+++ b/ppcls/data/dataloader/writer_hard_dataset.py
@@ -0,0 +1,38 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+
+from .common_dataset import CommonDataset
+
+
+class WriterHardDataset(CommonDataset):
+    def _load_anno(self, seed=None):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        self.images = []
+        self.labels = []
+
+        with open(self._cls_path) as fd:
+            self.anno_list = fd.readlines()
+            if seed is not None:
+                np.random.RandomState(seed).shuffle(self.anno_list)
+            for l in self.anno_list:
+                l = l.strip().split(" ")
+                self.images.append(os.path.join(self._img_root, l[0]))
+                self.labels.append(int(l[1]))
+                assert os.path.exists(self.images[-1])
diff --git a/ppcls/data/dataloader/writer_hard_sampler.py b/ppcls/data/dataloader/writer_hard_sampler.py
new file mode 100644
index 00000000..856ffd26
--- /dev/null
+++ b/ppcls/data/dataloader/writer_hard_sampler.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from collections import defaultdict
+import numpy as np
+import copy
+import random
+from paddle.io import DistributedBatchSampler
+
+from ppcls.data.dataloader.writer_hard_dataset import WriterHardDataset
+
+
+class WriterHardSampler(DistributedBatchSampler):
+    """
+    Randomly sample N anchor, then for each identity,
+    randomly sample 2 positive and 1 negative for each anchor, therefore batch size is N*4.
+    Args:
+    - data_source (list): list of (img_path, pid, camid).
+    - num_instances (int): number of instances per identity in a batch.
+    - batch_size (int): number of examples in a batch.
+    """
+
+    def __init__(self, dataset, batch_size, **args):
+        super(WriterHardSampler, self).__init__(dataset, batch_size)
+        self.dataset = dataset
+        self.batch_size = batch_size
+        assert not self.batch_size % 4, "bs of WriterHardSampler should be 3*N"
+        assert isinstance(dataset, WriterHardDataset), "WriterHardSampler only support WriterHardDataset"
+        self.num_pids_per_batch = self.batch_size // 4
+        self.anchor_list = []
+        self.person_id_map = {}
+        self.text_id_map = {}
+        anno_list = dataset.anno_list
+        for i, anno_i in enumerate(anno_list):
+            _, person_id, text_id = anno_i.split(" ")
+            if text_id != "-1":
+                if random.random() < 0.5:
+                    self.anchor_list.append([i, person_id, text_id])
+                else:
+                    if text_id in self.text_id_map:
+                        self.text_id_map[text_id].append(i)
+                    else:
+                        self.text_id_map[text_id] = [i]
+            else:
+                if person_id in self.person_id_map:
+                    self.person_id_map[person_id].append(i)
+                else:
+                    self.person_id_map[person_id] = [i]
+
+        assert len(self.anchor_list) < self.batch_size, "anchor should be larger than batch_size"
+
+    def __iter__(self):
+        random.shuffle(self.anchor_list)
+        for i in range(len(self)):
+            batch_indices = []
+            for j in range(self.batch_size // 4):
+                anchor = self.anchor_list[i * self.batch_size // 4 + j]
+                anchor_index = anchor[0]
+                anchor_person_id = anchor[1]
+                anchor_text_id = anchor[2]
+                person_indices = random.sample(self.person_id_map[anchor_person_id], 2)
+                text_index = random.choice(self.text_id_map[anchor_text_id])
+                batch_indices.append(anchor_index)
+                batch_indices += person_indices
+                batch_indices.append(text_index)
+            yield batch_indices
+
+    def __len__(self):
+        len(self.anchor_list) * 4 // self.batch_size
-- 
GitLab