未验证 提交 c4fcd143 编写于 作者: L littletomatodonkey 提交者: GitHub

refine dynamic sampling (#1256)

上级 169b629b
...@@ -32,12 +32,10 @@ class SimpleDataSet(Dataset): ...@@ -32,12 +32,10 @@ class SimpleDataSet(Dataset):
self.delimiter = dataset_config.get('delimiter', '\t') self.delimiter = dataset_config.get('delimiter', '\t')
label_file_list = dataset_config.pop('label_file_list') label_file_list = dataset_config.pop('label_file_list')
data_source_num = len(label_file_list) data_source_num = len(label_file_list)
if data_source_num == 1: ratio_list = dataset_config.get("ratio_list", [1.0])
ratio_list = [1.0] if isinstance(ratio_list, (float, int)):
else: ratio_list = [float(ratio_list)] * len(data_source_num)
ratio_list = dataset_config.pop('ratio_list')
assert sum(ratio_list) == 1, "The sum of the ratio_list should be 1."
assert len( assert len(
ratio_list ratio_list
) == data_source_num, "The length of ratio_list should be the same as the file_list." ) == data_source_num, "The length of ratio_list should be the same as the file_list."
...@@ -45,62 +43,32 @@ class SimpleDataSet(Dataset): ...@@ -45,62 +43,32 @@ class SimpleDataSet(Dataset):
self.do_shuffle = loader_config['shuffle'] self.do_shuffle = loader_config['shuffle']
logger.info("Initialize indexs of datasets:%s" % label_file_list) logger.info("Initialize indexs of datasets:%s" % label_file_list)
self.data_lines_list, data_num_list = self.get_image_info_list( self.data_lines = self.get_image_info_list(label_file_list, ratio_list)
label_file_list) self.data_idx_order_list = list(range(len(self.data_lines)))
self.data_idx_order_list = self.dataset_traversal( if mode.lower() == "train":
data_num_list, ratio_list, batch_size) self.shuffle_data_random()
self.shuffle_data_random()
self.ops = create_operators(dataset_config['transforms'], global_config) self.ops = create_operators(dataset_config['transforms'], global_config)
def get_image_info_list(self, file_list): def get_image_info_list(self, file_list, ratio_list):
if isinstance(file_list, str): if isinstance(file_list, str):
file_list = [file_list] file_list = [file_list]
data_lines_list = [] data_lines = []
data_num_list = [] for idx, file in enumerate(file_list):
for file in file_list:
with open(file, "rb") as f: with open(file, "rb") as f:
lines = f.readlines() lines = f.readlines()
data_lines_list.append(lines) lines = random.sample(lines,
data_num_list.append(len(lines)) round(len(lines) * ratio_list[idx]))
return data_lines_list, data_num_list data_lines.extend(lines)
return data_lines
def dataset_traversal(self, data_num_list, ratio_list, batch_size):
select_num_list = []
dataset_num = len(data_num_list)
for dno in range(dataset_num):
select_num = round(batch_size * ratio_list[dno])
select_num = max(select_num, 1)
select_num_list.append(select_num)
data_idx_order_list = []
cur_index_sets = [0] * dataset_num
while True:
finish_read_num = 0
for dataset_idx in range(dataset_num):
cur_index = cur_index_sets[dataset_idx]
if cur_index >= data_num_list[dataset_idx]:
finish_read_num += 1
else:
select_num = select_num_list[dataset_idx]
for sno in range(select_num):
cur_index = cur_index_sets[dataset_idx]
if cur_index >= data_num_list[dataset_idx]:
break
data_idx_order_list.append((dataset_idx, cur_index))
cur_index_sets[dataset_idx] += 1
if finish_read_num == dataset_num:
break
return data_idx_order_list
def shuffle_data_random(self): def shuffle_data_random(self):
if self.do_shuffle: if self.do_shuffle:
for dno in range(len(self.data_lines_list)): random.shuffle(self.data_lines)
random.shuffle(self.data_lines_list[dno])
return return
def __getitem__(self, idx): def __getitem__(self, idx):
dataset_idx, file_idx = self.data_idx_order_list[idx] file_idx = self.data_idx_order_list[idx]
data_line = self.data_lines_list[dataset_idx][file_idx] data_line = self.data_lines[file_idx]
try: try:
data_line = data_line.decode('utf-8') data_line = data_line.decode('utf-8')
substr = data_line.strip("\n").split(self.delimiter) substr = data_line.strip("\n").split(self.delimiter)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册